Source code for mmtfPyspark.datasets.uniProt

'''uniProt.py

This class downloads and reads UniProt sequence files in the FASTA format and
converts them to datasets.This module reads the following files:
- SWISS_PROT,
- TREMBL,
- UNIREF50,
- UNIREF90,
- UNIREF100.

References
----------
- UniProt downloads <http://www.uniprot.org/downloads>`_
- The datasets have the following columns: http://www.uniprot.org/help/fasta-headers

Examples
--------
Download, read, and save the SWISS_PROT dataset:

>>> ds = uniProt.get_dataset(UniProtDataset.SWISS_PROT)
>>> ds.printSchema()
>>> ds.show(5)
>>> ds.write().mode("overwrite").format("parquet").save(fileName)
'''

__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

from collections import namedtuple
import tempfile
import urllib.request
import gzip
from pyspark.sql import SparkSession
from enum import Enum


baseUrl = "ftp://ftp.uniprot.org/pub/databases/uniprot/"

SWISS_PROT = baseUrl + "current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
TREMBL = baseUrl + "current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
UNIREF50 = baseUrl + "uniref/uniref50/uniref50.fasta.gz"
UNIREF90 = baseUrl + "uniref/uniref90/uniref90.fasta.gz"
UNIREF100 = baseUrl + "uniref/uniref100/uniref100.fasta.gz"


def _get_uniprot_dataset(dataType):
    '''
    Get Uniprot Dataset
    '''

    # Decalre string variables
    db, uniqueIdentifier, entryName, proteinName, organismName, geneName, \
        proteinExistence, sequenceVersion, sequence = '', '', '', '', '', '', '', '', ''
    firstLine = True

    # Make temporary file
    tempFile = tempfile.NamedTemporaryFile(delete=False)
    with open(tempFile.name, "w") as t:

        t.writelines(
            "db,uniqueIdentifier,entryName,proteinName,organismName,geneName,proteinExistence,sequenceVersion,sequence\n")

        inputStream = urllib.request.urlopen(dataType, timeout=6000)
        rd = gzip.GzipFile(fileobj=inputStream)

        for line in rd:  # TODO check rd output content after UNIPROT online

            line = str(line)[2:-3]

            if ">" in line:
                line = line.replace(",", ";")

                if not firstLine:
                    t.writelines(
                        f"{db},{uniqueIdentifier},{entryName},{proteinName},{organismName},{geneName},{proteinExistence},{sequenceVersion}, {sequence}\n".replace(' ', ''))

                firstLine = False
                sequence = ""
                tmp = line.split("|")  # TODO
                db = tmp[0]
                uniqueIdentifier = tmp[1]
                tmp = tmp[2]  # TODO

                if len(tmp.split(" OS=")) > 2:
                    length = len(tmp.split(" OS")[0]) + \
                        len(tmp.split(" OS")[1]) + 4
                    tmp = tmp[:length]

                # Set sequence version
                sv = tmp.split(" SV=")
                tmp = sv[0]
                sequenceVersion = sv[1] if len(sv) > 1 else ""

                # Set proteinExistence
                pe = tmp.split(" PE=")
                tmp = pe[0]
                proteinExistence = pe[1] if len(pe) > 1 else ""

                # Set GeneName
                ge = tmp.split(" GN=")
                tmp = ge[0]
                geneName = ge[1] if len(ge) > 1 else ""

                # Set organismName
                on = tmp.split(" OS=")
                tmp = on[0]
                organismName = on[1] if len(on) > 1 else ""

                entryName = tmp.split(" ")[0]

                proteinName = tmp[len(entryName) + 1:]
            else:
                sequence += line

    spark = SparkSession.builder.getOrCreate()

    dataset = spark.read \
                   .format("csv") \
                   .option("header", "true") \
                   .option("inferSchema", "true") \
                   .load(tempFile.name)
    return dataset


def _get_uniref_dataset(dataType):
    '''
    Get Uniref Dataset
    '''

    # Decalre string variables
    uniqueIdentifier, clusterName, taxon, representativeMember, taxonID, \
        members = '', '', '', '', '', ''
    firstLine = True

    # Make temporary file
    tempFile = tempfile.NamedTemporaryFile(delete=False)
    with open(tempFile.name, "w") as t:

        t.writelines(
            "uniqueIdentifier,clusterName,members,taxon,taxonID,representativeMember,sequence\n")

        inputStream = urllib.request.urlopen(dataType)
        rd = gzip.GzipFile(fileobj=inputStream)

        for line in rd:  # TODO check rd output content after UNIPROT online

            line = str(line)[2:-3]

            if ">" in line:

                line = line.replace(",", ";")

                if not firstLine:
                    t.writelines(
                        f"{uniqueIdentifier},{clusterName},{members},{taxon},{taxonID},{representativeMember},{sequence}\n".replace(' ', ''))

                firstLine = False

                sequence = ""
                tmp = line

                # Set representativeMember
                rm = tmp.split(" RepID=")
                tmp = rm[0]
                representativeMember = rm[1] if len(rm) > 1 else ""

                # Set taxonID
                tid = tmp.split(" TaxID=")
                tmp = tid[0]
                taxonID = tid[1] if len(tid) > 1 else ""

                # Set taxon
                tx = tmp.split(" Tax=")
                tmp = tx[0]
                taxon = tx[1] if len(tx) > 1 else ""

                # Set members
                m = tmp.split(" n=")
                tmp = m[0]
                members = m[1] if len(m) > 1 else ""

                uniqueIdentifier = tmp.split(" ")[0]
                clusterName = tmp[len(uniqueIdentifier) + 1:]
            else:
                sequence += line

    spark = SparkSession.builder.getOrCreate()

    dataset = spark.read \
                   .format("csv") \
                   .option("header", "true") \
                   .option("inferSchema", "true") \
                   .load(tempFile.name)
    return dataset


[docs]def get_dataset(UniProtDataset):
    '''Returns the specified UniProt dataset.

    Parameters
    ----------
    uniProtDataset : str
       name of the UniProt dataset

    Returns
    -------
    dataset
       dataset with sequence and metadata
    '''

    if UniProtDataset.split('/')[-3] == "uniref":

        return _get_uniref_dataset(UniProtDataset)

    elif UniProtDataset.split('/')[-3] == "knowledgebase":

        return _get_uniprot_dataset(UniProtDataset)

    else:

        raise Exception("Please use pre-defined uniprotDataset \n \
                         eg: UniprotDataset.SWISS_PROT")