Source code for mmtfPyspark.datasets.jpredDataset

'''jpredDataset.py

This class downloads the dataset used to train the secondary structure predictor.
It can be used as a reference dataset for machine learning applications.

This dataset includes the ScopID, sequence, DSSP secondary structure assignment,
and a flag that indicates if data point was part of the training set.

References
----------
- `JPred4 <http://www.compbio.dundee.ac.uk/jpred/about_RETR_JNetv231_details.shtml>`_

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

import urllib.request
import tarfile
from pyspark.sql import Row
from pyspark import SparkContext
from mmtfPyspark.ml import pythonRDDToDataset


[docs]def get_dataset():
    '''Gets JPred 4/JNet (v.2.3.1) secondary structure dataset.

    Returns
    -------
    dataset
       secondaryStructure dataset
    '''

    URL = "http://www.compbio.dundee.ac.uk/jpred/downloads/retr231.tar.gz"
    instream = urllib.request.urlopen(URL)
    secondaryStructures, sequences, trained = {}, {}, {}
    scopIds = set()
    res = []

    with tarfile.open(fileobj=instream, mode="r:gz") as tf:

        for entry in tf:
            if entry.isdir():
                continue
            br = tf.extractfile(entry)

            if ".dssp" in entry.name:
                scopID = str(br.readline())[3:-3]  # Remove newline and byte
                secondaryStructure = str(br.readline())[2:-3]  # Remove newline and byte
                secondaryStructure = secondaryStructure.replace('-', 'C')
                secondaryStructures[scopID] = secondaryStructure

            if ".fasta" in entry.name:
                scopID = str(br.readline())[3:-3]  # Remove newline and byte
                sequence = str(br.readline())[2:-3]  # Remove newline and byte
                scopIds.add(scopID)
                sequences[scopID] = sequence

                if "training/" in entry.name:
                    trained[scopID] = "true"
                elif "blind/" in entry.name:
                    trained[scopID] = "false"

    for scopId in scopIds:
        row = Row(scopId, sequences[scopId],
                  secondaryStructures[scopId], trained[scopId])
        res.append(row)

    sc = SparkContext.getOrCreate()
    data = sc.parallelize(res)
    colNames = ["scopID", "sequence", "secondaryStructure", "trained"]

    return pythonRDDToDataset.get_dataset(data, colNames)