Source code for mmtfPyspark.datasets.jpredDataset
'''jpredDataset.py
This class downloads the dataset used to train the secondary structure predictor.
It can be used as a reference dataset for machine learning applications.
This dataset includes the ScopID, sequence, DSSP secondary structure assignment,
and a flag that indicates if data point was part of the training set.
References
----------
- `JPred4 <http://www.compbio.dundee.ac.uk/jpred/about_RETR_JNetv231_details.shtml>`_
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
import urllib.request
import tarfile
from pyspark.sql import Row
from pyspark import SparkContext
from mmtfPyspark.ml import pythonRDDToDataset
[docs]def get_dataset():
'''Gets JPred 4/JNet (v.2.3.1) secondary structure dataset.
Returns
-------
dataset
secondaryStructure dataset
'''
URL = "http://www.compbio.dundee.ac.uk/jpred/downloads/retr231.tar.gz"
instream = urllib.request.urlopen(URL)
secondaryStructures, sequences, trained = {}, {}, {}
scopIds = set()
res = []
with tarfile.open(fileobj=instream, mode="r:gz") as tf:
for entry in tf:
if entry.isdir():
continue
br = tf.extractfile(entry)
if ".dssp" in entry.name:
scopID = str(br.readline())[3:-3] # Remove newline and byte
secondaryStructure = str(br.readline())[2:-3] # Remove newline and byte
secondaryStructure = secondaryStructure.replace('-', 'C')
secondaryStructures[scopID] = secondaryStructure
if ".fasta" in entry.name:
scopID = str(br.readline())[3:-3] # Remove newline and byte
sequence = str(br.readline())[2:-3] # Remove newline and byte
scopIds.add(scopID)
sequences[scopID] = sequence
if "training/" in entry.name:
trained[scopID] = "true"
elif "blind/" in entry.name:
trained[scopID] = "false"
for scopId in scopIds:
row = Row(scopId, sequences[scopId],
secondaryStructures[scopId], trained[scopId])
res.append(row)
sc = SparkContext.getOrCreate()
data = sc.parallelize(res)
colNames = ["scopID", "sequence", "secondaryStructure", "trained"]
return pythonRDDToDataset.get_dataset(data, colNames)