Source code for mmtfPyspark.datasets.secondaryStructureElementExtractor
#!/user/bin/env python
'''secondaryStructureElementExtractor.py
Returns a datset of continuous segments of protein sequence with the specified
DSSP secondary structure code (E, H, C) of a minimum length.
Examples
--------
+-------------+-----+
|sequence |label|
+-------------+-----+
|TFIVTA |E |
|ALTGTYE |E |
+-------------+-----+
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.mappers import StructureToSecondaryStructureElements
from mmtfPyspark.datasets import secondaryStructureExtractor
[docs]def get_dataset(structure, label, length=None):
'''Returns a dataset of continuous segments of protein sequence with the
specified DSSP secondary structure code (E, H, C) of a minimum length.
Parameters
----------
structure : structure
label : str
DSSP secondary structure label (E, H, C)
length : int
minimum length of secondary structure segment
Returns
-------
dataset
dataset of continuous segments of protein sequence
'''
colNames = ["sequence", "label"]
if length == None:
rows = secondaryStructureExtractor.get_python_rdd(structure) \
.flatMap(StructureToSecondaryStructureElements(label))
return pythonRDDToDataset.get_dataset(rows, colNames)
else:
rows = secondaryStructureExtractor.get_python_rdd(structure) \
.flatMap(StructureToSecondaryStructureElements(label, length))
return pythonRDDToDataset.get_dataset(rows, colNames)