Source code for mmtfPyspark.datasets.secondaryStructureSegmentExtractor

#!/user/bin/env python
'''secondaryStructureSegmentExtractor.py:

This class creates a dataset of sequence segments of specified length
and associate secondary structure information. Sequence and secondary
structure strings are split into segments using a sliding window of the specified
segment length. The dataset contains the sequence segment and the DSP Q8 and
DSSP Q3 secondary structure annotation of the cneter residue. Therefore, the segment
length must be an odd number

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

from mmtfPyspark.datasets import secondaryStructureExtractor
from mmtfPyspark.mappers import StructureToSecondaryStructureSegments
from mmtfPyspark.ml import pythonRDDToDataset

[docs]def get_dataset(structureRDD, length): '''Returns a dataset of sequence segments of the specified length and the DSSP Q8 and Q3 code of the center residue in a segment. Parameters ---------- structureRDD : structure length : int segment length, must be an odd number Returns ------- dataset dataset of segments Raises ------ Exception Segment length must be an odd number ''' if length % 2 == 0: raise Exception("Segment length must be an odd number %i" % length) rows = secondaryStructureExtractor.get_python_rdd(structureRDD) \ .flatMap(StructureToSecondaryStructureSegments(length)) colNames = ["structureChainId", "sequence", "labelQ8", "labelQ3"] return pythonRDDToDataset.get_dataset(rows, colNames)