Source code for mmtfPyspark.datasets.polymerSequenceExtractor

#!/user/bin/env python
'''polymerSequenceExtractor.py:

Creates a dataset of polymer sequences using the full sequence
used in the experiment (i.e., the "SEQRES" record in PDB files).

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Debug"

from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.mappers import StructureToPolymerSequences
from pyspark.sql import Row


[docs]def get_dataset(structures): '''Returns a dataset of polymer sequence contained in PDB entries using the full sequence used in the experimnet (i.e., the "SEQRES" record in PDB files) Parameters ---------- structures : pythonRDD a set of PDB structures Returns ------- dataset dataset with interacting residue and atom information ''' rows = structures.flatMap(StructureToPolymerSequences()) \ .map(lambda x: Row(x[0],x[1])) colNames = ["structureChainId", "sequence"] return pythonRDDToDataset.get_dataset(rows, colNames)