Source code for mmtfPyspark.datasets.polymerSequenceExtractor

#!/user/bin/env python
'''polymerSequenceExtractor.py:

Creates a dataset of polymer sequences using the full sequence
used in the experiment (i.e., the "SEQRES" record in PDB files).

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Debug"

from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.mappers import StructureToPolymerSequences
from pyspark.sql import Row


[docs]def get_dataset(structures):
    '''Returns a dataset of polymer sequence contained in PDB entries
    using the full sequence used in the experimnet
    (i.e., the "SEQRES" record in PDB files)

    Parameters
    ----------
    structures : pythonRDD
       a set of PDB structures

    Returns
    -------
    dataset
       dataset with interacting residue and atom information
    '''

    rows = structures.flatMap(StructureToPolymerSequences()) \
                     .map(lambda x: Row(x[0],x[1]))

    colNames = ["structureChainId", "sequence"]

    return pythonRDDToDataset.get_dataset(rows, colNames)