Source code for mmtfPyspark.datasets.secondaryStructureExtractor
#!/user/bin/env python
'''secondaryStructureExtractor.py
Creates a dataset of DSSP secondary structure assignments. The dataset
includes protein sequence, the DSSP 3-state (Q3) and 8-state (Q8)
assignments, and the fraction of alpha, beta, and coil within a chain. The
input to this class must be a single protein chain.
Examples
--------
get dataset of secondary structure assignments:
>>> pdb.flatMapToPair(new StructureToPolymerChains())
... .filter(new ContainsLProteinChain())
>>> secStruct = SecondaryStructureExtractor.getDataset(pdb)
>>> secStruct.show(10)
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.utils import DsspSecondaryStructure
from pyspark.sql import Row
[docs]def get_dataset(structure):
'''Returns a dataset with protein sequence and secondary structure assignments.
Parameters
----------
structure : mmtfStructure
single protein chain
Returns
-------
dataset
dataset with sequence and secondary structure assignments
'''
rows = structure.map(
lambda x: _get_sec_struct_fractions(x)) # Map or flatMap
# convert to dataset
colNames = ["structureChainId", "sequence", "alpha", "beta",
"coil", "dsspQ8Code", "dsspQ3Code"]
return pythonRDDToDataset.get_dataset(rows, colNames)
[docs]def get_python_rdd(structure):
'''Returns a pythonRDD of 3-state secondary structure
Parameters
----------
structure : mmtfStructure
'''
return structure.map(lambda x: _get_sec_struct_fractions(x))
def _get_sec_struct_fractions(t):
'''Get factions of alpha, beta and coil within a chain
'''
key = t[0]
structure = t[1]
if structure.num_chains != 1:
raise Exception(
"This method can only be applied to single polyer chain.")
dsspQ8, dsspQ3 = '', ''
helix = 0
sheet = 0
coil = 0
dsspIndex = 0
structureIndex = 0
for code in structure.sec_struct_list:
seqIndex = structure.sequence_index_list[structureIndex]
while dsspIndex < seqIndex:
dsspQ3 += "X"
dsspQ8 += "X"
dsspIndex += 1
structureIndex += 1
dsspQ8 += DsspSecondaryStructure.get_dssp_code(
code).get_one_letter_code()
dsspIndex += 1
q3 = DsspSecondaryStructure.get_q3_code(code).name
if q3 == "ALPHA_HELIX":
helix += 1
dsspQ3 += "H"
elif q3 == "EXTENDED":
sheet += 1
dsspQ3 += "E"
elif q3 == "COIL":
coil += 1
dsspQ3 += "C"
while dsspIndex < len(structure.entity_list[0]['sequence']):
dsspQ8 += "X"
dsspQ3 += "X"
dsspIndex += 1
n = len(structure.sec_struct_list)
helix /= n
sheet /= n
coil /= n
return Row(key, structure.entity_list[0]['sequence'], helix, sheet,
coil, dsspQ8, dsspQ3)