This demo shows how to get a dataset of secondary structure segment
In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("secondaryStructureSegmentDemo")
sc = SparkContext(conf = conf)
In [3]:
pdb = mmtfReader.download_mmtf_files(['1STP'], sc).cache()
In [4]:
pdb = pdb.flatMap(StructureToPolymerChains()) \
.filter(ContainsLProteinChain())
In [5]:
segmentLength = 25
ds = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength)
ds.show(50, False)
+----------------+-------------------------+-------+-------+
|structureChainId|sequence |labelQ8|labelQ3|
+----------------+-------------------------+-------+-------+
|1STP.A |DPSKDSKAQVSAAEAGITGTWYNQL|C |C |
|1STP.A |PSKDSKAQVSAAEAGITGTWYNQLG|H |H |
|1STP.A |SKDSKAQVSAAEAGITGTWYNQLGS|H |H |
|1STP.A |KDSKAQVSAAEAGITGTWYNQLGST|H |H |
|1STP.A |DSKAQVSAAEAGITGTWYNQLGSTF|H |H |
|1STP.A |SKAQVSAAEAGITGTWYNQLGSTFI|C |C |
|1STP.A |KAQVSAAEAGITGTWYNQLGSTFIV|E |E |
|1STP.A |AQVSAAEAGITGTWYNQLGSTFIVT|E |E |
|1STP.A |QVSAAEAGITGTWYNQLGSTFIVTA|E |E |
|1STP.A |VSAAEAGITGTWYNQLGSTFIVTAG|E |E |
|1STP.A |SAAEAGITGTWYNQLGSTFIVTAGA|E |E |
|1STP.A |AAEAGITGTWYNQLGSTFIVTAGAD|T |C |
|1STP.A |AEAGITGTWYNQLGSTFIVTAGADG|T |C |
|1STP.A |EAGITGTWYNQLGSTFIVTAGADGA|C |C |
|1STP.A |AGITGTWYNQLGSTFIVTAGADGAL|C |C |
|1STP.A |GITGTWYNQLGSTFIVTAGADGALT|E |E |
|1STP.A |ITGTWYNQLGSTFIVTAGADGALTG|E |E |
|1STP.A |TGTWYNQLGSTFIVTAGADGALTGT|E |E |
|1STP.A |GTWYNQLGSTFIVTAGADGALTGTY|E |E |
|1STP.A |TWYNQLGSTFIVTAGADGALTGTYE|E |E |
|1STP.A |WYNQLGSTFIVTAGADGALTGTYES|E |E |
|1STP.A |YNQLGSTFIVTAGADGALTGTYESA|C |C |
|1STP.A |NQLGSTFIVTAGADGALTGTYESAV|T |C |
|1STP.A |QLGSTFIVTAGADGALTGTYESAVG|T |C |
|1STP.A |LGSTFIVTAGADGALTGTYESAVGN|S |C |
|1STP.A |GSTFIVTAGADGALTGTYESAVGNA|E |E |
|1STP.A |STFIVTAGADGALTGTYESAVGNAE|E |E |
|1STP.A |TFIVTAGADGALTGTYESAVGNAES|E |E |
|1STP.A |FIVTAGADGALTGTYESAVGNAESR|E |E |
|1STP.A |IVTAGADGALTGTYESAVGNAESRY|E |E |
|1STP.A |VTAGADGALTGTYESAVGNAESRYV|E |E |
|1STP.A |TAGADGALTGTYESAVGNAESRYVL|E |E |
|1STP.A |AGADGALTGTYESAVGNAESRYVLT|C |C |
|1STP.A |GADGALTGTYESAVGNAESRYVLTG|S |C |
|1STP.A |ADGALTGTYESAVGNAESRYVLTGR|S |C |
|1STP.A |DGALTGTYESAVGNAESRYVLTGRY|S |C |
|1STP.A |GALTGTYESAVGNAESRYVLTGRYD|S |C |
|1STP.A |ALTGTYESAVGNAESRYVLTGRYDS|C |C |
|1STP.A |LTGTYESAVGNAESRYVLTGRYDSA|C |C |
|1STP.A |TGTYESAVGNAESRYVLTGRYDSAP|S |C |
|1STP.A |GTYESAVGNAESRYVLTGRYDSAPA|C |C |
|1STP.A |TYESAVGNAESRYVLTGRYDSAPAT|E |E |
|1STP.A |YESAVGNAESRYVLTGRYDSAPATD|E |E |
|1STP.A |ESAVGNAESRYVLTGRYDSAPATDG|E |E |
|1STP.A |SAVGNAESRYVLTGRYDSAPATDGS|E |E |
|1STP.A |AVGNAESRYVLTGRYDSAPATDGSG|E |E |
|1STP.A |VGNAESRYVLTGRYDSAPATDGSGT|E |E |
|1STP.A |GNAESRYVLTGRYDSAPATDGSGTA|E |E |
|1STP.A |NAESRYVLTGRYDSAPATDGSGTAL|C |C |
|1STP.A |AESRYVLTGRYDSAPATDGSGTALG|S |C |
+----------------+-------------------------+-------+-------+
only showing top 50 rows
In [6]:
sc.stop()