Secondary Structure Segment Demo

This demo shows how to get a dataset of secondary structure segment

Imports

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor

Configure Spark

In [2]:
conf = SparkConf().setMaster("local[*]") \
                      .setAppName("secondaryStructureSegmentDemo")
sc = SparkContext(conf = conf)

Download protein (1STP)

Note: Need to use SparkContext as parameter to download Mmtf files

In [3]:
pdb = mmtfReader.download_mmtf_files(['1STP'], sc).cache()

Map protein to polymer chains and apply LProteinChain filter

In [4]:
pdb = pdb.flatMap(StructureToPolymerChains()) \
         .filter(ContainsLProteinChain())

Extract secondary structure element ‘E’

In [5]:
segmentLength = 25
ds = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength)

ds.show(50, False)
+----------------+-------------------------+-------+-------+
|structureChainId|sequence                 |labelQ8|labelQ3|
+----------------+-------------------------+-------+-------+
|1STP.A          |DPSKDSKAQVSAAEAGITGTWYNQL|C      |C      |
|1STP.A          |PSKDSKAQVSAAEAGITGTWYNQLG|H      |H      |
|1STP.A          |SKDSKAQVSAAEAGITGTWYNQLGS|H      |H      |
|1STP.A          |KDSKAQVSAAEAGITGTWYNQLGST|H      |H      |
|1STP.A          |DSKAQVSAAEAGITGTWYNQLGSTF|H      |H      |
|1STP.A          |SKAQVSAAEAGITGTWYNQLGSTFI|C      |C      |
|1STP.A          |KAQVSAAEAGITGTWYNQLGSTFIV|E      |E      |
|1STP.A          |AQVSAAEAGITGTWYNQLGSTFIVT|E      |E      |
|1STP.A          |QVSAAEAGITGTWYNQLGSTFIVTA|E      |E      |
|1STP.A          |VSAAEAGITGTWYNQLGSTFIVTAG|E      |E      |
|1STP.A          |SAAEAGITGTWYNQLGSTFIVTAGA|E      |E      |
|1STP.A          |AAEAGITGTWYNQLGSTFIVTAGAD|T      |C      |
|1STP.A          |AEAGITGTWYNQLGSTFIVTAGADG|T      |C      |
|1STP.A          |EAGITGTWYNQLGSTFIVTAGADGA|C      |C      |
|1STP.A          |AGITGTWYNQLGSTFIVTAGADGAL|C      |C      |
|1STP.A          |GITGTWYNQLGSTFIVTAGADGALT|E      |E      |
|1STP.A          |ITGTWYNQLGSTFIVTAGADGALTG|E      |E      |
|1STP.A          |TGTWYNQLGSTFIVTAGADGALTGT|E      |E      |
|1STP.A          |GTWYNQLGSTFIVTAGADGALTGTY|E      |E      |
|1STP.A          |TWYNQLGSTFIVTAGADGALTGTYE|E      |E      |
|1STP.A          |WYNQLGSTFIVTAGADGALTGTYES|E      |E      |
|1STP.A          |YNQLGSTFIVTAGADGALTGTYESA|C      |C      |
|1STP.A          |NQLGSTFIVTAGADGALTGTYESAV|T      |C      |
|1STP.A          |QLGSTFIVTAGADGALTGTYESAVG|T      |C      |
|1STP.A          |LGSTFIVTAGADGALTGTYESAVGN|S      |C      |
|1STP.A          |GSTFIVTAGADGALTGTYESAVGNA|E      |E      |
|1STP.A          |STFIVTAGADGALTGTYESAVGNAE|E      |E      |
|1STP.A          |TFIVTAGADGALTGTYESAVGNAES|E      |E      |
|1STP.A          |FIVTAGADGALTGTYESAVGNAESR|E      |E      |
|1STP.A          |IVTAGADGALTGTYESAVGNAESRY|E      |E      |
|1STP.A          |VTAGADGALTGTYESAVGNAESRYV|E      |E      |
|1STP.A          |TAGADGALTGTYESAVGNAESRYVL|E      |E      |
|1STP.A          |AGADGALTGTYESAVGNAESRYVLT|C      |C      |
|1STP.A          |GADGALTGTYESAVGNAESRYVLTG|S      |C      |
|1STP.A          |ADGALTGTYESAVGNAESRYVLTGR|S      |C      |
|1STP.A          |DGALTGTYESAVGNAESRYVLTGRY|S      |C      |
|1STP.A          |GALTGTYESAVGNAESRYVLTGRYD|S      |C      |
|1STP.A          |ALTGTYESAVGNAESRYVLTGRYDS|C      |C      |
|1STP.A          |LTGTYESAVGNAESRYVLTGRYDSA|C      |C      |
|1STP.A          |TGTYESAVGNAESRYVLTGRYDSAP|S      |C      |
|1STP.A          |GTYESAVGNAESRYVLTGRYDSAPA|C      |C      |
|1STP.A          |TYESAVGNAESRYVLTGRYDSAPAT|E      |E      |
|1STP.A          |YESAVGNAESRYVLTGRYDSAPATD|E      |E      |
|1STP.A          |ESAVGNAESRYVLTGRYDSAPATDG|E      |E      |
|1STP.A          |SAVGNAESRYVLTGRYDSAPATDGS|E      |E      |
|1STP.A          |AVGNAESRYVLTGRYDSAPATDGSG|E      |E      |
|1STP.A          |VGNAESRYVLTGRYDSAPATDGSGT|E      |E      |
|1STP.A          |GNAESRYVLTGRYDSAPATDGSGTA|E      |E      |
|1STP.A          |NAESRYVLTGRYDSAPATDGSGTAL|C      |C      |
|1STP.A          |AESRYVLTGRYDSAPATDGSGTALG|S      |C      |
+----------------+-------------------------+-------+-------+
only showing top 50 rows

Terminate Spark

In [6]:
sc.stop()