This demo shows how to create and query a dssp dataset.
DSSP is a database of secondary structure assigmnets for all protein entries in the Protein Data Bank (PDB).
In [9]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.datasets import secondaryStructureExtractor
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.mappers import StructureToPolymerChains
import time
In [10]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("DSSPDemo")
sc = SparkContext(conf = conf)
In [11]:
pdbIds = ["1STP"]
pdb = mmtfReader.download_mmtf_files(pdbIds, sc).cache()
In [12]:
pdb = pdb.flatMap(StructureToPolymerChains())
In [13]:
ds = secondaryStructureExtractor.get_dataset(pdb)
ds.printSchema()
ds.show(2, False)
root
|-- structureChainId: string (nullable = false)
|-- sequence: string (nullable = false)
|-- alpha: float (nullable = false)
|-- beta: float (nullable = false)
|-- coil: float (nullable = false)
|-- dsspQ8Code: string (nullable = false)
|-- dsspQ3Code: string (nullable = false)
+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|structureChainId|sequence |alpha |beta |coil |dsspQ8Code |dsspQ3Code |
+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1STP.A |DPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSAPATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNNGNPLDAVQQ|0.08264463|0.553719|0.36363637|XXXXXXXXXXXXCHHHHCEEEEETTCCEEEEEECTTSEEEEEEECSSSSCCSCEEEEEEECSSCCSSSCCEEEEEEEEEECSSCEEEEEEEEEEEEECSTTCEEEEEEEEEECCCGGGGGGCEEEEEEEEECCXXXXXXXXXXXXXXXXXXXXXXXXXX|XXXXXXXXXXXXCHHHHCEEEEECCCCEEEEEECCCCEEEEEEECCCCCCCCCEEEEEEECCCCCCCCCCEEEEEEEEEECCCCEEEEEEEEEEEEECCCCCEEEEEEEEEECCCHHHHHHCEEEEEEEEECCXXXXXXXXXXXXXXXXXXXXXXXXXX|
+----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
In [14]:
sc.stop()