This demo shows how to get a dataset of secondary structure elements
In [10]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureElementExtractor
In [11]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("secondaryStructureElementDemo")
sc = SparkContext(conf = conf)
In [12]:
pdb = mmtfReader.download_mmtf_files(['1STP'], sc).cache()
In [13]:
pdb = pdb.flatMap(StructureToPolymerChains()) \
.filter(ContainsLProteinChain())
In [14]:
ds = secondaryStructureElementExtractor.get_dataset(pdb, 'E', 6)
ds.show(50, False)
+-------------+-----+
|sequence |label|
+-------------+-----+
|TFIVTA |E |
|ALTGTYE |E |
|VLTGRY |E |
|TALGWTVAWK |E |
|NAHSATTWSGQYV|E |
|INTQWLLTS |E |
|TLVGHDTFT |E |
+-------------+-----+
In [15]:
sc.stop()