This Demo is a simple example of using Dataset operations to create a datset
In [12]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.functions import col, when
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader
In [13]:
def add_protein_fold_type(data, minThreshold, maxThreshold):
'''
Adds a column "foldType" with three major secondary structure class:
"alpha", "beta", "alpha+beta", and "other" based upon the fraction of alpha/beta content.
The simplified syntax used in this method relies on two imports:
from pyspark.sql.functions import when
from pyspark.sql.functions import col
Attributes:
data (Dataset<Row>): input dataset with alpha, beta composition
minThreshold (float): below this threshold, the secondary structure is ignored
maxThreshold (float): above this threshold, the secondary structure is ignored
'''
return data.withColumn("foldType", \
when((col("alpha") > maxThreshold) & (col("beta") < minThreshold), "alpha"). \
when((col("beta") > maxThreshold) & (col("alpha") < minThreshold), "beta"). \
when((col("alpha") > maxThreshold) & (col("beta") > minThreshold), "alpha+beta"). \
otherwise("other")\
)
In [14]:
conf = SparkConf() \
.setMaster("local[*]") \
.setAppName("ProteinFoldDatasetCreatorDemo")
sc = SparkContext(conf = conf)
Create non-redundant set (<=40% seq. identity) if L-protein chains
In [15]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 40
resolution = 2.0
pdb = mmtfReader \
.read_sequence_file(path, sc) \
.filter(Pisces(sequenceIdentity, resolution)) \
.flatMap(StructureToPolymerChains()) \
.filter(Pisces(sequenceIdentity, resolution)) \
.filter(ContainsLProteinChain())
In [16]:
data = secondaryStructureExtractor.get_dataset(pdb)
In [17]:
minThreshold = 0.05
maxThreshold = 0.15
data = add_protein_fold_type(data, minThreshold, maxThreshold)
In [19]:
encoder = ProteinSequenceEncoder(data)
n = 2 # Create 2-grams
windowSize = 25 # 25-amino residue window size for Word2Vec
vectorSize = 50 # dimension of feature vector
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n = n, windowSize = windowSize, vectorSize=vectorSize).cache()
data.printSchema()
data.show(10)
root
|-- structureChainId: string (nullable = false)
|-- sequence: string (nullable = false)
|-- alpha: float (nullable = false)
|-- beta: float (nullable = false)
|-- coil: float (nullable = false)
|-- dsspQ8Code: string (nullable = false)
|-- dsspQ3Code: string (nullable = false)
|-- foldType: string (nullable = false)
|-- ngram: array (nullable = true)
| |-- element: string (containsNull = true)
|-- features: vector (nullable = true)
+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
|structureChainId| sequence| alpha| beta| coil| dsspQ8Code| dsspQ3Code| foldType| ngram| features|
+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
| 1FEC.B|SRAYDLVVIGAGSGGLE...| 0.32783505|0.27216494| 0.4|XCCEEEEEECCSHHHHH...|XCCEEEEEECCCHHHHH...|alpha+beta|[SR, RA, AY, YD, ...|[0.83933154100662...|
| 1FGY.A|TFFNPDREGWLLKLGGR...| 0.13385826|0.37007874| 0.496063|CCSSCSEEEEEEEECSS...|CCCCCCEEEEEEEECCC...| other|[TF, FF, FN, NP, ...|[0.71180595034393...|
| 1FHG.A|ISGMSGRKASGSSPTSP...|0.029411765| 0.5980392|0.37254903|XXXXXXXXXXXXXXXXX...|XXXXXXXXXXXXXXXXX...| beta|[IS, SG, GM, MS, ...|[0.72951849662010...|
| 1FIP.B|MFEQRVNSDVLTVSTVN...| 0.72602737| 0.0| 0.2739726|XXXXXXXXXXXXXXXXX...|XXXXXXXXXXXXXXXXX...| alpha|[MF, FE, EQ, QR, ...|[0.91695108936293...|
| 1FIT.A|MSFRFGQHLIKPSVVFL...| 0.34126985|0.25396827| 0.4047619|XCEEETTEEECGGGEEE...|XCEEECCEEECHHHEEE...|alpha+beta|[MS, SF, FR, RF, ...|[0.95015770865425...|
| 1FIU.D|MQPLFTQERRIFHKKLL...| 0.48951048|0.12937063| 0.3811189|CCSHHHHHHHHHHHHHH...|CCCHHHHHHHHHHHHHH...|alpha+beta|[MQ, QP, PL, LF, ...|[0.95298244105325...|
| 1FL2.A|AYDVLIVGSGPAGAAAA...| 0.2580645|0.30967742|0.43225807|CEEEEEECCSHHHHHHH...|CEEEEEECCCHHHHHHH...|alpha+beta|[AY, YD, DV, VL, ...|[0.88702770774508...|
| 1FLT.Y|GRPFVEMYSEIPEIIHM...|0.031914894| 0.4787234| 0.4893617|CCCBSSCCCSSCEEEEE...|CCCECCCCCCCCEEEEE...| beta|[GR, RP, PF, FV, ...|[0.94899295706381...|
| 1FM0.D|MIKVLFFAQVRELVGTD...| 0.28395063|0.27160493|0.44444445|CEEEEECHHHHHHHTCS...|CEEEEECHHHHHHHCCC...|alpha+beta|[MI, IK, KV, VL, ...|[1.00496734850457...|
| 1FM0.E|MAETKIVVGPQPFSVGE...| 0.36619717|0.35915494| 0.2746479|XCCEEEEEESSCCCHHH...|XCCEEEEEECCCCCHHH...|alpha+beta|[MA, AE, ET, TK, ...|[0.82849135274855...|
+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
only showing top 10 rows
In [20]:
data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features")
data.show(10)
+----------------+-----------+----------+----------+----------+--------------------+
|structureChainId| alpha| beta| coil| foldType| features|
+----------------+-----------+----------+----------+----------+--------------------+
| 1FEC.B| 0.32783505|0.27216494| 0.4|alpha+beta|[0.83933154100662...|
| 1FGY.A| 0.13385826|0.37007874| 0.496063| other|[0.71180595034393...|
| 1FHG.A|0.029411765| 0.5980392|0.37254903| beta|[0.72951849662010...|
| 1FIP.B| 0.72602737| 0.0| 0.2739726| alpha|[0.91695108936293...|
| 1FIT.A| 0.34126985|0.25396827| 0.4047619|alpha+beta|[0.95015770865425...|
| 1FIU.D| 0.48951048|0.12937063| 0.3811189|alpha+beta|[0.95298244105325...|
| 1FL2.A| 0.2580645|0.30967742|0.43225807|alpha+beta|[0.88702770774508...|
| 1FLT.Y|0.031914894| 0.4787234| 0.4893617| beta|[0.94899295706381...|
| 1FM0.D| 0.28395063|0.27160493|0.44444445|alpha+beta|[1.00496734850457...|
| 1FM0.E| 0.36619717|0.35915494| 0.2746479|alpha+beta|[0.82849135274855...|
+----------------+-----------+----------+----------+----------+--------------------+
only showing top 10 rows
In [21]:
sc.stop()