Protein Fold Dataset Creator Dmeo

This Demo is a simple example of using Dataset operations to create a datset

Imports

In [12]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.functions import col, when
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader

Define addProteinFoldType function

In [13]:
def add_protein_fold_type(data, minThreshold, maxThreshold):
    '''
    Adds a column "foldType" with three major secondary structure class:
    "alpha", "beta", "alpha+beta", and "other" based upon the fraction of alpha/beta content.

    The simplified syntax used in this method relies on two imports:
        from pyspark.sql.functions import when
        from pyspark.sql.functions import col

    Attributes:
        data (Dataset<Row>): input dataset with alpha, beta composition
        minThreshold (float): below this threshold, the secondary structure is ignored
        maxThreshold (float): above this threshold, the secondary structure is ignored
    '''

    return data.withColumn("foldType", \
                           when((col("alpha") > maxThreshold) & (col("beta") < minThreshold), "alpha"). \
                           when((col("beta") > maxThreshold) & (col("alpha") < minThreshold), "beta"). \
                           when((col("alpha") > maxThreshold) & (col("beta") > minThreshold), "alpha+beta"). \
                           otherwise("other")\
                           )

Configure Spark Context

In [14]:
conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("ProteinFoldDatasetCreatorDemo")

sc = SparkContext(conf = conf)

Read MMTF Hadoop sequence file

Create non-redundant set (<=40% seq. identity) if L-protein chains

In [15]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 40
resolution = 2.0

pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain())

Get secondary structure content

In [16]:
data = secondaryStructureExtractor.get_dataset(pdb)

Classify chains by secondary structure type

In [17]:
minThreshold = 0.05
maxThreshold = 0.15
data = add_protein_fold_type(data, minThreshold, maxThreshold)

Add Word2Vec encoded feature vector

In [19]:
encoder = ProteinSequenceEncoder(data)
n = 2 # Create 2-grams
windowSize = 25 # 25-amino residue window size for Word2Vec
vectorSize = 50 # dimension of feature vector
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n = n, windowSize = windowSize, vectorSize=vectorSize).cache()

data.printSchema()
data.show(10)
root
 |-- structureChainId: string (nullable = false)
 |-- sequence: string (nullable = false)
 |-- alpha: float (nullable = false)
 |-- beta: float (nullable = false)
 |-- coil: float (nullable = false)
 |-- dsspQ8Code: string (nullable = false)
 |-- dsspQ3Code: string (nullable = false)
 |-- foldType: string (nullable = false)
 |-- ngram: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
|structureChainId|            sequence|      alpha|      beta|      coil|          dsspQ8Code|          dsspQ3Code|  foldType|               ngram|            features|
+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
|          1FEC.B|SRAYDLVVIGAGSGGLE...| 0.32783505|0.27216494|       0.4|XCCEEEEEECCSHHHHH...|XCCEEEEEECCCHHHHH...|alpha+beta|[SR, RA, AY, YD, ...|[0.83933154100662...|
|          1FGY.A|TFFNPDREGWLLKLGGR...| 0.13385826|0.37007874|  0.496063|CCSSCSEEEEEEEECSS...|CCCCCCEEEEEEEECCC...|     other|[TF, FF, FN, NP, ...|[0.71180595034393...|
|          1FHG.A|ISGMSGRKASGSSPTSP...|0.029411765| 0.5980392|0.37254903|XXXXXXXXXXXXXXXXX...|XXXXXXXXXXXXXXXXX...|      beta|[IS, SG, GM, MS, ...|[0.72951849662010...|
|          1FIP.B|MFEQRVNSDVLTVSTVN...| 0.72602737|       0.0| 0.2739726|XXXXXXXXXXXXXXXXX...|XXXXXXXXXXXXXXXXX...|     alpha|[MF, FE, EQ, QR, ...|[0.91695108936293...|
|          1FIT.A|MSFRFGQHLIKPSVVFL...| 0.34126985|0.25396827| 0.4047619|XCEEETTEEECGGGEEE...|XCEEECCEEECHHHEEE...|alpha+beta|[MS, SF, FR, RF, ...|[0.95015770865425...|
|          1FIU.D|MQPLFTQERRIFHKKLL...| 0.48951048|0.12937063| 0.3811189|CCSHHHHHHHHHHHHHH...|CCCHHHHHHHHHHHHHH...|alpha+beta|[MQ, QP, PL, LF, ...|[0.95298244105325...|
|          1FL2.A|AYDVLIVGSGPAGAAAA...|  0.2580645|0.30967742|0.43225807|CEEEEEECCSHHHHHHH...|CEEEEEECCCHHHHHHH...|alpha+beta|[AY, YD, DV, VL, ...|[0.88702770774508...|
|          1FLT.Y|GRPFVEMYSEIPEIIHM...|0.031914894| 0.4787234| 0.4893617|CCCBSSCCCSSCEEEEE...|CCCECCCCCCCCEEEEE...|      beta|[GR, RP, PF, FV, ...|[0.94899295706381...|
|          1FM0.D|MIKVLFFAQVRELVGTD...| 0.28395063|0.27160493|0.44444445|CEEEEECHHHHHHHTCS...|CEEEEECHHHHHHHCCC...|alpha+beta|[MI, IK, KV, VL, ...|[1.00496734850457...|
|          1FM0.E|MAETKIVVGPQPFSVGE...| 0.36619717|0.35915494| 0.2746479|XCCEEEEEESSCCCHHH...|XCCEEEEEECCCCCHHH...|alpha+beta|[MA, AE, ET, TK, ...|[0.82849135274855...|
+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
only showing top 10 rows

Keep only a subset of relevant fields for futher processing

In [20]:
data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features")

data.show(10)
+----------------+-----------+----------+----------+----------+--------------------+
|structureChainId|      alpha|      beta|      coil|  foldType|            features|
+----------------+-----------+----------+----------+----------+--------------------+
|          1FEC.B| 0.32783505|0.27216494|       0.4|alpha+beta|[0.83933154100662...|
|          1FGY.A| 0.13385826|0.37007874|  0.496063|     other|[0.71180595034393...|
|          1FHG.A|0.029411765| 0.5980392|0.37254903|      beta|[0.72951849662010...|
|          1FIP.B| 0.72602737|       0.0| 0.2739726|     alpha|[0.91695108936293...|
|          1FIT.A| 0.34126985|0.25396827| 0.4047619|alpha+beta|[0.95015770865425...|
|          1FIU.D| 0.48951048|0.12937063| 0.3811189|alpha+beta|[0.95298244105325...|
|          1FL2.A|  0.2580645|0.30967742|0.43225807|alpha+beta|[0.88702770774508...|
|          1FLT.Y|0.031914894| 0.4787234| 0.4893617|      beta|[0.94899295706381...|
|          1FM0.D| 0.28395063|0.27160493|0.44444445|alpha+beta|[1.00496734850457...|
|          1FM0.E| 0.36619717|0.35915494| 0.2746479|alpha+beta|[0.82849135274855...|
+----------------+-----------+----------+----------+----------+--------------------+
only showing top 10 rows

Terminate Spark Context

In [21]:
sc.stop()