Secondary Structure Elements Word2Vec Encoder Demo

This demo creates a dataset by extracting secondary structure elements “H”, then encode an overlapping Ngram feature vector

Imports

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureElementExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader

Configure Spark Context

In [2]:
conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SecondaryStructureElementsWord2VecEncoderDemo")

sc = SparkContext(conf = conf)

## Read MMTF Hadoop sequence file and

Create a non-redundant set(<=20% seq. identity) of L-protein chains

In [3]:
path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .flatMap(StructureToPolymerChains(False, True)) \
        .filter(ContainsLProteinChain()) \
        .sample(False, fraction, seed)

Extract Element “H” from Secondary Structure

In [4]:
label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data   : {data.count()}")
data.show(10, False)
original data   : 3297
+-----------------------------+-----+
|sequence                     |label|
+-----------------------------+-----+
|ACAGV                        |H    |
|GIGLHLAVRLA                  |H    |
|RLWEAARA                     |H    |
|KSVAAARE                     |H    |
|EDAVASVLDVNVVGTVRMLQAFLPDMKRR|H    |
|VYCASKFALEGLCESLAVLLLPF      |H    |
|IHTFHRFYQYLALSKQVFREA        |H    |
|EEVAEVFLTALR                 |H    |
|LPLLRMRL                     |H    |
|NYVTAMHREVF                  |H    |
+-----------------------------+-----+
only showing top 10 rows

Word2Vec encoded feature Vector

In [6]:
segmentLength = 11
n = 2
windowSize = (segmentLength-1)/2
vectorSize = 50

encoder = ProteinSequenceEncoder(data)
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n=n, windowSize=windowSize, vectorSize=vectorSize)

data.show(5)
+--------------------+-----+--------------------+--------------------+
|            sequence|label|               ngram|            features|
+--------------------+-----+--------------------+--------------------+
|               ACAGV|    H|    [AC, CA, AG, GV]|[0.46121373027563...|
|         GIGLHLAVRLA|    H|[GI, IG, GL, LH, ...|[-0.2606903441250...|
|            RLWEAARA|    H|[RL, LW, WE, EA, ...|[0.16112836982522...|
|            KSVAAARE|    H|[KS, SV, VA, AA, ...|[1.15827076775687...|
|EDAVASVLDVNVVGTVR...|    H|[ED, DA, AV, VA, ...|[0.37046241248026...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows

Terminate Spark Context

In [7]:
sc.stop()