This demo creates a dataset by extracting secondary structure elements “H”, then encode an overlapping Ngram feature vector
In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureElementExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader
In [2]:
conf = SparkConf() \
.setMaster("local[*]") \
.setAppName("SecondaryStructureElementsWord2VecEncoderDemo")
sc = SparkContext(conf = conf)
## Read MMTF Hadoop sequence file and
Create a non-redundant set(<=20% seq. identity) of L-protein chains
In [3]:
path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123
pdb = mmtfReader \
.read_sequence_file(path, sc) \
.flatMap(StructureToPolymerChains(False, True)) \
.filter(ContainsLProteinChain()) \
.sample(False, fraction, seed)
In [4]:
label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data : {data.count()}")
data.show(10, False)
original data : 3297
+-----------------------------+-----+
|sequence |label|
+-----------------------------+-----+
|ACAGV |H |
|GIGLHLAVRLA |H |
|RLWEAARA |H |
|KSVAAARE |H |
|EDAVASVLDVNVVGTVRMLQAFLPDMKRR|H |
|VYCASKFALEGLCESLAVLLLPF |H |
|IHTFHRFYQYLALSKQVFREA |H |
|EEVAEVFLTALR |H |
|LPLLRMRL |H |
|NYVTAMHREVF |H |
+-----------------------------+-----+
only showing top 10 rows
In [6]:
segmentLength = 11
n = 2
windowSize = (segmentLength-1)/2
vectorSize = 50
encoder = ProteinSequenceEncoder(data)
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n=n, windowSize=windowSize, vectorSize=vectorSize)
data.show(5)
+--------------------+-----+--------------------+--------------------+
| sequence|label| ngram| features|
+--------------------+-----+--------------------+--------------------+
| ACAGV| H| [AC, CA, AG, GV]|[0.46121373027563...|
| GIGLHLAVRLA| H|[GI, IG, GL, LH, ...|[-0.2606903441250...|
| RLWEAARA| H|[RL, LW, WE, EA, ...|[0.16112836982522...|
| KSVAAARE| H|[KS, SV, VA, AA, ...|[1.15827076775687...|
|EDAVASVLDVNVVGTVR...| H|[ED, DA, AV, VA, ...|[0.37046241248026...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows
In [7]:
sc.stop()