This demo creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a property encoding of the sequence segment.
In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader
In [2]:
conf = SparkConf() \
.setMaster("local[*]") \
.setAppName("SecondaryStructurePropertyEncoderDemo")
sc = SparkContext(conf = conf)
## Read MMTF Hadoop sequence file and
Create a non-redundant set(<=20% seq. identity) of L-protein chains
In [3]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123
pdb = mmtfReader \
.read_sequence_file(path, sc) \
.flatMap(StructureToPolymerChains()) \
.filter(Pisces(sequenceIdentity, resolution)) \
.filter(ContainsLProteinChain()) \
.sample(False, fraction, seed)
In [4]:
segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data : {data.count()}")
original data : 2149
In [5]:
data = data.dropDuplicates(["labelQ3", "sequence"]).cache()
print(f"- duplicate Q3/seq : {data.count()}")
- duplicate Q3/seq : 2149
In [6]:
data = data.dropDuplicates(["sequence"])
print(f"- duplicate seq : {data.count()}")
- duplicate seq : 2149
In [7]:
encoder = ProteinSequenceEncoder(data)
data = encoder.property_encode()
data.printSchema()
data.show(5, False)
root
|-- structureChainId: string (nullable = true)
|-- sequence: string (nullable = false)
|-- labelQ8: string (nullable = true)
|-- labelQ3: string (nullable = true)
|-- features: vector (nullable = true)
+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|structureChainId|sequence |labelQ8|labelQ3|features |
+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1A9X.F |DIDTRKLTRLL|H |H |[1.6,0.11,2.78,-0.77,2.95,0.25,0.2,4.19,0.19,4.0,1.8,6.04,0.3,0.45,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,3.03,0.11,2.6,0.26,5.6,0.21,0.36,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.59,0.19,4.0,1.7,6.04,0.39,0.31,3.03,0.11,2.6,0.26,5.6,0.21,0.36,2.59,0.19,4.0,1.7,6.04,0.39,0.31,2.59,0.19,4.0,1.7,6.04,0.39,0.31] |
|1FO8.A |DLEVAPDFFEY|T |C |[1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,3.67,0.14,3.0,1.22,6.02,0.27,0.49,2.34,0.29,6.13,-1.01,10.74,0.36,0.25,2.67,0.0,2.72,0.72,6.8,0.13,0.34,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.94,0.29,5.89,1.79,5.67,0.3,0.38,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,2.94,0.3,6.47,0.96,5.66,0.25,0.41] |
|1A9X.F |EDLSSYLKRHN|H |H |[1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,2.94,0.3,6.47,0.96,5.66,0.25,0.41,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.99,0.23,4.66,0.13,7.69,0.27,0.3,1.6,0.13,2.95,-0.6,6.52,0.21,0.22] |
|1EB6.A |GDESKFEEYFK|H |H |[0.0,0.0,0.0,0.0,6.07,0.13,0.15,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,2.94,0.3,6.47,0.96,5.66,0.25,0.41,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.89,0.22,4.77,-0.99,9.99,0.32,0.27]|
|1C1K.A |IISFETFILLD|H |H |[4.19,0.19,4.0,1.8,6.04,0.3,0.45,4.19,0.19,4.0,1.8,6.04,0.3,0.45,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,3.03,0.11,2.6,0.26,5.6,0.21,0.36,2.94,0.29,5.89,1.79,5.67,0.3,0.38,4.19,0.19,4.0,1.8,6.04,0.3,0.45,2.59,0.19,4.0,1.7,6.04,0.39,0.31,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.6,0.11,2.78,-0.77,2.95,0.25,0.2] |
+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows
In [8]:
sc.stop()