Secondary Structure Property Encoder Demo

This demo creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a property encoding of the sequence segment.

Imports

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader

Configure Spark Context

In [2]:
conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SecondaryStructurePropertyEncoderDemo")

sc = SparkContext(conf = conf)

## Read MMTF Hadoop sequence file and

Create a non-redundant set(<=20% seq. identity) of L-protein chains

In [3]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain()) \
        .sample(False, fraction, seed)

Get content

In [4]:
segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data   : {data.count()}")
original data   : 2149

Drop Q3 and sequence duplicates

In [5]:
data = data.dropDuplicates(["labelQ3", "sequence"]).cache()
print(f"- duplicate Q3/seq  : {data.count()}")
- duplicate Q3/seq  : 2149

Drop sequence duplicates

In [6]:
data = data.dropDuplicates(["sequence"])
print(f"- duplicate seq  : {data.count()}")
- duplicate seq  : 2149

Property Encoding

In [7]:
encoder = ProteinSequenceEncoder(data)
data = encoder.property_encode()

data.printSchema()
data.show(5, False)
root
 |-- structureChainId: string (nullable = true)
 |-- sequence: string (nullable = false)
 |-- labelQ8: string (nullable = true)
 |-- labelQ3: string (nullable = true)
 |-- features: vector (nullable = true)

+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|structureChainId|sequence   |labelQ8|labelQ3|features                                                                                                                                                                                                                                                                                                                                                                                     |
+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1A9X.F          |DIDTRKLTRLL|H      |H      |[1.6,0.11,2.78,-0.77,2.95,0.25,0.2,4.19,0.19,4.0,1.8,6.04,0.3,0.45,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,3.03,0.11,2.6,0.26,5.6,0.21,0.36,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.59,0.19,4.0,1.7,6.04,0.39,0.31,3.03,0.11,2.6,0.26,5.6,0.21,0.36,2.59,0.19,4.0,1.7,6.04,0.39,0.31,2.59,0.19,4.0,1.7,6.04,0.39,0.31]                                                                               |
|1FO8.A          |DLEVAPDFFEY|T      |C      |[1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,3.67,0.14,3.0,1.22,6.02,0.27,0.49,2.34,0.29,6.13,-1.01,10.74,0.36,0.25,2.67,0.0,2.72,0.72,6.8,0.13,0.34,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.94,0.29,5.89,1.79,5.67,0.3,0.38,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,2.94,0.3,6.47,0.96,5.66,0.25,0.41] |
|1A9X.F          |EDLSSYLKRHN|H      |H      |[1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,2.94,0.3,6.47,0.96,5.66,0.25,0.41,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.99,0.23,4.66,0.13,7.69,0.27,0.3,1.6,0.13,2.95,-0.6,6.52,0.21,0.22]                                        |
|1EB6.A          |GDESKFEEYFK|H      |H      |[0.0,0.0,0.0,0.0,6.07,0.13,0.15,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,2.94,0.3,6.47,0.96,5.66,0.25,0.41,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.89,0.22,4.77,-0.99,9.99,0.32,0.27]|
|1C1K.A          |IISFETFILLD|H      |H      |[4.19,0.19,4.0,1.8,6.04,0.3,0.45,4.19,0.19,4.0,1.8,6.04,0.3,0.45,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,3.03,0.11,2.6,0.26,5.6,0.21,0.36,2.94,0.29,5.89,1.79,5.67,0.3,0.38,4.19,0.19,4.0,1.8,6.04,0.3,0.45,2.59,0.19,4.0,1.7,6.04,0.39,0.31,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.6,0.11,2.78,-0.77,2.95,0.25,0.2]              |
+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows

Terminate Spark Context

In [8]:
sc.stop()