Secondary Structure Blosum62 Encoder Demo

This demo creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a Blosum62 encoding of the sequence segment.

Imports

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader

Configure Spark Context

In [2]:
conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SecondaryStructureBlosumEncoderDemo")

sc = SparkContext(conf = conf)

## Read MMTF Hadoop sequence file and

Create a non-redundant set(<=20% seq. identity) of L-protein chains

In [3]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain()) \
        .sample(False, fraction, seed)

Get content

In [4]:
segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data   : {data.count()}")
original data   : 2149

Drop Q3 and sequence duplicates

In [5]:
data = data.dropDuplicates(["labelQ3", "sequence"]).cache()
print(f"- duplicate Q3/seq  : {data.count()}")
- duplicate Q3/seq  : 2149

Drop sequence duplicates

In [6]:
data = data.dropDuplicates(["sequence"])
print(f"- duplicate seq  : {data.count()}")
- duplicate seq  : 2149

Blosum62 Encoding

In [7]:
encoder = ProteinSequenceEncoder(data)
data = encoder.blosum62_encode()

data.printSchema()
data.show(5, False)
root
 |-- structureChainId: string (nullable = true)
 |-- sequence: string (nullable = false)
 |-- labelQ8: string (nullable = true)
 |-- labelQ3: string (nullable = true)
 |-- features: vector (nullable = true)

+----------------+-----------+-------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|structureChainId|sequence   |labelQ8|labelQ3|features                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+----------------+-----------+-------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1A9X.F          |DIDTRKLTRLL|H      |H      |[-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,5.0,-2.0,-2.0,0.0,-1.0,5.0,0.0,-2.0,-3.0,1.0,0.0,-2.0,0.0,-3.0,-2.0,2.0,-1.0,-3.0,-2.0,-1.0,-1.0,-3.0,-2.0,-3.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,5.0,-2.0,-2.0,0.0,-1.0,5.0,0.0,-2.0,-3.0,1.0,0.0,-2.0,0.0,-3.0,-2.0,2.0,-1.0,-3.0,-2.0,-1.0,-1.0,-3.0,-2.0,-3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0]|
|1FO8.A          |DLEVAPDFFEY|T      |C      |[-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,0.0,-3.0,-3.0,-3.0,-1.0,-2.0,-2.0,-3.0,-3.0,3.0,1.0,-2.0,1.0,-1.0,-2.0,-2.0,0.0,-3.0,-1.0,4.0,4.0,-1.0,-2.0,-2.0,0.0,-1.0,-1.0,0.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0,-1.0,-2.0,-2.0,-1.0,-3.0,-1.0,-1.0,-2.0,-2.0,-3.0,-3.0,-1.0,-2.0,-4.0,7.0,-1.0,-1.0,-4.0,-3.0,-2.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,2.0,-1.0,-1.0,-2.0,-1.0,3.0,-3.0,-2.0,-2.0,2.0,7.0,-1.0]  |
|1A9X.F          |EDLSSYLKRHN|H      |H      |[-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,2.0,-1.0,-1.0,-2.0,-1.0,3.0,-3.0,-2.0,-2.0,2.0,7.0,-1.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-1.0,5.0,0.0,-2.0,-3.0,1.0,0.0,-2.0,0.0,-3.0,-2.0,2.0,-1.0,-3.0,-2.0,-1.0,-1.0,-3.0,-2.0,-3.0,-2.0,0.0,1.0,-1.0,-3.0,0.0,0.0,-2.0,8.0,-3.0,-3.0,-1.0,-2.0,-1.0,-2.0,-1.0,-2.0,-2.0,2.0,-3.0,-2.0,0.0,6.0,1.0,-3.0,0.0,0.0,0.0,1.0,-3.0,-3.0,0.0,-2.0,-3.0,-2.0,1.0,0.0,-4.0,-2.0,-3.0]               |
|1EB6.A          |GDESKFEEYFK|H      |H      |[0.0,-2.0,0.0,-1.0,-3.0,-2.0,-2.0,6.0,-2.0,-4.0,-4.0,-2.0,-3.0,-3.0,-2.0,0.0,-2.0,-2.0,-3.0,-3.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,2.0,-1.0,-1.0,-2.0,-1.0,3.0,-3.0,-2.0,-2.0,2.0,7.0,-1.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0]            |
|1C1K.A          |IISFETFILLD|H      |H      |[-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,5.0,-2.0,-2.0,0.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0]      |
+----------------+-----------+-------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows

Terminate Spark Context

In [8]:
sc.stop()