This demo creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a Blosum62 encoding of the sequence segment.
In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader
In [2]:
conf = SparkConf() \
.setMaster("local[*]") \
.setAppName("SecondaryStructureBlosumEncoderDemo")
sc = SparkContext(conf = conf)
## Read MMTF Hadoop sequence file and
Create a non-redundant set(<=20% seq. identity) of L-protein chains
In [3]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123
pdb = mmtfReader \
.read_sequence_file(path, sc) \
.flatMap(StructureToPolymerChains()) \
.filter(Pisces(sequenceIdentity, resolution)) \
.filter(ContainsLProteinChain()) \
.sample(False, fraction, seed)
In [4]:
segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data : {data.count()}")
original data : 2149
In [5]:
data = data.dropDuplicates(["labelQ3", "sequence"]).cache()
print(f"- duplicate Q3/seq : {data.count()}")
- duplicate Q3/seq : 2149
In [6]:
data = data.dropDuplicates(["sequence"])
print(f"- duplicate seq : {data.count()}")
- duplicate seq : 2149
In [7]:
encoder = ProteinSequenceEncoder(data)
data = encoder.blosum62_encode()
data.printSchema()
data.show(5, False)
root
|-- structureChainId: string (nullable = true)
|-- sequence: string (nullable = false)
|-- labelQ8: string (nullable = true)
|-- labelQ3: string (nullable = true)
|-- features: vector (nullable = true)
+----------------+-----------+-------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|structureChainId|sequence |labelQ8|labelQ3|features |
+----------------+-----------+-------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1A9X.F |DIDTRKLTRLL|H |H |[-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,5.0,-2.0,-2.0,0.0,-1.0,5.0,0.0,-2.0,-3.0,1.0,0.0,-2.0,0.0,-3.0,-2.0,2.0,-1.0,-3.0,-2.0,-1.0,-1.0,-3.0,-2.0,-3.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,5.0,-2.0,-2.0,0.0,-1.0,5.0,0.0,-2.0,-3.0,1.0,0.0,-2.0,0.0,-3.0,-2.0,2.0,-1.0,-3.0,-2.0,-1.0,-1.0,-3.0,-2.0,-3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0]|
|1FO8.A |DLEVAPDFFEY|T |C |[-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,0.0,-3.0,-3.0,-3.0,-1.0,-2.0,-2.0,-3.0,-3.0,3.0,1.0,-2.0,1.0,-1.0,-2.0,-2.0,0.0,-3.0,-1.0,4.0,4.0,-1.0,-2.0,-2.0,0.0,-1.0,-1.0,0.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0,-1.0,-2.0,-2.0,-1.0,-3.0,-1.0,-1.0,-2.0,-2.0,-3.0,-3.0,-1.0,-2.0,-4.0,7.0,-1.0,-1.0,-4.0,-3.0,-2.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,2.0,-1.0,-1.0,-2.0,-1.0,3.0,-3.0,-2.0,-2.0,2.0,7.0,-1.0] |
|1A9X.F |EDLSSYLKRHN|H |H |[-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,2.0,-1.0,-1.0,-2.0,-1.0,3.0,-3.0,-2.0,-2.0,2.0,7.0,-1.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-1.0,5.0,0.0,-2.0,-3.0,1.0,0.0,-2.0,0.0,-3.0,-2.0,2.0,-1.0,-3.0,-2.0,-1.0,-1.0,-3.0,-2.0,-3.0,-2.0,0.0,1.0,-1.0,-3.0,0.0,0.0,-2.0,8.0,-3.0,-3.0,-1.0,-2.0,-1.0,-2.0,-1.0,-2.0,-2.0,2.0,-3.0,-2.0,0.0,6.0,1.0,-3.0,0.0,0.0,0.0,1.0,-3.0,-3.0,0.0,-2.0,-3.0,-2.0,1.0,0.0,-4.0,-2.0,-3.0] |
|1EB6.A |GDESKFEEYFK|H |H |[0.0,-2.0,0.0,-1.0,-3.0,-2.0,-2.0,6.0,-2.0,-4.0,-4.0,-2.0,-3.0,-3.0,-2.0,0.0,-2.0,-2.0,-3.0,-3.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,-2.0,-2.0,-2.0,-3.0,-2.0,-1.0,-2.0,-3.0,2.0,-1.0,-1.0,-2.0,-1.0,3.0,-3.0,-2.0,-2.0,2.0,7.0,-1.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,2.0,0.0,-1.0,-3.0,1.0,1.0,-2.0,-1.0,-3.0,-2.0,5.0,-1.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0] |
|1C1K.A |IISFETFILLD|H |H |[-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,-2.0,-2.0,0.0,-1.0,-2.0,-1.0,4.0,1.0,-3.0,-2.0,-2.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,0.0,0.0,2.0,-4.0,2.0,5.0,-2.0,0.0,-3.0,-3.0,1.0,-2.0,-3.0,-1.0,0.0,-1.0,-3.0,-2.0,-2.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,-2.0,-1.0,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,5.0,-2.0,-2.0,0.0,-2.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0,-1.0,0.0,0.0,-3.0,0.0,6.0,-4.0,-2.0,-2.0,1.0,3.0,-1.0,-1.0,-3.0,-3.0,-3.0,-1.0,-3.0,-3.0,-4.0,-3.0,4.0,2.0,-3.0,1.0,0.0,-3.0,-2.0,-1.0,-3.0,-1.0,3.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-1.0,-2.0,-3.0,-4.0,-1.0,-2.0,-3.0,-4.0,-3.0,2.0,4.0,-2.0,2.0,0.0,-3.0,-2.0,-1.0,-2.0,-1.0,1.0,-2.0,-2.0,1.0,6.0,-3.0,0.0,2.0,-1.0,-1.0,-3.0,-4.0,-1.0,-3.0,-3.0,-1.0,0.0,-1.0,-4.0,-3.0,-3.0] |
+----------------+-----------+-------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows
In [8]:
sc.stop()