Example demonstrateing how to extract protein chains from PDB entries. This example uses a flatMap function to transform a structure to its polymer chains.
In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.filters import PolymerComposition
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.mappers import StructureToPolymerChains
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("FlatMapChainsDemo")
sc = SparkContext(conf = conf)
In [3]:
path = "../../resources/mmtf_reduced_sample/"
pdb = mmtfReader.read_sequence_file(path, sc)
** polymerComposition.AMINO_ACIDS_20 **= [“ALA”,”ARG”,”ASN”,”ASP”,”CYS”,”GLN”,”GLU”,”GLY”,”HIS”,”ILE”,”LEU”,”LYS”,”MET”,”PHE”,”PRO”,”SER”,”THR”,”TRP”,”TYR”,”VAL”]
** polymerComposition.AMINO_ACIDS_22 **= [“ALA”,”ARG”,”ASN”,”ASP”,”CYS”,”GLN”,”GLU”,”GLY”,”HIS”,”ILE”,”LEU”,”LYS”,”MET”,”PHE”,”PRO”,”SER”,”THR”,”TRP”,”TYR”,”VAL”,”SEC”,”PYL”]
** polymerComposition.DNA_STD_NUCLEOTIDES **= [“DA”,”DC”,”DG”,”DT”]
** polymerComposition.RNA_STD_NUCLEOTIDES **= [“A”,”C”,”G”,”U”]
In [4]:
count = pdb.flatMap(StructureToPolymerChains(False, True)) \
.filter(PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) \
.count()
print(f"Chains with standard amino acids: {count}")
Chains with standard amino acids: 9386
In [5]:
sc.stop()