This demo filters PDB chains by sequence similarity using RCSB PDB webservices.
In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.webfilters import SequenceSimilarity
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.io import mmtfReader
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("SequenceSimilaritySearchDemo")
sc = SparkContext(conf = conf)
In [6]:
path = "../../resources/mmtf_reduced_sample/"
pdb = mmtfReader.read_sequence_file(path, sc) \
.flatMap(StructureToPolymerChains()) \
.filter(SequenceSimilarity(sequence="NLVQFGVMIEKMTGKSALQYNDYGCYCGIGGSHWPVDQ",\
searchTool=SequenceSimilarity.BLAST, \
eValueCutoff=0.001, \
sequenceIdentityCutoff=40, \
maskLowComplexity=True)) \
.collect()
for pdbId, structure in pdb:
print(f"{pdbId} : {structure.entity_list[0]['sequence']}")
2H4C.A : NFFQFAEMIVKMTGKEAVHSYAIYGCYCGWGGQGKPQDATDRCCFVHDCCYGTVNDCNPKMATYSYSFENGDIVCGDNNLCLKTVCECDRAAAICLGQNVNTYDKNYENYAISHCTEESEQC
2H4C.C : NFFQFAEMIVKMTGKEAVHSYAIYGCYCGWGGQGKPQDATDRCCFVHDCCYGTVNDCNPKMATYSYSFENGDIVCGDNNLCLKTVCECDRAAAICLGQNVNTYDKNYENYAISHCTEESEQC
2H4C.E : NFFQFAEMIVKMTGKEAVHSYAIYGCYCGWGGQGKPQDATDRCCFVHDCCYGTVNDCNPKMATYSYSFENGDIVCGDNNLCLKTVCECDRAAAICLGQNVNTYDKNYENYAISHCTEESEQC
2H4C.G : NFFQFAEMIVKMTGKEAVHSYAIYGCYCGWGGQGKPQDATDRCCFVHDCCYGTVNDCNPKMATYSYSFENGDIVCGDNNLCLKTVCECDRAAAICLGQNVNTYDKNYENYAISHCTEESEQC
1GP7.A : MNPAHLLVLSAVCVSLLGASSIPPQPLHLIQFGNMIQCTVPGFLSWIKYADYGCYCGAGGSGTPVDKLDRCCQVHDNCYTQAQKLPACSSIMDSPYVKIYSYDCSERTVTCKADNDECAAFICNCDRVAAHCFAASPYNNNNYNIDTTTRC
1GP7.B : MNPAHLLVLSAVCVSLLGASSIPPQPLHLIQFGNMIQCTVPGFLSWIKYADYGCYCGAGGSGTPVDKLDRCCQVHDNCYTQAQKLPACSSIMDSPYVKIYSYDCSERTVTCKADNDECAAFICNCDRVAAHCFAASPYNNNNYNIDTTTRC
1GP7.C : MNPAHLLVLSAVCVSLLGASSIPPQPLHLIQFGNMIQCTVPGFLSWIKYADYGCYCGAGGSGTPVDKLDRCCQVHDNCYTQAQKLPACSSIMDSPYVKIYSYDCSERTVTCKADNDECAAFICNCDRVAAHCFAASPYNNNNYNIDTTTRC
3V9M.A : NLIQFGNMIQCANKGSRPSLDYADYGCYCGWGGSGTPVDELDRCCQVHDNCYEQAGKKGCFPKLTLYSWKCTGNVPTCNSKPGCKSFVCACDAAAAKCFAKAPYKKENYNIDTKKRCK
3V9M.B : NLIQFGNMIQCANKGSRPSLDYADYGCYCGWGGSGTPVDELDRCCQVHDNCYEQAGKKGCFPKLTLYSWKCTGNVPTCNSKPGCKSFVCACDAAAAKCFAKAPYKKENYNIDTKKRCK
3VC0.A : NLVQFGKMIECAIRNRRPALDFMNYGCYCGKGGSGTPVDDLDRCCQVHDECYAEAEKHGCYPSLTTYTWECRQVGPYCNSKTQCEVFVCACDFAAAKCFAQEDYNPAHSNINTGERCK
1VAP.A : NLFQFEKLIKKMTGKSGMLWYSAYGCYCGWGGQGRPKDATDRCCFVHDCCYGKVTGCNPKMDIYTYSVDNGNIVCGGTNPCKKQICECDRAAAICFRDNLKTYDSKTYWKYPKKNCKEESEPC
1VAP.B : NLFQFEKLIKKMTGKSGMLWYSAYGCYCGWGGQGRPKDATDRCCFVHDCCYGKVTGCNPKMDIYTYSVDNGNIVCGGTNPCKKQICECDRAAAICFRDNLKTYDSKTYWKYPKKNCKEESEPC
1VIP.A : NLFQFAEMIVKMTGKNPLSSYSDYGCYCGWGGKGKPQDATDRCCFVHDCCYEKVKSCKPKLSLYSYSFQNGGIVCGDNHSCKRAVCECDRVAATCFRDNLNTYDKKYHNYPPSQCTGTEQC
In [7]:
sc.stop()