This demo selects protein sequences that do not contain mutations in comparison with the reference UniProt sequences.
Expression tags: Some PDB entries include expression tags that were added during the experiment. Select “No” to filter out sequences with expression tags. Percent coverage of UniProt sequence: PDB entries may contain only a portion of the referenced UniProt sequence. The “Percent coverage of UniProt sequence” option defines how much of a UniProt sequence needs to be contained in a PDB entry.
In [5]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import WildTypeQuery
In [6]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("wildTypeQuery")
sc = SparkContext(conf = conf)
In [7]:
path = "../../resources/mmtf_reduced_sample/"
pdb = mmtfReader.read_sequence_file(path, sc) \
.filter(WildTypeQuery(includeExpressionTags = True, percentSequenceCoverage = WildTypeQuery.SEQUENCE_COVERAGE_95))
In [8]:
count = pdb.count()
print(f"Number of structures after filtering : {count}")
pdb.top(5)
Number of structures after filtering : 1440
Out[8]:
[('1GBS', <mmtfPyspark.utils.mmtfStructure.MmtfStructure at 0x7f0838750550>),
('1GAX', <mmtfPyspark.utils.mmtfStructure.MmtfStructure at 0x7f083869ef98>),
('1GAR', <mmtfPyspark.utils.mmtfStructure.MmtfStructure at 0x7f08386b4358>),
('1GAL', <mmtfPyspark.utils.mmtfStructure.MmtfStructure at 0x7f08254ce358>),
('1GAJ', <mmtfPyspark.utils.mmtfStructure.MmtfStructure at 0x7f08254c2470>)]
In [9]:
sc.stop()