Simple example of reading an MMTF Hadoop Sequence file, filtering the entries exclusively by LProtein, and counting the number of entries. This example shows how methods can be chained for a more concise syntax.
In [2]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.structureViewer import view_structure
In [3]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("FilterExclusivelyByLProtein")
sc = SparkContext(conf = conf)
In [4]:
path = "../../resources/mmtf_reduced_sample/"
structures = mmtfReader.read_sequence_file(path, sc) \
.filter(ContainsLProteinChain(exclusive = True))
print(f"Number of L-Proteins: {structures.count()}")
Number of L-Proteins: 4506
In [5]:
structure_names = structures.keys().collect()
view_structure(structure_names, style='sphere')
Out[5]:
<function mmtfPyspark.structureViewer.view_structure.<locals>.view3d>
In [6]:
sc.stop()