PDBj Mine 2 RDB keyword search query and MMTF filtering using pdbid.
In [8]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.io import mmtfReader
In [9]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("SimpleQuerySearch")
sc = SparkContext(conf = conf)
In [10]:
path = "../../resources/mmtf_full_sample/"
pdb = mmtfReader.read_sequence_file(path, sc)
Very simple query; this gets the pdbids for all entries modified since 2016-06-28 with a resulution better than 1.5 A
In [11]:
sql = "select pdbid from brief_summary where modification_date >= '2016-06-28' and resolution < 1.5"
search = PdbjMineSearch(sql)
count = pdb.filter(search).keys().count()
print(f"Number of entries using sql to filter: {count}")
Number of entries using sql to filter: 11
In [12]:
dataset = pdbjMineDataset.get_dataset(sql)
dataset.show(5)
+-----------+
|structureId|
+-----------+
| 5U8P|
| 5U8U|
| 5U8V|
| 5U9D|
| 5UAM|
+-----------+
only showing top 5 rows
In [13]:
sc.stop()