Source code for mmtfPyspark.webfilters.pdbjMineSearch
#!/user/bin/env python
'''PdbjMineSearch.py
This filter runs an PDBj Mine 2 Search web service using an SQL query.
References
----------
- Each category represents a table, and fields represent database columns, see
available tables and columns: `MINE RDB DOCS <https://pdbj.org/mine-rdb-docs>`_
- Data are provided through: `MINE2-SQl <https://pdbj.org/help/mine2-sql>`_
- Queries can be designed with the interactive PDBjMine2 query service: `PDBjMine2 SQL <https://pdbj.org/mine/sql>`_
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
import urllib
import tempfile
from pyspark.sql import SparkSession
from mmtfPyspark.datasets import pdbjMineDataset
from urllib.request import urlretrieve
import requests
[docs]class PdbjMineSearch(object):
'''Fetch data using the PDBj Mine 2 SQL service
Attributes
----------
sqlQuery: str
the sql query [None]
'''
URL = "https://pdbj.org/rest/mine2_sql"
def __init__(self, sqlQuery):
self.chainLevel = False
self.pdbIds = []
dataset = pdbjMineDataset.get_dataset(sqlQuery)
if dataset == None:
raise Exception(
"Dataset empty. Either provide an sql query or a dataset")
# Check if there is a pdbID file
if 'structureId' in dataset.columns:
self.chainLevel = False
self.pdbIds = [a[0] for a in dataset.select('structureId').collect()]
if 'structureChainId' in dataset.columns:
self.chainLevel = True
ids = [a[0] for a in dataset.select('structureChainId').collect()]
ids_sub = [i[:4] for i in ids]
self.pdbIds = ids + ids_sub
def __call__(self, t):
match = t[0] in self.pdbIds
# If results are PDB IDs. but the keys contains chain names,
# then trucate the chain name before matching (eg. 4HHB.A -> 4HHB)
if not match and not self.chainLevel and len(t[0]) > 4:
match = t[0][:4] in self.pdbIds
return match