Source code for mmtfPyspark.webfilters.pdbjMineSearch

#!/user/bin/env python
'''PdbjMineSearch.py

This filter runs an PDBj Mine 2 Search web service using an SQL query.

References
----------
- Each category represents a table, and fields represent database columns, see
available tables and columns: `MINE RDB DOCS <https://pdbj.org/mine-rdb-docs>`_

- Data are provided through: `MINE2-SQl <https://pdbj.org/help/mine2-sql>`_

- Queries can be designed with the interactive PDBjMine2 query service: `PDBjMine2 SQL <https://pdbj.org/mine/sql>`_
'''

__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

import urllib
import tempfile
from pyspark.sql import SparkSession
from mmtfPyspark.datasets import pdbjMineDataset
from urllib.request import urlretrieve
import requests


[docs]class PdbjMineSearch(object):
    '''Fetch data using the PDBj Mine 2 SQL service

    Attributes
    ----------
    sqlQuery: str
       the sql query [None]
    '''

    URL = "https://pdbj.org/rest/mine2_sql"

    def __init__(self, sqlQuery):

        self.chainLevel = False
        self.pdbIds = []

        dataset = pdbjMineDataset.get_dataset(sqlQuery)

        if dataset == None:
            raise Exception(
                "Dataset empty. Either provide an sql query or a dataset")

        # Check if there is a pdbID file
        if 'structureId' in dataset.columns:
            self.chainLevel = False
            self.pdbIds = [a[0] for a in dataset.select('structureId').collect()]

        if 'structureChainId' in dataset.columns:
            self.chainLevel = True
            ids = [a[0] for a in dataset.select('structureChainId').collect()]
            ids_sub = [i[:4] for i in ids]
            self.pdbIds = ids + ids_sub

    def __call__(self, t):
        match = t[0] in self.pdbIds

        # If results are PDB IDs. but the keys contains chain names,
        # then trucate the chain name before matching (eg. 4HHB.A -> 4HHB)
        if not match and not self.chainLevel and len(t[0]) > 4:
            match = t[0][:4] in self.pdbIds

        return match