Source code for mmtfPyspark.datasets.pdbjMineDataset
#!/user/bin/env python
'''pdbjMineDataset.py
This filter runs an PDBj Mine 2 Search web service using SQL query
References
----------
- Data are provided through Mine2 SQL: https://pdbj.org/help/mine2-sql
- Queries can be designed with the interactive PDBj Mine 2 query service: https://pdbj.org/mine/sql
- PDB metadata are described in the PDB mmCIF Dictionary: http://mmcif.wwpdb.org/
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
import urllib
import ssl
import tempfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, upper, concat
from urllib.request import urlretrieve
import requests
[docs]def get_dataset(sqlQuery):
'''Runs a PDBj Mine 2 search web service using an SQL query
Parameters
----------
sqlQuery : str
the sql query for the web service
'''
# Create SSl certificate
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# encode SQL
encodedSQL = urllib.parse.quote(sqlQuery)
URL = "https://pdbj.org/rest/mine2_sql"
# Download results to file
infile = urllib.request.urlopen(URL + "?format=csv&q=" + encodedSQL, context=ctx)
tmp = tempfile.NamedTemporaryFile(delete=False)
with open(tmp.name, 'wb') as output:
output.write(infile.read())
spark = SparkSession.builder.getOrCreate()
ds = spark.read.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.option("parserLib", "UNIVOCITY") \
.load(tmp.name)
# rename.concatenate columns to assign
# consistent primary keys to datasets
if "pdbid" in ds.columns:
# this project uses upper case pdbids
ds = ds.withColumn("pdbid", upper(col("pdbid")))
if "chain" in ds.columns:
ds = ds.withColumn("structureChainId", \
concat(col("pdbid"), lit("."), col("chain")))
ds.drop("pdbid","chain")
else:
ds = ds.withColumnRenamed("pdbid", "structureId")
return ds