#!/user/bin/env python
'''drugBankDatset.py
This module provides access to DrugBank containing drug structure and drug target
imformation. These datasets contain identifiers and names for integration with
other data resources.
References
----------
- Drug Bank. https://www.drugbank.ca
- Wishart DS, et al., DrugBank 5.0: a major update to the DrugBank database for 2018.
Nucleic Acids Res. 2017 Nov 8. https://dx.doi.org/10.1093/nar/gkx1037
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
import requests
import tempfile
import io
from zipfile import ZipFile
from io import BytesIO
from pyspark.sql import SparkSession
DRUG_GROUP = ['ALL', 'APPROVED', 'EXPERIMENTAL', 'NUTRACEUTICAL', 'ILLICIT'\
'WITHDRAWN', 'INVESTIGATIONAL']
DRUG_TYPE = ['SMALL_MOLECULE', 'BIOTECH']
BASE_URL = "https://www.drugbank.ca/releases/latest/downloads/"
BUFFER = 2048
[docs]def get_open_drug_links():
'''Downloads the DrugBank Open Data dataset with drug structure external
links and identifiers. See DrugBank.
This dataset contains drug common names, synonyms, CAS numbers, and
Standard InChIKeys.
The DrugBank Open Data dataset is a public domain dataset that can be
used freely in your application or project (including commercial use).
It is released under a Creative Common’s CC0 International License.
References
----------
Open Data dataset. https://www.drugbank.ca/releases/latest#open-data
Examples
--------
Get DrugBank open dataset:
>>> openDrugLinks = DrugBankDataset.get_open_drug_links()
>>> openDrugLinks.show()
+----------+--------------------+-----------+--------------------+
|DrugBankID| Commonname| CAS| StandardInChIKey|
+----------+--------------------+-----------+--------------------+
| DB00006| Bivalirudin|128270-60-0|OIRCOABEOLEUMC-GE...|
| DB00014| Goserelin| 65807-02-5|BLCLNMBMMGCOAS-UR...|
+----------+--------------------+-----------+--------------------+
Returns
-------
dataset
DrugBank Dataset
'''
url = BASE_URL + "all-drugbank-vocabulary"
return get_dataset(url)
[docs]def get_drug_links(drugGroup, username, password):
'''Downloads drug structure external links and identifiers from DrugBank.
Either all or subsets of data can be downloaded by specifying the
DrugGroup:
ALL, APPROVED, EXPERIMENTAL, NUTRACEUTICAL, ILLLICT, WITHDRAWN,
INVESTIGATIONAL.
The structure external links datasets include drug structure information
in the form of InChI/InChI Key/SMILES as well as identifiers for other
drug-structure resources (such as ChEBI, ChEMBL,ChemSpider, BindingDB,
etc.). Included in each dataset is also the PubChem Compound ID (CID) and
the particular PubChem Substance ID (SID) for the given DrugBank record.
These DrugBank datasets are released under a Creative Common’s
Attribution-NonCommercial 4.0 International License. They can be used
freely in your non-commercial application or project. A DrugBank user
account and authentication is required to download these datasets.
References
----------
External Drug Links:
https://www.drugbank.ca/releases/latest#external-links
Examples
--------
Get dataset of external links and identifiers of approved drugs:
>>> username = "<your DrugBank username>"
>>> String password = "<your DrugBank password>"
>>> drugLinks = get_drug_links("APPROVED", username, password)
>>> drugLinks.show()
Parameters
----------
durgGroup : str
specific dataset to be downloaded, has to be in the pre-defined DURG_GROUP list
usesrname : str
DrugBank username
password : str
DrugBank password
Returns
-------
dataset
DrugBank Dataset
'''
if drugGroup.upper() not in DRUG_GROUP:
raise ValueError("drugGroup not in pre-defined durgGroups")
url = BASE_URL + drugGroup + "-structure-links"
return get_dataset(url, username, password)
[docs]def get_drug_target_links(drug, username, password):
'''Downloads drug target external links and identifiers from DrugBank.
Either all or subsets of data can be downloaded by specifying the
DrugGroup:
ALL, APPROVED, EXPERIMENTAL, NUTRACEUTICAL, ILLLICT, WITHDRAWN,
INVESTIGATIONAL.
OR DrugType:
SMALL_MOLECULE, BIOTECH.
The drug target external links datasets include drug name, drug type
(small molecule, biotech), UniProtID and UniProtName.
These DrugBank datasets are released under the Creative Common’s
Attribution-NonCommercial 4.0 International License. They can be used
freely in your non-commercial application or project. A DrugBank user
account and authentication is required to download these datasets.
References
----------
Target Drug-UniProt:
https://www.drugbank.ca/releases/latest#external-links
Examples
--------
Get dataset of drug target external links and identifiers of all drugs in DrugBank:
>>> username = "<your DrugBank username>"
>>> password = "<your DrugBank password>"
>>> drugTargetLinks = get_drug_target_links("ALL",
... username,
... password)
>>> drugTargetLinks.show()
Parameters
----------
durg : str
specific dataset to be downloaded, has to be either in
the DrugGroup list OR DrugType list.
usesrname : str
DrugBank username
password : str
DrugBank password
Returns
-------
dataset
DrugBank Dataset
'''
if drug.upper() in DRUG_GROUP or drug.upper() in DRUG_TYPE:
url = BASE_URL + "target-" + drug + "-uniprot-links"
else:
raise ValueError("drug not in pre-defined durgGroups or drugTypes")
return get_dataset(url, username, password)
[docs]def get_dataset(url, username=None, password=None):
'''Downloads a DrugBank dataset
Parameters
----------
url : str
DrugBank dataset download links
username : str, optional
DrugBank username <None>
password : str, optional
DrugBank password <None>
Returns
-------
dataset
DrugBank dataset
'''
if username is None and password is None:
# get input stream to first zip entry
req = requests.get(url)
else:
# TODO dataset that requires authentication
req = requests.get(url, auth=(username, password))
if req.text == 'Invalid Email or password.':
raise ValueError('Invalid Email or password.')
# Decode and unzip file
unzipped = _decode_as_zip_input_stream(req.content)
# save data to a temporary file (Dataset csv reader requires a input
# file!)
tempFileName = _save_temp_file(unzipped)
#load temporary CSV file to Spark dataste
dataset = _read_csv(tempFileName)
dataset = _remove_spaces_from_column_names(dataset)
return dataset
def _decode_as_zip_input_stream(content):
'''Returns an input stream to the first zip file entry
Parameters
----------
content : inputStream
inputStream content from request
Returns
-------
inputStream
unzipped InputStream
'''
zipfile = ZipFile(BytesIO(content))
return [line.decode() for line \
in zipfile.open(zipfile.namelist()[0]).readlines()]
def _save_temp_file(unzipped):
'''Saves tabular report as a temporary CSV file
Parameters
----------
unzipped : list
list of unzipped content
Returns
-------
str
path to the tempfile
'''
tempFile = tempfile.NamedTemporaryFile(delete=False)
with io.open(tempFile.name, "w", encoding='utf-8') as t:
t.writelines(unzipped)
return tempFile.name
def _read_csv(inputFileName):
'''Reads CSV file into Spark dataset
Parameters
----------
fileName : str
name of the input csv fileName
Returns
-------
dataset
a spark dataset
'''
spark = SparkSession.builder.getOrCreate()
dataset = spark.read.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load(inputFileName)
return dataset
def _remove_spaces_from_column_names(original):
'''Remove spaces from column names to ensure compatibility with parquet
Parameters
----------
original : dataset
the original dataset
Returns
-------
dataset
dataset with columns renamed
'''
for existingName in original.columns:
newName = existingName.replace(' ','')
# TODO: double check dataset "withColumnRenamed" funciton
original = original.withColumnRenamed(existingName,newName)
return original