PDB Drug Bank Mapping

Join PDB, Drug Bank and PDBjMine dataset together

Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.datasets import customReportService, drugBankDataset, pdbjMineDataset
from mmtfPyspark import structureViewer

# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"

# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)

Download open DrugBank dataset

In [2]:
drugBank = drugBankDataset.get_open_drug_links()
drugBank.toPandas().head(10)
Out[2]:
DrugBankID AccessionNumbers Commonname CAS UNII Synonyms StandardInChIKey
0 DB00001 BTD00024 | BIOD00024 Lepirudin 138068-37-8 Y43GF64R34 Hirudin variant-1 | Lepirudin recombinant None
1 DB00002 BTD00071 | BIOD00071 Cetuximab 205923-56-4 PQX0D8J21J Cetuximab | Cétuximab | Cetuximabum | Immunogl... None
2 DB00003 BTD00001 | BIOD00001 Dornase alfa 143831-71-4 953A26OA1Y Deoxyribonuclease (human clone 18-1 protein mo... None
3 DB00004 BTD00084 | BIOD00084 Denileukin diftitox 173146-27-5 25E79B5CTM Diphtheria toxin precursor | DT | NAD(+--dipht... None
4 DB00005 BTD00052 | BIOD00052 Etanercept 185243-69-0 OP401G7OJC Etanercept-szzs | RHU TNFR:FC | RHU-TNFR:FC | ... None
5 DB00006 BTD00076 | EXPT03302 | BIOD00076 | DB02351 Bivalirudin 128270-60-0 TN9BEX005G Bivalirudina | Bivalirudinum | Hirulog OIRCOABEOLEUMC-GEJPAHFPSA-N
6 DB00007 BTD00009 | BIOD00009 Leuprolide 53714-56-0 EFY6W0M8TG (D-Leu(6),des-gly-NH2(10),pro-ethylamide(9))-g... None
7 DB00008 BTD00043 | BIOD00043 Peginterferon alfa-2a 198153-51-4 Q46947FE7K Pegylated interferon alfa-2a | Pegylated inter... None
8 DB00009 BTD00050 | BIOD00050 Alteplase 105857-23-6 1RXS4UE564 Alteplase (genetical recombination) | Alteplas... None
9 DB00010 BTD00033 | BIOD00033 Sermorelin 86168-78-7 89243S03TE None None

Filter out DrugBank entries without StandardInChIKey

In [3]:
drugBank = drugBank.filter(drugBank.StandardInChIKey.isNotNull())
drugBank.toPandas().head(5)
Out[3]:
DrugBankID AccessionNumbers Commonname CAS UNII Synonyms StandardInChIKey
0 DB00006 BTD00076 | EXPT03302 | BIOD00076 | DB02351 Bivalirudin 128270-60-0 TN9BEX005G Bivalirudina | Bivalirudinum | Hirulog OIRCOABEOLEUMC-GEJPAHFPSA-N
1 DB00014 BTD00113 | BIOD00113 Goserelin 65807-02-5 0F65R8P09N None BLCLNMBMMGCOAS-URPVMXJPSA-N
2 DB00027 BTD00036 | BIOD00036 Gramicidin D 1405-97-6 5IE62321P4 Bacillus brevis gramicidin D | Gramicidin | Gr... NDAYQJDHGXTBJL-MWWSRJDJSA-N
3 DB00035 BTD00112 | BTD00061 | BIOD00112 | BIOD00061 Desmopressin 16679-58-6 ENR1LLB0FP 1-(3-mercaptopropionic acid)-8-D-arginine-vaso... NFLWUMRGJYTJIN-NXBWRCJVSA-N
4 DB00050 BTD00115 | APRD00686 | BIOD00115 Cetrorelix 120287-85-6 OON1HFZ4BA Cetrorelixum SBNPWPIBESPSIF-MHWMIDJBSA-N

Get PDB ligand annotations

In [4]:
ligands = customReportService.get_dataset(["ligandId","ligandMolecularWeight","ligandFormula","ligandSmiles","InChIKey"])
ligands.toPandas().head(10)
Out[4]:
structureChainId structureId chainId ligandId ligandMolecularWeight ligandFormula ligandSmiles InChIKey
0 100D.A 100D A SPM 202.34 C10 H26 N4 C(CCNCCCN)CNCCCN PFNFFQXMRSDOHW-UHFFFAOYSA-N
1 100D.B 100D B None NaN None None None
2 101D.A 101D A CBR 386.09 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... PLDRCXOBLRYJSZ-RRKCRQDMSA-N
3 101D.A 101D A MG 24.31 Mg 2 [Mg+2] JLVVSXFLKOJNIY-UHFFFAOYSA-N
4 101D.B 101D B CBR 386.09 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... PLDRCXOBLRYJSZ-RRKCRQDMSA-N
5 101D.B 101D B NT 430.46 C18 H26 N10 O3 Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=... IDBIFFKSXLYUOT-UHFFFAOYSA-N
6 101M.A 101M A HEM 616.49 C34 H32 Fe N4 O4 Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=[N]4[Fe]36[N]7... KABFMIBPWCXCRK-RGGAHWMASA-L
7 101M.A 101M A NBN 83.13 C5 H9 N CCCC[N+]#[C-] FSBLVBBRXSCOKU-UHFFFAOYSA-N
8 101M.A 101M A SO4 96.06 O4 S -2 [O-]S(=O)(=O)[O-] QAOWNCQODCNURD-UHFFFAOYSA-L
9 102D.A 102D A None NaN None None None

Filter out DrugBank entries without InChIKey

In [5]:
ligands = ligands.filter(ligands.InChIKey.isNotNull())
ligands.toPandas().head(5)
Out[5]:
structureChainId structureId chainId ligandId ligandMolecularWeight ligandFormula ligandSmiles InChIKey
0 100D.A 100D A SPM 202.34 C10 H26 N4 C(CCNCCCN)CNCCCN PFNFFQXMRSDOHW-UHFFFAOYSA-N
1 101D.A 101D A CBR 386.09 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... PLDRCXOBLRYJSZ-RRKCRQDMSA-N
2 101D.A 101D A MG 24.31 Mg 2 [Mg+2] JLVVSXFLKOJNIY-UHFFFAOYSA-N
3 101D.B 101D B CBR 386.09 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... PLDRCXOBLRYJSZ-RRKCRQDMSA-N
4 101D.B 101D B NT 430.46 C18 H26 N10 O3 Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=... IDBIFFKSXLYUOT-UHFFFAOYSA-N

Join ligand dataset with DrugBank info by InChIKey

In [6]:
ligands = ligands.join(drugBank, ligands.InChIKey == drugBank.StandardInChIKey)
ligands.toPandas().head(10)
Out[6]:
structureChainId structureId chainId ligandId ligandMolecularWeight ligandFormula ligandSmiles InChIKey DrugBankID AccessionNumbers Commonname CAS UNII Synonyms StandardInChIKey
0 100D.A 100D A SPM 202.34 C10 H26 N4 C(CCNCCCN)CNCCCN PFNFFQXMRSDOHW-UHFFFAOYSA-N DB00127 NUTR00055 | EXPT02947 | DB02564 Spermine 71-44-3 2FZ7Y3VOQX 4,9-Diaza-1,12-dodecanediamine | 4,9-Diazadode... PFNFFQXMRSDOHW-UHFFFAOYSA-N
1 101D.A 101D A MG 24.31 Mg 2 [Mg+2] JLVVSXFLKOJNIY-UHFFFAOYSA-N DB01378 None Magnesium 7439-95-4 T6V3LHY838 None JLVVSXFLKOJNIY-UHFFFAOYSA-N
2 101M.A 101M A NBN 83.13 C5 H9 N CCCC[N+]#[C-] FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 N-Butyl Isocyanide 2769-64-4 None None FSBLVBBRXSCOKU-UHFFFAOYSA-N
3 102D.B 102D B TNT 312.37 C17 H20 N4 O2 c1cc(ccc1C(=N)N)OCCCOc2ccc(cc2)C(=N)N WTFXJFJYEJZMFO-UHFFFAOYSA-N DB13296 None Propamidine 104-32-5 G20G12V769 None WTFXJFJYEJZMFO-UHFFFAOYSA-N
4 102L.A 102L A BME 78.13 C2 H6 O S C(CS)O DGVVWUTYPXICAM-UHFFFAOYSA-N DB03345 EXPT02882 | DB03131 Beta-Mercaptoethanol 60-24-2 14R9K67URN 2-Sulfhydryl-Ethanol DGVVWUTYPXICAM-UHFFFAOYSA-N
5 103L.A 103L A BME 78.13 C2 H6 O S C(CS)O DGVVWUTYPXICAM-UHFFFAOYSA-N DB03345 EXPT02882 | DB03131 Beta-Mercaptoethanol 60-24-2 14R9K67URN 2-Sulfhydryl-Ethanol DGVVWUTYPXICAM-UHFFFAOYSA-N
6 103M.A 103M A NBN 83.13 C5 H9 N CCCC[N+]#[C-] FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 N-Butyl Isocyanide 2769-64-4 None None FSBLVBBRXSCOKU-UHFFFAOYSA-N
7 104M.A 104M A NBN 83.13 C5 H9 N CCCC[N+]#[C-] FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 N-Butyl Isocyanide 2769-64-4 None None FSBLVBBRXSCOKU-UHFFFAOYSA-N
8 105M.A 105M A NBN 83.13 C5 H9 N CCCC[N+]#[C-] FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 N-Butyl Isocyanide 2769-64-4 None None FSBLVBBRXSCOKU-UHFFFAOYSA-N
9 106M.A 106M A ENC 56.09 C3 H6 N 1 CC[N+]#C JEGVKBYNUPNGJU-UHFFFAOYSA-N DB03399 EXPT01344 Ethyl Isocyanide None None None JEGVKBYNUPNGJU-UHFFFAOYSA-N

Show one example per drug molecule

In [7]:
ligands = ligands.dropDuplicates(["Commonname"])
ligands = ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight","ligandFormula", "InChIKey", "ligandSmiles")
ligands.sort("Commonname").toPandas().head(10)
Out[7]:
structureChainId ligandId DrugBankID Commonname ligandMolecularWeight ligandFormula InChIKey ligandSmiles
0 1NJ6.A A5A DB03376 '5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine 417.40 C13 H19 N7 O7 S CWWYMWDIYBJVLP-YTMOPEAISA-N C[C@@H](C(=O)NS(=O)(=O)OC[C@@H]1[C@H]([C@H]([C...
1 1NJ5.A P5A DB02510 '5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine 443.43 C15 H21 N7 O7 S LKVJEMXWEODCAY-JVEUSOJLSA-N c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...
2 1VQ2.A DDN DB04280 ((2r,3s,5r)-3-Hydroxy-5-(4-Hydroxy-2-Oxo-3,4-D... 310.20 C9 H15 N2 O8 P ILSIYJVILUIVPM-LXGUWJNJSA-N C1[C@@H]([C@H](O[C@H]1N2C=C[C@H](NC2=O)O)COP(=...
3 1A7A.A ADC DB03216 (1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Y... 233.23 C10 H11 N5 O2 RQPALADHFYHEHK-CHKWXVPMSA-N c1nc(c2c(n1)n(cn2)[C@@H]3C=CC([C@H]3O)O)N
4 1DCY.A I3N DB03121 (1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)ace... 309.36 C19 H19 N O3 ZEKCBTQHDTUHRJ-UHFFFAOYSA-N Cc1c(c2cc(ccc2n1Cc3ccccc3)OC)CC(=O)O
5 2E99.A B08 DB07404 (1-HYDROXY-1-PHOSPHONO-2-[1,1';3',1'']TERPHENY... 434.32 C20 H20 O7 P2 YXQQNSYZOQHKHD-UHFFFAOYSA-N c1ccc(cc1)c2cccc(c2)c3cccc(c3)CC(O)(P(=O)(O)O)...
6 2E9A.A B28 DB07409 (1-HYDROXY-1-PHOSPHONO-2-[1,1';4',1'']TERPHENY... 434.32 C20 H20 O7 P2 MPBUFKZCEBTBSK-UHFFFAOYSA-N c1ccc(cc1)c2ccc(cc2)c3cccc(c3)CC(O)(P(=O)(O)O)...
7 2Z52.A H23 DB07873 (1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID) 346.29 C12 H28 O7 P2 KKVZONPEMODBBG-UHFFFAOYSA-N CCCCCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
8 2Z50.A 028 DB06830 (1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID) 276.16 C7 H18 O7 P2 IJEGNOYPWRBKAE-UHFFFAOYSA-N CCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
9 2Z4X.A 252 DB06931 (1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID) 304.21 C9 H22 O7 P2 COHUUYPEYRMWTH-UHFFFAOYSA-N CCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O

Query structures with 2.7.11.1 EC number using PDBjMine

In [8]:
enzymeQuery = "SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'"
enzyme = pdbjMineDataset.get_dataset(enzymeQuery)

print(f"First 10 results for query: {enzymeQuery}")
enzyme.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'
+-----+-----+---------+---------+----------------+
|pdbid|chain|accession|ec_number|structureChainId|
+-----+-----+---------+---------+----------------+
| 1AUE|    A|   P42345| 2.7.11.1|          1AUE.A|
| 1AUE|    B|   P42345| 2.7.11.1|          1AUE.B|
| 1BPV|    A|   Q8WZ42| 2.7.11.1|          1BPV.A|
| 1C1Y|    B|   P04049| 2.7.11.1|          1C1Y.B|
| 1CF4|    B|   Q07912| 2.7.11.1|          1CF4.B|
| 1CJA|    A|   P80197| 2.7.11.1|          1CJA.A|
| 1CJA|    B|   P80197| 2.7.11.1|          1CJA.B|
| 1CKI|    A|   Q06486| 2.7.11.1|          1CKI.A|
| 1CKI|    B|   Q06486| 2.7.11.1|          1CKI.B|
| 1CKJ|    A|   Q06486| 2.7.11.1|          1CKJ.A|
+-----+-----+---------+---------+----------------+
only showing top 10 rows

Join ligand dataset with PDBjMine dataset with structureChainId

In [9]:
ligands = ligands.join(enzyme, ligands.structureChainId == enzyme.structureChainId)
print(f"Total number of structures: {ligands.count()}")

df = ligands.toPandas()
df.head()
Total number of structures: 172
Out[9]:
structureChainId ligandId DrugBankID Commonname ligandMolecularWeight ligandFormula InChIKey ligandSmiles pdbid chain accession ec_number structureChainId
0 1UVR.A BI8 DB01946 3-[1-(3-Aminopropyl)-1h-Indol-3-Yl]-4-(1-Methy... 398.46 C24 H22 N4 O2 UQHKJRCFSLMWIA-UHFFFAOYSA-N Cn1cc(c2c1cccc2)C3=C(C(=O)NC3=O)c4cn(c5c4cccc5... 1UVR A O15530 2.7.11.1 1UVR.A
1 4EKL.A 0RF DB11743 Ipatasertib 458.00 C24 H32 Cl N5 O2 GRZXWCHAXNAUHY-NSISKUIASA-N C[C@@H]1C[C@H](c2c1c(ncn2)N3CCN(CC3)C(=O)[C@H]... 4EKL A P31749 2.7.11.1 4EKL.A
2 2NP8.A CC3 DB07545 N-{3-[(4-{[3-(TRIFLUOROMETHYL)PHENYL]AMINO}PYR... 413.40 C21 H18 F3 N5 O RDTDWGQDFJPTPD-UHFFFAOYSA-N c1cc(cc(c1)Nc2ccnc(n2)Nc3cccc(c3)NC(=O)C4CC4)C... 2NP8 A O14965 2.7.11.1 2NP8.A
3 2C3L.A IDZ DB07959 3-(1H-BENZIMIDAZOL-2-YL)-1H-INDAZOLE 234.26 C14 H10 N4 JTKFRFMSUBOCIQ-UHFFFAOYSA-N c1ccc2c(c1)c(n[nH]2)c3[nH]c4ccccc4n3 2C3L A O14757 2.7.11.1 2C3L.A
4 3BQR.A 4RB DB07125 4-(6-{[(1R)-1-(hydroxymethyl)propyl]amino}imid... 326.35 C17 H18 N4 O3 KKZYGUVAFJCULH-CYBMUJFWSA-N CC[C@H](CO)Nc1ccc2ncc(n2n1)c3ccc(cc3)C(=O)O 3BQR A O43293 2.7.11.1 3BQR.A

Visualize protein kinase interaction

In [11]:
structureViewer.view_binding_site(df.pdbid, df.ligandId, df.chain, 4.0)
Out[11]:
<function mmtfPyspark.structureViewer.view_binding_site.<locals>.view3d(i=0)>

Terminate Spark

In [12]:
sc.stop()