# PDB Drug Bank Mapping

Join PDB, Drug Bank and PDBjMine dataset together 

## Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext                    
from mmtfPyspark.datasets import customReportService, drugBankDataset, pdbjMineDataset
from mmtfPyspark import structureViewer
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "../../resources/mmtf_full_sample/"

# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   

## Download open DrugBank dataset

In [2]:
drugBank = drugBankDataset.get_open_drug_links()
drugBank.toPandas().head(10)

Unnamed: 0,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,DB00001,BTD00024 | BIOD00024,Lepirudin,138068-37-8,Y43GF64R34,Hirudin variant-1 | Lepirudin recombinant,
1,DB00002,BTD00071 | BIOD00071,Cetuximab,205923-56-4,PQX0D8J21J,Cetuximab | Cétuximab | Cetuximabum | Immunogl...,
2,DB00003,BTD00001 | BIOD00001,Dornase alfa,143831-71-4,953A26OA1Y,Deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BTD00084 | BIOD00084,Denileukin diftitox,173146-27-5,25E79B5CTM,Diphtheria toxin precursor | DT | NAD(+--dipht...,
4,DB00005,BTD00052 | BIOD00052,Etanercept,185243-69-0,OP401G7OJC,Etanercept-szzs | RHU TNFR:FC | RHU-TNFR:FC | ...,
5,DB00006,BTD00076 | EXPT03302 | BIOD00076 | DB02351,Bivalirudin,128270-60-0,TN9BEX005G,Bivalirudina | Bivalirudinum | Hirulog,OIRCOABEOLEUMC-GEJPAHFPSA-N
6,DB00007,BTD00009 | BIOD00009,Leuprolide,53714-56-0,EFY6W0M8TG,"(D-Leu(6),des-gly-NH2(10),pro-ethylamide(9))-g...",
7,DB00008,BTD00043 | BIOD00043,Peginterferon alfa-2a,198153-51-4,Q46947FE7K,Pegylated interferon alfa-2a | Pegylated inter...,
8,DB00009,BTD00050 | BIOD00050,Alteplase,105857-23-6,1RXS4UE564,Alteplase (genetical recombination) | Alteplas...,
9,DB00010,BTD00033 | BIOD00033,Sermorelin,86168-78-7,89243S03TE,,


## Filter out DrugBank entries without StandardInChIKey

In [3]:
drugBank = drugBank.filter(drugBank.StandardInChIKey.isNotNull())
drugBank.toPandas().head(5)

Unnamed: 0,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,DB00006,BTD00076 | EXPT03302 | BIOD00076 | DB02351,Bivalirudin,128270-60-0,TN9BEX005G,Bivalirudina | Bivalirudinum | Hirulog,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00014,BTD00113 | BIOD00113,Goserelin,65807-02-5,0F65R8P09N,,BLCLNMBMMGCOAS-URPVMXJPSA-N
2,DB00027,BTD00036 | BIOD00036,Gramicidin D,1405-97-6,5IE62321P4,Bacillus brevis gramicidin D | Gramicidin | Gr...,NDAYQJDHGXTBJL-MWWSRJDJSA-N
3,DB00035,BTD00112 | BTD00061 | BIOD00112 | BIOD00061,Desmopressin,16679-58-6,ENR1LLB0FP,1-(3-mercaptopropionic acid)-8-D-arginine-vaso...,NFLWUMRGJYTJIN-NXBWRCJVSA-N
4,DB00050,BTD00115 | APRD00686 | BIOD00115,Cetrorelix,120287-85-6,OON1HFZ4BA,Cetrorelixum,SBNPWPIBESPSIF-MHWMIDJBSA-N


## Get PDB ligand annotations

In [4]:
ligands = customReportService.get_dataset(["ligandId","ligandMolecularWeight","ligandFormula","ligandSmiles","InChIKey"])
ligands.toPandas().head(10)

Unnamed: 0,structureChainId,structureId,chainId,ligandId,ligandMolecularWeight,ligandFormula,ligandSmiles,InChIKey
0,100D.A,100D,A,SPM,202.34,C10 H26 N4,C(CCNCCCN)CNCCCN,PFNFFQXMRSDOHW-UHFFFAOYSA-N
1,100D.B,100D,B,,,,,
2,101D.A,101D,A,CBR,386.09,C9 H13 Br N3 O7 P,C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...,PLDRCXOBLRYJSZ-RRKCRQDMSA-N
3,101D.A,101D,A,MG,24.31,Mg 2,[Mg+2],JLVVSXFLKOJNIY-UHFFFAOYSA-N
4,101D.B,101D,B,CBR,386.09,C9 H13 Br N3 O7 P,C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...,PLDRCXOBLRYJSZ-RRKCRQDMSA-N
5,101D.B,101D,B,NT,430.46,C18 H26 N10 O3,Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=...,IDBIFFKSXLYUOT-UHFFFAOYSA-N
6,101M.A,101M,A,HEM,616.49,C34 H32 Fe N4 O4,Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=[N]4[Fe]36[N]7...,KABFMIBPWCXCRK-RGGAHWMASA-L
7,101M.A,101M,A,NBN,83.13,C5 H9 N,CCCC[N+]#[C-],FSBLVBBRXSCOKU-UHFFFAOYSA-N
8,101M.A,101M,A,SO4,96.06,O4 S -2,[O-]S(=O)(=O)[O-],QAOWNCQODCNURD-UHFFFAOYSA-L
9,102D.A,102D,A,,,,,


## Filter out DrugBank entries without InChIKey

In [5]:
ligands = ligands.filter(ligands.InChIKey.isNotNull())
ligands.toPandas().head(5)

Unnamed: 0,structureChainId,structureId,chainId,ligandId,ligandMolecularWeight,ligandFormula,ligandSmiles,InChIKey
0,100D.A,100D,A,SPM,202.34,C10 H26 N4,C(CCNCCCN)CNCCCN,PFNFFQXMRSDOHW-UHFFFAOYSA-N
1,101D.A,101D,A,CBR,386.09,C9 H13 Br N3 O7 P,C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...,PLDRCXOBLRYJSZ-RRKCRQDMSA-N
2,101D.A,101D,A,MG,24.31,Mg 2,[Mg+2],JLVVSXFLKOJNIY-UHFFFAOYSA-N
3,101D.B,101D,B,CBR,386.09,C9 H13 Br N3 O7 P,C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...,PLDRCXOBLRYJSZ-RRKCRQDMSA-N
4,101D.B,101D,B,NT,430.46,C18 H26 N10 O3,Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=...,IDBIFFKSXLYUOT-UHFFFAOYSA-N


## Join ligand dataset with DrugBank info by InChIKey

In [6]:
ligands = ligands.join(drugBank, ligands.InChIKey == drugBank.StandardInChIKey)
ligands.toPandas().head(10)

Unnamed: 0,structureChainId,structureId,chainId,ligandId,ligandMolecularWeight,ligandFormula,ligandSmiles,InChIKey,DrugBankID,AccessionNumbers,Commonname,CAS,UNII,Synonyms,StandardInChIKey
0,100D.A,100D,A,SPM,202.34,C10 H26 N4,C(CCNCCCN)CNCCCN,PFNFFQXMRSDOHW-UHFFFAOYSA-N,DB00127,NUTR00055 | EXPT02947 | DB02564,Spermine,71-44-3,2FZ7Y3VOQX,"4,9-Diaza-1,12-dodecanediamine | 4,9-Diazadode...",PFNFFQXMRSDOHW-UHFFFAOYSA-N
1,101D.A,101D,A,MG,24.31,Mg 2,[Mg+2],JLVVSXFLKOJNIY-UHFFFAOYSA-N,DB01378,,Magnesium,7439-95-4,T6V3LHY838,,JLVVSXFLKOJNIY-UHFFFAOYSA-N
2,101M.A,101M,A,NBN,83.13,C5 H9 N,CCCC[N+]#[C-],FSBLVBBRXSCOKU-UHFFFAOYSA-N,DB01826,EXPT02302,N-Butyl Isocyanide,2769-64-4,,,FSBLVBBRXSCOKU-UHFFFAOYSA-N
3,102D.B,102D,B,TNT,312.37,C17 H20 N4 O2,c1cc(ccc1C(=N)N)OCCCOc2ccc(cc2)C(=N)N,WTFXJFJYEJZMFO-UHFFFAOYSA-N,DB13296,,Propamidine,104-32-5,G20G12V769,,WTFXJFJYEJZMFO-UHFFFAOYSA-N
4,102L.A,102L,A,BME,78.13,C2 H6 O S,C(CS)O,DGVVWUTYPXICAM-UHFFFAOYSA-N,DB03345,EXPT02882 | DB03131,Beta-Mercaptoethanol,60-24-2,14R9K67URN,2-Sulfhydryl-Ethanol,DGVVWUTYPXICAM-UHFFFAOYSA-N
5,103L.A,103L,A,BME,78.13,C2 H6 O S,C(CS)O,DGVVWUTYPXICAM-UHFFFAOYSA-N,DB03345,EXPT02882 | DB03131,Beta-Mercaptoethanol,60-24-2,14R9K67URN,2-Sulfhydryl-Ethanol,DGVVWUTYPXICAM-UHFFFAOYSA-N
6,103M.A,103M,A,NBN,83.13,C5 H9 N,CCCC[N+]#[C-],FSBLVBBRXSCOKU-UHFFFAOYSA-N,DB01826,EXPT02302,N-Butyl Isocyanide,2769-64-4,,,FSBLVBBRXSCOKU-UHFFFAOYSA-N
7,104M.A,104M,A,NBN,83.13,C5 H9 N,CCCC[N+]#[C-],FSBLVBBRXSCOKU-UHFFFAOYSA-N,DB01826,EXPT02302,N-Butyl Isocyanide,2769-64-4,,,FSBLVBBRXSCOKU-UHFFFAOYSA-N
8,105M.A,105M,A,NBN,83.13,C5 H9 N,CCCC[N+]#[C-],FSBLVBBRXSCOKU-UHFFFAOYSA-N,DB01826,EXPT02302,N-Butyl Isocyanide,2769-64-4,,,FSBLVBBRXSCOKU-UHFFFAOYSA-N
9,106M.A,106M,A,ENC,56.09,C3 H6 N 1,CC[N+]#C,JEGVKBYNUPNGJU-UHFFFAOYSA-N,DB03399,EXPT01344,Ethyl Isocyanide,,,,JEGVKBYNUPNGJU-UHFFFAOYSA-N


## Show one example per drug molecule

In [7]:
ligands = ligands.dropDuplicates(["Commonname"])
ligands = ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight","ligandFormula", "InChIKey", "ligandSmiles")
ligands.sort("Commonname").toPandas().head(10)

Unnamed: 0,structureChainId,ligandId,DrugBankID,Commonname,ligandMolecularWeight,ligandFormula,InChIKey,ligandSmiles
0,1NJ6.A,A5A,DB03376,'5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine,417.4,C13 H19 N7 O7 S,CWWYMWDIYBJVLP-YTMOPEAISA-N,C[C@@H](C(=O)NS(=O)(=O)OC[C@@H]1[C@H]([C@H]([C...
1,1NJ5.A,P5A,DB02510,'5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine,443.43,C15 H21 N7 O7 S,LKVJEMXWEODCAY-JVEUSOJLSA-N,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...
2,1VQ2.A,DDN,DB04280,"((2r,3s,5r)-3-Hydroxy-5-(4-Hydroxy-2-Oxo-3,4-D...",310.2,C9 H15 N2 O8 P,ILSIYJVILUIVPM-LXGUWJNJSA-N,C1[C@@H]([C@H](O[C@H]1N2C=C[C@H](NC2=O)O)COP(=...
3,1A7A.A,ADC,DB03216,"(1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Y...",233.23,C10 H11 N5 O2,RQPALADHFYHEHK-CHKWXVPMSA-N,c1nc(c2c(n1)n(cn2)[C@@H]3C=CC([C@H]3O)O)N
4,1DCY.A,I3N,DB03121,(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)ace...,309.36,C19 H19 N O3,ZEKCBTQHDTUHRJ-UHFFFAOYSA-N,Cc1c(c2cc(ccc2n1Cc3ccccc3)OC)CC(=O)O
5,2E99.A,B08,DB07404,"(1-HYDROXY-1-PHOSPHONO-2-[1,1';3',1'']TERPHENY...",434.32,C20 H20 O7 P2,YXQQNSYZOQHKHD-UHFFFAOYSA-N,c1ccc(cc1)c2cccc(c2)c3cccc(c3)CC(O)(P(=O)(O)O)...
6,2E9A.A,B28,DB07409,"(1-HYDROXY-1-PHOSPHONO-2-[1,1';4',1'']TERPHENY...",434.32,C20 H20 O7 P2,MPBUFKZCEBTBSK-UHFFFAOYSA-N,c1ccc(cc1)c2ccc(cc2)c3cccc(c3)CC(O)(P(=O)(O)O)...
7,2Z52.A,H23,DB07873,"(1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",346.29,C12 H28 O7 P2,KKVZONPEMODBBG-UHFFFAOYSA-N,CCCCCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
8,2Z50.A,028,DB06830,"(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",276.16,C7 H18 O7 P2,IJEGNOYPWRBKAE-UHFFFAOYSA-N,CCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
9,2Z4X.A,252,DB06931,"(1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID)",304.21,C9 H22 O7 P2,COHUUYPEYRMWTH-UHFFFAOYSA-N,CCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O


## Query structures with 2.7.11.1 EC number using PDBjMine

In [8]:
enzymeQuery = "SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'"
enzyme = pdbjMineDataset.get_dataset(enzymeQuery)

print(f"First 10 results for query: {enzymeQuery}")
enzyme.show(10)

First 10 results for query: SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'
+-----+-----+---------+---------+----------------+
|pdbid|chain|accession|ec_number|structureChainId|
+-----+-----+---------+---------+----------------+
| 1AUE|    A|   P42345| 2.7.11.1|          1AUE.A|
| 1AUE|    B|   P42345| 2.7.11.1|          1AUE.B|
| 1BPV|    A|   Q8WZ42| 2.7.11.1|          1BPV.A|
| 1C1Y|    B|   P04049| 2.7.11.1|          1C1Y.B|
| 1CF4|    B|   Q07912| 2.7.11.1|          1CF4.B|
| 1CJA|    A|   P80197| 2.7.11.1|          1CJA.A|
| 1CJA|    B|   P80197| 2.7.11.1|          1CJA.B|
| 1CKI|    A|   Q06486| 2.7.11.1|          1CKI.A|
| 1CKI|    B|   Q06486| 2.7.11.1|          1CKI.B|
| 1CKJ|    A|   Q06486| 2.7.11.1|          1CKJ.A|
+-----+-----+---------+---------+----------------+
only showing top 10 rows



## Join ligand dataset with PDBjMine dataset with structureChainId

In [9]:
ligands = ligands.join(enzyme, ligands.structureChainId == enzyme.structureChainId)
print(f"Total number of structures: {ligands.count()}")

df = ligands.toPandas()
df.head()

Total number of structures: 172


Unnamed: 0,structureChainId,ligandId,DrugBankID,Commonname,ligandMolecularWeight,ligandFormula,InChIKey,ligandSmiles,pdbid,chain,accession,ec_number,structureChainId.1
0,1UVR.A,BI8,DB01946,3-[1-(3-Aminopropyl)-1h-Indol-3-Yl]-4-(1-Methy...,398.46,C24 H22 N4 O2,UQHKJRCFSLMWIA-UHFFFAOYSA-N,Cn1cc(c2c1cccc2)C3=C(C(=O)NC3=O)c4cn(c5c4cccc5...,1UVR,A,O15530,2.7.11.1,1UVR.A
1,4EKL.A,0RF,DB11743,Ipatasertib,458.0,C24 H32 Cl N5 O2,GRZXWCHAXNAUHY-NSISKUIASA-N,C[C@@H]1C[C@H](c2c1c(ncn2)N3CCN(CC3)C(=O)[C@H]...,4EKL,A,P31749,2.7.11.1,4EKL.A
2,2NP8.A,CC3,DB07545,N-{3-[(4-{[3-(TRIFLUOROMETHYL)PHENYL]AMINO}PYR...,413.4,C21 H18 F3 N5 O,RDTDWGQDFJPTPD-UHFFFAOYSA-N,c1cc(cc(c1)Nc2ccnc(n2)Nc3cccc(c3)NC(=O)C4CC4)C...,2NP8,A,O14965,2.7.11.1,2NP8.A
3,2C3L.A,IDZ,DB07959,3-(1H-BENZIMIDAZOL-2-YL)-1H-INDAZOLE,234.26,C14 H10 N4,JTKFRFMSUBOCIQ-UHFFFAOYSA-N,c1ccc2c(c1)c(n[nH]2)c3[nH]c4ccccc4n3,2C3L,A,O14757,2.7.11.1,2C3L.A
4,3BQR.A,4RB,DB07125,4-(6-{[(1R)-1-(hydroxymethyl)propyl]amino}imid...,326.35,C17 H18 N4 O3,KKZYGUVAFJCULH-CYBMUJFWSA-N,CC[C@H](CO)Nc1ccc2ncc(n2n1)c3ccc(cc3)C(=O)O,3BQR,A,O43293,2.7.11.1,3BQR.A


## Visualize protein kinase interaction

In [11]:
structureViewer.view_binding_site(df.pdbid, df.ligandId, df.chain, 4.0)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=171), Output())…

<function mmtfPyspark.structureViewer.view_binding_site.<locals>.view3d(i=0)>

## Terminate Spark

In [12]:
sc.stop()