Join PDB, Drug Bank and PDBjMine dataset together
In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.datasets import customReportService, drugBankDataset, pdbjMineDataset
from mmtfPyspark import structureViewer
# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"
# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)
In [2]:
drugBank = drugBankDataset.get_open_drug_links()
drugBank.toPandas().head(10)
Out[2]:
DrugBankID | AccessionNumbers | Commonname | CAS | UNII | Synonyms | StandardInChIKey | |
---|---|---|---|---|---|---|---|
0 | DB00001 | BTD00024 | BIOD00024 | Lepirudin | 138068-37-8 | Y43GF64R34 | Hirudin variant-1 | Lepirudin recombinant | None |
1 | DB00002 | BTD00071 | BIOD00071 | Cetuximab | 205923-56-4 | PQX0D8J21J | Cetuximab | Cétuximab | Cetuximabum | Immunogl... | None |
2 | DB00003 | BTD00001 | BIOD00001 | Dornase alfa | 143831-71-4 | 953A26OA1Y | Deoxyribonuclease (human clone 18-1 protein mo... | None |
3 | DB00004 | BTD00084 | BIOD00084 | Denileukin diftitox | 173146-27-5 | 25E79B5CTM | Diphtheria toxin precursor | DT | NAD(+--dipht... | None |
4 | DB00005 | BTD00052 | BIOD00052 | Etanercept | 185243-69-0 | OP401G7OJC | Etanercept-szzs | RHU TNFR:FC | RHU-TNFR:FC | ... | None |
5 | DB00006 | BTD00076 | EXPT03302 | BIOD00076 | DB02351 | Bivalirudin | 128270-60-0 | TN9BEX005G | Bivalirudina | Bivalirudinum | Hirulog | OIRCOABEOLEUMC-GEJPAHFPSA-N |
6 | DB00007 | BTD00009 | BIOD00009 | Leuprolide | 53714-56-0 | EFY6W0M8TG | (D-Leu(6),des-gly-NH2(10),pro-ethylamide(9))-g... | None |
7 | DB00008 | BTD00043 | BIOD00043 | Peginterferon alfa-2a | 198153-51-4 | Q46947FE7K | Pegylated interferon alfa-2a | Pegylated inter... | None |
8 | DB00009 | BTD00050 | BIOD00050 | Alteplase | 105857-23-6 | 1RXS4UE564 | Alteplase (genetical recombination) | Alteplas... | None |
9 | DB00010 | BTD00033 | BIOD00033 | Sermorelin | 86168-78-7 | 89243S03TE | None | None |
In [3]:
drugBank = drugBank.filter(drugBank.StandardInChIKey.isNotNull())
drugBank.toPandas().head(5)
Out[3]:
DrugBankID | AccessionNumbers | Commonname | CAS | UNII | Synonyms | StandardInChIKey | |
---|---|---|---|---|---|---|---|
0 | DB00006 | BTD00076 | EXPT03302 | BIOD00076 | DB02351 | Bivalirudin | 128270-60-0 | TN9BEX005G | Bivalirudina | Bivalirudinum | Hirulog | OIRCOABEOLEUMC-GEJPAHFPSA-N |
1 | DB00014 | BTD00113 | BIOD00113 | Goserelin | 65807-02-5 | 0F65R8P09N | None | BLCLNMBMMGCOAS-URPVMXJPSA-N |
2 | DB00027 | BTD00036 | BIOD00036 | Gramicidin D | 1405-97-6 | 5IE62321P4 | Bacillus brevis gramicidin D | Gramicidin | Gr... | NDAYQJDHGXTBJL-MWWSRJDJSA-N |
3 | DB00035 | BTD00112 | BTD00061 | BIOD00112 | BIOD00061 | Desmopressin | 16679-58-6 | ENR1LLB0FP | 1-(3-mercaptopropionic acid)-8-D-arginine-vaso... | NFLWUMRGJYTJIN-NXBWRCJVSA-N |
4 | DB00050 | BTD00115 | APRD00686 | BIOD00115 | Cetrorelix | 120287-85-6 | OON1HFZ4BA | Cetrorelixum | SBNPWPIBESPSIF-MHWMIDJBSA-N |
In [4]:
ligands = customReportService.get_dataset(["ligandId","ligandMolecularWeight","ligandFormula","ligandSmiles","InChIKey"])
ligands.toPandas().head(10)
Out[4]:
structureChainId | structureId | chainId | ligandId | ligandMolecularWeight | ligandFormula | ligandSmiles | InChIKey | |
---|---|---|---|---|---|---|---|---|
0 | 100D.A | 100D | A | SPM | 202.34 | C10 H26 N4 | C(CCNCCCN)CNCCCN | PFNFFQXMRSDOHW-UHFFFAOYSA-N |
1 | 100D.B | 100D | B | None | NaN | None | None | None |
2 | 101D.A | 101D | A | CBR | 386.09 | C9 H13 Br N3 O7 P | C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... | PLDRCXOBLRYJSZ-RRKCRQDMSA-N |
3 | 101D.A | 101D | A | MG | 24.31 | Mg 2 | [Mg+2] | JLVVSXFLKOJNIY-UHFFFAOYSA-N |
4 | 101D.B | 101D | B | CBR | 386.09 | C9 H13 Br N3 O7 P | C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... | PLDRCXOBLRYJSZ-RRKCRQDMSA-N |
5 | 101D.B | 101D | B | NT | 430.46 | C18 H26 N10 O3 | Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=... | IDBIFFKSXLYUOT-UHFFFAOYSA-N |
6 | 101M.A | 101M | A | HEM | 616.49 | C34 H32 Fe N4 O4 | Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=[N]4[Fe]36[N]7... | KABFMIBPWCXCRK-RGGAHWMASA-L |
7 | 101M.A | 101M | A | NBN | 83.13 | C5 H9 N | CCCC[N+]#[C-] | FSBLVBBRXSCOKU-UHFFFAOYSA-N |
8 | 101M.A | 101M | A | SO4 | 96.06 | O4 S -2 | [O-]S(=O)(=O)[O-] | QAOWNCQODCNURD-UHFFFAOYSA-L |
9 | 102D.A | 102D | A | None | NaN | None | None | None |
In [5]:
ligands = ligands.filter(ligands.InChIKey.isNotNull())
ligands.toPandas().head(5)
Out[5]:
structureChainId | structureId | chainId | ligandId | ligandMolecularWeight | ligandFormula | ligandSmiles | InChIKey | |
---|---|---|---|---|---|---|---|---|
0 | 100D.A | 100D | A | SPM | 202.34 | C10 H26 N4 | C(CCNCCCN)CNCCCN | PFNFFQXMRSDOHW-UHFFFAOYSA-N |
1 | 101D.A | 101D | A | CBR | 386.09 | C9 H13 Br N3 O7 P | C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... | PLDRCXOBLRYJSZ-RRKCRQDMSA-N |
2 | 101D.A | 101D | A | MG | 24.31 | Mg 2 | [Mg+2] | JLVVSXFLKOJNIY-UHFFFAOYSA-N |
3 | 101D.B | 101D | B | CBR | 386.09 | C9 H13 Br N3 O7 P | C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... | PLDRCXOBLRYJSZ-RRKCRQDMSA-N |
4 | 101D.B | 101D | B | NT | 430.46 | C18 H26 N10 O3 | Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=... | IDBIFFKSXLYUOT-UHFFFAOYSA-N |
In [6]:
ligands = ligands.join(drugBank, ligands.InChIKey == drugBank.StandardInChIKey)
ligands.toPandas().head(10)
Out[6]:
structureChainId | structureId | chainId | ligandId | ligandMolecularWeight | ligandFormula | ligandSmiles | InChIKey | DrugBankID | AccessionNumbers | Commonname | CAS | UNII | Synonyms | StandardInChIKey | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100D.A | 100D | A | SPM | 202.34 | C10 H26 N4 | C(CCNCCCN)CNCCCN | PFNFFQXMRSDOHW-UHFFFAOYSA-N | DB00127 | NUTR00055 | EXPT02947 | DB02564 | Spermine | 71-44-3 | 2FZ7Y3VOQX | 4,9-Diaza-1,12-dodecanediamine | 4,9-Diazadode... | PFNFFQXMRSDOHW-UHFFFAOYSA-N |
1 | 101D.A | 101D | A | MG | 24.31 | Mg 2 | [Mg+2] | JLVVSXFLKOJNIY-UHFFFAOYSA-N | DB01378 | None | Magnesium | 7439-95-4 | T6V3LHY838 | None | JLVVSXFLKOJNIY-UHFFFAOYSA-N |
2 | 101M.A | 101M | A | NBN | 83.13 | C5 H9 N | CCCC[N+]#[C-] | FSBLVBBRXSCOKU-UHFFFAOYSA-N | DB01826 | EXPT02302 | N-Butyl Isocyanide | 2769-64-4 | None | None | FSBLVBBRXSCOKU-UHFFFAOYSA-N |
3 | 102D.B | 102D | B | TNT | 312.37 | C17 H20 N4 O2 | c1cc(ccc1C(=N)N)OCCCOc2ccc(cc2)C(=N)N | WTFXJFJYEJZMFO-UHFFFAOYSA-N | DB13296 | None | Propamidine | 104-32-5 | G20G12V769 | None | WTFXJFJYEJZMFO-UHFFFAOYSA-N |
4 | 102L.A | 102L | A | BME | 78.13 | C2 H6 O S | C(CS)O | DGVVWUTYPXICAM-UHFFFAOYSA-N | DB03345 | EXPT02882 | DB03131 | Beta-Mercaptoethanol | 60-24-2 | 14R9K67URN | 2-Sulfhydryl-Ethanol | DGVVWUTYPXICAM-UHFFFAOYSA-N |
5 | 103L.A | 103L | A | BME | 78.13 | C2 H6 O S | C(CS)O | DGVVWUTYPXICAM-UHFFFAOYSA-N | DB03345 | EXPT02882 | DB03131 | Beta-Mercaptoethanol | 60-24-2 | 14R9K67URN | 2-Sulfhydryl-Ethanol | DGVVWUTYPXICAM-UHFFFAOYSA-N |
6 | 103M.A | 103M | A | NBN | 83.13 | C5 H9 N | CCCC[N+]#[C-] | FSBLVBBRXSCOKU-UHFFFAOYSA-N | DB01826 | EXPT02302 | N-Butyl Isocyanide | 2769-64-4 | None | None | FSBLVBBRXSCOKU-UHFFFAOYSA-N |
7 | 104M.A | 104M | A | NBN | 83.13 | C5 H9 N | CCCC[N+]#[C-] | FSBLVBBRXSCOKU-UHFFFAOYSA-N | DB01826 | EXPT02302 | N-Butyl Isocyanide | 2769-64-4 | None | None | FSBLVBBRXSCOKU-UHFFFAOYSA-N |
8 | 105M.A | 105M | A | NBN | 83.13 | C5 H9 N | CCCC[N+]#[C-] | FSBLVBBRXSCOKU-UHFFFAOYSA-N | DB01826 | EXPT02302 | N-Butyl Isocyanide | 2769-64-4 | None | None | FSBLVBBRXSCOKU-UHFFFAOYSA-N |
9 | 106M.A | 106M | A | ENC | 56.09 | C3 H6 N 1 | CC[N+]#C | JEGVKBYNUPNGJU-UHFFFAOYSA-N | DB03399 | EXPT01344 | Ethyl Isocyanide | None | None | None | JEGVKBYNUPNGJU-UHFFFAOYSA-N |
In [7]:
ligands = ligands.dropDuplicates(["Commonname"])
ligands = ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight","ligandFormula", "InChIKey", "ligandSmiles")
ligands.sort("Commonname").toPandas().head(10)
Out[7]:
structureChainId | ligandId | DrugBankID | Commonname | ligandMolecularWeight | ligandFormula | InChIKey | ligandSmiles | |
---|---|---|---|---|---|---|---|---|
0 | 1NJ6.A | A5A | DB03376 | '5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine | 417.40 | C13 H19 N7 O7 S | CWWYMWDIYBJVLP-YTMOPEAISA-N | C[C@@H](C(=O)NS(=O)(=O)OC[C@@H]1[C@H]([C@H]([C... |
1 | 1NJ5.A | P5A | DB02510 | '5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine | 443.43 | C15 H21 N7 O7 S | LKVJEMXWEODCAY-JVEUSOJLSA-N | c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3... |
2 | 1VQ2.A | DDN | DB04280 | ((2r,3s,5r)-3-Hydroxy-5-(4-Hydroxy-2-Oxo-3,4-D... | 310.20 | C9 H15 N2 O8 P | ILSIYJVILUIVPM-LXGUWJNJSA-N | C1[C@@H]([C@H](O[C@H]1N2C=C[C@H](NC2=O)O)COP(=... |
3 | 1A7A.A | ADC | DB03216 | (1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Y... | 233.23 | C10 H11 N5 O2 | RQPALADHFYHEHK-CHKWXVPMSA-N | c1nc(c2c(n1)n(cn2)[C@@H]3C=CC([C@H]3O)O)N |
4 | 1DCY.A | I3N | DB03121 | (1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)ace... | 309.36 | C19 H19 N O3 | ZEKCBTQHDTUHRJ-UHFFFAOYSA-N | Cc1c(c2cc(ccc2n1Cc3ccccc3)OC)CC(=O)O |
5 | 2E99.A | B08 | DB07404 | (1-HYDROXY-1-PHOSPHONO-2-[1,1';3',1'']TERPHENY... | 434.32 | C20 H20 O7 P2 | YXQQNSYZOQHKHD-UHFFFAOYSA-N | c1ccc(cc1)c2cccc(c2)c3cccc(c3)CC(O)(P(=O)(O)O)... |
6 | 2E9A.A | B28 | DB07409 | (1-HYDROXY-1-PHOSPHONO-2-[1,1';4',1'']TERPHENY... | 434.32 | C20 H20 O7 P2 | MPBUFKZCEBTBSK-UHFFFAOYSA-N | c1ccc(cc1)c2ccc(cc2)c3cccc(c3)CC(O)(P(=O)(O)O)... |
7 | 2Z52.A | H23 | DB07873 | (1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID) | 346.29 | C12 H28 O7 P2 | KKVZONPEMODBBG-UHFFFAOYSA-N | CCCCCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O |
8 | 2Z50.A | 028 | DB06830 | (1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID) | 276.16 | C7 H18 O7 P2 | IJEGNOYPWRBKAE-UHFFFAOYSA-N | CCCCCCC(O)(P(=O)(O)O)P(=O)(O)O |
9 | 2Z4X.A | 252 | DB06931 | (1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID) | 304.21 | C9 H22 O7 P2 | COHUUYPEYRMWTH-UHFFFAOYSA-N | CCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O |
In [8]:
enzymeQuery = "SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'"
enzyme = pdbjMineDataset.get_dataset(enzymeQuery)
print(f"First 10 results for query: {enzymeQuery}")
enzyme.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'
+-----+-----+---------+---------+----------------+
|pdbid|chain|accession|ec_number|structureChainId|
+-----+-----+---------+---------+----------------+
| 1AUE| A| P42345| 2.7.11.1| 1AUE.A|
| 1AUE| B| P42345| 2.7.11.1| 1AUE.B|
| 1BPV| A| Q8WZ42| 2.7.11.1| 1BPV.A|
| 1C1Y| B| P04049| 2.7.11.1| 1C1Y.B|
| 1CF4| B| Q07912| 2.7.11.1| 1CF4.B|
| 1CJA| A| P80197| 2.7.11.1| 1CJA.A|
| 1CJA| B| P80197| 2.7.11.1| 1CJA.B|
| 1CKI| A| Q06486| 2.7.11.1| 1CKI.A|
| 1CKI| B| Q06486| 2.7.11.1| 1CKI.B|
| 1CKJ| A| Q06486| 2.7.11.1| 1CKJ.A|
+-----+-----+---------+---------+----------------+
only showing top 10 rows
In [9]:
ligands = ligands.join(enzyme, ligands.structureChainId == enzyme.structureChainId)
print(f"Total number of structures: {ligands.count()}")
df = ligands.toPandas()
df.head()
Total number of structures: 172
Out[9]:
structureChainId | ligandId | DrugBankID | Commonname | ligandMolecularWeight | ligandFormula | InChIKey | ligandSmiles | pdbid | chain | accession | ec_number | structureChainId | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1UVR.A | BI8 | DB01946 | 3-[1-(3-Aminopropyl)-1h-Indol-3-Yl]-4-(1-Methy... | 398.46 | C24 H22 N4 O2 | UQHKJRCFSLMWIA-UHFFFAOYSA-N | Cn1cc(c2c1cccc2)C3=C(C(=O)NC3=O)c4cn(c5c4cccc5... | 1UVR | A | O15530 | 2.7.11.1 | 1UVR.A |
1 | 4EKL.A | 0RF | DB11743 | Ipatasertib | 458.00 | C24 H32 Cl N5 O2 | GRZXWCHAXNAUHY-NSISKUIASA-N | C[C@@H]1C[C@H](c2c1c(ncn2)N3CCN(CC3)C(=O)[C@H]... | 4EKL | A | P31749 | 2.7.11.1 | 4EKL.A |
2 | 2NP8.A | CC3 | DB07545 | N-{3-[(4-{[3-(TRIFLUOROMETHYL)PHENYL]AMINO}PYR... | 413.40 | C21 H18 F3 N5 O | RDTDWGQDFJPTPD-UHFFFAOYSA-N | c1cc(cc(c1)Nc2ccnc(n2)Nc3cccc(c3)NC(=O)C4CC4)C... | 2NP8 | A | O14965 | 2.7.11.1 | 2NP8.A |
3 | 2C3L.A | IDZ | DB07959 | 3-(1H-BENZIMIDAZOL-2-YL)-1H-INDAZOLE | 234.26 | C14 H10 N4 | JTKFRFMSUBOCIQ-UHFFFAOYSA-N | c1ccc2c(c1)c(n[nH]2)c3[nH]c4ccccc4n3 | 2C3L | A | O14757 | 2.7.11.1 | 2C3L.A |
4 | 3BQR.A | 4RB | DB07125 | 4-(6-{[(1R)-1-(hydroxymethyl)propyl]amino}imid... | 326.35 | C17 H18 N4 O3 | KKZYGUVAFJCULH-CYBMUJFWSA-N | CC[C@H](CO)Nc1ccc2ncc(n2n1)c3ccc(cc3)C(=O)O | 3BQR | A | O43293 | 2.7.11.1 | 3BQR.A |
In [11]:
structureViewer.view_binding_site(df.pdbid, df.ligandId, df.chain, 4.0)
Out[11]:
<function mmtfPyspark.structureViewer.view_binding_site.<locals>.view3d(i=0)>
In [12]:
sc.stop()