{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PDB Drug Bank Mapping\n", "\n", "Join PDB, Drug Bank and PDBjMine dataset together " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports and variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext \n", "from mmtfPyspark.datasets import customReportService, drugBankDataset, pdbjMineDataset\n", "from mmtfPyspark import structureViewer\n", " \n", "# Create variables \n", "APP_NAME = \"MMTF_Spark\" \n", "path = \"../../resources/mmtf_full_sample/\"\n", "\n", "# Configure Spark \n", "conf = SparkConf().setAppName(APP_NAME).setMaster(\"local[*]\") \n", "sc = SparkContext(conf=conf) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download open DrugBank dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DrugBankIDAccessionNumbersCommonnameCASUNIISynonymsStandardInChIKey
0DB00001BTD00024 | BIOD00024Lepirudin138068-37-8Y43GF64R34Hirudin variant-1 | Lepirudin recombinantNone
1DB00002BTD00071 | BIOD00071Cetuximab205923-56-4PQX0D8J21JCetuximab | Cétuximab | Cetuximabum | Immunogl...None
2DB00003BTD00001 | BIOD00001Dornase alfa143831-71-4953A26OA1YDeoxyribonuclease (human clone 18-1 protein mo...None
3DB00004BTD00084 | BIOD00084Denileukin diftitox173146-27-525E79B5CTMDiphtheria toxin precursor | DT | NAD(+--dipht...None
4DB00005BTD00052 | BIOD00052Etanercept185243-69-0OP401G7OJCEtanercept-szzs | RHU TNFR:FC | RHU-TNFR:FC | ...None
5DB00006BTD00076 | EXPT03302 | BIOD00076 | DB02351Bivalirudin128270-60-0TN9BEX005GBivalirudina | Bivalirudinum | HirulogOIRCOABEOLEUMC-GEJPAHFPSA-N
6DB00007BTD00009 | BIOD00009Leuprolide53714-56-0EFY6W0M8TG(D-Leu(6),des-gly-NH2(10),pro-ethylamide(9))-g...None
7DB00008BTD00043 | BIOD00043Peginterferon alfa-2a198153-51-4Q46947FE7KPegylated interferon alfa-2a | Pegylated inter...None
8DB00009BTD00050 | BIOD00050Alteplase105857-23-61RXS4UE564Alteplase (genetical recombination) | Alteplas...None
9DB00010BTD00033 | BIOD00033Sermorelin86168-78-789243S03TENoneNone
\n", "
" ], "text/plain": [ " DrugBankID AccessionNumbers \\\n", "0 DB00001 BTD00024 | BIOD00024 \n", "1 DB00002 BTD00071 | BIOD00071 \n", "2 DB00003 BTD00001 | BIOD00001 \n", "3 DB00004 BTD00084 | BIOD00084 \n", "4 DB00005 BTD00052 | BIOD00052 \n", "5 DB00006 BTD00076 | EXPT03302 | BIOD00076 | DB02351 \n", "6 DB00007 BTD00009 | BIOD00009 \n", "7 DB00008 BTD00043 | BIOD00043 \n", "8 DB00009 BTD00050 | BIOD00050 \n", "9 DB00010 BTD00033 | BIOD00033 \n", "\n", " Commonname CAS UNII \\\n", "0 Lepirudin 138068-37-8 Y43GF64R34 \n", "1 Cetuximab 205923-56-4 PQX0D8J21J \n", "2 Dornase alfa 143831-71-4 953A26OA1Y \n", "3 Denileukin diftitox 173146-27-5 25E79B5CTM \n", "4 Etanercept 185243-69-0 OP401G7OJC \n", "5 Bivalirudin 128270-60-0 TN9BEX005G \n", "6 Leuprolide 53714-56-0 EFY6W0M8TG \n", "7 Peginterferon alfa-2a 198153-51-4 Q46947FE7K \n", "8 Alteplase 105857-23-6 1RXS4UE564 \n", "9 Sermorelin 86168-78-7 89243S03TE \n", "\n", " Synonyms \\\n", "0 Hirudin variant-1 | Lepirudin recombinant \n", "1 Cetuximab | Cétuximab | Cetuximabum | Immunogl... \n", "2 Deoxyribonuclease (human clone 18-1 protein mo... \n", "3 Diphtheria toxin precursor | DT | NAD(+--dipht... \n", "4 Etanercept-szzs | RHU TNFR:FC | RHU-TNFR:FC | ... \n", "5 Bivalirudina | Bivalirudinum | Hirulog \n", "6 (D-Leu(6),des-gly-NH2(10),pro-ethylamide(9))-g... \n", "7 Pegylated interferon alfa-2a | Pegylated inter... \n", "8 Alteplase (genetical recombination) | Alteplas... \n", "9 None \n", "\n", " StandardInChIKey \n", "0 None \n", "1 None \n", "2 None \n", "3 None \n", "4 None \n", "5 OIRCOABEOLEUMC-GEJPAHFPSA-N \n", "6 None \n", "7 None \n", "8 None \n", "9 None " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drugBank = drugBankDataset.get_open_drug_links()\n", "drugBank.toPandas().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filter out DrugBank entries without StandardInChIKey" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DrugBankIDAccessionNumbersCommonnameCASUNIISynonymsStandardInChIKey
0DB00006BTD00076 | EXPT03302 | BIOD00076 | DB02351Bivalirudin128270-60-0TN9BEX005GBivalirudina | Bivalirudinum | HirulogOIRCOABEOLEUMC-GEJPAHFPSA-N
1DB00014BTD00113 | BIOD00113Goserelin65807-02-50F65R8P09NNoneBLCLNMBMMGCOAS-URPVMXJPSA-N
2DB00027BTD00036 | BIOD00036Gramicidin D1405-97-65IE62321P4Bacillus brevis gramicidin D | Gramicidin | Gr...NDAYQJDHGXTBJL-MWWSRJDJSA-N
3DB00035BTD00112 | BTD00061 | BIOD00112 | BIOD00061Desmopressin16679-58-6ENR1LLB0FP1-(3-mercaptopropionic acid)-8-D-arginine-vaso...NFLWUMRGJYTJIN-NXBWRCJVSA-N
4DB00050BTD00115 | APRD00686 | BIOD00115Cetrorelix120287-85-6OON1HFZ4BACetrorelixumSBNPWPIBESPSIF-MHWMIDJBSA-N
\n", "
" ], "text/plain": [ " DrugBankID AccessionNumbers Commonname \\\n", "0 DB00006 BTD00076 | EXPT03302 | BIOD00076 | DB02351 Bivalirudin \n", "1 DB00014 BTD00113 | BIOD00113 Goserelin \n", "2 DB00027 BTD00036 | BIOD00036 Gramicidin D \n", "3 DB00035 BTD00112 | BTD00061 | BIOD00112 | BIOD00061 Desmopressin \n", "4 DB00050 BTD00115 | APRD00686 | BIOD00115 Cetrorelix \n", "\n", " CAS UNII Synonyms \\\n", "0 128270-60-0 TN9BEX005G Bivalirudina | Bivalirudinum | Hirulog \n", "1 65807-02-5 0F65R8P09N None \n", "2 1405-97-6 5IE62321P4 Bacillus brevis gramicidin D | Gramicidin | Gr... \n", "3 16679-58-6 ENR1LLB0FP 1-(3-mercaptopropionic acid)-8-D-arginine-vaso... \n", "4 120287-85-6 OON1HFZ4BA Cetrorelixum \n", "\n", " StandardInChIKey \n", "0 OIRCOABEOLEUMC-GEJPAHFPSA-N \n", "1 BLCLNMBMMGCOAS-URPVMXJPSA-N \n", "2 NDAYQJDHGXTBJL-MWWSRJDJSA-N \n", "3 NFLWUMRGJYTJIN-NXBWRCJVSA-N \n", "4 SBNPWPIBESPSIF-MHWMIDJBSA-N " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drugBank = drugBank.filter(drugBank.StandardInChIKey.isNotNull())\n", "drugBank.toPandas().head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get PDB ligand annotations" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureChainIdstructureIdchainIdligandIdligandMolecularWeightligandFormulaligandSmilesInChIKey
0100D.A100DASPM202.34C10 H26 N4C(CCNCCCN)CNCCCNPFNFFQXMRSDOHW-UHFFFAOYSA-N
1100D.B100DBNoneNaNNoneNoneNone
2101D.A101DACBR386.09C9 H13 Br N3 O7 PC1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...PLDRCXOBLRYJSZ-RRKCRQDMSA-N
3101D.A101DAMG24.31Mg 2[Mg+2]JLVVSXFLKOJNIY-UHFFFAOYSA-N
4101D.B101DBCBR386.09C9 H13 Br N3 O7 PC1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...PLDRCXOBLRYJSZ-RRKCRQDMSA-N
5101D.B101DBNT430.46C18 H26 N10 O3Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=...IDBIFFKSXLYUOT-UHFFFAOYSA-N
6101M.A101MAHEM616.49C34 H32 Fe N4 O4Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=[N]4[Fe]36[N]7...KABFMIBPWCXCRK-RGGAHWMASA-L
7101M.A101MANBN83.13C5 H9 NCCCC[N+]#[C-]FSBLVBBRXSCOKU-UHFFFAOYSA-N
8101M.A101MASO496.06O4 S -2[O-]S(=O)(=O)[O-]QAOWNCQODCNURD-UHFFFAOYSA-L
9102D.A102DANoneNaNNoneNoneNone
\n", "
" ], "text/plain": [ " structureChainId structureId chainId ligandId ligandMolecularWeight \\\n", "0 100D.A 100D A SPM 202.34 \n", "1 100D.B 100D B None NaN \n", "2 101D.A 101D A CBR 386.09 \n", "3 101D.A 101D A MG 24.31 \n", "4 101D.B 101D B CBR 386.09 \n", "5 101D.B 101D B NT 430.46 \n", "6 101M.A 101M A HEM 616.49 \n", "7 101M.A 101M A NBN 83.13 \n", "8 101M.A 101M A SO4 96.06 \n", "9 102D.A 102D A None NaN \n", "\n", " ligandFormula ligandSmiles \\\n", "0 C10 H26 N4 C(CCNCCCN)CNCCCN \n", "1 None None \n", "2 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... \n", "3 Mg 2 [Mg+2] \n", "4 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... \n", "5 C18 H26 N10 O3 Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=... \n", "6 C34 H32 Fe N4 O4 Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=[N]4[Fe]36[N]7... \n", "7 C5 H9 N CCCC[N+]#[C-] \n", "8 O4 S -2 [O-]S(=O)(=O)[O-] \n", "9 None None \n", "\n", " InChIKey \n", "0 PFNFFQXMRSDOHW-UHFFFAOYSA-N \n", "1 None \n", "2 PLDRCXOBLRYJSZ-RRKCRQDMSA-N \n", "3 JLVVSXFLKOJNIY-UHFFFAOYSA-N \n", "4 PLDRCXOBLRYJSZ-RRKCRQDMSA-N \n", "5 IDBIFFKSXLYUOT-UHFFFAOYSA-N \n", "6 KABFMIBPWCXCRK-RGGAHWMASA-L \n", "7 FSBLVBBRXSCOKU-UHFFFAOYSA-N \n", "8 QAOWNCQODCNURD-UHFFFAOYSA-L \n", "9 None " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ligands = customReportService.get_dataset([\"ligandId\",\"ligandMolecularWeight\",\"ligandFormula\",\"ligandSmiles\",\"InChIKey\"])\n", "ligands.toPandas().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filter out DrugBank entries without InChIKey" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureChainIdstructureIdchainIdligandIdligandMolecularWeightligandFormulaligandSmilesInChIKey
0100D.A100DASPM202.34C10 H26 N4C(CCNCCCN)CNCCCNPFNFFQXMRSDOHW-UHFFFAOYSA-N
1101D.A101DACBR386.09C9 H13 Br N3 O7 PC1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...PLDRCXOBLRYJSZ-RRKCRQDMSA-N
2101D.A101DAMG24.31Mg 2[Mg+2]JLVVSXFLKOJNIY-UHFFFAOYSA-N
3101D.B101DBCBR386.09C9 H13 Br N3 O7 PC1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(...PLDRCXOBLRYJSZ-RRKCRQDMSA-N
4101D.B101DBNT430.46C18 H26 N10 O3Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=...IDBIFFKSXLYUOT-UHFFFAOYSA-N
\n", "
" ], "text/plain": [ " structureChainId structureId chainId ligandId ligandMolecularWeight \\\n", "0 100D.A 100D A SPM 202.34 \n", "1 101D.A 101D A CBR 386.09 \n", "2 101D.A 101D A MG 24.31 \n", "3 101D.B 101D B CBR 386.09 \n", "4 101D.B 101D B NT 430.46 \n", "\n", " ligandFormula ligandSmiles \\\n", "0 C10 H26 N4 C(CCNCCCN)CNCCCN \n", "1 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... \n", "2 Mg 2 [Mg+2] \n", "3 C9 H13 Br N3 O7 P C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(... \n", "4 C18 H26 N10 O3 Cn1cc(cc1C(=O)Nc2cc(n(c2)C)C(=O)NCCC(=N)N)NC(=... \n", "\n", " InChIKey \n", "0 PFNFFQXMRSDOHW-UHFFFAOYSA-N \n", "1 PLDRCXOBLRYJSZ-RRKCRQDMSA-N \n", "2 JLVVSXFLKOJNIY-UHFFFAOYSA-N \n", "3 PLDRCXOBLRYJSZ-RRKCRQDMSA-N \n", "4 IDBIFFKSXLYUOT-UHFFFAOYSA-N " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ligands = ligands.filter(ligands.InChIKey.isNotNull())\n", "ligands.toPandas().head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join ligand dataset with DrugBank info by InChIKey" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureChainIdstructureIdchainIdligandIdligandMolecularWeightligandFormulaligandSmilesInChIKeyDrugBankIDAccessionNumbersCommonnameCASUNIISynonymsStandardInChIKey
0100D.A100DASPM202.34C10 H26 N4C(CCNCCCN)CNCCCNPFNFFQXMRSDOHW-UHFFFAOYSA-NDB00127NUTR00055 | EXPT02947 | DB02564Spermine71-44-32FZ7Y3VOQX4,9-Diaza-1,12-dodecanediamine | 4,9-Diazadode...PFNFFQXMRSDOHW-UHFFFAOYSA-N
1101D.A101DAMG24.31Mg 2[Mg+2]JLVVSXFLKOJNIY-UHFFFAOYSA-NDB01378NoneMagnesium7439-95-4T6V3LHY838NoneJLVVSXFLKOJNIY-UHFFFAOYSA-N
2101M.A101MANBN83.13C5 H9 NCCCC[N+]#[C-]FSBLVBBRXSCOKU-UHFFFAOYSA-NDB01826EXPT02302N-Butyl Isocyanide2769-64-4NoneNoneFSBLVBBRXSCOKU-UHFFFAOYSA-N
3102D.B102DBTNT312.37C17 H20 N4 O2c1cc(ccc1C(=N)N)OCCCOc2ccc(cc2)C(=N)NWTFXJFJYEJZMFO-UHFFFAOYSA-NDB13296NonePropamidine104-32-5G20G12V769NoneWTFXJFJYEJZMFO-UHFFFAOYSA-N
4102L.A102LABME78.13C2 H6 O SC(CS)ODGVVWUTYPXICAM-UHFFFAOYSA-NDB03345EXPT02882 | DB03131Beta-Mercaptoethanol60-24-214R9K67URN2-Sulfhydryl-EthanolDGVVWUTYPXICAM-UHFFFAOYSA-N
5103L.A103LABME78.13C2 H6 O SC(CS)ODGVVWUTYPXICAM-UHFFFAOYSA-NDB03345EXPT02882 | DB03131Beta-Mercaptoethanol60-24-214R9K67URN2-Sulfhydryl-EthanolDGVVWUTYPXICAM-UHFFFAOYSA-N
6103M.A103MANBN83.13C5 H9 NCCCC[N+]#[C-]FSBLVBBRXSCOKU-UHFFFAOYSA-NDB01826EXPT02302N-Butyl Isocyanide2769-64-4NoneNoneFSBLVBBRXSCOKU-UHFFFAOYSA-N
7104M.A104MANBN83.13C5 H9 NCCCC[N+]#[C-]FSBLVBBRXSCOKU-UHFFFAOYSA-NDB01826EXPT02302N-Butyl Isocyanide2769-64-4NoneNoneFSBLVBBRXSCOKU-UHFFFAOYSA-N
8105M.A105MANBN83.13C5 H9 NCCCC[N+]#[C-]FSBLVBBRXSCOKU-UHFFFAOYSA-NDB01826EXPT02302N-Butyl Isocyanide2769-64-4NoneNoneFSBLVBBRXSCOKU-UHFFFAOYSA-N
9106M.A106MAENC56.09C3 H6 N 1CC[N+]#CJEGVKBYNUPNGJU-UHFFFAOYSA-NDB03399EXPT01344Ethyl IsocyanideNoneNoneNoneJEGVKBYNUPNGJU-UHFFFAOYSA-N
\n", "
" ], "text/plain": [ " structureChainId structureId chainId ligandId ligandMolecularWeight \\\n", "0 100D.A 100D A SPM 202.34 \n", "1 101D.A 101D A MG 24.31 \n", "2 101M.A 101M A NBN 83.13 \n", "3 102D.B 102D B TNT 312.37 \n", "4 102L.A 102L A BME 78.13 \n", "5 103L.A 103L A BME 78.13 \n", "6 103M.A 103M A NBN 83.13 \n", "7 104M.A 104M A NBN 83.13 \n", "8 105M.A 105M A NBN 83.13 \n", "9 106M.A 106M A ENC 56.09 \n", "\n", " ligandFormula ligandSmiles \\\n", "0 C10 H26 N4 C(CCNCCCN)CNCCCN \n", "1 Mg 2 [Mg+2] \n", "2 C5 H9 N CCCC[N+]#[C-] \n", "3 C17 H20 N4 O2 c1cc(ccc1C(=N)N)OCCCOc2ccc(cc2)C(=N)N \n", "4 C2 H6 O S C(CS)O \n", "5 C2 H6 O S C(CS)O \n", "6 C5 H9 N CCCC[N+]#[C-] \n", "7 C5 H9 N CCCC[N+]#[C-] \n", "8 C5 H9 N CCCC[N+]#[C-] \n", "9 C3 H6 N 1 CC[N+]#C \n", "\n", " InChIKey DrugBankID AccessionNumbers \\\n", "0 PFNFFQXMRSDOHW-UHFFFAOYSA-N DB00127 NUTR00055 | EXPT02947 | DB02564 \n", "1 JLVVSXFLKOJNIY-UHFFFAOYSA-N DB01378 None \n", "2 FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 \n", "3 WTFXJFJYEJZMFO-UHFFFAOYSA-N DB13296 None \n", "4 DGVVWUTYPXICAM-UHFFFAOYSA-N DB03345 EXPT02882 | DB03131 \n", "5 DGVVWUTYPXICAM-UHFFFAOYSA-N DB03345 EXPT02882 | DB03131 \n", "6 FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 \n", "7 FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 \n", "8 FSBLVBBRXSCOKU-UHFFFAOYSA-N DB01826 EXPT02302 \n", "9 JEGVKBYNUPNGJU-UHFFFAOYSA-N DB03399 EXPT01344 \n", "\n", " Commonname CAS UNII \\\n", "0 Spermine 71-44-3 2FZ7Y3VOQX \n", "1 Magnesium 7439-95-4 T6V3LHY838 \n", "2 N-Butyl Isocyanide 2769-64-4 None \n", "3 Propamidine 104-32-5 G20G12V769 \n", "4 Beta-Mercaptoethanol 60-24-2 14R9K67URN \n", "5 Beta-Mercaptoethanol 60-24-2 14R9K67URN \n", "6 N-Butyl Isocyanide 2769-64-4 None \n", "7 N-Butyl Isocyanide 2769-64-4 None \n", "8 N-Butyl Isocyanide 2769-64-4 None \n", "9 Ethyl Isocyanide None None \n", "\n", " Synonyms \\\n", "0 4,9-Diaza-1,12-dodecanediamine | 4,9-Diazadode... \n", "1 None \n", "2 None \n", "3 None \n", "4 2-Sulfhydryl-Ethanol \n", "5 2-Sulfhydryl-Ethanol \n", "6 None \n", "7 None \n", "8 None \n", "9 None \n", "\n", " StandardInChIKey \n", "0 PFNFFQXMRSDOHW-UHFFFAOYSA-N \n", "1 JLVVSXFLKOJNIY-UHFFFAOYSA-N \n", "2 FSBLVBBRXSCOKU-UHFFFAOYSA-N \n", "3 WTFXJFJYEJZMFO-UHFFFAOYSA-N \n", "4 DGVVWUTYPXICAM-UHFFFAOYSA-N \n", "5 DGVVWUTYPXICAM-UHFFFAOYSA-N \n", "6 FSBLVBBRXSCOKU-UHFFFAOYSA-N \n", "7 FSBLVBBRXSCOKU-UHFFFAOYSA-N \n", "8 FSBLVBBRXSCOKU-UHFFFAOYSA-N \n", "9 JEGVKBYNUPNGJU-UHFFFAOYSA-N " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ligands = ligands.join(drugBank, ligands.InChIKey == drugBank.StandardInChIKey)\n", "ligands.toPandas().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Show one example per drug molecule" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureChainIdligandIdDrugBankIDCommonnameligandMolecularWeightligandFormulaInChIKeyligandSmiles
01NJ6.AA5ADB03376'5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine417.40C13 H19 N7 O7 SCWWYMWDIYBJVLP-YTMOPEAISA-NC[C@@H](C(=O)NS(=O)(=O)OC[C@@H]1[C@H]([C@H]([C...
11NJ5.AP5ADB02510'5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine443.43C15 H21 N7 O7 SLKVJEMXWEODCAY-JVEUSOJLSA-Nc1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...
21VQ2.ADDNDB04280((2r,3s,5r)-3-Hydroxy-5-(4-Hydroxy-2-Oxo-3,4-D...310.20C9 H15 N2 O8 PILSIYJVILUIVPM-LXGUWJNJSA-NC1[C@@H]([C@H](O[C@H]1N2C=C[C@H](NC2=O)O)COP(=...
31A7A.AADCDB03216(1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Y...233.23C10 H11 N5 O2RQPALADHFYHEHK-CHKWXVPMSA-Nc1nc(c2c(n1)n(cn2)[C@@H]3C=CC([C@H]3O)O)N
41DCY.AI3NDB03121(1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)ace...309.36C19 H19 N O3ZEKCBTQHDTUHRJ-UHFFFAOYSA-NCc1c(c2cc(ccc2n1Cc3ccccc3)OC)CC(=O)O
52E99.AB08DB07404(1-HYDROXY-1-PHOSPHONO-2-[1,1';3',1'']TERPHENY...434.32C20 H20 O7 P2YXQQNSYZOQHKHD-UHFFFAOYSA-Nc1ccc(cc1)c2cccc(c2)c3cccc(c3)CC(O)(P(=O)(O)O)...
62E9A.AB28DB07409(1-HYDROXY-1-PHOSPHONO-2-[1,1';4',1'']TERPHENY...434.32C20 H20 O7 P2MPBUFKZCEBTBSK-UHFFFAOYSA-Nc1ccc(cc1)c2ccc(cc2)c3cccc(c3)CC(O)(P(=O)(O)O)...
72Z52.AH23DB07873(1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID)346.29C12 H28 O7 P2KKVZONPEMODBBG-UHFFFAOYSA-NCCCCCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
82Z50.A028DB06830(1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID)276.16C7 H18 O7 P2IJEGNOYPWRBKAE-UHFFFAOYSA-NCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
92Z4X.A252DB06931(1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID)304.21C9 H22 O7 P2COHUUYPEYRMWTH-UHFFFAOYSA-NCCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O
\n", "
" ], "text/plain": [ " structureChainId ligandId DrugBankID \\\n", "0 1NJ6.A A5A DB03376 \n", "1 1NJ5.A P5A DB02510 \n", "2 1VQ2.A DDN DB04280 \n", "3 1A7A.A ADC DB03216 \n", "4 1DCY.A I3N DB03121 \n", "5 2E99.A B08 DB07404 \n", "6 2E9A.A B28 DB07409 \n", "7 2Z52.A H23 DB07873 \n", "8 2Z50.A 028 DB06830 \n", "9 2Z4X.A 252 DB06931 \n", "\n", " Commonname ligandMolecularWeight \\\n", "0 '5'-O-(N-(L-Alanyl)-Sulfamoyl)Adenosine 417.40 \n", "1 '5'-O-(N-(L-Prolyl)-Sulfamoyl)Adenosine 443.43 \n", "2 ((2r,3s,5r)-3-Hydroxy-5-(4-Hydroxy-2-Oxo-3,4-D... 310.20 \n", "3 (1'r,2's)-9-(2-Hydroxy-3'-Keto-Cyclopenten-1-Y... 233.23 \n", "4 (1-Benzyl-5-methoxy-2-methyl-1H-indol-3-yl)ace... 309.36 \n", "5 (1-HYDROXY-1-PHOSPHONO-2-[1,1';3',1'']TERPHENY... 434.32 \n", "6 (1-HYDROXY-1-PHOSPHONO-2-[1,1';4',1'']TERPHENY... 434.32 \n", "7 (1-HYDROXYDODECANE-1,1-DIYL)BIS(PHOSPHONIC ACID) 346.29 \n", "8 (1-HYDROXYHEPTANE-1,1-DIYL)BIS(PHOSPHONIC ACID) 276.16 \n", "9 (1-HYDROXYNONANE-1,1-DIYL)BIS(PHOSPHONIC ACID) 304.21 \n", "\n", " ligandFormula InChIKey \\\n", "0 C13 H19 N7 O7 S CWWYMWDIYBJVLP-YTMOPEAISA-N \n", "1 C15 H21 N7 O7 S LKVJEMXWEODCAY-JVEUSOJLSA-N \n", "2 C9 H15 N2 O8 P ILSIYJVILUIVPM-LXGUWJNJSA-N \n", "3 C10 H11 N5 O2 RQPALADHFYHEHK-CHKWXVPMSA-N \n", "4 C19 H19 N O3 ZEKCBTQHDTUHRJ-UHFFFAOYSA-N \n", "5 C20 H20 O7 P2 YXQQNSYZOQHKHD-UHFFFAOYSA-N \n", "6 C20 H20 O7 P2 MPBUFKZCEBTBSK-UHFFFAOYSA-N \n", "7 C12 H28 O7 P2 KKVZONPEMODBBG-UHFFFAOYSA-N \n", "8 C7 H18 O7 P2 IJEGNOYPWRBKAE-UHFFFAOYSA-N \n", "9 C9 H22 O7 P2 COHUUYPEYRMWTH-UHFFFAOYSA-N \n", "\n", " ligandSmiles \n", "0 C[C@@H](C(=O)NS(=O)(=O)OC[C@@H]1[C@H]([C@H]([C... \n", "1 c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3... \n", "2 C1[C@@H]([C@H](O[C@H]1N2C=C[C@H](NC2=O)O)COP(=... \n", "3 c1nc(c2c(n1)n(cn2)[C@@H]3C=CC([C@H]3O)O)N \n", "4 Cc1c(c2cc(ccc2n1Cc3ccccc3)OC)CC(=O)O \n", "5 c1ccc(cc1)c2cccc(c2)c3cccc(c3)CC(O)(P(=O)(O)O)... \n", "6 c1ccc(cc1)c2ccc(cc2)c3cccc(c3)CC(O)(P(=O)(O)O)... \n", "7 CCCCCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O \n", "8 CCCCCCC(O)(P(=O)(O)O)P(=O)(O)O \n", "9 CCCCCCCCC(O)(P(=O)(O)O)P(=O)(O)O " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ligands = ligands.dropDuplicates([\"Commonname\"])\n", "ligands = ligands.select(\"structureChainId\", \"ligandId\", \"DrugBankID\", \"Commonname\", \"ligandMolecularWeight\",\"ligandFormula\", \"InChIKey\", \"ligandSmiles\")\n", "ligands.sort(\"Commonname\").toPandas().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Query structures with 2.7.11.1 EC number using PDBjMine" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "First 10 results for query: SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'\n", "+-----+-----+---------+---------+----------------+\n", "|pdbid|chain|accession|ec_number|structureChainId|\n", "+-----+-----+---------+---------+----------------+\n", "| 1AUE| A| P42345| 2.7.11.1| 1AUE.A|\n", "| 1AUE| B| P42345| 2.7.11.1| 1AUE.B|\n", "| 1BPV| A| Q8WZ42| 2.7.11.1| 1BPV.A|\n", "| 1C1Y| B| P04049| 2.7.11.1| 1C1Y.B|\n", "| 1CF4| B| Q07912| 2.7.11.1| 1CF4.B|\n", "| 1CJA| A| P80197| 2.7.11.1| 1CJA.A|\n", "| 1CJA| B| P80197| 2.7.11.1| 1CJA.B|\n", "| 1CKI| A| Q06486| 2.7.11.1| 1CKI.A|\n", "| 1CKI| B| Q06486| 2.7.11.1| 1CKI.B|\n", "| 1CKJ| A| Q06486| 2.7.11.1| 1CKJ.A|\n", "+-----+-----+---------+---------+----------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "enzymeQuery = \"SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'\"\n", "enzyme = pdbjMineDataset.get_dataset(enzymeQuery)\n", "\n", "print(f\"First 10 results for query: {enzymeQuery}\")\n", "enzyme.show(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join ligand dataset with PDBjMine dataset with structureChainId" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of structures: 172\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureChainIdligandIdDrugBankIDCommonnameligandMolecularWeightligandFormulaInChIKeyligandSmilespdbidchainaccessionec_numberstructureChainId
01UVR.ABI8DB019463-[1-(3-Aminopropyl)-1h-Indol-3-Yl]-4-(1-Methy...398.46C24 H22 N4 O2UQHKJRCFSLMWIA-UHFFFAOYSA-NCn1cc(c2c1cccc2)C3=C(C(=O)NC3=O)c4cn(c5c4cccc5...1UVRAO155302.7.11.11UVR.A
14EKL.A0RFDB11743Ipatasertib458.00C24 H32 Cl N5 O2GRZXWCHAXNAUHY-NSISKUIASA-NC[C@@H]1C[C@H](c2c1c(ncn2)N3CCN(CC3)C(=O)[C@H]...4EKLAP317492.7.11.14EKL.A
22NP8.ACC3DB07545N-{3-[(4-{[3-(TRIFLUOROMETHYL)PHENYL]AMINO}PYR...413.40C21 H18 F3 N5 ORDTDWGQDFJPTPD-UHFFFAOYSA-Nc1cc(cc(c1)Nc2ccnc(n2)Nc3cccc(c3)NC(=O)C4CC4)C...2NP8AO149652.7.11.12NP8.A
32C3L.AIDZDB079593-(1H-BENZIMIDAZOL-2-YL)-1H-INDAZOLE234.26C14 H10 N4JTKFRFMSUBOCIQ-UHFFFAOYSA-Nc1ccc2c(c1)c(n[nH]2)c3[nH]c4ccccc4n32C3LAO147572.7.11.12C3L.A
43BQR.A4RBDB071254-(6-{[(1R)-1-(hydroxymethyl)propyl]amino}imid...326.35C17 H18 N4 O3KKZYGUVAFJCULH-CYBMUJFWSA-NCC[C@H](CO)Nc1ccc2ncc(n2n1)c3ccc(cc3)C(=O)O3BQRAO432932.7.11.13BQR.A
\n", "
" ], "text/plain": [ " structureChainId ligandId DrugBankID \\\n", "0 1UVR.A BI8 DB01946 \n", "1 4EKL.A 0RF DB11743 \n", "2 2NP8.A CC3 DB07545 \n", "3 2C3L.A IDZ DB07959 \n", "4 3BQR.A 4RB DB07125 \n", "\n", " Commonname ligandMolecularWeight \\\n", "0 3-[1-(3-Aminopropyl)-1h-Indol-3-Yl]-4-(1-Methy... 398.46 \n", "1 Ipatasertib 458.00 \n", "2 N-{3-[(4-{[3-(TRIFLUOROMETHYL)PHENYL]AMINO}PYR... 413.40 \n", "3 3-(1H-BENZIMIDAZOL-2-YL)-1H-INDAZOLE 234.26 \n", "4 4-(6-{[(1R)-1-(hydroxymethyl)propyl]amino}imid... 326.35 \n", "\n", " ligandFormula InChIKey \\\n", "0 C24 H22 N4 O2 UQHKJRCFSLMWIA-UHFFFAOYSA-N \n", "1 C24 H32 Cl N5 O2 GRZXWCHAXNAUHY-NSISKUIASA-N \n", "2 C21 H18 F3 N5 O RDTDWGQDFJPTPD-UHFFFAOYSA-N \n", "3 C14 H10 N4 JTKFRFMSUBOCIQ-UHFFFAOYSA-N \n", "4 C17 H18 N4 O3 KKZYGUVAFJCULH-CYBMUJFWSA-N \n", "\n", " ligandSmiles pdbid chain accession \\\n", "0 Cn1cc(c2c1cccc2)C3=C(C(=O)NC3=O)c4cn(c5c4cccc5... 1UVR A O15530 \n", "1 C[C@@H]1C[C@H](c2c1c(ncn2)N3CCN(CC3)C(=O)[C@H]... 4EKL A P31749 \n", "2 c1cc(cc(c1)Nc2ccnc(n2)Nc3cccc(c3)NC(=O)C4CC4)C... 2NP8 A O14965 \n", "3 c1ccc2c(c1)c(n[nH]2)c3[nH]c4ccccc4n3 2C3L A O14757 \n", "4 CC[C@H](CO)Nc1ccc2ncc(n2n1)c3ccc(cc3)C(=O)O 3BQR A O43293 \n", "\n", " ec_number structureChainId \n", "0 2.7.11.1 1UVR.A \n", "1 2.7.11.1 4EKL.A \n", "2 2.7.11.1 2NP8.A \n", "3 2.7.11.1 2C3L.A \n", "4 2.7.11.1 3BQR.A " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ligands = ligands.join(enzyme, ligands.structureChainId == enzyme.structureChainId)\n", "print(f\"Total number of structures: {ligands.count()}\")\n", "\n", "df = ligands.toPandas()\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualize protein kinase interaction" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4ebc18af8b754d2e8c0f302196e962ab", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=171), Output())…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ ".view3d(i=0)>" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "structureViewer.view_binding_site(df.pdbid, df.ligandId, df.chain, 4.0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Terminate Spark" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }