This demo shows how to query PDB annotations from the SIFTS project.
The “Structure Integration with Function, Taxonomy and Sequence” is the authoritative source of up-to-date residue-level annotation of structures in the PDB with data available in Uniprot, IntEnz, CATH, SCOP, GO, InterPro, Pfam and PubMed. link to SIFTS
Data are probided through Mine 2 SQL
Queries can be designed with the interactive PDBj Mine 2 query service
In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from mmtfPyspark.datasets import pdbjMineDataset
In [2]:
spark = SparkSession.builder\
.master("local[*]")\
.appName("SIFTSDataDemo")\
.getOrCreate()
In [4]:
pubmedQuery = "SELECT * FROM sifts.pdb_pubmed LIMIT 10"
pubmed = pdbjMineDataset.get_dataset(pubmedQuery)
print(f"First 10 results for query: {pubmedQuery}")
pubmed.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_pubmed LIMIT 10
+-----------+-------+---------+
|structureId|ordinal|pubmed_id|
+-----------+-------+---------+
| 100D| 0| 7816639|
| 101D| 0| 7711020|
| 102D| 0| 7608897|
| 102L| 0| 8429913|
| 103D| 0| 7966337|
| 103L| 0| 8429913|
| 104D| 0| 7857947|
| 104L| 0| 8429913|
| 105D| 0| 7743125|
| 106D| 0| 7743125|
+-----------+-------+---------+
In [9]:
interproQuery = "SELECT * FROM sifts.pdb_chain_interpro LIMIT 10"
interpro = pdbjMineDataset.get_dataset(interproQuery)
print(f"First 10 results for query: {interproQuery}")
interpro.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_interpro LIMIT 10
+-----+-----+-----------+----------------+
|pdbid|chain|interpro_id|structureChainId|
+-----+-----+-----------+----------------+
| 101M| A| IPR000971| 101M.A|
| 101M| A| IPR002335| 101M.A|
| 102L| A| IPR001165| 102L.A|
| 102L| A| IPR002196| 102L.A|
| 102L| A| IPR034690| 102L.A|
| 102M| A| IPR000971| 102M.A|
| 102M| A| IPR002335| 102M.A|
| 103L| A| IPR001165| 103L.A|
| 103L| A| IPR002196| 103L.A|
| 103L| A| IPR034690| 103L.A|
+-----+-----+-----------+----------------+
In [10]:
uniprotQuery = "SELECT * FROM sifts.pdb_chain_uniprot LIMIT 10"
uniprot = pdbjMineDataset.get_dataset(uniprotQuery)
print(f"First 10 results for query: {uniprotQuery}")
uniprot.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_uniprot LIMIT 10
+-----+-----+----------+-------+-------+-------+-------+------+------+----------------+
|pdbid|chain|sp_primary|res_beg|res_end|pdb_beg|pdb_end|sp_beg|sp_end|structureChainId|
+-----+-----+----------+-------+-------+-------+-------+------+------+----------------+
| 101M| A| P02185| 1| 154| 0| 153| 1| 154| 101M.A|
| 102L| A| P00720| 1| 40| 1| 40| 1| 40| 102L.A|
| 102L| A| P00720| 42| 165| 41| None| 41| 164| 102L.A|
| 102M| A| P02185| 1| 154| 0| 153| 1| 154| 102M.A|
| 103L| A| P00720| 1| 40| 1| None| 1| 40| 103L.A|
| 103L| A| P00720| 44| 167| 41| None| 41| 164| 103L.A|
| 103M| A| P02185| 1| 154| 0| 153| 1| 154| 103M.A|
| 104L| A| P00720| 1| 44| 1| 44| 1| 44| 104L.A|
| 104L| A| P00720| 47| 166| 45| None| 45| 164| 104L.A|
| 104L| B| P00720| 1| 44| 1| 44| 1| 44| 104L.B|
+-----+-----+----------+-------+-------+-------+-------+------+------+----------------+
In [11]:
taxonomyQuery = "SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10"
taxonomy = pdbjMineDataset.get_dataset(taxonomyQuery)
print(f"First 10 results for query: {taxonomyQuery}")
taxonomy.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10
+-----+-----+------+--------------------+----------------+
|pdbid|chain|tax_id| scientific_name|structureChainId|
+-----+-----+------+--------------------+----------------+
| 101M| A| 9755| PHYCD| 101M.A|
| 101M| A| 9755| Physeter catodon| 101M.A|
| 101M| A| 9755|Physeter catodon ...| 101M.A|
| 101M| A| 9755|Physeter catodon ...| 101M.A|
| 101M| A| 9755|Physeter macrocep...| 101M.A|
| 101M| A| 9755| Sperm whale| 101M.A|
| 101M| A| 9755| sperm whale| 101M.A|
| 102L| A| 10665| BPT4| 102L.A|
| 102L| A| 10665| Bacteriophage T4| 102L.A|
| 102L| A| 10665|Enterobacteria ph...| 102L.A|
+-----+-----+------+--------------------+----------------+
In [12]:
pfamQuery = "SELECT * FROM sifts.pdb_chain_pfam LIMIT 10"
pfam = pdbjMineDataset.get_dataset(pfamQuery)
print(f"First 10 results for query: {pfamQuery}")
pfam.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_pfam LIMIT 10
+-----+-----+----------+-------+----------------+
|pdbid|chain|sp_primary|pfam_id|structureChainId|
+-----+-----+----------+-------+----------------+
| 101M| A| P02185|PF00042| 101M.A|
| 102L| A| P00720|PF00959| 102L.A|
| 102M| A| P02185|PF00042| 102M.A|
| 103L| A| P00720|PF00959| 103L.A|
| 103M| A| P02185|PF00042| 103M.A|
| 104L| A| P00720|PF00959| 104L.A|
| 104L| B| P00720|PF00959| 104L.B|
| 104M| A| P02185|PF00042| 104M.A|
| 105M| A| P02185|PF00042| 105M.A|
| 106M| A| P02185|PF00042| 106M.A|
+-----+-----+----------+-------+----------------+
In [4]:
pubmedQuery = "SELECT * FROM sifts.pdb_pubmed LIMIT 10"
pubmed = pdbjMineDataset.get_dataset(pubmedQuery)
print(f"First 10 results for query: {pubmedQuery}")
pubmed.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_pubmed LIMIT 10
+-----------+-------+---------+
|structureId|ordinal|pubmed_id|
+-----------+-------+---------+
| 100D| 0| 7816639|
| 101D| 0| 7711020|
| 102D| 0| 7608897|
| 102L| 0| 8429913|
| 103D| 0| 7966337|
| 103L| 0| 8429913|
| 104D| 0| 7857947|
| 104L| 0| 8429913|
| 105D| 0| 7743125|
| 106D| 0| 7743125|
+-----------+-------+---------+
In [13]:
scopQuery = "SELECT * FROM sifts.pdb_chain_scop_uniprot LIMIT 10"
scop = pdbjMineDataset.get_dataset(scopQuery)
print(f"First 10 results for query: {scopQuery}")
scop.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_scop_uniprot LIMIT 10
+-----+-----+----------+-----+-------+----------------+
|pdbid|chain|sp_primary|sunid|scop_id|structureChainId|
+-----+-----+----------+-----+-------+----------------+
| 101M| A| P02185|15125|d101ma_| 101M.A|
| 102L| A| P00720|36724|d102la_| 102L.A|
| 102M| A| P02185|15073|d102ma_| 102M.A|
| 103L| A| P00720|36870|d103la_| 103L.A|
| 103M| A| P02185|15124|d103ma_| 103M.A|
| 104L| A| P00720|36969|d104la_| 104L.A|
| 104L| B| P00720|36970|d104lb_| 104L.B|
| 104M| A| P02185|15041|d104ma_| 104M.A|
| 105M| A| P02185|15128|d105ma_| 105M.A|
| 106M| A| P02185|15101|d106ma_| 106M.A|
+-----+-----+----------+-----+-------+----------------+
In [14]:
enzymeQuery = "SELECT * FROM sifts.pdb_chain_enzyme LIMIT 10"
enzyme = pdbjMineDataset.get_dataset(enzymeQuery)
print(f"First 10 results for query: {enzymeQuery}")
enzyme.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_enzyme LIMIT 10
+-----+-----+---------+---------+----------------+
|pdbid|chain|accession|ec_number|structureChainId|
+-----+-----+---------+---------+----------------+
| 102L| A| P00720| 3.2.1.17| 102L.A|
| 103L| A| P00720| 3.2.1.17| 103L.A|
| 104L| A| P00720| 3.2.1.17| 104L.A|
| 104L| B| P00720| 3.2.1.17| 104L.B|
| 107L| A| P00720| 3.2.1.17| 107L.A|
| 108L| A| P00720| 3.2.1.17| 108L.A|
| 109L| A| P00720| 3.2.1.17| 109L.A|
| 10GS| A| P09211| 2.5.1.18| 10GS.A|
| 10GS| B| P09211| 2.5.1.18| 10GS.B|
| 10MH| A| P05102| 2.1.1.37| 10MH.A|
+-----+-----+---------+---------+----------------+
In [15]:
goQuery = "SELECT * FROM sifts.pdb_chain_go LIMIT 10"
go = pdbjMineDataset.get_dataset(goQuery)
print(f"First 10 results for query: {goQuery}")
go.show(10)
First 10 results for query: SELECT * FROM sifts.pdb_chain_go LIMIT 10
+-----+-----+----------+--------------------+--------+----------+----------------+
|pdbid|chain|sp_primary| with_string|evidence| go_id|structureChainId|
+-----+-----+----------+--------------------+--------+----------+----------------+
| 107L| A| P00720| n| IDA|GO:0003796| 107L.A|
| 108L| A| P00720| n| IDA|GO:0003796| 108L.A|
| 109L| A| P00720| n| IDA|GO:0003796| 109L.A|
| 10GS| A| P09211| GO:0005515| IC|GO:0010804| 10GS.A|
| 10GS| A| P09211|UniProtKB:P19712:...| IPI|GO:0005515| 10GS.A|
| 10GS| A| P09211| UniProtKB:P30041| IPI|GO:0005515| 10GS.A|
| 10GS| A| P09211| UniProtKB:P60409| IPI|GO:0005515| 10GS.A|
| 10GS| A| P09211| UniProtKB:P60411| IPI|GO:0005515| 10GS.A|
| 10GS| A| P09211| UniProtKB:Q12933| IPI|GO:0005515| 10GS.A|
| 10GS| A| P09211| UniProtKB:Q15323| IPI|GO:0005515| 10GS.A|
+-----+-----+----------+--------------------+--------+----------+----------------+
In [23]:
spark.stop()