Keywork Search Demo

pdbj

pdbj

PDBj Mine 2 RDB keyword search query and MMTF filtering using pdbid. This filter searches the ‘keyword’ column in the brief_summary table for a keyword and returns a couple of columns for the matching entries.

PDBj Mine Search Website

Imports

In [17]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.io import mmtfReader

Configure Spark Context

In [18]:
conf = SparkConf().setMaster("local[*]") \
                  .setAppName("keywordSearch")
sc = SparkContext(conf = conf)

Read in MMTF files from local directory

In [20]:
path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)

Apply a SQL search on PDBj using a filter

In [21]:
sql = "select pdbid from keyword_search('porin')"

pdb = pdb.filter(PdbjMineSearch(sql))
print(pdb.keys().collect())
print("\n")
print(f"Number of entries matching query: {pdb.count()}")
['3M8B', '3M8D', '4AUI', '4PR7', '4RJW', '4RJX', '4RLC', '2L26', '1R1M', '3UPG', '3UU2', '1BT9', '2WVP', '2X9K', '3VY8', '3VY9', '3VZT', '3VZU', '3VZW', '2O4V', '2ODJ', '2OMF', '1H6S', '3SY7', '3SY9', '3SYB', '3SYS', '3SZD', '3SZV', '3T0S', '3T20', '3T24', '3JBU', '4JML', '4K34', '4K7K', '4K7R', '1NQE', '1NQG', '1NQH', '4MJT', '4MKO', '5NIK', '5NIL', '1E54', '3FCG', '5O8O', '2D57', '2GUF', '1A0S', '1A0T', '1AF6', '5DL6', '5DL7', '5DL8', '3JTY', '3K19', '3K1B', '3FIP', '3FMO', '3FMP', '3FYX', '4D65', '2KNS', '2KS4', '2KSM', '1GFM', '1GFN', '1GFO', '1GFP', '1GFQ', '2K0L', '2YSU', '3NB3', '5LDT', '5LDV', '2VDA', '2VDD', '2VDE', '2VQG', '2VQH', '2VQI', '2VQK', '2VQL', '2MLH', '3TZG', '2GSK', '5U1H', '4BUM', '2BR3', '2BR4', '2BR5', '2BRR', '2XE1', '2XE2', '2XE3', '2XE5', '2XET', '2XG6', '2XMN', '2PV1', '2PV2', '2PV3', '3PGR', '3PGS', '3PGU', '3PIK', '3POQ', '3POR', '3POU', '3POX', '3PRN', '4MKQ', '2ZZ9', '3A2S', '3L48', '2BM8', '2BM9', '3EMN', '2V9U', '2LBT', '2LCA', '2WJQ', '2WJR', '2WMZ', '4LSE', '4LSF', '4LSH', '4LSI', '3RFZ', '3RGM', '3RGN', '1UJW', '1UUN', '1UWX', '3NJT', '3NSG', '1PRN', '3WI4', '3WI5', '4QLP', '5JDP', '1P4T', '1PHO', '1G90', '4D5U', '4D64', '2ZFG', '2ZLD', '1TLW', '1TLY', '1TLZ', '1TQQ', '2GE4', '3BWU', '3C02', '1HXT', '1HXU', '1HXX', '3PF1', '1BH3', '1M5Y', '1MAL', '2KT6', '1MPM', '4FMS', '4FOZ', '4FQE', '4FRT', '4FRX', '4FSO', '4FSP', '4FT6', '4FUV', '2FGQ', '2FGR', '2IWV', '2IWW', '1MPF', '1MPN', '1MPO', '1MPQ', '1MPR', '2ZLE', '5NG5', '5V5S', '3O0E', '4KR4', '4KR8', '4KRA', '2POR', '2MAF', '2J1N', '2J4U', '5JIR', '1T16', '1T1L', '1OH2', '1OPF', '1OSM', '4J3O', '4QKY', '4QL0', '4CTD', '4C69', '2LQV', '3DWN', '3IYZ', '2MPR', '2JK4', '2JMM', '1QJP', '1QKZ', '1BXW', '3TD3', '3TD4', '3TD5', '4B0E', '4B0M', '3HW9', '3HWB', '1NQF', '3OHN', '2PRN', '2F1C', '2MPA', '2QTK', '2R4L', '2R4N', '2R4O', '2R4P', '2R88', '2R89', '2R8A', '4A8D', '2K4T', '5FVN', '1EK9', '2KGS', '2KGW', '2Y0H', '2Y0K', '2Y0L', '2Y2X', '5O66', '6PRN', '7PRN', '8PRN', '2JQY', '4JFB', '5M2Q', '4G4Y', '4G4Z', '4G88', '4GCP', '4GCQ', '4GCS', '4GEY', '4GF4', '4GHB', '1ZDV', '1ZDX', '1ZE3', '5XDN', '5XDO', '5PRN']
Number of entries matching query: 266

Apply a SQL search on PDBj and get a dataset

In [14]:
sql = "select pdbid, resolution, biol_species, db_uniprot, db_pfam, hit_score from keyword_search('porin') order by hit_score desc"

dataset = pdbjMineDataset.get_dataset(sql)
dataset.show(10)
+-----------+----------+--------------------+--------------------+-----------+---------+
|structureId|resolution|        biol_species|          db_uniprot|    db_pfam|hit_score|
+-----------+----------+--------------------+--------------------+-----------+---------+
|       3POR|       2.5|Rhodobacter capsu...|['P31243', 'PORI_...|['PF13609']| 0.095809|
|       2OMF|       2.4|Escherichia coli K12|['OMPF_ECOLI', 'P...|['PF00267']|0.0954989|
|       2POR|       1.8|Rhodobacter capsu...|['P31243', 'PORI_...|['PF13609']|0.0951392|
|       1GFP|       2.7|    Escherichia coli|['OMPF_ECOLI', 'P...|['PF00267']| 0.094717|
|       1GFQ|       2.8|    Escherichia coli|['OMPF_ECOLI', 'P...|['PF00267']| 0.094717|
|       1GFM|       3.5|    Escherichia coli|['OMPF_ECOLI', 'P...|['PF00267']| 0.094717|
|       1GFN|       3.1|    Escherichia coli|['OMPF_ECOLI', 'P...|         []| 0.094717|
|       1GFO|       3.3|    Escherichia coli|['OMPF_ECOLI', 'P...|['PF00267']| 0.094717|
|       1BT9|       3.0|    Escherichia coli|['OMPF_ECOLI', 'P...|['PF00267']| 0.094717|
|       1H6S|       3.0|RHODOPSEUDOMONAS ...|['P39767', 'PORI_...|['PF13609']| 0.094717|
+-----------+----------+--------------------+--------------------+-----------+---------+
only showing top 10 rows

Terminate Spark Context

In [15]:
sc.stop()