Source code for mmtfPyspark.datasets.swissModelDataset

#!/user/bin/env python
'''swissModelDataset

This module provides access to SWISS-MODEL datasets containing homology models.

References
----------
- SWISS-MODEL API: https://swissmodel.expasy.org/docs/repository_help#smr_api
- Bienert S, Waterhouse A, de Beer TA, Tauriello G, Studer G, Bordoli L,
  Schwede T (2017). The SWISS-MODEL Repository - new features and
  functionality, Nucleic Acids Res. 45(D1):D313-D319.  https://dx.doi.org/10.1093/nar/gkw1132
- Biasini M, Bienert S, Waterhouse A, Arnold K, Studer G, Schmidt T, Kiefer F,
  Gallo Cassarino T, Bertoni M, Bordoli L, Schwede T(2014). The SWISS-MODEL
  Repository - modelling protein tertiary and quaternary structure using
  evolutionary information, Nucleic Acids Res. 42(W1):W252–W258.  https://doi.org/10.1093/nar/gku340

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

import requests
import tempfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

SWISS_MODEL_REST_URL = "https://swissmodel.expasy.org/repository/uniprot/"
SWISS_MODEL_PROVIDER = ".json?provider=swissmodel"
PDB_PROVIDER = ".json?provider=pdb"


[docs]def get_swiss_models(uniProtIds): '''Downloads metadata for SWISS-MODEL homology models for alist of UniProtIds. The original data schema is flatterened into a row-based schema. Examples -------- >>> uniProtIds = ["P36575", "P24539", "O00244"] >>> ds = swissProtDataset.get_swiss_models(uniProtIds) >>> ds.show() +------+--------+----+---+-----+----------+----+--------+-----------+--------+--------+--------+----------+-----------+ | ac|sequence|from| to|qmean|qmean_norm|gmqe|coverage|oligo-state| method|template|identity|similarity|coordinates| +------+--------+----+---+-----+----------+----+--------+-----------+--------+--------+-- -----+----------+-----------+ |P36575|MSKVF...| 2|371|-3.06|0.66345522|0.75|0.953608| monomer|Homology|1suj.1.A|68.66484|0.50463312|https://...| |P24539|MLSRV...| 76|249|-2.51|0.67113881|0.65|0.679687| monomer|Homology|5ara.1.S|84.48275|0.54788881|https://...| |O00244|MPKHE...| 1| 68| 1.04|0.84233218|0.98| 1.0| homo-2-mer|Homology|1fe4.1.A| 100.0|0.60686457|https://...| +------+--------+----+---+-----+----------+----+--------+-----------+--------+--------+--------+----------+-----------+ Parameters ---------- uniProtIds : list list of UniProt Ids Returns ------- dataset SwissModel dataset ''' dataset = get_swiss_models_raw_data(uniProtIds) return _flatten_dataset(dataset)
[docs]def get_swiss_models_raw_data(uniProtIds): '''Downloads the raw metadata for SWISS-MODEL homology models. This dataset is in the original data schema as downloaded from SWISS-MODEL. Parameters ---------- uniProtIds : list list of UniProt Ids Returns ------- dataset SwissModel dataset in original data schema ''' paths = [] for uniProtId in uniProtIds: url = SWISS_MODEL_REST_URL + uniProtId + SWISS_MODEL_PROVIDER req = requests.get(url) inputStream = req.content # TODO temporary solution for inability to retrive data if (len(inputStream) - len(uniProtId)) < 103: print(f"WARNING: Counld not load data for: {uniProtId}") continue # save data to temporary file requires as input to dataset reader paths.append(_save_temp_file(inputStream.decode("utf-8"))) # load temporary JSON files into Spark dataset dataset = _read_json_files(paths) return dataset
def _flatten_dataset(ds): '''Flattens the original hierarchical data schema into a simple row-based schema. Some less useful data are excluded. Parameters ---------- ds : dataset the original spark dataset Returns ------- dataset flattened dataset ''' ds = ds.withColumn("structures", explode(ds.result.structures)) return ds.select(col("query.ac"), col("result.sequence"), \ col("structures.from"), col("structures.to"), \ col("structures.qmean"), col("structures.qmean_norm"), \ col("structures.gmqe"), col("structures.coverage"), \ col("structures.oligo-state"), col("structures.method"), \ col("structures.template"), col("structures.identity"), \ col("structures.similarity"), col("structures.coordinates"),\ col("result.md5"), col("structures.md5")) def _save_temp_file(inputStream): '''Saves tabular report as a temporary CSV file Parameters ---------- inputStream : str inputStream from swiss model Returns ------- str path to the tempfile ''' tempFile = tempfile.NamedTemporaryFile(delete=False) with open (tempFile.name, "w") as t: t.writelines(inputStream) return tempFile.name def _read_json_files(paths): '''Reads a list of json files to Spark dataset Parameters ---------- paths : list a list of paths to temporary json files Returns ------- dataset a sparkdataset ''' spark = SparkSession.builder.getOrCreate() dataset = spark.read \ .format("json") \ .load(paths) return dataset