Source code for mmtfPyspark.webfilters.blastCluster

#!/user/bin/env python
'''blastCluster.py

This filter passes through representative structures from the RCSB PDB
BlastCLust cluster. A sequence identity thresholds needs to be specified.
The representative for each cluster is the first chain in a cluster.

References
----------
BlastClust cluster field names:
`Field names <http://www.rcsb.org/pdb/statistics/clusterStatistics.do>`_

Examples
--------
Find representative PDB entries at 90% sequence identity:
>>> sequenceIdentity = 90
>>> pdb = pdb.filter(BlastCluster(90))

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

import urllib.request


[docs]class BlastCluster(object): '''Filters blast clusters Attributes ---------- sequenceIdentity : int sequence indentity for blast ''' def __init__(self, sequenceIdentity): clusters = self.get_blast_cluster(sequenceIdentity) self.pdbIds = set() for protein in clusters: self.pdbIds.add(protein) self.pdbIds.add(protein[:4]) def __call__(self, t): return t[0] in self.pdbIds
[docs] def get_blast_cluster(self, sequenceIdentity): if sequenceIdentity not in [30,40,50,70,90,95,100]: raise Exception(f"Error: representative chains are not availible for \ sequence Identity {sequenceIdentity}.\n Must be in \ range [30,40,50,70,90,95,100]") return coreUrl = "https://cdn.rcsb.org/sequence/clusters/" clusters = [] inputStream = urllib.request.urlopen(f"{coreUrl}bc-{sequenceIdentity}.out") for line in inputStream: line = str(line)[2:-3].replace("_",".").strip("\\n") clusters += line.split(" ") return clusters