Source code for mmtfPyspark.webfilters.blastCluster

#!/user/bin/env python
'''blastCluster.py

This filter passes through representative structures from the RCSB PDB
BlastCLust cluster. A sequence identity thresholds needs to be specified.
The representative for each cluster is the first chain in a cluster.

References
----------
BlastClust cluster field names:
`Field names <http://www.rcsb.org/pdb/statistics/clusterStatistics.do>`_

Examples
--------
Find representative PDB entries at 90% sequence identity:
>>> sequenceIdentity = 90
>>> pdb = pdb.filter(BlastCluster(90))

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

import urllib.request


[docs]class BlastCluster(object):
	'''Filters blast clusters

	Attributes
	----------
	sequenceIdentity : int
	   sequence indentity for blast
	'''
	def __init__(self, sequenceIdentity):

		clusters = self.get_blast_cluster(sequenceIdentity)

		self.pdbIds = set()

		for protein in clusters:
			self.pdbIds.add(protein)
			self.pdbIds.add(protein[:4])


	def __call__(self, t):
		return t[0] in self.pdbIds


[docs]	def get_blast_cluster(self, sequenceIdentity):

		if sequenceIdentity not in [30,40,50,70,90,95,100]:
			raise Exception(f"Error: representative chains are not availible for \
							sequence Identity {sequenceIdentity}.\n Must be in \
							range [30,40,50,70,90,95,100]")
			return

		coreUrl = "https://cdn.rcsb.org/sequence/clusters/"
		clusters = []
		inputStream = urllib.request.urlopen(f"{coreUrl}bc-{sequenceIdentity}.out")

		for line in inputStream:
			line = str(line)[2:-3].replace("_",".").strip("\\n")
			clusters += line.split(" ")

		return clusters