Source code for mmtfPyspark.webservices.piscesDownloader

#!/user/bin/env python
'''piscesDownloader.py

This class downloads representative protein chains from the PISCES
CulledPDB sets. A CulledPDB set is selected by specifying
sequenceIdentity and resolution cutoff values from the following
list:

- sequenceIdentity = [20, 25, 30, 40, 50, 60, 70, 80, 90]
- resolution = [1.6, 1.8, 2.0, 2.2, 2.5, 3.0]

References
----------
- PISCES: http://dunbrack.fccc.edu/PISCES.php
- G. Wang and R. L. Dunbrack, Jr. PISCES: a protein sequence culling server. Bioinformatics, 19:1589-1591, 2003.
'''

__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

from urllib.request import urlopen
import gzip


[docs]class PiscesDownloader(object): '''Downloads representative protein chains from the PISCES CulledPDB sets. A CulledPDB set is selected by specifying sequenceIdentity and resolution cutoff values from the following lists: - sequenceIdentity = 20, 25, 30, 40, 50, 60, 70, 80, 90 - resolution = 1.6, 1.8, 2.0, 2.2, 2.5, 3.0 Attributes ---------- sequenceIdentity : int sequence identity [0] resoltion : int resoltion for pisces filter [0.0] ''' URL = "http://dunbrack.fccc.edu/Guoli/culledpdb_hh" SEQ_ID_LIST = [20, 25, 30, 40, 50, 60, 70, 80, 90] RESOLUTION_LIST = [1.6, 1.8, 2.0, 2.2, 2.5, 3.0] def __init__(self, sequenceIdentity=0, resolution=0.0): if sequenceIdentity not in self.SEQ_ID_LIST: raise ValueError("Invalid sequenceIdentity") if resolution not in self.RESOLUTION_LIST: raise ValueError("Invalid resolution value") self.sequenceIdentity = sequenceIdentity self.resolution = resolution
[docs] def get_structure_chain_ids(self): fileURL = self.URL + '/' + self._get_file_name() u = urlopen(fileURL) line = str(gzip.GzipFile(fileobj=u).read()).split('\\n') structureChainId = [l.split()[0][:4] + '.' + l.split()[0][4] for l in line if len(l.split()) > 1] return structureChainId
def _get_file_name(self): u = urlopen(self.URL) fileName = "" cs = "pc" + str(self.sequenceIdentity) + "_res" + str(self.resolution) while True: line = str(u.readline()) line = line.split('\"')[1].split('/')[-1] if line[:3] == 'log': break if (cs in line) and ("fasta" not in line): fileName = line break u.close() return fileName