Source code for mmtfPyspark.mappers.structureToPolymerSequences
#!/user/bin/env python
'''structureToPolymerSequences.py:
This mapper maps a structure to it's polypeptides, polynucleotide chain sequences.
For a multi-model structure, only the first model is considered.
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
[docs]class StructureToPolymerSequences(object):
'''This mapper maps a structure to it's polypeptides, polynucleotide chain sequences.
For a multi-model structure, only the first model is considered.
'''
def __init__(self, useChainIdInsteadOfChainName=False, excludeDuplicates=False):
'''Extracts all polymer chains from a structure. If the argument is set to true,
the assigned key is: <PDB ID.Chain ID>, where Chain ID is the unique identifier
assigned to each molecular entity in an mmCIF file. This Chain ID corresponds to
`_atom_site.label_asym_id <http://mmcif.wwpdb.org/dictionaries/mmcif_mdb.dic/Items/_atom_site.label_asym_id.html>`_
field in an mmCIF file.
Parameters
----------
useChainIdInsteadOfChainName : bool
if true, use the Chain Id in the key assignments
excludeDuplicates : bool
if true, return only one chain for each unique sequence= t[1]
'''
self.useChainIdInsteadOfChainName = useChainIdInsteadOfChainName
self.excludeDuplicates = excludeDuplicates
def __call__(self, t):
structure = t[1]
sequences = list()
seqSet = set()
chainToEntityIndex = self._get_chain_to_entity_index(structure)
for i in range(structure.chains_per_model[0]):
polymer = structure.entity_list[chainToEntityIndex[i]]['type'] == 'polymer'
if polymer:
key = t[0]
if '.' in key:
key = key.split('.')[0]
key += '.'
if self.useChainIdInsteadOfChainName:
key += structure.chain_id_list[i]
else:
key += structure.chain_name_list[i]
if self.excludeDuplicates:
if chainToEntityIndex[i] in seqSet:
continue
seqSet.add(chainToEntityIndex[i])
sequences.append(
(key, structure.entity_list[chainToEntityIndex[i]]['sequence']))
return sequences
def _get_chain_to_entity_index(self, structure):
entityChainIndex = [0] * structure.num_chains
for i in range(len(structure.entity_list)):
for j in structure.entity_list[i]['chainIndexList']:
entityChainIndex[j] = i
return entityChainIndex