Source code for mmtfPyspark.mappers.structureToPolymerChains

#!/user/bin/env python
'''structureToPolymerChain.py:

Maps a structure to its individual polymer chains. Polymer chains
include polypeptides, polynucleotides, and linear and branched polysaccharides.
For a multi-model structure, only the first model is considered.

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "debug"
from mmtf.utils import *
from mmtf.api.mmtf_writer import MMTFEncoder
from mmtfPyspark.utils import MmtfStructure

[docs]class StructureToPolymerChains(object): '''Extracts all polymer chains from a structure. If the argument is set to true, the assigned key is: <PDB ID.Chain ID>, where Chain ID is the unique identifier assigned to each molecular entity in an mmCIF file. This Chain ID corresponds to `_atom_size.label_asym_id <http://mmcif.wwpdb.org/dictionaries/mmcif_mdb.dic/Items/_atom_site.label_asym_id.html>`_ field in an mmCIF file. Attributes ---------- useChainIdInsteadOfChainName : bool if true, use Chain Id in the key assignments excludeDuplicates : bool if true return only one chain for each sequence ''' def __init__(self, useChainIdInsteadOfChainName = False, excludeDuplicates = False): self.useChainIdInsteadOfChainName = useChainIdInsteadOfChainName self.excludeDuplicates = excludeDuplicates def __call__(self, t): if type(t[1]) == MmtfStructure and t[1].alt_loc_set == False: structure = t[1].set_alt_loc_list() else: structure = t[1] # Precalculate indices numChains = structure.chains_per_model[0] chainToEntityIndex = self._get_chain_to_entity_index(structure) atomsPerChain, bondsPerChain = self._get_num_atoms_and_bonds(structure) chainList = list() seqSet = set() groupCounter = 0 atomCounter = 0 for i in range(numChains): polymerChain = MMTFEncoder() entityToChainIndex = chainToEntityIndex[i] chain_type = structure.entity_list[entityToChainIndex]['type'] polymer = chain_type == "polymer" polymerAtomCount = 0 atomMap = {} structureId = '' if polymer: # To avoid of information loss, add chainName/IDs and entity id # This required by some queries structureId = structure.structure_id + '.' +\ structure.chain_name_list[i] + '.' +\ structure.chain_id_list[i] + '.' +\ str(entityToChainIndex + 1) # Set header polymerChain.init_structure(bondsPerChain[i], atomsPerChain[i], structure.groups_per_chain[i], 1, 1, structureId) decoder_utils.add_xtalographic_info(structure, polymerChain) decoder_utils.add_header_info(structure, polymerChain) # Set model info (only one model: 0) polymerChain.set_model_info(0,1) # Set entity and chain info polymerChain.set_entity_info([0], structure.entity_list[entityToChainIndex]['sequence'], structure.entity_list[entityToChainIndex]['description'], structure.entity_list[entityToChainIndex]['type']) polymerChain.set_chain_info(structure.chain_id_list[i], structure.chain_name_list[i], structure.groups_per_chain[i]) for j in range(structure.groups_per_chain[i]): groupIndex = structure.group_type_list[groupCounter] if polymer: # Set group info polymerChain.set_group_info(structure.group_list[groupIndex]['groupName'], structure.group_id_list[groupCounter], structure.ins_code_list[groupCounter], structure.group_list[groupIndex]['chemCompType'], len(structure.group_list[groupIndex]['atomNameList']), len(structure.group_list[groupIndex]['bondOrderList']), structure.group_list[groupIndex]['singleLetterCode'], structure.sequence_index_list[groupCounter], structure.sec_struct_list[groupCounter]) for k in range(len(structure.group_list[groupIndex]['atomNameList'])): if polymer: atomMap[atomCounter] = polymerAtomCount polymerAtomCount += 1 polymerChain.set_atom_info( structure.group_list[groupIndex]['atomNameList'][k], structure.atom_id_list[atomCounter], structure.alt_loc_list[atomCounter], structure.x_coord_list[atomCounter], structure.y_coord_list[atomCounter], structure.z_coord_list[atomCounter], structure.occupancy_list[atomCounter], structure.b_factor_list[atomCounter], structure.group_list[groupIndex]['elementList'][k], structure.group_list[groupIndex]['formalChargeList'][k],) atomCounter += 1 if polymer: # Add intra-group bond info for l in range(len(structure.group_list[groupIndex]['bondOrderList'])): bondIndOne = structure.group_list[groupIndex]['bondAtomList'][l*2] bondIndTwo = structure.group_list[groupIndex]['bondAtomList'][l*2+1] bondOrder = structure.group_list[groupIndex]['bondOrderList'][l] polymerChain.set_group_bond(bondIndOne, bondIndTwo, bondOrder) groupCounter += 1 if polymer: # TODO skipping adding inter group bond info for now polymerChain.finalize_structure() chId = structure.chain_name_list[i] if self.useChainIdInsteadOfChainName : chId = structure.chain_id_list[i] if self.excludeDuplicates: if chainToEntityIndex[i] in seqSet: continue seqSet.add(chainToEntityIndex[i]) chainList.append((structure.structure_id + "." + chId, polymerChain)) return chainList def _get_num_atoms_and_bonds(self, structure): '''Gets the number of atoms and bonds per chain ''' numChains = structure.chains_per_model[0] atomsPerChain = [0] * numChains bondsPerChain = [0] * numChains groupCounter = 0 for i in range(numChains): for j in range(structure.groups_per_chain[i]): groupIndex = structure.group_type_list[groupCounter] atomsPerChain[i] += len(structure.group_list[groupIndex]['atomNameList']) bondsPerChain[i] += len(structure.group_list[groupIndex]['bondOrderList']) groupCounter += 1 return atomsPerChain, bondsPerChain def _get_chain_to_entity_index(self, structure): '''Returns an list that maps a chain index to an entity index. Parameters ---------- structure : structureDataInterFace ''' entityChainIndex = [0] * structure.num_chains for i in range(len(structure.entity_list)): for j in structure.entity_list[i]["chainIndexList"]: entityChainIndex[j] = i return entityChainIndex