Source code for mmtfPyspark.mappers.structureToBioassembly

#!/user/bin/env python
'''structureToPolymerChain.py:

Maps a structure to its individual polymer chains. Polymer chains
include polypeptides, polynucleotides, and linear and branched polysaccharides.
For a multi-model structure, only the first model is considered.

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "debug"
from mmtf.utils import *
from mmtf.api.mmtf_writer import MMTFEncoder
from numpy import *


[docs]class StructureToBioassembly(object): '''Extracts all polymer chains from a structure. If the argument is set to true, the assigned key is: <PDB ID.Chain ID>, where Chain ID is the unique identifier assigned to each molecular entity in an mmCIF file. This Chain ID corresponds to `_atom_size.label_asym_id <http://mmcif.wwpdb.org/dictionaries/mmcif_mdb.dic/Items/_atom_site.label_asym_id.html>`_ field in an mmCIF file. Attributes ---------- useChainIdInsteadOfChainName : bool if true, use Chain Id in the key assignments excludeDuplicates : bool if true return only one chain for each sequence ''' def __init__(self, useChainIdInsteadOfChainName=False, excludeDuplicates=False): self.useChainIdInsteadOfChainName = useChainIdInsteadOfChainName self.excludeDuplicates = excludeDuplicates def __call__(self, t): structure = t[1] if not structure.alt_loc_set: structure = structure.set_alt_loc_list() # print(structure.group_type_list) bioassemblies = structure.bio_assembly numBioassembly = len(bioassemblies) resList = list() for i in range(numBioassembly): bioAssembly = MMTFEncoder() structureId = structure.structure_id + '-BioAssembly' + \ bioassemblies[i]['name'] totAtoms = 0 totBonds = 0 totGroups = 0 totChains = 0 totModels = structure.num_models numTrans = len(bioassemblies[i]['transformList']) bioChainList = [[]] * numTrans transMatrix = [[]] * numTrans for ii in range(numTrans): bioChainList[ii] = bioassemblies[i]['transformList'][ii]['chainIndexList'] transMatrix[ii] = bioassemblies[i]['transformList'][ii]['matrix'] for j in range(totModels): totChains = totChains + len(bioChainList[ii]) groupCounter = 0 for k in range(structure.chains_per_model[j]): adding = False for currChain in bioChainList[ii]: if currChain == k: adding = True if adding: totGroups = totGroups + \ structure.groups_per_chain[k] for h in range(structure.groups_per_chain[k]): if adding: groupIndex = structure.group_type_list[groupCounter] totAtoms = totAtoms + \ len(structure.group_list[groupIndex] ['atomNameList']) totBonds = totBonds + \ len(structure.group_list[groupIndex] ['bondOrderList']) groupCounter = groupCounter + 1 # Set header bioAssembly.init_structure( totBonds, totAtoms, totGroups, totChains, totModels, structureId) decoder_utils.add_xtalographic_info(structure, bioAssembly) decoder_utils.add_header_info(structure, bioAssembly) modelIndex = 0 chainIndex = 0 groupIndex = 0 atomIndex = 0 chainCounter = 0 description = {} for ii in range(totModels): numChainsPerModel = structure.chains_per_model[modelIndex] * numTrans bioAssembly.set_model_info(modelIndex, numChainsPerModel) chainToEntityIndex = self._getChainToEntityIndex(structure) for j in range(structure.chains_per_model[modelIndex]): currGroupIndex = groupIndex currAtomIndex = atomIndex for k in range(numTrans): currChainList = bioChainList[k] currMatrix = transMatrix[k] addThisChain = False for currChain in currChainList: if currChain == j: addThisChain = True groupIndex = currGroupIndex atomIndex = currAtomIndex xCoords = structure.x_coord_list yCoords = structure.y_coord_list zCoords = structure.z_coord_list m = reshape(matrix(currMatrix), (4, 4)) if addThisChain: entityToChainIndex = chainToEntityIndex[chainIndex] if structure.entity_list[entityToChainIndex]['description'] in description: index = description[structure.entity_list[entityToChainIndex]['description']] bioAssembly.entity_list[index]['chainIndexList'].append(chainCounter) else: bioAssembly.set_entity_info([chainCounter], structure.entity_list[entityToChainIndex]['sequence'], structure.entity_list[entityToChainIndex]['description'], structure.entity_list[entityToChainIndex]['type']) description[bioAssembly.entity_list[-1]['description']] = len(bioAssembly.entity_list)-1 bioAssembly.set_chain_info(structure.chain_id_list[chainIndex], structure.chain_name_list[chainIndex], structure.groups_per_chain[chainIndex]) chainCounter = chainCounter + 1 for jj in range(structure.groups_per_chain[chainIndex]): # print(structure.group_type_list) currgroup = structure.group_type_list[groupIndex] # if ii == 0 and j == 0 and jj < 10: # print(currgroup) if addThisChain: bioAssembly.set_group_info(structure.group_list[currgroup]['groupName'], structure.group_id_list[groupIndex], structure.ins_code_list[groupIndex], structure.group_list[currgroup]['chemCompType'], len( structure.group_list[currgroup]['atomNameList']), len( structure.group_list[currgroup]['bondOrderList']), structure.group_list[currgroup]['singleLetterCode'], structure.sequence_index_list[groupIndex], structure.sec_struct_list[groupIndex]) for kk in range(len(structure.group_list[currgroup]['atomNameList'])): if addThisChain: p1 = array( [xCoords[atomIndex], yCoords[atomIndex], zCoords[atomIndex], 1]) p2 = matmul(p1, m) bioAssembly.set_atom_info( structure.group_list[currgroup]['atomNameList'][kk], structure.atom_id_list[atomIndex], structure.alt_loc_list[atomIndex], p2.item(0), p2.item(1), p2.item(2), structure.occupancy_list[atomIndex], structure.b_factor_list[atomIndex], structure.group_list[currgroup]['elementList'][kk], structure.group_list[currgroup]['formalChargeList'][kk], ) atomIndex = atomIndex + 1 # bond not implemented if addThisChain: for l in range(len(structure.group_list[currgroup]['bondOrderList'])): bondIndOne = structure.group_list[currgroup]['bondAtomList'][l * 2] bondIndTwo = structure.group_list[currgroup]['bondAtomList'][l * 2 + 1] bondOrder = structure.group_list[currgroup]['bondOrderList'][l] #newChain.set_group_bond(bondIndOne, bondIndTwo, bondOrder) bioAssembly.current_group.bond_atom_list += [ bondIndOne, bondIndTwo] bioAssembly.current_group.bond_order_list.append( bondOrder) groupIndex = groupIndex + 1 chainIndex = chainIndex + 1 modelIndex = modelIndex + 1 # print(type(currMatrix)) bioAssembly.finalize_structure() resList.append((structureId, bioAssembly)) return resList def _getNumAtomsAndBonds(self, structure): '''Gets the number of atoms and bonds per chain ''' numChains = structure.chains_per_model[0] atomsPerChain = [0] * numChains bondsPerChain = [0] * numChains groupCounter = 0 for i in range(numChains): for j in range(structure.groups_per_chain[i]): groupIndex = structure.group_type_list[groupCounter] atomsPerChain[i] = len( structure.group_list[groupIndex]['atomNameList']) bondsPerChain[i] = len( structure.group_list[groupIndex]['bondOrderList']) groupCounter += 1 return atomsPerChain, bondsPerChain def _getChainToEntityIndex(self, structure): '''Returns an list that maps a chain index to an entity index. Parameters ---------- structure : structureDataInterFace ''' entityChainIndex = [0] * structure.num_chains for i in range(len(structure.entity_list)): for j in structure.entity_list[i]["chainIndexList"]: entityChainIndex[j] = i return entityChainIndex