Source code for mmtfPyspark.utils.columnarStructure
#!/user/bin/env python
'''columnarStructure.py
Provides efficient access to structure information in the form of atom-based arrays.
Data are lazily initialized as needs.
'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"
import numpy as np
[docs]class ColumnarStructure(object):
'''Column based data structure to efficiently access structure information
Attributes
----------
structure : mmtfStructure
mmtf structure
firstModelOnly : bool
use only the first model in a structure if True
'''
def __init__(self, structure, firstModelOnly=True):
# Set class variables
self.numAtoms = 0
self.numChains = 0
self.numGroups = 0
self.numModels = 0
self.atomNames = None
self.atomToChainIndices = None
self.atomToGroupIndices = None
self.chainIds = None
self.chainNames = None
self.chemCompType = None
self.elements = None
self.entityTypes = None
self.entityChainIndex = None
self.entityIndices = None
self.groupNames = None
self.groupNumbers = None
self.groupToAtomIndices = None
self.polymer = None
self.sequencePositions = None
self.structure = structure
self.groupToAtomIndices = None
self.chainToAtomIndices = None
self.chainToGroupIndices = None
if firstModelOnly:
self.numModels = 1
else:
self.numModels = structure.num_models
[docs] def get_group_to_atom_indices(self):
if self.groupToAtomIndices is None:
self.get_indices()
return self.groupToAtomIndices
[docs] def get_chain_to_atom_indices(self):
if self.chainToAtomIndices is None:
self.get_indices()
return self.chainToAtomIndices
[docs] def get_chain_to_group_indices(self):
if self.chainToGroupIndices is None:
self.get_indices
return self.chainToGroupIndices
[docs] def get_num_atoms(self):
self.get_indices()
return self.numAtoms
[docs] def get_num_groups(self):
self.get_indices()
return self.numGroups
[docs] def get_num_chains(self):
self.get_indices()
return self.numChains
[docs] def get_num_models(self):
self.get_indices()
return self.numModels
[docs] def get_x_coords(self):
return self.structure.x_coord_list
[docs] def get_y_coords(self):
return self.structure.y_coord_list
[docs] def get_z_coords(self):
return self.structure.z_coord_list
[docs] def get_occupancies(self):
return self.structure.occupancy_list
[docs] def get_b_factors(self):
return self.structure.b_factor_list
[docs] def get_alt_loc_list(self):
if not self.structure.alt_loc_set:
self.structure = self.structure.set_alt_loc_list()
return self.structure.alt_loc_list
[docs] def get_group_types(self):
return self.structure.group_type_list
[docs] def get_atom_to_group_indices(self):
if self.atomToGroupIndices is None:
self.atomToGroupIndices = np.empty(
self.get_num_atoms(), dtype='>i4')
for i in range(self.get_num_groups()):
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.atomToGroupIndices[start:end] = i
return self.atomToGroupIndices
[docs] def get_chem_comp_types(self):
if self.chemCompType is None:
# Max chemCompType has 17 characters
self.chemCompType = np.empty(
self.get_num_atoms(), dtype=np.object_)
self.groupTypeIndices = self.get_group_types()
for i in range(self.get_num_groups()):
index = self.groupTypeIndices[i]
value = self.structure.group_list[index]['chemCompType']
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.chemCompType[start:end] = value
return self.chemCompType
[docs] def get_elements(self):
if self.elements is None:
self.elements = np.empty(self.get_num_atoms(), dtype=np.object_)
self.groupTypeIndices = self.get_group_types()
for i in range(self.get_num_groups()):
index = self.groupTypeIndices[i]
elementNames = self.structure.group_list[index]['elementList']
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.elements[start:end] = elementNames
return self.elements
[docs] def get_atom_names(self):
if self.atomNames is None:
# in case of 2 cjacter group + 3 digits
self.atomNames = np.empty(self.get_num_atoms(), dtype=np.object_)
self.groupTypeIndices = self.get_group_types()
for i in range(self.get_num_groups()):
index = self.groupTypeIndices[i]
atomNames = self.structure.group_list[index]['atomNameList']
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.atomNames[start:end] = atomNames
return self.atomNames
[docs] def get_entity_types(self):
if self.entityTypes is None:
self.entityTypes = np.empty(self.get_num_atoms(), dtype=np.object_)
# Instantiate required data
self.is_polymer()
self.get_chem_comp_types()
self.get_elements()
self.get_group_names()
for i in range(self.get_num_groups()):
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
poly = self.polymer[start]
ccType = self.chemCompType[start]
if poly:
if "PEPTIDE" in ccType:
eType = "PRO"
elif "DNA" in ccType:
eType = "DNA"
elif "RNA" in ccType:
eType = "RNA"
elif "SACCHARIDE" in ccType:
eType = "PSR"
else:
eType = "UNK"
elif (self.groupNames[start] == "HOH" or self.groupNames[start] == "DOD"):
eType = "WAT"
elif "SACCHARIDE" in ccType:
eType = "SAC"
else:
# If group contains at least one carbon atom, it is
# considered organic, but see expection below
organic = False
for j in range(start, end):
if self.elements[j] == "C":
organic = True
break
# Handle exceptions: carbon dioxide, carbon monoxide,
# cyanide ion are considered inorganic
if self.groupNames[start] == "CO2" or \
self.groupNames[start] == "CMO" or \
self.groupNames[start] == "CYN":
organic = False
if organic:
eType = "LGO"
else:
eType = "LGI"
self.entityTypes[start:end] = eType
return self.entityTypes
[docs] def get_group_names(self):
if self.groupNames is None:
self.groupNames = np.empty(self.get_num_atoms(), dtype=np.object_)
self.groupTypeIndices = self.structure.group_type_list
for i in range(self.get_num_groups()):
index = self.groupTypeIndices[i]
value = self.structure.group_list[index]['groupName']
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.groupNames[start:end] = value
return self.groupNames
[docs] def get_group_numbers(self):
if self.groupNumbers is None:
# In case of 4 digit group id + 2 ins code
self.groupNumbers = np.empty(
self.get_num_atoms(), dtype=np.object_)
for i in range(self.get_num_groups()):
value = str(self.structure.group_id_list[i])
insCode = self.structure.ins_code_list[i]
if insCode != "\x00":
value += insCode
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.groupNumbers[start:end] = value
return self.groupNumbers
[docs] def get_chain_ids(self):
if self.chainIds is None:
self.chainIds = np.empty(self.get_num_atoms(), dtype=np.object_)
for i in range(self.get_num_chains()):
start = self.chainToAtomIndices[i]
end = self.chainToAtomIndices[i + 1]
self.chainIds[start:end] = self.structure.chain_id_list[i]
return self.chainIds
[docs] def get_chain_names(self):
if self.chainNames is None:
self.chainNames = np.empty(self.get_num_atoms(), dtype=np.object_)
for i in range(self.get_num_chains()):
start = self.chainToAtomIndices[i]
end = self.chainToAtomIndices[i + 1]
self.chainNames[start:end] = self.structure.chain_name_list[i]
return self.chainNames
[docs] def is_polymer(self):
if self.polymer is None:
# TODO: Numpy bool type is different than python bools, might need to fix
self.polymer = np.empty(self.get_num_atoms(), dtype=bool)
self.get_chain_to_entity_index()
for i in range(self.get_num_chains()):
index = self.entityChainIndex[i]
start = self.chainToAtomIndices[i]
end = self.chainToAtomIndices[i + 1]
poly = self.structure.entity_list[index]['type'] == 'polymer'
self.polymer[start:end] = poly
return self.polymer
[docs] def get_entity_indices(self):
if self.entityIndices is None:
self.entityIndices = np.empty(
self.get_num_atoms(), dtype=np.object_)
self.get_chain_to_entity_index()
for i in range(self.get_num_chains()):
start = self.chainToAtomIndices[i]
end = self.chainToAtomIndices[i + 1]
self.entityIndices[start:end] = self.entityChainIndex[i]
return self.entityIndices
[docs] def get_atom_to_chain_indices(self):
if self.atomToChainIndices is None:
self.atomToChainIndices = np.empty(
self.get_num_atoms(), dtype='>i4')
for i in range(self.get_num_chains()):
start = self.chainToAtomIndices[i]
end = self.chainToAtomIndices[i + 1]
self.atomToChainIndices[start:end] = i
return self.atomToChainIndices
[docs] def get_sequence_positions(self):
if self.sequencePositions is None:
self.sequencePositions = np.empty(
self.get_num_atoms(), dtype='>i4')
groupSequenceIndices = self.structure.sequence_index_list
for i in range(self.get_num_groups()):
value = groupSequenceIndices[i]
start = self.groupToAtomIndices[i]
end = self.groupToAtomIndices[i + 1]
self.sequencePositions[start:end] = value
return self.sequencePositions
[docs] def get_chain_to_entity_index(self):
'''Returns an array that maps a chain index to an entity index
Returns
-------
:obj:`array <numpy.ndarray>`
index that maps chain index to an entity index
'''
if self.entityChainIndex is None:
self.entityChainIndex = np.empty(
self.structure.num_chains, dtype='>i4')
for i, entity in enumerate(self.structure.entity_list):
chainIndexList = entity['chainIndexList']
self.entityChainIndex[chainIndexList] = i
return self.entityChainIndex
[docs] def get_indices(self):
if self.groupToAtomIndices is None:
self.groupToAtomIndices = np.empty(
self.structure.num_groups + 1, dtype='>i4')
self.chainToAtomIndices = np.empty(
self.structure.num_chains + 1, dtype='>i4')
self.chainToGroupIndices = np.empty(
self.structure.num_chains + 1, dtype='>i4')
chainCount, groupCount, atomCount = 0, 0, 0
# Loop over all models
for m in range(self.numModels):
# Loop over all chains
for i in range(self.structure.chains_per_model[m]):
self.chainToAtomIndices[chainCount] = atomCount
self.chainToGroupIndices[chainCount] = groupCount
# Loop over all groups in chain
for j in range(self.structure.groups_per_chain[i]):
groupType = self.structure.group_type_list[groupCount]
self.groupToAtomIndices[groupCount] = atomCount
atomsInGroup = len(
self.structure.group_list[groupType]['formalChargeList'])
groupCount += 1
atomCount += atomsInGroup
chainCount += 1
self.groupToAtomIndices[groupCount] = atomCount
self.chainToAtomIndices[chainCount] = atomCount
self.chainToGroupIndices[chainCount] = groupCount
self.numAtoms = atomCount
self.numGroups = groupCount
self.numChains = chainCount
if self.numAtoms < self.structure.num_atoms:
self.groupToAtomIndices = self.groupToAtomIndices[:groupCount + 1]
self.chainToAtomIndices = self.chainToAtomIndices[:chainCount + 1]
self.chainToGroupIndices = self.chainToGroupIndices[:chainCount + 1]