Source code for mmtfPyspark.utils.columnarStructure

#!/user/bin/env python
'''columnarStructure.py

Provides efficient access to structure information in the form of atom-based arrays.
Data are lazily initialized as needs.

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

import numpy as np


[docs]class ColumnarStructure(object):
    '''Column based data structure to efficiently access structure information

    Attributes
    ----------
    structure : mmtfStructure
       mmtf structure
    firstModelOnly : bool
       use only the first model in a structure if True
    '''

    def __init__(self, structure, firstModelOnly=True):

        # Set class variables
        self.numAtoms = 0
        self.numChains = 0
        self.numGroups = 0
        self.numModels = 0

        self.atomNames = None
        self.atomToChainIndices = None
        self.atomToGroupIndices = None
        self.chainIds = None
        self.chainNames = None
        self.chemCompType = None
        self.elements = None
        self.entityTypes = None
        self.entityChainIndex = None
        self.entityIndices = None
        self.groupNames = None
        self.groupNumbers = None
        self.groupToAtomIndices = None
        self.polymer = None
        self.sequencePositions = None
        self.structure = structure
        self.groupToAtomIndices = None
        self.chainToAtomIndices = None
        self.chainToGroupIndices = None


        if firstModelOnly:
            self.numModels = 1
        else:
            self.numModels = structure.num_models

[docs]    def get_group_to_atom_indices(self):
        if self.groupToAtomIndices is None:
            self.get_indices()
        return self.groupToAtomIndices

[docs]    def get_chain_to_atom_indices(self):
        if self.chainToAtomIndices is None:
            self.get_indices()
        return self.chainToAtomIndices

[docs]    def get_chain_to_group_indices(self):
        if self.chainToGroupIndices is None:
            self.get_indices
        return self.chainToGroupIndices

[docs]    def get_num_atoms(self):
        self.get_indices()
        return self.numAtoms

[docs]    def get_num_groups(self):
        self.get_indices()
        return self.numGroups

[docs]    def get_num_chains(self):
        self.get_indices()
        return self.numChains

[docs]    def get_num_models(self):
        self.get_indices()
        return self.numModels

[docs]    def get_x_coords(self):
        return self.structure.x_coord_list

[docs]    def get_y_coords(self):
        return self.structure.y_coord_list

[docs]    def get_z_coords(self):
        return self.structure.z_coord_list

[docs]    def get_occupancies(self):
        return self.structure.occupancy_list

[docs]    def get_b_factors(self):
        return self.structure.b_factor_list

[docs]    def get_alt_loc_list(self):
        if not self.structure.alt_loc_set:
            self.structure = self.structure.set_alt_loc_list()
        return self.structure.alt_loc_list

[docs]    def get_group_types(self):
        return self.structure.group_type_list

[docs]    def get_atom_to_group_indices(self):

        if self.atomToGroupIndices is None:
            self.atomToGroupIndices = np.empty(
                self.get_num_atoms(), dtype='>i4')

            for i in range(self.get_num_groups()):
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                self.atomToGroupIndices[start:end] = i

        return self.atomToGroupIndices

[docs]    def get_chem_comp_types(self):

        if self.chemCompType is None:
            # Max chemCompType has 17 characters
            self.chemCompType = np.empty(
                self.get_num_atoms(), dtype=np.object_)
            self.groupTypeIndices = self.get_group_types()

            for i in range(self.get_num_groups()):
                index = self.groupTypeIndices[i]
                value = self.structure.group_list[index]['chemCompType']
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                self.chemCompType[start:end] = value

        return self.chemCompType

[docs]    def get_elements(self):

        if self.elements is None:
            self.elements = np.empty(self.get_num_atoms(), dtype=np.object_)
            self.groupTypeIndices = self.get_group_types()

            for i in range(self.get_num_groups()):
                index = self.groupTypeIndices[i]
                elementNames = self.structure.group_list[index]['elementList']
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                self.elements[start:end] = elementNames

        return self.elements

[docs]    def get_atom_names(self):

        if self.atomNames is None:
            # in case of 2 cjacter group + 3 digits
            self.atomNames = np.empty(self.get_num_atoms(), dtype=np.object_)
            self.groupTypeIndices = self.get_group_types()

            for i in range(self.get_num_groups()):
                index = self.groupTypeIndices[i]
                atomNames = self.structure.group_list[index]['atomNameList']
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                self.atomNames[start:end] = atomNames

        return self.atomNames

[docs]    def get_entity_types(self):

        if self.entityTypes is None:
            self.entityTypes = np.empty(self.get_num_atoms(), dtype=np.object_)

            # Instantiate required data
            self.is_polymer()
            self.get_chem_comp_types()
            self.get_elements()
            self.get_group_names()

            for i in range(self.get_num_groups()):
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                poly = self.polymer[start]
                ccType = self.chemCompType[start]

                if poly:
                    if "PEPTIDE" in ccType:
                        eType = "PRO"
                    elif "DNA" in ccType:
                        eType = "DNA"
                    elif "RNA" in ccType:
                        eType = "RNA"
                    elif "SACCHARIDE" in ccType:
                        eType = "PSR"
                    else:
                        eType = "UNK"
                elif (self.groupNames[start] == "HOH" or self.groupNames[start] == "DOD"):
                    eType = "WAT"
                elif "SACCHARIDE" in ccType:
                    eType = "SAC"
                else:
                    # If group contains at least one carbon atom, it is
                    # considered organic, but see expection below
                    organic = False
                    for j in range(start, end):
                        if self.elements[j] == "C":
                            organic = True
                            break

                    # Handle exceptions: carbon dioxide, carbon monoxide,
                    # cyanide ion are considered inorganic
                    if self.groupNames[start] == "CO2" or \
                       self.groupNames[start] == "CMO" or \
                       self.groupNames[start] == "CYN":
                        organic = False

                    if organic:
                        eType = "LGO"
                    else:
                        eType = "LGI"

                self.entityTypes[start:end] = eType

        return self.entityTypes

[docs]    def get_group_names(self):

        if self.groupNames is None:
            self.groupNames = np.empty(self.get_num_atoms(), dtype=np.object_)
            self.groupTypeIndices = self.structure.group_type_list

            for i in range(self.get_num_groups()):
                index = self.groupTypeIndices[i]
                value = self.structure.group_list[index]['groupName']
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                self.groupNames[start:end] = value

        return self.groupNames

[docs]    def get_group_numbers(self):

        if self.groupNumbers is None:
            # In case of 4 digit group id + 2 ins code
            self.groupNumbers = np.empty(
                self.get_num_atoms(), dtype=np.object_)

            for i in range(self.get_num_groups()):
                value = str(self.structure.group_id_list[i])
                insCode = self.structure.ins_code_list[i]

                if insCode != "\x00":
                    value += insCode

                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]
                self.groupNumbers[start:end] = value

        return self.groupNumbers

[docs]    def get_chain_ids(self):

        if self.chainIds is None:
            self.chainIds = np.empty(self.get_num_atoms(), dtype=np.object_)

            for i in range(self.get_num_chains()):
                start = self.chainToAtomIndices[i]
                end = self.chainToAtomIndices[i + 1]
                self.chainIds[start:end] = self.structure.chain_id_list[i]

        return self.chainIds

[docs]    def get_chain_names(self):

        if self.chainNames is None:
            self.chainNames = np.empty(self.get_num_atoms(), dtype=np.object_)

            for i in range(self.get_num_chains()):
                start = self.chainToAtomIndices[i]
                end = self.chainToAtomIndices[i + 1]
                self.chainNames[start:end] = self.structure.chain_name_list[i]

        return self.chainNames

[docs]    def is_polymer(self):

        if self.polymer is None:
            # TODO: Numpy bool type is different than python bools, might need to fix
            self.polymer = np.empty(self.get_num_atoms(), dtype=bool)
            self.get_chain_to_entity_index()

            for i in range(self.get_num_chains()):
                index = self.entityChainIndex[i]
                start = self.chainToAtomIndices[i]
                end = self.chainToAtomIndices[i + 1]
                poly = self.structure.entity_list[index]['type'] == 'polymer'

                self.polymer[start:end] = poly

        return self.polymer

[docs]    def get_entity_indices(self):

        if self.entityIndices is None:
            self.entityIndices = np.empty(
                self.get_num_atoms(), dtype=np.object_)
            self.get_chain_to_entity_index()

            for i in range(self.get_num_chains()):
                start = self.chainToAtomIndices[i]
                end = self.chainToAtomIndices[i + 1]

                self.entityIndices[start:end] = self.entityChainIndex[i]

        return self.entityIndices

[docs]    def get_atom_to_chain_indices(self):

        if self.atomToChainIndices is None:
            self.atomToChainIndices = np.empty(
                self.get_num_atoms(), dtype='>i4')

            for i in range(self.get_num_chains()):
                start = self.chainToAtomIndices[i]
                end = self.chainToAtomIndices[i + 1]

                self.atomToChainIndices[start:end] = i

        return self.atomToChainIndices

[docs]    def get_sequence_positions(self):

        if self.sequencePositions is None:

            self.sequencePositions = np.empty(
                self.get_num_atoms(), dtype='>i4')
            groupSequenceIndices = self.structure.sequence_index_list

            for i in range(self.get_num_groups()):
                value = groupSequenceIndices[i]
                start = self.groupToAtomIndices[i]
                end = self.groupToAtomIndices[i + 1]

                self.sequencePositions[start:end] = value

        return self.sequencePositions

[docs]    def get_chain_to_entity_index(self):
        '''Returns an array that maps a chain index to an entity index

        Returns
        -------
        :obj:`array <numpy.ndarray>`
           index that maps chain index to an entity index
        '''

        if self.entityChainIndex is None:

            self.entityChainIndex = np.empty(
                self.structure.num_chains, dtype='>i4')

            for i, entity in enumerate(self.structure.entity_list):

                chainIndexList = entity['chainIndexList']
                self.entityChainIndex[chainIndexList] = i

        return self.entityChainIndex

[docs]    def get_indices(self):

        if self.groupToAtomIndices is None:

            self.groupToAtomIndices = np.empty(
                self.structure.num_groups + 1, dtype='>i4')
            self.chainToAtomIndices = np.empty(
                self.structure.num_chains + 1, dtype='>i4')
            self.chainToGroupIndices = np.empty(
                self.structure.num_chains + 1, dtype='>i4')

            chainCount, groupCount, atomCount = 0, 0, 0

            # Loop over all models
            for m in range(self.numModels):

                # Loop over all chains
                for i in range(self.structure.chains_per_model[m]):

                    self.chainToAtomIndices[chainCount] = atomCount
                    self.chainToGroupIndices[chainCount] = groupCount

                    # Loop over all groups in chain
                    for j in range(self.structure.groups_per_chain[i]):

                        groupType = self.structure.group_type_list[groupCount]
                        self.groupToAtomIndices[groupCount] = atomCount
                        atomsInGroup = len(
                            self.structure.group_list[groupType]['formalChargeList'])

                        groupCount += 1
                        atomCount += atomsInGroup

                    chainCount += 1

            self.groupToAtomIndices[groupCount] = atomCount
            self.chainToAtomIndices[chainCount] = atomCount
            self.chainToGroupIndices[chainCount] = groupCount

            self.numAtoms = atomCount
            self.numGroups = groupCount
            self.numChains = chainCount

            if self.numAtoms < self.structure.num_atoms:
                self.groupToAtomIndices = self.groupToAtomIndices[:groupCount + 1]
                self.chainToAtomIndices = self.chainToAtomIndices[:chainCount + 1]
                self.chainToGroupIndices = self.chainToGroupIndices[:chainCount + 1]