Source code for mmtfPyspark.mappers.structureToProteinDimers

#!/user/bin/env python
'''structureToProteinDimers.py:

Maps a structure to its protein dimers

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "done"

from mmtf.utils import *
from mmtf.api.mmtf_writer import MMTFEncoder
import itertools
from sympy import Point3D
from mmtfPyspark.utils import DistanceBox
import time
import numpy as np
import math


[docs]class StructureToProteinDimers(object):
    '''Maps a protein structure to it's protein dimers

    Attributes
    ----------
    cutoffDistance : float
       cutoff distance for protein dimers [8.0]
    contacts : int
       number of contacts [20]
    useAllAtoms : bool
       flag to use all atoms [False]
    exclusive : bool
       exclusive flag [False]
    '''

    def __init__(self, cutoffDistance=8.0, contacts=20,
                 useAllAtoms=False, exclusive=False):
        self.cutoffDistance = cutoffDistance
        self.contacts = contacts
        self.useAllAtoms = useAllAtoms
        self.exclusive = exclusive

    def __call__(self, t):
        structure = t[1]

        # split structure into a list of chains
        chains = self._split_to_chains(structure)
        chainVectors = self._get_chain_vectors(chains)
        resList = []

        if self.useAllAtoms:
            boxes = self._get_all_atoms_distance_boxes(
                chains, self.cutoffDistance)
        else:
            boxes = self._get_c_beta_atoms_distance_boxes(
                chains, self.cutoffDistance)

        self.exclusiveHashSet = np.empty([0, 3])

        for i in range(len(chains)):

            j = 0

            while j < i:

                if self._check_pair(boxes[i], boxes[j], chains[i], chains[j], self.cutoffDistance, self.contacts):

                    if self.exclusive:

                        newVec = chainVectors[i] - chainVectors[j]
                        if not self._check_list(newVec, self.exclusiveHashSet):

                            resList.append(self._combine_chains(
                                chains[i], chains[j]))

                            self.exclusiveHashSet = np.append(
                                self.exclusiveHashSet, [newVec], axis=0)
                    else:
                        resList.append(self._combine_chains(
                            chains[i], chains[j]))

                j += 1

        return resList

    def _check_list(self, vec, exclusiveList):

        for point in exclusiveList:

            if np.linalg.norm(vec - point) < 0.1 and self._angle(vec, point) < 0.1:
                return True
            vec = vec * -1

            if np.linalg.norm(vec - point) < 0.1 and self._angle(vec, point) < 0.1:
                return True
            vec = vec * -1

        return False

    def _angle(self, a, b):

        arccosInput = np.dot(a, b) / np.linagl.norm(a) / np.linagl.norm(b)
        arccosInput = 1.0 if arccosInput > 1.0 else arccosInput
        arccosInput = -1.0 if arccosInput < -1.0 else arccosInput

        return math.acos(arccosInput)

    def _get_chain_vectors(self, chains):

        chainVectors = []

        for chain in chains:

            chainVectors.append(self._calc_average_vec(chain))

        return chainVectors

    def _calc_average_vec(self, s1):

        totX, totY, totZ = 0, 0, 0

        for i in range(s1.num_atoms):  # TODO double check num atoms

            totX += s1.x_coord_list[i]
            totY += s1.y_coord_list[i]
            totZ += s1.z_coord_list[i]

        return np.array([totX / s1.num_atoms, totY / s1.num_atoms, totZ / s1.num_atoms])

    def _distance(self, s1, s2, index1, index2):

        xCoord = s1.x_coord_list[index1]
        yCoord = s1.y_coord_list[index1]
        zCoord = s1.z_coord_list[index1]

        newPoint1 = Point3D(xCoord, yCoord, zCoord)

        xCoord = s2.x_coord_list[index2]
        yCoord = s2.y_coord_list[index2]
        zCoord = s2.z_coord_list[index2]

        newPoint2 = Point3D(xCoord, yCoord, zCoord)

        return newPoint1.distance(newPoint2)

    def _check_pair(self, box1, box2, s1, s2, cutoffDistance, contacts):

        pointsInBox2 = box1.getIntersection(box2)
        pointsInBox1 = box2.getIntersection(box1)

        hs1, hs2 = set(), set()

        num = 0

        for i in range(len(pointsInBox2)):

            for j in range(len(pointsInBox1)):

                if (i in hs1) or (j in hs2):
                    continue

                if self._distance(s1, s2, pointsInBox2[i], pointsInBox1[j]) < cutoffDistance:

                    num += 1
                    hs1.add(i)
                    hs2.add(j)

                if num > contacts:
                    return True

        return False

    def _get_c_beta_atoms_distance_boxes(self, chains, cutoffDistance):
        '''Get distance boxes for all atoms
        '''
        distanceBoxes = []

        for i in range(len(chains)):

            tmp = chains[i]
            newbox = DistanceBox(cutoffDistance)
            groupIndex = 0
            atomIndex = 0

            for k in range(tmp.groups_per_chain[0]):
                groupType = tmp.group_type_list[groupIndex]

                for m in range(len(tmp.group_list[groupType]["formalChargeList"])):

                    atomName = tmp.group_list[groupType]["atomNameList"][m]

                    if atomName == "CB":

                        xCoord = tmp.x_coord_list[atomIndex]
                        yCoord = tmp.y_coord_list[atomIndex]
                        zCoord = tmp.z_coord_list[atomIndex]

                        # TODO trying numpy instead of Point3D
                        #newPoint = Point3D(xCoord, yCoord, zCoord)
                        newPoint = np.array([xCoord, yCoord, zCoord])

                        newbox.add_point(newPoint, atomIndex)

                    atomIndex += 1

                groupIndex += 1

            distanceBoxes.append(newbox)

        return distanceBoxes

    def _get_all_atoms_distance_boxes(self, chains, cutoffDistance):
        '''Get distance boxes for all atoms
        '''
        distanceBoxes = []

        for i in range(len(chains)):

            tmp = chains[i]
            newbox = DistanceBox(cutoffDistance)

            for j in range(tmp.num_atoms):

                xCoord = tmp.x_coord_list[j]
                yCoord = tmp.y_coord_list[j]
                zCoord = tmp.z_coord_list[j]

                # TODO trying numpy instead of Point3D
                # newPoint = Point3D(xCoord, yCoord, zCoord)
                newPoint = np.array([xCoord, yCoord, zCoord])

                newbox.add_point(newPoint, j)

            distanceBoxes.append(newbox)

        return distanceBoxes

    def _split_to_chains(self, s):
        '''split structure to a list of chains
        '''
        chains = []
        numChains = s.chains_per_model[0]

        chainToEntityIndex = self._get_chain_to_entity_index(s)
        atomsPerChain, bondsPerChain = self._get_num_atoms_and_bonds(s)

        groupCounter = 0
        atomCounter = 0

        for i in range(numChains):
            atomMap = {}
            newChain = MMTFEncoder()
            entityToChainIndex = chainToEntityIndex[i]

            structureId = s.structure_id + '.' +\
                s.chain_name_list[i] + '.' +\
                s.chain_id_list[i] + '.' +\
                str(entityToChainIndex + 1)

            # Set header
            newChain.init_structure(bondsPerChain[i],
                                    atomsPerChain[i],
                                    s.groups_per_chain[i],
                                    1, 1, structureId)
            decoder_utils.add_xtalographic_info(s, newChain)
            decoder_utils.add_header_info(s, newChain)

            # Set model info (only one model: 0)
            newChain.set_model_info(0, 1)

            # Set entity and chain info
            newChain.set_entity_info([0],
                                     s.entity_list[entityToChainIndex]['sequence'],
                                     s.entity_list[entityToChainIndex]['description'],
                                     s.entity_list[entityToChainIndex]['type'])

            newChain.set_chain_info(s.chain_id_list[i],
                                    s.chain_name_list[i],
                                    s.groups_per_chain[i])

            for j in range(s.groups_per_chain[i]):
                groupIndex = s.group_type_list[groupCounter]
                # print(s.group_type_list)

                # Set group info
                newChain.set_group_info(s.group_list[groupIndex]['groupName'],
                                        s.group_id_list[groupCounter],
                                        s.ins_code_list[groupCounter],
                                        s.group_list[groupIndex]['chemCompType'],
                                        len(s.group_list[groupIndex]
                                            ['atomNameList']),
                                        len(s.group_list[groupIndex]
                                            ['bondOrderList']),
                                        s.group_list[groupIndex]['singleLetterCode'],
                                        s.sequence_index_list[groupCounter],
                                        s.sec_struct_list[groupCounter])

                for k in range(len(s.group_list[groupIndex]['atomNameList'])):
                    newChain.set_atom_info(s.group_list[groupIndex]['atomNameList'][k],
                                           s.atom_id_list[atomCounter],
                                           s.alt_loc_list[atomCounter],
                                           s.x_coord_list[atomCounter],
                                           s.y_coord_list[atomCounter],
                                           s.z_coord_list[atomCounter],
                                           s.occupancy_list[atomCounter],
                                           s.b_factor_list[atomCounter],
                                           s.group_list[groupIndex]['elementList'][k],
                                           s.group_list[groupIndex]['formalChargeList'][k])

                    atomCounter += 1

                for l in range(len(s.group_list[groupIndex]['bondOrderList'])):

                    bondIndOne = s.group_list[groupIndex]['bondAtomList'][l * 2]
                    bondIndTwo = s.group_list[groupIndex]['bondAtomList'][l * 2 + 1]
                    bondOrder = s.group_list[groupIndex]['bondOrderList'][l]

                    #newChain.set_group_bond(bondIndOne, bondIndTwo, bondOrder)
                    newChain.current_group.bond_atom_list += [
                        bondIndOne, bondIndTwo]
                    newChain.current_group.bond_order_list.append(bondOrder)

                groupCounter += 1

                # TODO skipping adding inter group bond info for now

            newChain.finalize_structure()

            # TODO double check if just getting from chain 0
            chain_type = [chain['type'] for chain in newChain.entity_list]

            polymer = "polymer" in chain_type

            if polymer:

                match = True

                for j in range(newChain.groups_per_chain[0]):

                    #print(newChain.groups_per_chain, newChain.group_type_list)
                    groupIndex = newChain.group_type_list[j]

                    if match:
                        _type = newChain.group_list[groupIndex]["chemCompType"]

                        match = (
                            _type == "L-PEPTIDE LINKING") or (_type == "PEPTIDE LINKING")

                if match:
                    chains.append(newChain)

        return chains

    def _combine_chains(self, s1, s2):

        if not s1.alt_loc_set:
            s1 = s1.set_alt_loc_list()
        if not s2.alt_loc_set:
            s2 = s2.set_alt_loc_list()

        groupCounter = 0
        atomCounter = 0

        structureId = s1.structure_id + "_append_" + s2.structure_id

        combinedStructure = MMTFEncoder()

        # Set header
        combinedStructure.init_structure(s1.num_bonds + s2.num_bonds,
                                         s1.num_atoms + s2.num_atoms,
                                         s1.num_groups + s2.num_groups,
                                         2, 1, structureId)
        decoder_utils.add_xtalographic_info(s1, combinedStructure)
        decoder_utils.add_header_info(s1, combinedStructure)

        # Set model info (only one model: 0)
        combinedStructure.set_model_info(0, 2)

        chainToEntityIndex = self._get_chain_to_entity_index(s1)[0]

        # Set entity and chain info
        combinedStructure.set_entity_info([0],
                                          s1.entity_list[chainToEntityIndex]['sequence'],
                                          s1.entity_list[chainToEntityIndex]['description'],
                                          s1.entity_list[chainToEntityIndex]['type'])

        combinedStructure.set_chain_info(s1.chain_id_list[0],
                                         s1.chain_name_list[0],
                                         s1.groups_per_chain[0])

        for i in range(s1.groups_per_chain[0]):
            groupIndex = s1.group_type_list[groupCounter]

            # Set group info
            combinedStructure.set_group_info(s1.group_list[groupIndex]['groupName'],
                                             s1.group_id_list[groupCounter],
                                             s1.ins_code_list[groupCounter],
                                             s1.group_list[groupIndex]['chemCompType'],
                                             len(s1.group_list[groupIndex]
                                                 ['atomNameList']),
                                             len(s1.group_list[groupIndex]
                                                 ['bondOrderList']),
                                             s1.group_list[groupIndex]['singleLetterCode'],
                                             s1.sequence_index_list[groupCounter],
                                             s1.sec_struct_list[groupCounter])

            for j in range(len(s1.group_list[groupIndex]['atomNameList'])):
                combinedStructure.set_atom_info(s1.group_list[groupIndex]['atomNameList'][j],
                                                s1.atom_id_list[atomCounter],
                                                s1.alt_loc_list[atomCounter],
                                                s1.x_coord_list[atomCounter],
                                                s1.y_coord_list[atomCounter],
                                                s1.z_coord_list[atomCounter],
                                                s1.occupancy_list[atomCounter],
                                                s1.b_factor_list[atomCounter],
                                                s1.group_list[groupIndex]['elementList'][j],
                                                s1.group_list[groupIndex]['formalChargeList'][j])

                atomCounter += 1

            # TODO not sure if we should add bonds like this
            # TODO bondAtomList == getGroupBondIndices?
            for k in range(len(s1.group_list[groupIndex]["bondOrderList"])):
                bondIndOne = s1.group_list[groupIndex]["bondAtomList"][k * 2]
                bondIndTwo = s1.group_list[groupIndex]["bondAtomList"][k * 2 + 1]
                bondOrder = s1.group_list[groupIndex]["bondOrderList"][k]
                combinedStructure.set_group_bond(
                    bondIndOne, bondIndTwo, bondOrder)

            groupCounter += 1

        # Set entity and chain info for s2
        chainToEntityIndex = self._get_chain_to_entity_index(s2)[0]
        combinedStructure.set_entity_info([1],
                                          s2.entity_list[chainToEntityIndex]['sequence'],
                                          s2.entity_list[chainToEntityIndex]['description'],
                                          s2.entity_list[chainToEntityIndex]['type'])

        combinedStructure.set_chain_info(s2.chain_id_list[0],
                                         s2.chain_name_list[0],
                                         s2.groups_per_chain[0])

        groupCounter = 0
        atomCounter = 0

        for i in range(s2.groups_per_chain[0]):
            groupIndex = s2.group_type_list[groupCounter]

            # Set group info
            combinedStructure.set_group_info(s2.group_list[groupIndex]['groupName'],
                                             s2.group_id_list[groupCounter],
                                             s2.ins_code_list[groupCounter],
                                             s2.group_list[groupIndex]['chemCompType'],
                                             len(s2.group_list[groupIndex]
                                                 ['atomNameList']),
                                             len(s2.group_list[groupIndex]
                                                 ['bondOrderList']),
                                             s2.group_list[groupIndex]['singleLetterCode'],
                                             s2.sequence_index_list[groupCounter],
                                             s2.sec_struct_list[groupCounter])

            for j in range(len(s2.group_list[groupIndex]['atomNameList'])):
                combinedStructure.set_atom_info(s2.group_list[groupIndex]['atomNameList'][j],
                                                s2.atom_id_list[atomCounter],
                                                s2.alt_loc_list[atomCounter],
                                                s2.x_coord_list[atomCounter],
                                                s2.y_coord_list[atomCounter],
                                                s2.z_coord_list[atomCounter],
                                                s2.occupancy_list[atomCounter],
                                                s2.b_factor_list[atomCounter],
                                                s2.group_list[groupIndex]['elementList'][j],
                                                s2.group_list[groupIndex]['formalChargeList'][j])

                atomCounter += 1

            # TODO not sure if we should add bonds like this
            # TODO bondAtomList == getGroupBondIndices?
            for k in range(len(s2.group_list[groupIndex]["bondOrderList"])):
                bondIndOne = s2.group_list[groupIndex]["bondAtomList"][k * 2]
                bondIndTwo = s2.group_list[groupIndex]["bondAtomList"][k * 2 + 1]
                bondOrder = s2.group_list[groupIndex]["bondOrderList"][k]
                combinedStructure.set_group_bond(
                    bondIndOne, bondIndTwo, bondOrder)

            groupCounter += 1

        combinedStructure.finalize_structure()
        return (structureId, combinedStructure)

    def _get_num_atoms_and_bonds(self, structure):
        '''Gets the number of atoms and bonds per chain
        '''
        numChains = structure.chains_per_model[0]
        atomsPerChain = [0] * numChains
        bondsPerChain = [0] * numChains
        groupCounter = 0

        for i in range(numChains):

            for j in range(structure.groups_per_chain[i]):
                groupIndex = structure.group_type_list[groupCounter]
                atomsPerChain[i] = len(
                    structure.group_list[groupIndex]['atomNameList'])
                bondsPerChain[i] = len(
                    structure.group_list[groupIndex]['bondOrderList'])
                groupCounter += 1

        return atomsPerChain, bondsPerChain

    def _get_chain_to_entity_index(self, structure):
        '''Returns an list that maps a chain index to an entity index.

        Parameters
        ----------
        structure: structureDataInterFace
        '''
        entityChainIndex = [0] * structure.num_chains

        for i in range(len(structure.entity_list)):

            for j in structure.entity_list[i]["chainIndexList"]:
                entityChainIndex[j] = i

        return entityChainIndex