Source code for mmtfPyspark.interactions.interactionFingerprinter

#!/user/bin/env python
'''interactionFingerprinter.py

This class creates dataset of ligand - macromolecule and macromolecule -
macromolecule interaction information. Criteria to select interactions are
specified by the InteractionFilter.

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "done"

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkContext
from mmtfPyspark.interactions import LigandInteractionFingerprint, PolymerInteractionFingerprint


[docs]class InteractionFingerprinter(object):

[docs]    def get_ligand_polymer_interactions(structures, interactionFilter):
        '''Returns a dataset of ligand - macromolecule interacting residues.

        The dataset contains the following columns:
        - structureChainId - pdbId.chainName of chain that interacts with ligand
        - queryLigandId - id of ligand from PDB chemical component dictionary
        - queryLigandNumber - group number of ligand including insetion code
        - queryLigandChainId - chain name of ligand
        - targetChainId - name of chain for which the interaction data are listed
        - groupNumbers - array of residue number of interacting groups including insertion code (e.g. 101A)
        - sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence
        - sequence - interacting polymer sequence
        - interactingChains - total number of chains that interact with ligand

        Parameters
        ----------
        structures : PythonRDD
           a set of PDB structures
        interactionFilter : InteractionFilter
           interaction criteria

        Returns
        -------
        dataset
           dataset with interacting residue information
        '''

        # find sll interactions
        row = structures.flatMap(LigandInteractionFingerprint(interactionFilter))

        # convert RDD to a Dataset with the following columns
        nullable = False
        fields = [StructField("structureChainId", StringType(), nullable),
                  StructField("queryLigandId", StringType(), nullable),
                  StructField("queryLigandNumber", StringType(), nullable),
                  StructField("queryLigandChainId", StringType(), nullable),
                  StructField("targetChainId", StringType(), nullable),
                  StructField("groupNumbers", ArrayType(StringType(), nullable), nullable),
                  StructField("sequenceIndices", ArrayType(IntegerType(), nullable), nullable),
                  StructField("sequence", StringType(), nullable),
                  StructField("interactingChains", IntegerType(), nullable)
                  ]

        schema = StructType(fields)
        spark = SparkSession.builder.getOrCreate()
        return spark.createDataFrame(row, schema)


[docs]    def get_polymer_interactions(structures, interactionFilter):
        '''Returns a dataset of ligand - macromolecule interacting information.

        The dataset contains the following columns:
            structureChainId - pdbId.chainName for which the interaction data are listed
            queryChainId - name of chain that interacts with target chain
            targetChainId - name of chain for which the interaction data are listed
            groupNumbers - array of residue number of interacting groups including insertion code (e.g. 101A)
            sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence
            sequence - target polymer sequence

        Parameters
        ----------
        structures : PythonRDD
           a set of PDB structures
        interactionFilter : InteractionFilter
           interaction criteria

        Returns
        -------
        dataset
           dataset with interacting residue information
        '''

        # find all interactions
        row = structures.flatMap(PolymerInteractionFingerprint(interactionFilter))

        # convert RDD to a Dataset with the following columns
        nullable = False
        fields = [StructField("structureChainId", StringType(), nullable),
                  StructField("queryChainId", StringType(), nullable),
                  StructField("targetChainId", StringType(), nullable),
                  StructField("groupNumbers", ArrayType(StringType(),nullable), nullable),
                  StructField("sequenceIndices", ArrayType(IntegerType(), nullable), nullable),
                  StructField("sequence", StringType(), nullable),
                  ]

        schema = StructType(fields)
        spark = SparkSession.builder.getOrCreate()
        return spark.createDataFrame(row, schema)