Source code for mmtfPyspark.interactions.interactionFingerprinter

#!/user/bin/env python
'''interactionFingerprinter.py

This class creates dataset of ligand - macromolecule and macromolecule -
macromolecule interaction information. Criteria to select interactions are
specified by the InteractionFilter.

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "done"

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkContext
from mmtfPyspark.interactions import LigandInteractionFingerprint, PolymerInteractionFingerprint


[docs]class InteractionFingerprinter(object):
[docs] def get_ligand_polymer_interactions(structures, interactionFilter): '''Returns a dataset of ligand - macromolecule interacting residues. The dataset contains the following columns: - structureChainId - pdbId.chainName of chain that interacts with ligand - queryLigandId - id of ligand from PDB chemical component dictionary - queryLigandNumber - group number of ligand including insetion code - queryLigandChainId - chain name of ligand - targetChainId - name of chain for which the interaction data are listed - groupNumbers - array of residue number of interacting groups including insertion code (e.g. 101A) - sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence - sequence - interacting polymer sequence - interactingChains - total number of chains that interact with ligand Parameters ---------- structures : PythonRDD a set of PDB structures interactionFilter : InteractionFilter interaction criteria Returns ------- dataset dataset with interacting residue information ''' # find sll interactions row = structures.flatMap(LigandInteractionFingerprint(interactionFilter)) # convert RDD to a Dataset with the following columns nullable = False fields = [StructField("structureChainId", StringType(), nullable), StructField("queryLigandId", StringType(), nullable), StructField("queryLigandNumber", StringType(), nullable), StructField("queryLigandChainId", StringType(), nullable), StructField("targetChainId", StringType(), nullable), StructField("groupNumbers", ArrayType(StringType(), nullable), nullable), StructField("sequenceIndices", ArrayType(IntegerType(), nullable), nullable), StructField("sequence", StringType(), nullable), StructField("interactingChains", IntegerType(), nullable) ] schema = StructType(fields) spark = SparkSession.builder.getOrCreate() return spark.createDataFrame(row, schema)
[docs] def get_polymer_interactions(structures, interactionFilter): '''Returns a dataset of ligand - macromolecule interacting information. The dataset contains the following columns: structureChainId - pdbId.chainName for which the interaction data are listed queryChainId - name of chain that interacts with target chain targetChainId - name of chain for which the interaction data are listed groupNumbers - array of residue number of interacting groups including insertion code (e.g. 101A) sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence sequence - target polymer sequence Parameters ---------- structures : PythonRDD a set of PDB structures interactionFilter : InteractionFilter interaction criteria Returns ------- dataset dataset with interacting residue information ''' # find all interactions row = structures.flatMap(PolymerInteractionFingerprint(interactionFilter)) # convert RDD to a Dataset with the following columns nullable = False fields = [StructField("structureChainId", StringType(), nullable), StructField("queryChainId", StringType(), nullable), StructField("targetChainId", StringType(), nullable), StructField("groupNumbers", ArrayType(StringType(),nullable), nullable), StructField("sequenceIndices", ArrayType(IntegerType(), nullable), nullable), StructField("sequence", StringType(), nullable), ] schema = StructType(fields) spark = SparkSession.builder.getOrCreate() return spark.createDataFrame(row, schema)