# Example of using mmtfPyspark to find water interactions


## Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import col
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor, ExcludedLigandSets
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
import matplotlib.pyplot as plt
import pandas as pd
import py3Dmol
import time

                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "../../resources/mmtf_full_sample/"

# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]") 
sc = SparkContext(conf=conf)                                   

## Define Variables

In [2]:
# input parameters
resolution = 2.0
minInteractions = 2
maxInteractions = 4
distanceCutoff = 3.0
bFactorCutoff = 1.645
includeWaters = True

## Read PDB and filter by resolution and only include proteins

In [3]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Resolution(minResolution=0.0, maxResolution=2.0))\
         .filter(ContainsLProteinChain(exclusive=True))

## Setup criteria for metal interactions

In [4]:
interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(3.0)
interactions_filter.set_normalized_b_factor_cutoff(1.645)
interactions_filter.set_min_interactions(2)
interactions_filter.set_max_interactions(4)
interactions_filter.set_query_groups(True, ["HOH"])
interactions_filter.set_query_elements(True, "O")    # Only use water oxygen
interactions_filter.set_target_elements(True, ["O", "N", "S"])

## Exclude "uninteresting" ligands 

In [5]:
prohibitedGroups = ExcludedLigandSets.ALL_GROUPS
if not includeWaters:
    prohibitedGroups.add("HOH")
interactions_filter.set_prohibited_target_groups(prohibitedGroups)

## Calculate interactions

In [6]:
data = GroupInteractionExtractor().get_interactions(structures=pdb, interactionFilter=interactions_filter)

## Define Filter Bridging Water Interactions Function

In [7]:
def filter_bridging_water_interactions(data, maxInteractions):
    if maxInteractions == 2:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO"))
    elif maxInteractions == 3:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO") | \
                           (col("type3") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO") | \
                           (col("type3") == "PRO"))
    elif maxInteractions == 4:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO") | \
                           (col("type3") == "LGO") | \
                           (col("type4") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO") | \
                           (col("type3") == "PRO") | \
                           (col("type4") == "PRO"))
    else:
        raise ValueError("maxInteractions > 4 are not supported yet")
    return data

## Keep only interactions with at least one organic ligand and one protein interaction

In [8]:
data = filter_bridging_water_interactions(data, maxInteractions=4).cache()

print(f"Hits(all): {data.count()}")
data = data.toPandas()
data.head(50)

Hits(all): 25050


Unnamed: 0,pdbId,polyChains,q3,q4,q5,q6,atom0,element0,group0,groupNum0,...,type4,chain4,nbFactor4,distance4,angle1-2,angle1-3,angle1-4,angle2-3,angle2-4,angle3-4
0,5VNX,2,0.917888,,,,O,O,HOH,802,...,,,,0.0,1.808243,2.442852,2.032056,0.0,,
1,5VNX,2,0.923362,,,,O,O,HOH,720,...,,,,0.0,1.99951,2.444744,1.837754,0.0,,
2,5VNX,1,0.981236,0.859927,,,O,O,HOH,832,...,WAT,A,0.107415,2.835226,2.201767,1.899949,1.955327,1.868961,1.98702,1.355446
3,5VNX,2,0.913232,,,,O,O,HOH,753,...,,,,0.0,2.480631,1.956003,1.844749,0.0,,
4,4X9D,1,,,,,O,O,HOH,233,...,,,,0.0,1.821897,,,,,
5,5H9N,1,0.997502,,,,O,O,HOH,341,...,,,,0.0,2.036827,2.099268,2.143945,0.0,,
6,5H9N,1,0.694474,,,,O,O,HOH,349,...,,,,0.0,2.397284,1.392314,2.267734,0.0,,
7,5LCA,1,0.920665,,,,O,O,HOH,621,...,,,,0.0,1.736914,2.296526,2.103182,0.0,,
8,5LCA,3,0.957595,0.839895,,,O,O,HOH,633,...,PRO,A,-0.677593,2.87464,1.989787,2.061019,1.597358,1.819018,2.430154,1.556288
9,5LCF,3,0.977555,0.80175,,,O,O,HOH,634,...,PRO,A,-0.090881,2.792826,2.465741,1.989851,1.971329,1.450782,1.566221,1.943373


## Terminate Spark

In [None]:
sc.stop()