Example of using mmtfPyspark to find water interactions

Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import col
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor, ExcludedLigandSets
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
import matplotlib.pyplot as plt
import pandas as pd
import py3Dmol
import time


# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"

# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)

Define Variables

In [2]:
# input parameters
resolution = 2.0
minInteractions = 2
maxInteractions = 4
distanceCutoff = 3.0
bFactorCutoff = 1.645
includeWaters = True

Read PDB and filter by resolution and only include proteins

In [3]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Resolution(minResolution=0.0, maxResolution=2.0))\
         .filter(ContainsLProteinChain(exclusive=True))

Setup criteria for metal interactions

In [4]:
interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(3.0)
interactions_filter.set_normalized_b_factor_cutoff(1.645)
interactions_filter.set_min_interactions(2)
interactions_filter.set_max_interactions(4)
interactions_filter.set_query_groups(True, ["HOH"])
interactions_filter.set_query_elements(True, "O")    # Only use water oxygen
interactions_filter.set_target_elements(True, ["O", "N", "S"])

Exclude “uninteresting” ligands

In [5]:
prohibitedGroups = ExcludedLigandSets.ALL_GROUPS
if not includeWaters:
    prohibitedGroups.add("HOH")
interactions_filter.set_prohibited_target_groups(prohibitedGroups)

Calculate interactions

In [6]:
data = GroupInteractionExtractor().get_interactions(structures=pdb, interactionFilter=interactions_filter)

Define Filter Bridging Water Interactions Function

In [7]:
def filter_bridging_water_interactions(data, maxInteractions):
    if maxInteractions == 2:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO"))
    elif maxInteractions == 3:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO") | \
                           (col("type3") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO") | \
                           (col("type3") == "PRO"))
    elif maxInteractions == 4:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO") | \
                           (col("type3") == "LGO") | \
                           (col("type4") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO") | \
                           (col("type3") == "PRO") | \
                           (col("type4") == "PRO"))
    else:
        raise ValueError("maxInteractions > 4 are not supported yet")
    return data

Keep only interactions with at least one organic ligand and one protein interaction

In [8]:
data = filter_bridging_water_interactions(data, maxInteractions=4).cache()

print(f"Hits(all): {data.count()}")
data = data.toPandas()
data.head(50)
Hits(all): 25050
Out[8]:
pdbId polyChains q3 q4 q5 q6 atom0 element0 group0 groupNum0 ... type4 chain4 nbFactor4 distance4 angle1-2 angle1-3 angle1-4 angle2-3 angle2-4 angle3-4
0 5VNX 2 0.917888 NaN None None O O HOH 802 ... None None NaN 0.000000 1.808243 2.442852 2.032056 0.000000 NaN NaN
1 5VNX 2 0.923362 NaN None None O O HOH 720 ... None None NaN 0.000000 1.999510 2.444744 1.837754 0.000000 NaN NaN
2 5VNX 1 0.981236 0.859927 None None O O HOH 832 ... WAT A 0.107415 2.835226 2.201767 1.899949 1.955327 1.868961 1.987020 1.355446
3 5VNX 2 0.913232 NaN None None O O HOH 753 ... None None NaN 0.000000 2.480631 1.956003 1.844749 0.000000 NaN NaN
4 4X9D 1 NaN NaN None None O O HOH 233 ... None None NaN 0.000000 1.821897 NaN NaN NaN NaN NaN
5 5H9N 1 0.997502 NaN None None O O HOH 341 ... None None NaN 0.000000 2.036827 2.099268 2.143945 0.000000 NaN NaN
6 5H9N 1 0.694474 NaN None None O O HOH 349 ... None None NaN 0.000000 2.397284 1.392314 2.267734 0.000000 NaN NaN
7 5LCA 1 0.920665 NaN None None O O HOH 621 ... None None NaN 0.000000 1.736914 2.296526 2.103182 0.000000 NaN NaN
8 5LCA 3 0.957595 0.839895 None None O O HOH 633 ... PRO A -0.677593 2.874640 1.989787 2.061019 1.597358 1.819018 2.430154 1.556288
9 5LCF 3 0.977555 0.801750 None None O O HOH 634 ... PRO A -0.090881 2.792826 2.465741 1.989851 1.971329 1.450782 1.566221 1.943373
10 5LCF 1 0.916752 NaN None None O O HOH 665 ... None None NaN 0.000000 2.078905 1.759737 2.373434 0.000000 NaN NaN
11 5LCH 3 0.971902 0.788855 None None O O HOH 648 ... LGO A 0.079471 2.683238 2.009260 1.608932 1.911190 2.478863 1.954119 1.394372
12 5LE1 3 0.933710 0.769073 None None O O HOH 649 ... PRO A 0.597268 2.743888 2.396844 2.197642 1.976304 1.412055 1.552328 1.759869
13 5MMN 3 0.805579 0.840680 None None O O HOH 424 ... PRO A -0.191379 2.660059 1.835154 1.547962 2.337603 1.530142 2.067027 1.857040
14 5MMN 2 0.857262 NaN None None O O HOH 473 ... None None NaN 0.000000 1.892428 1.890819 1.644718 0.000000 NaN NaN
15 5MMO 3 0.794642 0.836989 None None O O HOH 417 ... PRO A -0.385945 2.628243 1.841165 1.538125 2.314125 1.529892 2.101835 1.814811
16 5MMO 2 0.817830 NaN None None O O HOH 462 ... None None NaN 0.000000 1.919041 1.823202 1.590212 0.000000 NaN NaN
17 5MMO 1 NaN NaN None None O O HOH 472 ... None None NaN 0.000000 2.270356 NaN NaN NaN NaN NaN
18 5N49 1 0.923594 NaN None None O O HOH 810 ... None None NaN 0.000000 2.150182 1.716559 2.009910 0.000000 NaN NaN
19 5UEW 1 0.911381 NaN None None O O HOH 611 ... None None NaN 0.000000 1.717024 2.102965 1.905225 0.000000 NaN NaN
20 5UEW 1 0.904136 NaN None None O O HOH 602 ... None None NaN 0.000000 1.691051 1.944444 2.179145 0.000000 NaN NaN
21 5UEZ 1 0.939357 NaN None None O O HOH 607 ... None None NaN 0.000000 1.965640 1.787436 2.223777 0.000000 NaN NaN
22 5UEZ 1 0.895376 NaN None None O O HOH 642 ... None None NaN 0.000000 1.716711 2.391761 2.165344 0.000000 NaN NaN
23 3U96 2 0.886328 NaN None None O O HOH 481 ... None None NaN 0.000000 2.144969 1.817851 1.708172 0.000000 NaN NaN
24 3U98 1 NaN NaN None None O O HOH 1043 ... None None NaN 0.000000 2.333080 NaN NaN NaN NaN NaN
25 3U9H 1 0.850342 0.863505 None None O O HOH 226 ... PRO A 0.032156 2.768258 2.431111 1.714931 1.655102 1.811743 2.035461 1.652409
26 3U9N 1 0.966877 NaN None None O O HOH 435 ... None None NaN 0.000000 2.094525 1.839143 2.156703 0.000000 NaN NaN
27 3U9N 1 NaN NaN None None O O HOH 498 ... None None NaN 0.000000 1.831271 NaN NaN NaN NaN NaN
28 3UAG 3 0.803613 0.563889 None None O O HOH 501 ... PRO A -0.554693 2.793061 1.127710 2.601418 1.737904 1.548108 2.379534 1.935428
29 3UAG 2 0.860623 NaN None None O O HOH 502 ... None None NaN 0.000000 2.273910 1.625290 1.923401 0.000000 NaN NaN
30 3UAG 2 0.123106 NaN None None O O HOH 600 ... None None NaN 0.000000 1.360162 2.408705 1.063054 0.000000 NaN NaN
31 3UAG 1 0.203524 0.364127 None None O O HOH 837 ... LGO A -0.562174 2.777843 1.462554 2.436885 2.061869 2.259292 2.534806 0.862076
32 3UAL 1 NaN NaN None None O O HOH 258 ... None None NaN 0.000000 2.087371 NaN NaN NaN NaN NaN
33 3UAL 1 0.888170 NaN None None O O HOH 293 ... None None NaN 0.000000 2.161019 1.666536 2.305244 0.000000 NaN NaN
34 3UAZ 1 0.921715 NaN None None O O HOH 262 ... None None NaN 0.000000 2.290687 2.221030 1.754788 0.000000 NaN NaN
35 3UB6 1 0.899168 0.672460 None None O O HOH 407 ... PRO A -0.475994 2.863447 1.311276 1.740830 2.432327 2.251388 1.456739 2.101638
36 3UB6 1 0.901412 0.670772 None None O O HOH 403 ... PRO B -0.653865 2.873985 1.741537 1.311803 2.421566 2.262811 2.109684 1.452782
37 3UB7 1 0.896540 0.692493 None None O O HOH 407 ... PRO A -0.405171 2.855674 1.329261 1.740892 2.445819 2.226469 1.484874 2.083471
38 3UB7 1 0.897268 0.685547 None None O O HOH 407 ... PRO B -0.610487 2.902715 1.739871 1.330280 2.440397 2.233240 2.092597 1.465206
39 3UBD 1 NaN NaN None None O O HOH 530 ... None None NaN 0.000000 2.258912 NaN NaN NaN NaN NaN
40 3UBW 1 0.713606 0.820527 None None O O HOH 307 ... WAT A -0.608621 2.584318 2.006831 1.449188 1.849012 2.064086 1.650956 2.424514
41 3UBW 1 NaN NaN None None O O HOH 365 ... None None NaN 0.000000 2.160672 NaN NaN NaN NaN NaN
42 3UD5 1 NaN NaN None None O O HOH 242 ... None None NaN 0.000000 1.420573 NaN NaN NaN NaN NaN
43 3UDE 1 NaN NaN None None O O HOH 221 ... None None NaN 0.000000 1.464474 NaN NaN NaN NaN NaN
44 3UDH 1 NaN NaN None None O O HOH 843 ... None None NaN 0.000000 2.213616 NaN NaN NaN NaN NaN
45 3UDP 1 NaN NaN None None O O HOH 821 ... None None NaN 0.000000 2.270605 NaN NaN NaN NaN NaN
46 3UDR 1 NaN NaN None None O O HOH 684 ... None None NaN 0.000000 2.133965 NaN NaN NaN NaN NaN
47 3UDV 1 NaN NaN None None O O HOH 263 ... None None NaN 0.000000 2.302813 NaN NaN NaN NaN NaN
48 3UDY 1 NaN NaN None None O O HOH 654 ... None None NaN 0.000000 1.924380 NaN NaN NaN NaN NaN
49 3UEQ 2 0.860294 NaN None None O O HOH 808 ... None None NaN 0.000000 1.664870 2.468495 2.081199 0.000000 NaN NaN

50 rows × 51 columns

Terminate Spark

In [ ]:
sc.stop()