Example of using PySpark to find metal interactions

Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
from mmtfPyspark.webfilters import Pisces
import matplotlib.pyplot as plt
import pandas as pd
import py3Dmol
import time

# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"

# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)

Define Variables

In [11]:
# input parameters
sequenceIdentityCutoff = 30
resolution = 2.5
minInteractions = 4
maxInteractions = 6
distanceCutoff = 3.0

# chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}

Read PDB and create PISCES non-redundant set

In [12]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = sequenceIdentityCutoff, resolution = resolution))

Setup criteria for metal interactions

In [13]:
interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(distanceCutoff)
interactions_filter.set_min_interactions(minInteractions)
interactions_filter.set_max_interactions(maxInteractions)
interactions_filter.set_query_groups(True, metals)

#Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])

Tabulate interactions in a Dataframe

In [14]:
interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()
print(f"Metal interactions: {interactions.count()}")
Metal interactions: 52

Select interacting atoms and orientational order parameters (q4-q6)

In [15]:
interactions = interactions.select("pdbId", \
                "q4","q5","q6", \
                "element0","groupNum0","chain0", \
                "element1","groupNum1","chain1","distance1", \
                "element2","groupNum2","chain2","distance2", \
                "element3","groupNum3","chain3","distance3", \
                "element4","groupNum4","chain4","distance4", \
                "element5","groupNum5","chain5","distance5", \
                "element6","groupNum6","chain6","distance6").cache();

# show some example interactions
ds = interactions.dropDuplicates(["pdbId"])
df = ds.toPandas() # convert to pandas dataframe to fit table in jupyter notebook cell
df.head()
Out[15]:
pdbId q4 q5 q6 element0 groupNum0 chain0 element1 groupNum1 chain1 ... chain4 distance4 element5 groupNum5 chain5 distance5 element6 groupNum6 chain6 distance6
0 1A2P 0.545893 NaN NaN Zn 112 C O 139 C ... C 2.120991 None None None 0.000000 None None None 0.00000
1 1FN9 0.982773 NaN NaN Zn 1001 A S 51 A ... A 2.219631 None None None 0.000000 None None None 0.00000
2 1BYF 0.455097 0.396868 NaN Zn 302 A O 126 A ... A 2.788446 O 127 A 1.971687 None None None 0.00000
3 1AH7 0.877007 0.970126 NaN Zn 246 A N 118 A ... A 2.196544 O 249 A 2.191126 None None None 0.00000
4 1B9M 0.581767 0.817296 0.478988 Ni 263 A O 148 A ... A 2.238572 N 146 A 2.063766 O 465 A 2.13188

5 rows × 31 columns

Count Unique interactions by metal

In [16]:
print("Unique interactions by metal: ")
unique_ds = interactions.groupBy(['element0']).count().sort("count")
unique_ds.show()
Unique interactions by metal:
+--------+-----+
|element0|count|
+--------+-----+
|      Ni|    1|
|      Cu|    2|
|      Fe|    9|
|      Mn|   12|
|      Zn|   28|
+--------+-----+

Plot histogram for unique interactions count

In [17]:
unique_df = unique_ds.toPandas()
unique_df.plot(x='element0', y='count', kind='bar')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdb408783c8>
../../../_images/_static_demos_interactions_MetalInteractionsExample_16_1.png

Terminate Spark

In [18]:
sc.stop()