Example Of Using PySpark To Find Metal Interactions

Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.structureViewer import group_interaction_viewer, metal_distance_widget
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import py3Dmol
import time

# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"

# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)

Read PDB and create PISCES non-redundant set

In [2]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5))

Setup criteria for metal interactions

In [3]:
# Chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}

interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6)
interactions_filter.set_query_groups(True, metals)

# Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])

Tabulate interactions in a Dataframe

In [4]:
interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()
print(f"Metal interactions: {interactions.count()}")
Metal interactions: 1577

Select interacting atoms and orientational order parameters (q4-q6)

In [5]:
interactions = interactions.select("pdbId", \
                "q4","q5","q6", \
                "element0","groupNum0","chain0", \
                "element1","groupNum1","chain1","distance1", \
                "element2","groupNum2","chain2","distance2", \
                "element3","groupNum3","chain3","distance3", \
                "element4","groupNum4","chain4","distance4", \
                "element5","groupNum5","chain5","distance5", \
                "element6","groupNum6","chain6","distance6").cache();

# show some example interactions
ds = interactions.dropDuplicates(["pdbId"])
df = ds.toPandas() # convert to pandas dataframe to fit table in jupyter notebook cell
df.head()
Out[5]:
pdbId q4 q5 q6 element0 groupNum0 chain0 element1 groupNum1 chain1 ... chain4 distance4 element5 groupNum5 chain5 distance5 element6 groupNum6 chain6 distance6
0 1M4L 0.810257 0.425694 NaN Zn 1308 A N 196 A ... A 2.245983 N 69 A 2.034816 None None None 0.0
1 1YIX 0.683122 NaN NaN Zn 601 A O 205 A ... A 2.179354 None None None 0.000000 None None None 0.0
2 2ETV 0.622655 NaN NaN Ni 1 A O 495 A ... A 2.234011 None None None 0.000000 None None None 0.0
3 3VK6 0.995256 NaN NaN Zn 103 A S 4 A ... A 2.352950 None None None 0.000000 None None None 0.0
4 1ZKP 0.922594 0.455911 NaN Zn 245 A N 59 A ... A 1.983932 O 155 A 2.543866 None None None 0.0

5 rows × 31 columns

Count Unique interactions by metal

In [6]:
print("Unique interactions by metal: ")
unique_ds = interactions.groupBy(['element0']).count().sort("count")
unique_ds.show()
Unique interactions by metal:
+--------+-----+
|element0|count|
+--------+-----+
|      Cu|   30|
|      Co|   42|
|      Ni|  122|
|      Fe|  147|
|      Mn|  205|
|      Zn| 1031|
+--------+-----+

Violin plot using Seaborn

In [7]:
# tranform Dataset to pandas DataFrame
df = interactions.toPandas()

# Set fonts
sns.set(font_scale=2)

# Make subplots
fig, ax = plt.subplots(1,3, sharey = True, figsize = (30,5))

# Loop through subplots
for i in range(3):
    subplot = sns.violinplot(x="element0", y=f"q{i+4}", palette="muted", data=df, ax = ax[i])
    subplot.set(xlabel="Metals", ylabel="", title=f"q{i+4}")
/home/marshuang80/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:588: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  kde_data = remove_na(group_data)
/home/marshuang80/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:816: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  violin_data = remove_na(group_data)
../../../_images/_static_demos_applications_MetalInteractionsAdvanced_14_1.png

Make Violin plots for Metal-Elements distances

In [8]:
# Create dataframe subsets for elements 1-6
df_sub = [df[['element0', f'element{i}', f'distance{i}']]\
          .rename(columns={'element0':'Metal', f'element{i}':'Element', f'distance{i}':'Distance'}) \
          for i in range(1,7)]

# Vertically concating the dataframe subsets
df_concat = pd.concat(df_sub)

# Drop rows with NaN values
df_concat.dropna(inplace = True)

metal_distance_widget(df_concat)
Out[8]:
<function mmtfPyspark.structureViewer.metal_distance_widget.<locals>.metal_distance_violinplot(metal)>

Create subset based on Metal and q values

In [9]:
df_sub = df[df["element0"] == 'Zn']    # Fitler only Zinc interactinos
df_sub = df_sub.sort_values(["q4"], ascending = False).dropna(subset=['q4'])    #Sort by q4 values and drop NaN
df_sub = df_sub[df_sub['q5'] != np.nan]    # Revove interactions where q5 has values

df_sub.head(10)
Out[9]:
pdbId q4 q5 q6 element0 groupNum0 chain0 element1 groupNum1 chain1 ... chain4 distance4 element5 groupNum5 chain5 distance5 element6 groupNum6 chain6 distance6
614 5GRQ 0.999569 NaN NaN Zn 212 A N 96 B ... A 2.064547 None None None 0.0 None None None 0.0
874 2YHO 0.998952 NaN NaN Zn 1002 G S 418 G ... G 2.338642 None None None 0.0 None None None 0.0
872 2YHO 0.998921 NaN NaN Zn 1002 E N 404 E ... E 2.310509 None None None 0.0 None None None 0.0
760 1ZY7 0.998688 NaN NaN Zn 802 B S 451 B ... B 2.058088 None None None 0.0 None None None 0.0
868 2YHO 0.998687 NaN NaN Zn 1002 A S 421 A ... A 2.301190 None None None 0.0 None None None 0.0
496 1OQJ 0.998562 NaN NaN Zn 183 A S 113 A ... A 2.371518 None None None 0.0 None None None 0.0
1429 3ZVS 0.998459 NaN NaN Zn 1162 B S 132 B ... B 2.310093 None None None 0.0 None None None 0.0
1269 3UFF 0.998175 NaN NaN Zn 3 B N 231 B ... B 2.315904 None None None 0.0 None None None 0.0
1316 1Q08 0.998095 NaN NaN Zn 401 A O 301 A ... A 2.426980 None None None 0.0 None None None 0.0
1209 3H0N 0.997777 NaN NaN Zn 201 A S 168 A ... A 2.297602 None None None 0.0 None None None 0.0

10 rows × 31 columns

Visualize Data

In [10]:
group_interaction_viewer(df_sub, 'q4')
Out[10]:
<function mmtfPyspark.structureViewer.group_interaction_viewer.<locals>.view3d(i=0)>

Terminate Spark

In [11]:
sc.stop()