Example Of Using PySpark To Find Metal Interactions

Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.structureViewer import group_interaction_viewer, metal_distance_widget
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import py3Dmol
import time

# Create variables
path = "../../resources/mmtf_full_sample/"

# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)

Read PDB and create PISCES non-redundant set

In [2]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5))

Setup criteria for metal interactions

In [3]:
# Chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}

interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6)
interactions_filter.set_query_groups(True, metals)

# Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])

Tabulate interactions in a Dataframe

In [4]:
interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()
print(f"Metal interactions: {interactions.count()}")
Metal interactions: 1577

Select interacting atoms and orientational order parameters (q4-q6)

In [5]:
interactions = interactions.select("pdbId", \
                "q4","q5","q6", \
                "element0","groupNum0","chain0", \
                "element1","groupNum1","chain1","distance1", \
                "element2","groupNum2","chain2","distance2", \
                "element3","groupNum3","chain3","distance3", \
                "element4","groupNum4","chain4","distance4", \
                "element5","groupNum5","chain5","distance5", \

# show some example interactions
ds = interactions.dropDuplicates(["pdbId"])
df = ds.toPandas() # convert to pandas dataframe to fit table in jupyter notebook cell
pdbId q4 q5 q6 element0 groupNum0 chain0 element1 groupNum1 chain1 ... chain4 distance4 element5 groupNum5 chain5 distance5 element6 groupNum6 chain6 distance6
0 1M4L 0.810257 0.425694 NaN Zn 1308 A N 196 A ... A 2.245983 N 69 A 2.034816 None None None 0.0
1 1YIX 0.683122 NaN NaN Zn 601 A O 205 A ... A 2.179354 None None None 0.000000 None None None 0.0
2 2ETV 0.622655 NaN NaN Ni 1 A O 495 A ... A 2.234011 None None None 0.000000 None None None 0.0
3 3VK6 0.995256 NaN NaN Zn 103 A S 4 A ... A 2.352950 None None None 0.000000 None None None 0.0
4 1ZKP 0.922594 0.455911 NaN Zn 245 A N 59 A ... A 1.983932 O 155 A 2.543866 None None None 0.0

5 rows × 31 columns

Count Unique interactions by metal

In [6]:
print("Unique interactions by metal: ")
unique_ds = interactions.groupBy(['element0']).count().sort("count")
Unique interactions by metal:
|      Cu|   30|
|      Co|   42|
|      Ni|  122|
|      Fe|  147|
|      Mn|  205|
|      Zn| 1031|

Violin plot using Seaborn

In [7]:
# tranform Dataset to pandas DataFrame
df = interactions.toPandas()

# Set fonts

# Make subplots
fig, ax = plt.subplots(1,3, sharey = True, figsize = (30,5))

# Loop through subplots
for i in range(3):
    subplot = sns.violinplot(x="element0", y=f"q{i+4}", palette="muted", data=df, ax = ax[i])
    subplot.set(xlabel="Metals", ylabel="", title=f"q{i+4}")
/home/marshuang80/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:588: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  kde_data = remove_na(group_data)
/home/marshuang80/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:816: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  violin_data = remove_na(group_data)

Make Violin plots for Metal-Elements distances

In [8]:
# Create dataframe subsets for elements 1-6
df_sub = [df[['element0', f'element{i}', f'distance{i}']]\
          .rename(columns={'element0':'Metal', f'element{i}':'Element', f'distance{i}':'Distance'}) \
          for i in range(1,7)]

# Vertically concating the dataframe subsets
df_concat = pd.concat(df_sub)

# Drop rows with NaN values
df_concat.dropna(inplace = True)

<function mmtfPyspark.structureViewer.metal_distance_widget.<locals>.metal_distance_violinplot(metal)>

Create subset based on Metal and q values

In [9]:
df_sub = df[df["element0"] == 'Zn']    # Fitler only Zinc interactinos
df_sub = df_sub.sort_values(["q4"], ascending = False).dropna(subset=['q4'])    #Sort by q4 values and drop NaN
df_sub = df_sub[df_sub['q5'] != np.nan]    # Revove interactions where q5 has values

pdbId q4 q5 q6 element0 groupNum0 chain0 element1 groupNum1 chain1 ... chain4 distance4 element5 groupNum5 chain5 distance5 element6 groupNum6 chain6 distance6
614 5GRQ 0.999569 NaN NaN Zn 212 A N 96 B ... A 2.064547 None None None 0.0 None None None 0.0
874 2YHO 0.998952 NaN NaN Zn 1002 G S 418 G ... G 2.338642 None None None 0.0 None None None 0.0
872 2YHO 0.998921 NaN NaN Zn 1002 E N 404 E ... E 2.310509 None None None 0.0 None None None 0.0
760 1ZY7 0.998688 NaN NaN Zn 802 B S 451 B ... B 2.058088 None None None 0.0 None None None 0.0
868 2YHO 0.998687 NaN NaN Zn 1002 A S 421 A ... A 2.301190 None None None 0.0 None None None 0.0
496 1OQJ 0.998562 NaN NaN Zn 183 A S 113 A ... A 2.371518 None None None 0.0 None None None 0.0
1429 3ZVS 0.998459 NaN NaN Zn 1162 B S 132 B ... B 2.310093 None None None 0.0 None None None 0.0
1269 3UFF 0.998175 NaN NaN Zn 3 B N 231 B ... B 2.315904 None None None 0.0 None None None 0.0
1316 1Q08 0.998095 NaN NaN Zn 401 A O 301 A ... A 2.426980 None None None 0.0 None None None 0.0
1209 3H0N 0.997777 NaN NaN Zn 201 A S 168 A ... A 2.297602 None None None 0.0 None None None 0.0

10 rows × 31 columns

Visualize Data

In [10]:
group_interaction_viewer(df_sub, 'q4')
<function mmtfPyspark.structureViewer.group_interaction_viewer.<locals>.view3d(i=0)>

Terminate Spark

In [11]: