In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.structureViewer import group_interaction_viewer, metal_distance_widget
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import py3Dmol
import time
# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"
# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)
In [2]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5))
In [3]:
# Chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}
interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6)
interactions_filter.set_query_groups(True, metals)
# Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])
In [4]:
interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()
print(f"Metal interactions: {interactions.count()}")
Metal interactions: 1577
In [5]:
interactions = interactions.select("pdbId", \
"q4","q5","q6", \
"element0","groupNum0","chain0", \
"element1","groupNum1","chain1","distance1", \
"element2","groupNum2","chain2","distance2", \
"element3","groupNum3","chain3","distance3", \
"element4","groupNum4","chain4","distance4", \
"element5","groupNum5","chain5","distance5", \
"element6","groupNum6","chain6","distance6").cache();
# show some example interactions
ds = interactions.dropDuplicates(["pdbId"])
df = ds.toPandas() # convert to pandas dataframe to fit table in jupyter notebook cell
df.head()
Out[5]:
pdbId | q4 | q5 | q6 | element0 | groupNum0 | chain0 | element1 | groupNum1 | chain1 | ... | chain4 | distance4 | element5 | groupNum5 | chain5 | distance5 | element6 | groupNum6 | chain6 | distance6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1M4L | 0.810257 | 0.425694 | NaN | Zn | 1308 | A | N | 196 | A | ... | A | 2.245983 | N | 69 | A | 2.034816 | None | None | None | 0.0 |
1 | 1YIX | 0.683122 | NaN | NaN | Zn | 601 | A | O | 205 | A | ... | A | 2.179354 | None | None | None | 0.000000 | None | None | None | 0.0 |
2 | 2ETV | 0.622655 | NaN | NaN | Ni | 1 | A | O | 495 | A | ... | A | 2.234011 | None | None | None | 0.000000 | None | None | None | 0.0 |
3 | 3VK6 | 0.995256 | NaN | NaN | Zn | 103 | A | S | 4 | A | ... | A | 2.352950 | None | None | None | 0.000000 | None | None | None | 0.0 |
4 | 1ZKP | 0.922594 | 0.455911 | NaN | Zn | 245 | A | N | 59 | A | ... | A | 1.983932 | O | 155 | A | 2.543866 | None | None | None | 0.0 |
5 rows × 31 columns
In [6]:
print("Unique interactions by metal: ")
unique_ds = interactions.groupBy(['element0']).count().sort("count")
unique_ds.show()
Unique interactions by metal:
+--------+-----+
|element0|count|
+--------+-----+
| Cu| 30|
| Co| 42|
| Ni| 122|
| Fe| 147|
| Mn| 205|
| Zn| 1031|
+--------+-----+
In [7]:
# tranform Dataset to pandas DataFrame
df = interactions.toPandas()
# Set fonts
sns.set(font_scale=2)
# Make subplots
fig, ax = plt.subplots(1,3, sharey = True, figsize = (30,5))
# Loop through subplots
for i in range(3):
subplot = sns.violinplot(x="element0", y=f"q{i+4}", palette="muted", data=df, ax = ax[i])
subplot.set(xlabel="Metals", ylabel="", title=f"q{i+4}")
/home/marshuang80/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:588: FutureWarning: remove_na is deprecated and is a private function. Do not use.
kde_data = remove_na(group_data)
/home/marshuang80/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:816: FutureWarning: remove_na is deprecated and is a private function. Do not use.
violin_data = remove_na(group_data)
In [8]:
# Create dataframe subsets for elements 1-6
df_sub = [df[['element0', f'element{i}', f'distance{i}']]\
.rename(columns={'element0':'Metal', f'element{i}':'Element', f'distance{i}':'Distance'}) \
for i in range(1,7)]
# Vertically concating the dataframe subsets
df_concat = pd.concat(df_sub)
# Drop rows with NaN values
df_concat.dropna(inplace = True)
metal_distance_widget(df_concat)
Out[8]:
<function mmtfPyspark.structureViewer.metal_distance_widget.<locals>.metal_distance_violinplot(metal)>
In [9]:
df_sub = df[df["element0"] == 'Zn'] # Fitler only Zinc interactinos
df_sub = df_sub.sort_values(["q4"], ascending = False).dropna(subset=['q4']) #Sort by q4 values and drop NaN
df_sub = df_sub[df_sub['q5'] != np.nan] # Revove interactions where q5 has values
df_sub.head(10)
Out[9]:
pdbId | q4 | q5 | q6 | element0 | groupNum0 | chain0 | element1 | groupNum1 | chain1 | ... | chain4 | distance4 | element5 | groupNum5 | chain5 | distance5 | element6 | groupNum6 | chain6 | distance6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
614 | 5GRQ | 0.999569 | NaN | NaN | Zn | 212 | A | N | 96 | B | ... | A | 2.064547 | None | None | None | 0.0 | None | None | None | 0.0 |
874 | 2YHO | 0.998952 | NaN | NaN | Zn | 1002 | G | S | 418 | G | ... | G | 2.338642 | None | None | None | 0.0 | None | None | None | 0.0 |
872 | 2YHO | 0.998921 | NaN | NaN | Zn | 1002 | E | N | 404 | E | ... | E | 2.310509 | None | None | None | 0.0 | None | None | None | 0.0 |
760 | 1ZY7 | 0.998688 | NaN | NaN | Zn | 802 | B | S | 451 | B | ... | B | 2.058088 | None | None | None | 0.0 | None | None | None | 0.0 |
868 | 2YHO | 0.998687 | NaN | NaN | Zn | 1002 | A | S | 421 | A | ... | A | 2.301190 | None | None | None | 0.0 | None | None | None | 0.0 |
496 | 1OQJ | 0.998562 | NaN | NaN | Zn | 183 | A | S | 113 | A | ... | A | 2.371518 | None | None | None | 0.0 | None | None | None | 0.0 |
1429 | 3ZVS | 0.998459 | NaN | NaN | Zn | 1162 | B | S | 132 | B | ... | B | 2.310093 | None | None | None | 0.0 | None | None | None | 0.0 |
1269 | 3UFF | 0.998175 | NaN | NaN | Zn | 3 | B | N | 231 | B | ... | B | 2.315904 | None | None | None | 0.0 | None | None | None | 0.0 |
1316 | 1Q08 | 0.998095 | NaN | NaN | Zn | 401 | A | O | 301 | A | ... | A | 2.426980 | None | None | None | 0.0 | None | None | None | 0.0 |
1209 | 3H0N | 0.997777 | NaN | NaN | Zn | 201 | A | S | 168 | A | ... | A | 2.297602 | None | None | None | 0.0 | None | None | None | 0.0 |
10 rows × 31 columns
In [10]:
group_interaction_viewer(df_sub, 'q4')
Out[10]:
<function mmtfPyspark.structureViewer.group_interaction_viewer.<locals>.view3d(i=0)>
In [11]:
sc.stop()