In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
from mmtfPyspark.webfilters import Pisces
import matplotlib.pyplot as plt
import pandas as pd
import py3Dmol
import time
# Create variables
APP_NAME = "MMTF_Spark"
path = "../../resources/mmtf_full_sample/"
# Configure Spark
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")
sc = SparkContext(conf=conf)
In [11]:
# input parameters
sequenceIdentityCutoff = 30
resolution = 2.5
minInteractions = 4
maxInteractions = 6
distanceCutoff = 3.0
# chemical component codes of metals in different oxidation states
metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"}
In [12]:
pdb = mmtfReader.read_sequence_file(path, sc)
pdb = pdb.filter(Pisces(sequenceIdentity = sequenceIdentityCutoff, resolution = resolution))
In [13]:
interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(distanceCutoff)
interactions_filter.set_min_interactions(minInteractions)
interactions_filter.set_max_interactions(maxInteractions)
interactions_filter.set_query_groups(True, metals)
#Exclude non-polar interactions
interactions_filter.set_target_elements(False, ['H','C','P'])
In [14]:
interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()
print(f"Metal interactions: {interactions.count()}")
Metal interactions: 52
In [15]:
interactions = interactions.select("pdbId", \
"q4","q5","q6", \
"element0","groupNum0","chain0", \
"element1","groupNum1","chain1","distance1", \
"element2","groupNum2","chain2","distance2", \
"element3","groupNum3","chain3","distance3", \
"element4","groupNum4","chain4","distance4", \
"element5","groupNum5","chain5","distance5", \
"element6","groupNum6","chain6","distance6").cache();
# show some example interactions
ds = interactions.dropDuplicates(["pdbId"])
df = ds.toPandas() # convert to pandas dataframe to fit table in jupyter notebook cell
df.head()
Out[15]:
pdbId | q4 | q5 | q6 | element0 | groupNum0 | chain0 | element1 | groupNum1 | chain1 | ... | chain4 | distance4 | element5 | groupNum5 | chain5 | distance5 | element6 | groupNum6 | chain6 | distance6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1A2P | 0.545893 | NaN | NaN | Zn | 112 | C | O | 139 | C | ... | C | 2.120991 | None | None | None | 0.000000 | None | None | None | 0.00000 |
1 | 1FN9 | 0.982773 | NaN | NaN | Zn | 1001 | A | S | 51 | A | ... | A | 2.219631 | None | None | None | 0.000000 | None | None | None | 0.00000 |
2 | 1BYF | 0.455097 | 0.396868 | NaN | Zn | 302 | A | O | 126 | A | ... | A | 2.788446 | O | 127 | A | 1.971687 | None | None | None | 0.00000 |
3 | 1AH7 | 0.877007 | 0.970126 | NaN | Zn | 246 | A | N | 118 | A | ... | A | 2.196544 | O | 249 | A | 2.191126 | None | None | None | 0.00000 |
4 | 1B9M | 0.581767 | 0.817296 | 0.478988 | Ni | 263 | A | O | 148 | A | ... | A | 2.238572 | N | 146 | A | 2.063766 | O | 465 | A | 2.13188 |
5 rows × 31 columns
In [16]:
print("Unique interactions by metal: ")
unique_ds = interactions.groupBy(['element0']).count().sort("count")
unique_ds.show()
Unique interactions by metal:
+--------+-----+
|element0|count|
+--------+-----+
| Ni| 1|
| Cu| 2|
| Fe| 9|
| Mn| 12|
| Zn| 28|
+--------+-----+
In [17]:
unique_df = unique_ds.toPandas()
unique_df.plot(x='element0', y='count', kind='bar')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdb408783c8>
In [18]:
sc.stop()