{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Example Of Using PySpark To Find Metal Interactions \n",
"\n",
"
\n",
"\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports and variables"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from pyspark import SparkConf, SparkContext \n",
"from mmtfPyspark.io import mmtfReader\n",
"from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor\n",
"from mmtfPyspark.filters import ContainsLProteinChain, Resolution\n",
"from mmtfPyspark.webfilters import Pisces\n",
"from mmtfPyspark.structureViewer import group_interaction_viewer, metal_distance_widget\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import pandas as pd\n",
"import py3Dmol\n",
"import time\n",
" \n",
"# Create variables \n",
"APP_NAME = \"MMTF_Spark\" \n",
"path = \"../../resources/mmtf_full_sample/\"\n",
"\n",
"# Configure Spark \n",
"conf = SparkConf().setAppName(APP_NAME).setMaster(\"local[*]\") \n",
"sc = SparkContext(conf=conf) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read PDB and create PISCES non-redundant set"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"pdb = mmtfReader.read_sequence_file(path, sc)\n",
"pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5)) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup criteria for metal interactions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Chemical component codes of metals in different oxidation states\n",
"metals = {\"V\",\"CR\",\"MN\",\"MN3\",\"FE\",\"FE2\",\"CO\",\"3CO\",\"NI\",\"3NI\", \"CU\",\"CU1\",\"CU3\",\"ZN\",\"MO\",\"4MO\",\"6MO\"}\n",
"\n",
"interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6)\n",
"interactions_filter.set_query_groups(True, metals)\n",
"\n",
"# Exclude non-polar interactions\n",
"interactions_filter.set_target_elements(False, ['H','C','P'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tabulate interactions in a Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Metal interactions: 1577\n"
]
}
],
"source": [
"interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()\n",
"print(f\"Metal interactions: {interactions.count()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select interacting atoms and orientational order parameters (q4-q6)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | pdbId | \n", "q4 | \n", "q5 | \n", "q6 | \n", "element0 | \n", "groupNum0 | \n", "chain0 | \n", "element1 | \n", "groupNum1 | \n", "chain1 | \n", "... | \n", "chain4 | \n", "distance4 | \n", "element5 | \n", "groupNum5 | \n", "chain5 | \n", "distance5 | \n", "element6 | \n", "groupNum6 | \n", "chain6 | \n", "distance6 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1M4L | \n", "0.810257 | \n", "0.425694 | \n", "NaN | \n", "Zn | \n", "1308 | \n", "A | \n", "N | \n", "196 | \n", "A | \n", "... | \n", "A | \n", "2.245983 | \n", "N | \n", "69 | \n", "A | \n", "2.034816 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
1 | \n", "1YIX | \n", "0.683122 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "601 | \n", "A | \n", "O | \n", "205 | \n", "A | \n", "... | \n", "A | \n", "2.179354 | \n", "None | \n", "None | \n", "None | \n", "0.000000 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
2 | \n", "2ETV | \n", "0.622655 | \n", "NaN | \n", "NaN | \n", "Ni | \n", "1 | \n", "A | \n", "O | \n", "495 | \n", "A | \n", "... | \n", "A | \n", "2.234011 | \n", "None | \n", "None | \n", "None | \n", "0.000000 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
3 | \n", "3VK6 | \n", "0.995256 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "103 | \n", "A | \n", "S | \n", "4 | \n", "A | \n", "... | \n", "A | \n", "2.352950 | \n", "None | \n", "None | \n", "None | \n", "0.000000 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
4 | \n", "1ZKP | \n", "0.922594 | \n", "0.455911 | \n", "NaN | \n", "Zn | \n", "245 | \n", "A | \n", "N | \n", "59 | \n", "A | \n", "... | \n", "A | \n", "1.983932 | \n", "O | \n", "155 | \n", "A | \n", "2.543866 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
5 rows × 31 columns
\n", "\n", " | pdbId | \n", "q4 | \n", "q5 | \n", "q6 | \n", "element0 | \n", "groupNum0 | \n", "chain0 | \n", "element1 | \n", "groupNum1 | \n", "chain1 | \n", "... | \n", "chain4 | \n", "distance4 | \n", "element5 | \n", "groupNum5 | \n", "chain5 | \n", "distance5 | \n", "element6 | \n", "groupNum6 | \n", "chain6 | \n", "distance6 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
614 | \n", "5GRQ | \n", "0.999569 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "212 | \n", "A | \n", "N | \n", "96 | \n", "B | \n", "... | \n", "A | \n", "2.064547 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
874 | \n", "2YHO | \n", "0.998952 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "1002 | \n", "G | \n", "S | \n", "418 | \n", "G | \n", "... | \n", "G | \n", "2.338642 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
872 | \n", "2YHO | \n", "0.998921 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "1002 | \n", "E | \n", "N | \n", "404 | \n", "E | \n", "... | \n", "E | \n", "2.310509 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
760 | \n", "1ZY7 | \n", "0.998688 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "802 | \n", "B | \n", "S | \n", "451 | \n", "B | \n", "... | \n", "B | \n", "2.058088 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
868 | \n", "2YHO | \n", "0.998687 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "1002 | \n", "A | \n", "S | \n", "421 | \n", "A | \n", "... | \n", "A | \n", "2.301190 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
496 | \n", "1OQJ | \n", "0.998562 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "183 | \n", "A | \n", "S | \n", "113 | \n", "A | \n", "... | \n", "A | \n", "2.371518 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
1429 | \n", "3ZVS | \n", "0.998459 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "1162 | \n", "B | \n", "S | \n", "132 | \n", "B | \n", "... | \n", "B | \n", "2.310093 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
1269 | \n", "3UFF | \n", "0.998175 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "3 | \n", "B | \n", "N | \n", "231 | \n", "B | \n", "... | \n", "B | \n", "2.315904 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
1316 | \n", "1Q08 | \n", "0.998095 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "401 | \n", "A | \n", "O | \n", "301 | \n", "A | \n", "... | \n", "A | \n", "2.426980 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
1209 | \n", "3H0N | \n", "0.997777 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "201 | \n", "A | \n", "S | \n", "168 | \n", "A | \n", "... | \n", "A | \n", "2.297602 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "None | \n", "None | \n", "None | \n", "0.0 | \n", "
10 rows × 31 columns
\n", "