{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example of using PySpark to find metal interactions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports and variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext \n", "from mmtfPyspark.io import mmtfReader\n", "from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor\n", "from mmtfPyspark.filters import ContainsLProteinChain, Resolution\n", "from mmtfPyspark.webfilters import Pisces\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import py3Dmol\n", "import time\n", " \n", "# Create variables \n", "APP_NAME = \"MMTF_Spark\" \n", "path = \"../../resources/mmtf_full_sample/\"\n", "\n", "# Configure Spark \n", "conf = SparkConf().setAppName(APP_NAME).setMaster(\"local[*]\") \n", "sc = SparkContext(conf=conf) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define Variables" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# input parameters\n", "sequenceIdentityCutoff = 30\n", "resolution = 2.5\n", "minInteractions = 4\n", "maxInteractions = 6\n", "distanceCutoff = 3.0\n", "\n", "# chemical component codes of metals in different oxidation states\n", "metals = {\"V\",\"CR\",\"MN\",\"MN3\",\"FE\",\"FE2\",\"CO\",\"3CO\",\"NI\",\"3NI\", \"CU\",\"CU1\",\"CU3\",\"ZN\",\"MO\",\"4MO\",\"6MO\"}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read PDB and create PISCES non-redundant set" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "pdb = mmtfReader.read_sequence_file(path, sc)\n", "pdb = pdb.filter(Pisces(sequenceIdentity = sequenceIdentityCutoff, resolution = resolution)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup criteria for metal interactions" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "interactions_filter = InteractionFilter()\n", "interactions_filter.set_distance_cutoff(distanceCutoff)\n", "interactions_filter.set_min_interactions(minInteractions)\n", "interactions_filter.set_max_interactions(maxInteractions)\n", "interactions_filter.set_query_groups(True, metals)\n", "\n", "#Exclude non-polar interactions\n", "interactions_filter.set_target_elements(False, ['H','C','P'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tabulate interactions in a Dataframe" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Metal interactions: 52\n" ] } ], "source": [ "interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()\n", "print(f\"Metal interactions: {interactions.count()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select interacting atoms and orientational order parameters (q4-q6)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pdbIdq4q5q6element0groupNum0chain0element1groupNum1chain1...chain4distance4element5groupNum5chain5distance5element6groupNum6chain6distance6
01A2P0.545893NaNNaNZn112CO139C...C2.120991NoneNoneNone0.000000NoneNoneNone0.00000
11FN90.982773NaNNaNZn1001AS51A...A2.219631NoneNoneNone0.000000NoneNoneNone0.00000
21BYF0.4550970.396868NaNZn302AO126A...A2.788446O127A1.971687NoneNoneNone0.00000
31AH70.8770070.970126NaNZn246AN118A...A2.196544O249A2.191126NoneNoneNone0.00000
41B9M0.5817670.8172960.478988Ni263AO148A...A2.238572N146A2.063766O465A2.13188
\n", "

5 rows × 31 columns

\n", "
" ], "text/plain": [ " pdbId q4 q5 q6 element0 groupNum0 chain0 element1 \\\n", "0 1A2P 0.545893 NaN NaN Zn 112 C O \n", "1 1FN9 0.982773 NaN NaN Zn 1001 A S \n", "2 1BYF 0.455097 0.396868 NaN Zn 302 A O \n", "3 1AH7 0.877007 0.970126 NaN Zn 246 A N \n", "4 1B9M 0.581767 0.817296 0.478988 Ni 263 A O \n", "\n", " groupNum1 chain1 ... chain4 distance4 element5 groupNum5 chain5 \\\n", "0 139 C ... C 2.120991 None None None \n", "1 51 A ... A 2.219631 None None None \n", "2 126 A ... A 2.788446 O 127 A \n", "3 118 A ... A 2.196544 O 249 A \n", "4 148 A ... A 2.238572 N 146 A \n", "\n", " distance5 element6 groupNum6 chain6 distance6 \n", "0 0.000000 None None None 0.00000 \n", "1 0.000000 None None None 0.00000 \n", "2 1.971687 None None None 0.00000 \n", "3 2.191126 None None None 0.00000 \n", "4 2.063766 O 465 A 2.13188 \n", "\n", "[5 rows x 31 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interactions = interactions.select(\"pdbId\", \\\n", " \"q4\",\"q5\",\"q6\", \\\n", " \"element0\",\"groupNum0\",\"chain0\", \\\n", " \"element1\",\"groupNum1\",\"chain1\",\"distance1\", \\\n", " \"element2\",\"groupNum2\",\"chain2\",\"distance2\", \\\n", " \"element3\",\"groupNum3\",\"chain3\",\"distance3\", \\\n", " \"element4\",\"groupNum4\",\"chain4\",\"distance4\", \\\n", " \"element5\",\"groupNum5\",\"chain5\",\"distance5\", \\\n", " \"element6\",\"groupNum6\",\"chain6\",\"distance6\").cache();\n", "\n", "# show some example interactions\n", "ds = interactions.dropDuplicates([\"pdbId\"])\n", "df = ds.toPandas() # convert to pandas dataframe to fit table in jupyter notebook cell\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count Unique interactions by metal" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique interactions by metal: \n", "+--------+-----+\n", "|element0|count|\n", "+--------+-----+\n", "| Ni| 1|\n", "| Cu| 2|\n", "| Fe| 9|\n", "| Mn| 12|\n", "| Zn| 28|\n", "+--------+-----+\n", "\n" ] } ], "source": [ "print(\"Unique interactions by metal: \")\n", "unique_ds = interactions.groupBy(['element0']).count().sort(\"count\")\n", "unique_ds.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot histogram for unique interactions count" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEPCAYAAABShj9RAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvFvnyVgAAEdFJREFUeJzt3X+QVfV9xvHn4UdEgSTCrgREXWttBCs/zIamUkd+1JTGadBW26oQdCxrGxUd/cdopmrrdOxQCdVmjBhRaIlO1KikWo0R1EGtuigjIKZaB80CwgodxWQIIJ/+cc/CQna5d/fevWf3e9+vmZ2995xz73n2zO7D4XvPD0eEAAB9X7+8AwAAKoNCB4BEUOgAkAgKHQASQaEDQCIodABIBIUOAImg0AEgERQ6ACRiQDVXVldXFw0NDdVcJQD0eatXr/4oIuqLLVfVQm9oaFBzc3M1VwkAfZ7t90tZjiEXAEgEhQ4AiaDQASARVR1D78iePXvU0tKiXbt25R2l6gYNGqTRo0dr4MCBeUcBkIDcC72lpUVDhw5VQ0ODbOcdp2oiQtu3b1dLS4tOPPHEvOMASEDuQy67du3S8OHDa6rMJcm2hg8fXpP/MwHQM3IvdEk1V+ZtavXnBtAzekWhAwDKl/sY+qEarn+iou+38bZzKvp+XbVw4UI1NTXpqKOOyjUHgPT1ukJPzcKFCzVr1iwKHcjB9/92Rd4RdMUPplVtXQy5SFq6dKnGjRun8ePHa/bs2dq4caOmTZumcePGafr06frggw8kSZdccokefvjh/a8bMmSIJOm5557TlClTdP755+uUU07RxRdfrIjQHXfcoc2bN2vq1KmaOnVqLj8bgNpR83vo69ev16233qqXXnpJdXV12rFjh+bMmbP/a/HixZo3b54ee+yxw77PG2+8ofXr12vUqFGaPHmyXnzxRc2bN08LFizQypUrVVdXV6WfCECtqvk99BUrVuiCCy7YX7jDhg3Tyy+/rIsuukiSNHv2bK1ataro+0yaNEmjR49Wv379NGHCBG3cuLEnYwPAb6n5Qu+KAQMGaN++fZKkffv2affu3fvnHXHEEfsf9+/fX3v37q16PgC1reYLfdq0aXrooYe0fft2SdKOHTt0xhln6MEHH5QkLVu2TGeeeaakwuV/V69eLUlavny59uzZU/T9hw4dqp07d/ZQegA4oNeNoVf7MMNTTz1VN954o8466yz1799fEydO1J133qlLL71U8+fPV319ve677z5J0ty5czVz5kyNHz9eM2bM0ODBg4u+f1NTk2bMmKFRo0Zp5cqVPf3jAKhhjoiqrayxsTEOvcHFhg0bNGbMmKpl6G1q/ecHelIqhy3aXh0RjcWWq/khFwBIBYUOAInoFYVezWGf3qRWf24APSP3Qh80aJC2b99ec+XWdj30QYMG5R0FQCJyP8pl9OjRamlpUWtra95Rqq7tjkUAUAm5F/rAgQO5Yw8AVEDuQy4AgMqg0AEgERQ6ACSCQgeARBQtdNvH2V5p+y3b621fnU2/2fYm22uyr2/0fFwAQGdKOcplr6TrIuJ120Mlrbb9TDbvexHxLz0XDwBQqqKFHhFbJG3JHu+0vUHSsT0dDADQNV0aQ7fdIGmipFeySVfaftP2YttHVzgbAKALSi5020MkPSLpmoj4RNJdkk6SNEGFPfjbO3ldk+1m2821eDYoAFRLSYVue6AKZb4sIn4iSRGxNSI+i4h9ku6RNKmj10bEoohojIjG+vr6SuUGAByilKNcLOleSRsiYkG76SPbLXaepHWVjwcAKFUpR7lMljRb0lrba7JpN0i60PYESSFpo6TLeyQhAKAkpRzlskqSO5j1ZOXjAAC6izNFASARFDoAJIJCB4BEUOgAkAgKHQASQaEDQCIodABIBIUOAImg0AEgERQ6ACSCQgeARFDoAJAICh0AEkGhA0AiKHQASASFDgCJoNABIBEUOgAkgkIHgERQ6ACQCAodABJBoQNAIih0AEgEhQ4AiaDQASARFDoAJIJCB4BEUOgAkIiihW77ONsrbb9le73tq7Ppw2w/Y/ud7PvRPR8XANCZUvbQ90q6LiLGSvqapCtsj5V0vaRnI+JkSc9mzwEAOSla6BGxJSJezx7vlLRB0rGSZkpaki22RNK5PRUSAFBcl8bQbTdImijpFUkjImJLNutDSSMqmgwA0CUlF7rtIZIekXRNRHzSfl5EhKTo5HVNtpttN7e2tpYVFgDQuZIK3fZAFcp8WUT8JJu81fbIbP5ISds6em1ELIqIxohorK+vr0RmAEAHSjnKxZLulbQhIha0m7Vc0pzs8RxJj1c+HgCgVANKWGaypNmS1tpek027QdJtkn5s+zJJ70v6y56JCAAoRdFCj4hVktzJ7OmVjQMA6C7OFAWARFDoAJAICh0AEkGhA0AiKHQASASFDgCJoNABIBEUOgAkgkIHgERQ6ACQCAodABJBoQNAIih0AEgEhQ4AiaDQASARFDoAJIJCB4BEUOgAkAgKHQASQaEDQCIodABIBIUOAImg0AEgERQ6ACSCQgeARFDoAJAICh0AEkGhA0Aiiha67cW2t9le127azbY32V6TfX2jZ2MCAIopZQ/9fkkzOpj+vYiYkH09WdlYAICuKlroEfGCpB1VyAIAKEM5Y+hX2n4zG5I5umKJAADd0t1Cv0vSSZImSNoi6fbOFrTdZLvZdnNra2s3VwcAKKZbhR4RWyPis4jYJ+keSZMOs+yiiGiMiMb6+vru5gQAFNGtQrc9st3T8ySt62xZAEB1DCi2gO0HJE2RVGe7RdJNkqbYniApJG2UdHkPZgQAlKBooUfEhR1MvrcHsgAAysCZogCQCAodABJBoQNAIih0AEgEhQ4AiaDQASARFDoAJIJCB4BEUOgAkAgKHQASQaEDQCIodABIBIUOAImg0AEgERQ6ACSCQgeARFDoAJCIoncsAtC3bDhlTN4RNObtDXlHqEnsoQNAIih0AEgEhQ4AiaDQASARFDoAJIJCB4BEUOgAkAgKHQASQaEDQCIodABIRNFCt73Y9jbb69pNG2b7GdvvZN+P7tmYAIBiStlDv1/SjEOmXS/p2Yg4WdKz2XMAQI6KFnpEvCBpxyGTZ0pakj1eIuncCucCAHRRd8fQR0TEluzxh5JGVCgPAKCbyv5QNCJCUnQ233aT7Wbbza2treWuDgDQie4W+lbbIyUp+76tswUjYlFENEZEY319fTdXBwAopruFvlzSnOzxHEmPVyYOAKC7Sjls8QFJL0v6su0W25dJuk3S2bbfkfTH2XMAQI6K3oIuIi7sZNb0CmcBAJSBM0UBIBEUOgAkouiQC9AXnLbktLwjaO2ctXlHQI1jDx0AEkGhA0AiKHQASASFDgCJoNABIBEUOgAkgkIHgERQ6ACQCAodABJBoQNAIih0AEgEhQ4AiaDQASARFDoAJIJCB4BEUOgAkAgKHQASQaEDQCIodABIBIUOAImg0AEgERQ6ACSCQgeARFDoAJAICh0AEjGgnBfb3ihpp6TPJO2NiMZKhAIAdF1ZhZ6ZGhEfVeB9AABlYMgFABJRbqGHpJ/ZXm27qRKBAADdU+6Qyx9FxCbbx0h6xvbbEfFC+wWyom+SpOOPP77M1QEAOlPWHnpEbMq+b5P0qKRJHSyzKCIaI6Kxvr6+nNUBAA6j24Vue7DtoW2PJX1d0rpKBQMAdE05Qy4jJD1qu+19fhQRT1UkFQCgy7pd6BHxnqTxFcwCACgDhy0CQCIodABIBIUOAImg0AEgERQ6ACSCQgeARFDoAJAICh0AEkGhA0AiKHQASASFDgCJoNABIBEUOgAkgkIHgERQ6ACQCAodABJBoQNAIsq5BR3ydvMX8k4g3fxx3gkAZNhDB4BEUOgAkAgKHQASQaEDQCIodABIBIUOAInoc4ctNlz/RN4RtPG2c/KOAAC/hT10AEgEhQ4AiaDQASARZRW67Rm2f2H7XdvXVyoUAKDrul3otvtL+r6kP5U0VtKFtsdWKhgAoGvK2UOfJOndiHgvInZLelDSzMrEAgB0VTmFfqykX7Z73pJNAwDkoMePQ7fdJKkpe/qp7V/09DqLqJP0UTlv4H+uUJL8lb0tdIsrkyR/5f9eXMK22M9sizZX3l2RHCeUslA5hb5J0nHtno/Oph0kIhZJWlTGeirKdnNENOadozdgWxzAtjiAbXFAX9sW5Qy5vCbpZNsn2v6cpL+WtLwysQAAXdXtPfSI2Gv7SklPS+ovaXFErK9YMgBAl5Q1hh4RT0p6skJZqqXXDP/0AmyLA9gWB7AtDuhT28IRkXcGAEAFcOo/ACSCQgeARFDoAJCIPneDC5TH9rc6mh4RS6udpTewfYKkkyPi57aPlDQgInbmnSsP2fWZRqhdL0TEB/klyo/tMyQ16OBt0ev/RpIudNuzIuI/bF/b0fyIWFDtTL3AV9s9HiRpuqTXJfX6X9ZKsz1XhbOYh0k6SYWT436gwjapKbavknSTpK2S9mWTQ9K43ELlxPa/q/D7sEbSZ9nkUB/4G0m60CUNzr4PzTVFLxIRV7V/bvuLKlxYrRZdocJF5l6RpIh4x/Yx+UbKzdWSvhwR2/MO0gs0ShobffAQwKQLPSLuzr7fkneWXuxXkn4n7xA5+U1E7HZ23RHbA1TYE6tFv5T0cd4heol1kr4kaUveQboq6UK3/feHmR0R8Y9VC9NL2P6pDpRWPxWuZf/j/BLl6nnbN0g60vbZkr4t6ac5Z8rLe5Kes/2EpN+0TazRYck6SW/ZflUHb4tv5hepNEmfWGT7ug4mD5Z0maThETGkypFyY/t3dcgHXpL2SrKkLRHxv7kEy5Htfir8Lnxdhe3wtKQf9sX/apfL9k0dTa+l/93a/pOIeNr2WR3MPiYiHqp6qC5KutDbsz1UhXHCy1TYI709Irblm6p6bP+npO9ExNpDpp8m6Z8i4s/ySVZ9to+v1aM30Dnbn0l6QdKsiNh0yLzXI+L0fJKVLukhF0myPUzStZIulrRE0ukR8X/5psrFiEPLXJIiYq3thurHydVjkk6XJNuPRMRf5JwnN7YPe4XUvjDMUEFvSvqRpJdtXxsRD7eb1ycu8J50odueL+nPVbjAzmkR8WnOkfL0xcPMO7JqKXqH9n+ctfqBcJs/VOED0QdUONqnTxRXD4mIuMf285KW2T5H0hUR8Wv1kQ/LUz9T9DpJoyR9V9Jm259kXzttf5Jztmprzo67Pojtv5G0Ooc8eYpOHteiL0m6QdLvS/pXSWdL+igino+I53NNlpOI+B8V/qHbKukN23+Qc6SS1cwYeq2zPULSo5J260CBN0r6nKTzIuLDvLJVWzZW+isV9kaPlPTrtlkq7KV9Pq9sebJ9hKQLJc2XdEtE/FvOkarK9hsRMfGQaVMkLZZUHxG9/nwWCr3G2J6qwt6YJK2PiBV55kH+siI/R4Uyb1DhzmOLD/1gMHW2z42IxzqYfrSkyyPithxidQmFDtQw20tV+Af+SUkPRsS6nCOhDBQ6UMNs71Nh+Ek6+POEmh5+6qsodABIROpHuQBAzaDQASARFDr6PNsbbdfltO5rbB/V7vlXbK+1/a7tO9x2KUegCih0oDzXSDqq3fO7JM2VdHL2NSOPUKhNFDr6FNuzbL9qe43tu7PbphWdb/tT2/Ntr7f9c9uTbD9n+z3b38yW6Z8t85rtN21fnk2fki37sO23bS9zwTwVzkReaXul7ZGSPh8R/51dsXGppHOruoFQ0yh09Bm2x0j6K0mTI2KCCrcHu7jE+YMlrYiIUyXtlHSrCqe5nyfpH7JlLpP0cUR8VYVb9c21fWI2b6IKe+NjVbj+y+SIuEPSZklTI2KqpGMltbSL3JJNA6oi6YtzITnTJX1F0mvZ0PSRkraVOH+3pKeyx2tVuFvRHttrVTg7UipcF32c7fOz519QYdhkt6RXI6JFkmyvyV6zqrI/HlAeCh19iSUtiYjvHDTRvuRw8zN72t24Yp+yO9FExL7s1nNtr78qIp4+5P2nqN2da1TY8+/ob2eTCjeabjM6mwZUBUMu6EuelXR+242cbQ+zfUIX5hfztKS/sz0we/3v2R5c5DU7ld2EPCK2SPrE9teyo1u+JenxLqwfKAt76OgzIuIt29+V9LPs9nF7JF1Rwvz3S1zFD1UYSnk9K+RWFf9Qc5Gkp2xvzsbRvy3pfhWGe/4r+wKqglP/ASARDLkAQCIodABIBIUOAImg0AEgERQ6ACSCQgeARFDoAJAICh0AEvH/DPhqqNqo5icAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "unique_df = unique_ds.toPandas()\n", "unique_df.plot(x='element0', y='count', kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Terminate Spark" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }