{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example of using PySpark to find metal interactions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports and variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext \n", "from mmtfPyspark.io import mmtfReader\n", "from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor\n", "from mmtfPyspark.filters import ContainsLProteinChain, Resolution\n", "from mmtfPyspark.webfilters import Pisces\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import py3Dmol\n", "import time\n", " \n", "# Create variables \n", "APP_NAME = \"MMTF_Spark\" \n", "path = \"../../resources/mmtf_full_sample/\"\n", "\n", "# Configure Spark \n", "conf = SparkConf().setAppName(APP_NAME).setMaster(\"local[*]\") \n", "sc = SparkContext(conf=conf) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define Variables" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# input parameters\n", "sequenceIdentityCutoff = 30\n", "resolution = 2.5\n", "minInteractions = 4\n", "maxInteractions = 6\n", "distanceCutoff = 3.0\n", "\n", "# chemical component codes of metals in different oxidation states\n", "metals = {\"V\",\"CR\",\"MN\",\"MN3\",\"FE\",\"FE2\",\"CO\",\"3CO\",\"NI\",\"3NI\", \"CU\",\"CU1\",\"CU3\",\"ZN\",\"MO\",\"4MO\",\"6MO\"}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read PDB and create PISCES non-redundant set" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "pdb = mmtfReader.read_sequence_file(path, sc)\n", "pdb = pdb.filter(Pisces(sequenceIdentity = sequenceIdentityCutoff, resolution = resolution)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup criteria for metal interactions" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "interactions_filter = InteractionFilter()\n", "interactions_filter.set_distance_cutoff(distanceCutoff)\n", "interactions_filter.set_min_interactions(minInteractions)\n", "interactions_filter.set_max_interactions(maxInteractions)\n", "interactions_filter.set_query_groups(True, metals)\n", "\n", "#Exclude non-polar interactions\n", "interactions_filter.set_target_elements(False, ['H','C','P'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tabulate interactions in a Dataframe" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Metal interactions: 52\n" ] } ], "source": [ "interactions = GroupInteractionExtractor().get_interactions(pdb,interactions_filter).cache()\n", "print(f\"Metal interactions: {interactions.count()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select interacting atoms and orientational order parameters (q4-q6)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | pdbId | \n", "q4 | \n", "q5 | \n", "q6 | \n", "element0 | \n", "groupNum0 | \n", "chain0 | \n", "element1 | \n", "groupNum1 | \n", "chain1 | \n", "... | \n", "chain4 | \n", "distance4 | \n", "element5 | \n", "groupNum5 | \n", "chain5 | \n", "distance5 | \n", "element6 | \n", "groupNum6 | \n", "chain6 | \n", "distance6 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1A2P | \n", "0.545893 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "112 | \n", "C | \n", "O | \n", "139 | \n", "C | \n", "... | \n", "C | \n", "2.120991 | \n", "None | \n", "None | \n", "None | \n", "0.000000 | \n", "None | \n", "None | \n", "None | \n", "0.00000 | \n", "
1 | \n", "1FN9 | \n", "0.982773 | \n", "NaN | \n", "NaN | \n", "Zn | \n", "1001 | \n", "A | \n", "S | \n", "51 | \n", "A | \n", "... | \n", "A | \n", "2.219631 | \n", "None | \n", "None | \n", "None | \n", "0.000000 | \n", "None | \n", "None | \n", "None | \n", "0.00000 | \n", "
2 | \n", "1BYF | \n", "0.455097 | \n", "0.396868 | \n", "NaN | \n", "Zn | \n", "302 | \n", "A | \n", "O | \n", "126 | \n", "A | \n", "... | \n", "A | \n", "2.788446 | \n", "O | \n", "127 | \n", "A | \n", "1.971687 | \n", "None | \n", "None | \n", "None | \n", "0.00000 | \n", "
3 | \n", "1AH7 | \n", "0.877007 | \n", "0.970126 | \n", "NaN | \n", "Zn | \n", "246 | \n", "A | \n", "N | \n", "118 | \n", "A | \n", "... | \n", "A | \n", "2.196544 | \n", "O | \n", "249 | \n", "A | \n", "2.191126 | \n", "None | \n", "None | \n", "None | \n", "0.00000 | \n", "
4 | \n", "1B9M | \n", "0.581767 | \n", "0.817296 | \n", "0.478988 | \n", "Ni | \n", "263 | \n", "A | \n", "O | \n", "148 | \n", "A | \n", "... | \n", "A | \n", "2.238572 | \n", "N | \n", "146 | \n", "A | \n", "2.063766 | \n", "O | \n", "465 | \n", "A | \n", "2.13188 | \n", "
5 rows × 31 columns
\n", "