{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advanced Zinc Interaction Analysis Example\n",
    "\n",
    "<img src=\"./figures/zinc_interaction.png\" style=\"width: 300px;\"/>\n",
    "\n",
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkConf, SparkContext\n",
    "from pyspark.sql.functions import *\n",
    "from mmtfPyspark.datasets import groupInteractionExtractor\n",
    "from mmtfPyspark.io import mmtfReader\n",
    "from mmtfPyspark.webfilters import Pisces"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configure Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf = SparkConf().setMaster(\"local[*]\") \\\n",
    "                  .setAppName(\"advancedZincInteractionDemo\")\n",
    "\n",
    "sc = SparkContext(conf = conf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read PDB in MMTF format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../../resources/mmtf_full_sample/\"\n",
    "\n",
    "pdb = mmtfReader.read_sequence_file(path, sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use only representative structures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "seqId = 40\n",
    "resolution = 2.0\n",
    "\n",
    "pdb = pdb.filter(Pisces(seqId, resolution))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract proteins with Zn interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "finder = groupInteractionExtractor(\"ZN\",3)\n",
    "\n",
    "interactions = finder.get_dataset(pdb).cache()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List the top 10 residue types that interact with Zn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- structureId: string (nullable = false)\n",
      " |-- residue1: string (nullable = false)\n",
      " |-- atom1: string (nullable = false)\n",
      " |-- element1: string (nullable = false)\n",
      " |-- index1: integer (nullable = false)\n",
      " |-- residue2: string (nullable = false)\n",
      " |-- atom2: string (nullable = false)\n",
      " |-- element2: string (nullable = false)\n",
      " |-- index2: integer (nullable = false)\n",
      " |-- distance: float (nullable = false)\n",
      "\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    50|2.3709755|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    53|2.3940797|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     HIS|  NE2|       N|    70|2.2196307|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    72|2.3465357|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   415|2.3747551|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   418|2.3680198|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     HIS|  NE2|       N|   435|2.1647959|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   437|2.3763454|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     HIS|  CE1|       C|    53|2.9807622|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     HIS|  NE2|       N|    53| 2.040789|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|   CG|       C|    67| 2.754825|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|  OD1|       O|    67|2.8967845|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|  OD2|       O|    67|1.9672809|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     HIS|  NE2|       N|    10|2.2776458|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     HIS|  NE2|       N|    12|2.1644206|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     GLU|  OE2|       O|   123|2.3778422|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     ASP|  OD1|       O|   241|  2.41581|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   583|     GLU|   CD|       C|   123|2.7811828|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   583|     GLU|  OE1|       O|   123|2.1997967|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   583|     HIS|  ND1|       N|   156|2.2733805|\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "only showing top 20 rows\n",
      "\n",
      "Number of interactions: 238\n"
     ]
    }
   ],
   "source": [
    "interactions.printSchema()\n",
    "\n",
    "interactions.show(20)\n",
    "\n",
    "n = interactions.count()\n",
    "\n",
    "print(f\"Number of interactions: {n}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show the top 10 interacting group/atom types\n",
    "\n",
    "#### Exclude Carbon Interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "topGroupsAndAtoms = interactions.filter(\"element2 != 'C'\") \\\n",
    "                                .groupBy(\"residue2\",\"atom2\") \\\n",
    "                                .count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Add column with frequency of occurence\n",
    "#### Filter out occurrences < 1% \n",
    "#### Sort descending"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+-----+--------------------+\n",
      "|residue2|atom2|count|           frequency|\n",
      "+--------+-----+-----+--------------------+\n",
      "|     CYS|   SG|   43| 0.18067226890756302|\n",
      "|     HOH|    O|   37| 0.15546218487394958|\n",
      "|     HIS|  NE2|   30| 0.12605042016806722|\n",
      "|     HIS|  ND1|   24| 0.10084033613445378|\n",
      "|     ASP|  OD2|   11|0.046218487394957986|\n",
      "|     GLU|  OE1|   11|0.046218487394957986|\n",
      "|     GLU|  OE2|   11|0.046218487394957986|\n",
      "|     ASP|  OD1|    9|0.037815126050420166|\n",
      "|     ACT|    O|    4| 0.01680672268907563|\n",
      "|     ACT|  OXT|    4| 0.01680672268907563|\n",
      "+--------+-----+-----+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "topGroupsAndAtoms.withColumn(\"frequency\", topGroupsAndAtoms[\"count\"] / n) \\\n",
    "                 .filter(\"frequency > 0.01\") \\\n",
    "                 .sort(\"frequency\", ascending = False) \\\n",
    "                 .show(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Print the top interacting elements\n",
    "\n",
    "#### Exclude carbon interactions and group by element 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "topElements = interactions.filter(\"element2 != 'C'\") \\\n",
    "                          .groupBy(\"element2\") \\\n",
    "                          .count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Add column with frequencey of occurence\n",
    "#### Filter out occurence < 1%\n",
    "#### sort decending"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+--------------------+\n",
      "|element2|count|           frequency|\n",
      "+--------+-----+--------------------+\n",
      "|       O|   91| 0.38235294117647056|\n",
      "|       N|   56| 0.23529411764705882|\n",
      "|       S|   43| 0.18067226890756302|\n",
      "|       H|    3|0.012605042016806723|\n",
      "+--------+-----+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "topElements.withColumn(\"frequency\", topElements[\"count\"] / n) \\\n",
    "           .filter(\"frequency > 0.01\") \\\n",
    "           .sort(\"frequency\", ascending = False) \\\n",
    "           .show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+------------------+\n",
      "|element2|     avg(distance)|\n",
      "+--------+------------------+\n",
      "|       N| 2.247671846832548|\n",
      "|      Cl|2.3399999141693115|\n",
      "|       O| 2.340171109189044|\n",
      "|       S|2.3423283100128174|\n",
      "|       C| 2.727002328092402|\n",
      "|       H|2.8938498497009277|\n",
      "+--------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "interactions.groupBy(\"element2\") \\\n",
    "            .avg(\"distance\") \\\n",
    "            .sort(\"avg(distance)\") \\\n",
    "            .show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregate multiple statistics\n",
    "\n",
    "### NOTE: from pyspark.sql.functions import * required"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+---------------+------------------+-------------+-------------+-------------------+\n",
      "|element2|count(distance)|     avg(distance)|min(distance)|max(distance)| kurtosis(distance)|\n",
      "+--------+---------------+------------------+-------------+-------------+-------------------+\n",
      "|       O|             91| 2.340171109189044|    1.8502038|    2.9841056|-0.5095228492389405|\n",
      "|       C|             44| 2.727002328092402|    1.8144855|    2.9990435|  2.050274417960135|\n",
      "|       N|             56| 2.247671846832548|    1.9923105|    2.9953997|  2.470076287060217|\n",
      "|      Cl|              1|2.3399999141693115|         2.34|         2.34|                NaN|\n",
      "|       S|             43|2.3423283100128174|    2.2196188|    2.4604716| 0.3902514824014989|\n",
      "|       H|              3|2.8938498497009277|     2.844304|     2.979628|-1.4999999999999993|\n",
      "+--------+---------------+------------------+-------------+-------------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "interactions.groupBy(\"element2\") \\\n",
    "            .agg(count(\"distance\"), avg(\"distance\"), min(\"distance\"), max(\"distance\"), kurtosis(\"distance\")) \\\n",
    "            .show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Terminate Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}