{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple Zinc Interaction Analysis Example\n",
    "\n",
    "<img src=\"./figures/zinc_interaction.png\" style=\"width: 300px;\"/>\n",
    "\n",
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkConf, SparkContext\n",
    "from mmtfPyspark.datasets import groupInteractionExtractor\n",
    "from mmtfPyspark.io import mmtfReader\n",
    "from mmtfPyspark.webfilters import Pisces"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configure Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf = SparkConf().setMaster(\"local[*]\") \\\n",
    "                  .setAppName(\"simpleZincInteractionDemo\")\n",
    "\n",
    "sc = SparkContext(conf = conf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read PDB in MMTF format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../../resources/mmtf_full_sample/\"\n",
    "\n",
    "pdb = mmtfReader.read_sequence_file(path, sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use only representative structures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "seqId = 40\n",
    "resolution = 2.0\n",
    "\n",
    "pdb = pdb.filter(Pisces(seqId, resolution))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract proteins with Zn interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "finder = groupInteractionExtractor(\"ZN\",3)\n",
    "\n",
    "interactions = finder.get_dataset(pdb).cache()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List the top 10 residue types that interact with Zn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- structureId: string (nullable = false)\n",
      " |-- residue1: string (nullable = false)\n",
      " |-- atom1: string (nullable = false)\n",
      " |-- element1: string (nullable = false)\n",
      " |-- index1: integer (nullable = false)\n",
      " |-- residue2: string (nullable = false)\n",
      " |-- atom2: string (nullable = false)\n",
      " |-- element2: string (nullable = false)\n",
      " |-- index2: integer (nullable = false)\n",
      " |-- distance: float (nullable = false)\n",
      "\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    50|2.3709755|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    53|2.3940797|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     HIS|  NE2|       N|    70|2.2196307|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    72|2.3465357|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   415|2.3747551|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   418|2.3680198|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     HIS|  NE2|       N|   435|2.1647959|\n",
      "|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   437|2.3763454|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     HIS|  CE1|       C|    53|2.9807622|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     HIS|  NE2|       N|    53| 2.040789|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|   CG|       C|    67| 2.754825|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|  OD1|       O|    67|2.8967845|\n",
      "|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|  OD2|       O|    67|1.9672809|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     HIS|  NE2|       N|    10|2.2776458|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     HIS|  NE2|       N|    12|2.1644206|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     GLU|  OE2|       O|   123|2.3778422|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   582|     ASP|  OD1|       O|   241|  2.41581|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   583|     GLU|   CD|       C|   123|2.7811828|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   583|     GLU|  OE1|       O|   123|2.1997967|\n",
      "|       1BF6|      ZN|   ZN|      Zn|   583|     HIS|  ND1|       N|   156|2.2733805|\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "only showing top 20 rows\n",
      "\n",
      "Number of interactions: 238\n"
     ]
    }
   ],
   "source": [
    "interactions.printSchema()\n",
    "\n",
    "interactions.show(20)\n",
    "\n",
    "print(f\"Number of interactions: {interactions.count()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show the top 10 interacting groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+\n",
      "|residue2|count|\n",
      "+--------+-----+\n",
      "|     HIS|   76|\n",
      "|     CYS|   43|\n",
      "|     HOH|   37|\n",
      "|     GLU|   34|\n",
      "|     ASP|   27|\n",
      "|     ACT|   12|\n",
      "|     TRP|    4|\n",
      "|     LYS|    2|\n",
      "|     VAL|    2|\n",
      "|      CL|    1|\n",
      "+--------+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "interactions.groupBy(\"residue2\") \\\n",
    "            .count() \\\n",
    "            .sort(\"count\", ascending = False) \\\n",
    "            .show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Terminate Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}