{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ATP Interaction Anaylsis\n",
    "\n",
    "This demo shows how to create a dataset of ATP Interating atoms.\n",
    "\n",
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkConf, SparkContext, SQLContext\n",
    "from mmtfPyspark.datasets import groupInteractionExtractor\n",
    "from mmtfPyspark.io import mmtfReader\n",
    "from mmtfPyspark.webfilters import Pisces\n",
    "import time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configure Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf = SparkConf().setMaster(\"local[*]\") \\\n",
    "                  .setAppName(\"ATPInteractionAnalysisDemo\")\n",
    "    \n",
    "sc = SparkContext(conf = conf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read PDB in MMTF format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../../resources/mmtf_full_sample/\"\n",
    "\n",
    "pdb = mmtfReader.read_sequence_file(path, sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filter by sequence identity subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "seqId = 40\n",
    "resolution = 2.0\n",
    "\n",
    "pdb = pdb.filter(Pisces(seqId, resolution))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Find ATP interactions within 3 Angstroms\n",
    "\n",
    "![ATPInteraction](./figures/atp-dist2.jpg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "finder = groupInteractionExtractor(\"ATP\", 3)\n",
    "\n",
    "interactions = finder.get_dataset(pdb).cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "interactions = interactions.filter(\"atom1 LIKE('O%G')\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show the data schema of the dataset and some data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- structureId: string (nullable = false)\n",
      " |-- residue1: string (nullable = false)\n",
      " |-- atom1: string (nullable = false)\n",
      " |-- element1: string (nullable = false)\n",
      " |-- index1: integer (nullable = false)\n",
      " |-- residue2: string (nullable = false)\n",
      " |-- atom2: string (nullable = false)\n",
      " |-- element2: string (nullable = false)\n",
      " |-- index2: integer (nullable = false)\n",
      " |-- distance: float (nullable = false)\n",
      "\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "|       1B0U|     ATP|  O3G|       O|   261|     SER|   OG|       O|    36|2.6808183|\n",
      "|       1B0U|     ATP|  O3G|       O|   261|     HOH|    O|       O|   272|2.7428646|\n",
      "|       1B0U|     ATP|  O1G|       O|   261|     HOH|    O|       O|   293| 2.750308|\n",
      "|       1B0U|     ATP|  O2G|       O|   261|     HOH|    O|       O|   299|2.8596847|\n",
      "|       1B0U|     ATP|  O3G|       O|   261|     HOH|    O|       O|   356|2.6914153|\n",
      "+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "interactions.printSchema()\n",
    "\n",
    "interactions.show(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Count number of interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of interactions: 5\n"
     ]
    }
   ],
   "source": [
    "n = interactions.count()\n",
    "\n",
    "print(f\"Number of interactions: {n}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify top interacting groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+\n",
      "|residue2|count|\n",
      "+--------+-----+\n",
      "|     HOH|    4|\n",
      "|     SER|    1|\n",
      "+--------+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "topGroups = interactions.groupBy(\"residue2\").count()\n",
    "\n",
    "topGroups.sort(\"count\", ascending = False).show(10) # Sort descending by count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Top interacting groups/atoms types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+-----+---------+\n",
      "|residue2|atom2|count|frequency|\n",
      "+--------+-----+-----+---------+\n",
      "|     HOH|    O|    4|      0.8|\n",
      "|     SER|   OG|    1|      0.2|\n",
      "+--------+-----+-----+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "topGroupsAndAtoms = interactions.groupBy(\"residue2\",\"atom2\").count()\n",
    "\n",
    "topGroupsAndAtoms.withColumn(\"frequency\", topGroupsAndAtoms[\"count\"] / n)\\\n",
    "                 .sort(\"frequency\", ascending = False) \\\n",
    "                 .show(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Terminate Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}