{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Protein Fold Dataset Creator Dmeo\n", "\n", "This Demo is a simple example of using Dataset operations to create a datset\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext, SQLContext\n", "from pyspark.sql.functions import col, when\n", "from mmtfPyspark.ml import ProteinSequenceEncoder\n", "from mmtfPyspark.mappers import StructureToPolymerChains\n", "from mmtfPyspark.filters import ContainsLProteinChain\n", "from mmtfPyspark.datasets import secondaryStructureExtractor\n", "from mmtfPyspark.webfilters import Pisces\n", "from mmtfPyspark.io import mmtfReader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define addProteinFoldType function" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def add_protein_fold_type(data, minThreshold, maxThreshold):\n", " '''\n", " Adds a column \"foldType\" with three major secondary structure class:\n", " \"alpha\", \"beta\", \"alpha+beta\", and \"other\" based upon the fraction of alpha/beta content.\n", "\n", " The simplified syntax used in this method relies on two imports:\n", " from pyspark.sql.functions import when\n", " from pyspark.sql.functions import col\n", "\n", " Attributes:\n", " data (Dataset): input dataset with alpha, beta composition\n", " minThreshold (float): below this threshold, the secondary structure is ignored\n", " maxThreshold (float): above this threshold, the secondary structure is ignored\n", " '''\n", "\n", " return data.withColumn(\"foldType\", \\\n", " when((col(\"alpha\") > maxThreshold) & (col(\"beta\") < minThreshold), \"alpha\"). \\\n", " when((col(\"beta\") > maxThreshold) & (col(\"alpha\") < minThreshold), \"beta\"). \\\n", " when((col(\"alpha\") > maxThreshold) & (col(\"beta\") > minThreshold), \"alpha+beta\"). \\\n", " otherwise(\"other\")\\\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure Spark Context" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "conf = SparkConf() \\\n", " .setMaster(\"local[*]\") \\\n", " .setAppName(\"ProteinFoldDatasetCreatorDemo\")\n", "\n", "sc = SparkContext(conf = conf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read MMTF Hadoop sequence file\n", "\n", "Create non-redundant set (<=40% seq. identity) if L-protein chains" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "path = \"../../resources/mmtf_reduced_sample/\"\n", "sequenceIdentity = 40\n", "resolution = 2.0\n", "\n", "pdb = mmtfReader \\\n", " .read_sequence_file(path, sc) \\\n", " .filter(Pisces(sequenceIdentity, resolution)) \\\n", " .flatMap(StructureToPolymerChains()) \\\n", " .filter(Pisces(sequenceIdentity, resolution)) \\\n", " .filter(ContainsLProteinChain())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get secondary structure content" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "data = secondaryStructureExtractor.get_dataset(pdb)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classify chains by secondary structure type" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "minThreshold = 0.05\n", "maxThreshold = 0.15\n", "data = add_protein_fold_type(data, minThreshold, maxThreshold)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add Word2Vec encoded feature vector" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- structureChainId: string (nullable = false)\n", " |-- sequence: string (nullable = false)\n", " |-- alpha: float (nullable = false)\n", " |-- beta: float (nullable = false)\n", " |-- coil: float (nullable = false)\n", " |-- dsspQ8Code: string (nullable = false)\n", " |-- dsspQ3Code: string (nullable = false)\n", " |-- foldType: string (nullable = false)\n", " |-- ngram: array (nullable = true)\n", " | |-- element: string (containsNull = true)\n", " |-- features: vector (nullable = true)\n", "\n", "+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+\n", "|structureChainId| sequence| alpha| beta| coil| dsspQ8Code| dsspQ3Code| foldType| ngram| features|\n", "+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+\n", "| 1FEC.B|SRAYDLVVIGAGSGGLE...| 0.32783505|0.27216494| 0.4|XCCEEEEEECCSHHHHH...|XCCEEEEEECCCHHHHH...|alpha+beta|[SR, RA, AY, YD, ...|[0.83933154100662...|\n", "| 1FGY.A|TFFNPDREGWLLKLGGR...| 0.13385826|0.37007874| 0.496063|CCSSCSEEEEEEEECSS...|CCCCCCEEEEEEEECCC...| other|[TF, FF, FN, NP, ...|[0.71180595034393...|\n", "| 1FHG.A|ISGMSGRKASGSSPTSP...|0.029411765| 0.5980392|0.37254903|XXXXXXXXXXXXXXXXX...|XXXXXXXXXXXXXXXXX...| beta|[IS, SG, GM, MS, ...|[0.72951849662010...|\n", "| 1FIP.B|MFEQRVNSDVLTVSTVN...| 0.72602737| 0.0| 0.2739726|XXXXXXXXXXXXXXXXX...|XXXXXXXXXXXXXXXXX...| alpha|[MF, FE, EQ, QR, ...|[0.91695108936293...|\n", "| 1FIT.A|MSFRFGQHLIKPSVVFL...| 0.34126985|0.25396827| 0.4047619|XCEEETTEEECGGGEEE...|XCEEECCEEECHHHEEE...|alpha+beta|[MS, SF, FR, RF, ...|[0.95015770865425...|\n", "| 1FIU.D|MQPLFTQERRIFHKKLL...| 0.48951048|0.12937063| 0.3811189|CCSHHHHHHHHHHHHHH...|CCCHHHHHHHHHHHHHH...|alpha+beta|[MQ, QP, PL, LF, ...|[0.95298244105325...|\n", "| 1FL2.A|AYDVLIVGSGPAGAAAA...| 0.2580645|0.30967742|0.43225807|CEEEEEECCSHHHHHHH...|CEEEEEECCCHHHHHHH...|alpha+beta|[AY, YD, DV, VL, ...|[0.88702770774508...|\n", "| 1FLT.Y|GRPFVEMYSEIPEIIHM...|0.031914894| 0.4787234| 0.4893617|CCCBSSCCCSSCEEEEE...|CCCECCCCCCCCEEEEE...| beta|[GR, RP, PF, FV, ...|[0.94899295706381...|\n", "| 1FM0.D|MIKVLFFAQVRELVGTD...| 0.28395063|0.27160493|0.44444445|CEEEEECHHHHHHHTCS...|CEEEEECHHHHHHHCCC...|alpha+beta|[MI, IK, KV, VL, ...|[1.00496734850457...|\n", "| 1FM0.E|MAETKIVVGPQPFSVGE...| 0.36619717|0.35915494| 0.2746479|XCCEEEEEESSCCCHHH...|XCCEEEEEECCCCCHHH...|alpha+beta|[MA, AE, ET, TK, ...|[0.82849135274855...|\n", "+----------------+--------------------+-----------+----------+----------+--------------------+--------------------+----------+--------------------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "encoder = ProteinSequenceEncoder(data)\n", "n = 2 # Create 2-grams\n", "windowSize = 25 # 25-amino residue window size for Word2Vec\n", "vectorSize = 50 # dimension of feature vector\n", "# overlapping_ngram_word2vec_encode uses keyword attributes\n", "data = encoder.overlapping_ngram_word2vec_encode(n = n, windowSize = windowSize, vectorSize=vectorSize).cache()\n", "\n", "data.printSchema()\n", "data.show(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Keep only a subset of relevant fields for futher processing" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------+-----------+----------+----------+----------+--------------------+\n", "|structureChainId| alpha| beta| coil| foldType| features|\n", "+----------------+-----------+----------+----------+----------+--------------------+\n", "| 1FEC.B| 0.32783505|0.27216494| 0.4|alpha+beta|[0.83933154100662...|\n", "| 1FGY.A| 0.13385826|0.37007874| 0.496063| other|[0.71180595034393...|\n", "| 1FHG.A|0.029411765| 0.5980392|0.37254903| beta|[0.72951849662010...|\n", "| 1FIP.B| 0.72602737| 0.0| 0.2739726| alpha|[0.91695108936293...|\n", "| 1FIT.A| 0.34126985|0.25396827| 0.4047619|alpha+beta|[0.95015770865425...|\n", "| 1FIU.D| 0.48951048|0.12937063| 0.3811189|alpha+beta|[0.95298244105325...|\n", "| 1FL2.A| 0.2580645|0.30967742|0.43225807|alpha+beta|[0.88702770774508...|\n", "| 1FLT.Y|0.031914894| 0.4787234| 0.4893617| beta|[0.94899295706381...|\n", "| 1FM0.D| 0.28395063|0.27160493|0.44444445|alpha+beta|[1.00496734850457...|\n", "| 1FM0.E| 0.36619717|0.35915494| 0.2746479|alpha+beta|[0.82849135274855...|\n", "+----------------+-----------+----------+----------+----------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "data = data.select(\"structureChainId\", \"alpha\", \"beta\", \"coil\", \"foldType\", \"features\")\n", "\n", "data.show(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Terminate Spark Context" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }