{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Secondary Structure Property Encoder Demo\n", "\n", "This demo creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a property encoding of the sequence segment.\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext, SQLContext\n", "from mmtfPyspark.ml import ProteinSequenceEncoder\n", "from mmtfPyspark.mappers import StructureToPolymerChains\n", "from mmtfPyspark.filters import ContainsLProteinChain\n", "from mmtfPyspark.datasets import secondaryStructureSegmentExtractor\n", "from mmtfPyspark.webfilters import Pisces\n", "from mmtfPyspark.io import mmtfReader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure Spark Context" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "conf = SparkConf() \\\n", " .setMaster(\"local[*]\") \\\n", " .setAppName(\"SecondaryStructurePropertyEncoderDemo\")\n", "\n", "sc = SparkContext(conf = conf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Read MMTF Hadoop sequence file and \n", " \n", " Create a non-redundant set(<=20% seq. identity) of L-protein chains" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "path = \"../../resources/mmtf_reduced_sample/\"\n", "sequenceIdentity = 20\n", "resolution = 2.0\n", "fraction = 0.1\n", "seed = 123\n", "\n", "pdb = mmtfReader \\\n", " .read_sequence_file(path, sc) \\\n", " .flatMap(StructureToPolymerChains()) \\\n", " .filter(Pisces(sequenceIdentity, resolution)) \\\n", " .filter(ContainsLProteinChain()) \\\n", " .sample(False, fraction, seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get content" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "original data : 2149\n" ] } ], "source": [ "segmentLength = 11\n", "data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()\n", "print(f\"original data : {data.count()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop Q3 and sequence duplicates" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- duplicate Q3/seq : 2149\n" ] } ], "source": [ "data = data.dropDuplicates([\"labelQ3\", \"sequence\"]).cache()\n", "print(f\"- duplicate Q3/seq : {data.count()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop sequence duplicates" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "- duplicate seq : 2149\n" ] } ], "source": [ "data = data.dropDuplicates([\"sequence\"])\n", "print(f\"- duplicate seq : {data.count()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Property Encoding" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- structureChainId: string (nullable = true)\n", " |-- sequence: string (nullable = false)\n", " |-- labelQ8: string (nullable = true)\n", " |-- labelQ3: string (nullable = true)\n", " |-- features: vector (nullable = true)\n", "\n", "+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|structureChainId|sequence |labelQ8|labelQ3|features |\n", "+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|1A9X.F |DIDTRKLTRLL|H |H |[1.6,0.11,2.78,-0.77,2.95,0.25,0.2,4.19,0.19,4.0,1.8,6.04,0.3,0.45,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,3.03,0.11,2.6,0.26,5.6,0.21,0.36,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.59,0.19,4.0,1.7,6.04,0.39,0.31,3.03,0.11,2.6,0.26,5.6,0.21,0.36,2.59,0.19,4.0,1.7,6.04,0.39,0.31,2.59,0.19,4.0,1.7,6.04,0.39,0.31] |\n", "|1FO8.A |DLEVAPDFFEY|T |C |[1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,3.67,0.14,3.0,1.22,6.02,0.27,0.49,2.34,0.29,6.13,-1.01,10.74,0.36,0.25,2.67,0.0,2.72,0.72,6.8,0.13,0.34,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.94,0.29,5.89,1.79,5.67,0.3,0.38,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,2.94,0.3,6.47,0.96,5.66,0.25,0.41] |\n", "|1A9X.F |EDLSSYLKRHN|H |H |[1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,2.94,0.3,6.47,0.96,5.66,0.25,0.41,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.99,0.23,4.66,0.13,7.69,0.27,0.3,1.6,0.13,2.95,-0.6,6.52,0.21,0.22] |\n", "|1EB6.A |GDESKFEEYFK|H |H |[0.0,0.0,0.0,0.0,6.07,0.13,0.15,1.6,0.11,2.78,-0.77,2.95,0.25,0.2,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,1.89,0.22,4.77,-0.99,9.99,0.32,0.27,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,2.94,0.3,6.47,0.96,5.66,0.25,0.41,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.89,0.22,4.77,-0.99,9.99,0.32,0.27]|\n", "|1C1K.A |IISFETFILLD|H |H |[4.19,0.19,4.0,1.8,6.04,0.3,0.45,4.19,0.19,4.0,1.8,6.04,0.3,0.45,1.31,0.06,1.6,-0.04,5.7,0.2,0.28,2.94,0.29,5.89,1.79,5.67,0.3,0.38,1.56,0.15,3.78,-0.64,3.09,0.42,0.21,3.03,0.11,2.6,0.26,5.6,0.21,0.36,2.94,0.29,5.89,1.79,5.67,0.3,0.38,4.19,0.19,4.0,1.8,6.04,0.3,0.45,2.59,0.19,4.0,1.7,6.04,0.39,0.31,2.59,0.19,4.0,1.7,6.04,0.39,0.31,1.6,0.11,2.78,-0.77,2.95,0.25,0.2] |\n", "+----------------+-----------+-------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "encoder = ProteinSequenceEncoder(data)\n", "data = encoder.property_encode()\n", "\n", "data.printSchema()\n", "data.show(5, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Terminate Spark Context" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }