{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Secondary Structure Elements Word2Vec Encoder Demo\n", "\n", "This demo creates a dataset by extracting secondary structure elements \"H\", then encode an overlapping Ngram feature vector\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext, SQLContext\n", "from mmtfPyspark.ml import ProteinSequenceEncoder\n", "from mmtfPyspark.mappers import StructureToPolymerChains\n", "from mmtfPyspark.filters import ContainsLProteinChain\n", "from mmtfPyspark.datasets import secondaryStructureElementExtractor\n", "from mmtfPyspark.webfilters import Pisces\n", "from mmtfPyspark.io import mmtfReader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure Spark Context" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "conf = SparkConf() \\\n", " .setMaster(\"local[*]\") \\\n", " .setAppName(\"SecondaryStructureElementsWord2VecEncoderDemo\")\n", "\n", "sc = SparkContext(conf = conf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " ## Read MMTF Hadoop sequence file and \n", " \n", " Create a non-redundant set(<=20% seq. identity) of L-protein chains" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "path = \"../../resources/mmtf_reduced_sample/\"\n", "fraction = 0.05\n", "seed = 123\n", "\n", "pdb = mmtfReader \\\n", " .read_sequence_file(path, sc) \\\n", " .flatMap(StructureToPolymerChains(False, True)) \\\n", " .filter(ContainsLProteinChain()) \\\n", " .sample(False, fraction, seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract Element \"H\" from Secondary Structure" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "original data : 3297\n", "+-----------------------------+-----+\n", "|sequence |label|\n", "+-----------------------------+-----+\n", "|ACAGV |H |\n", "|GIGLHLAVRLA |H |\n", "|RLWEAARA |H |\n", "|KSVAAARE |H |\n", "|EDAVASVLDVNVVGTVRMLQAFLPDMKRR|H |\n", "|VYCASKFALEGLCESLAVLLLPF |H |\n", "|IHTFHRFYQYLALSKQVFREA |H |\n", "|EEVAEVFLTALR |H |\n", "|LPLLRMRL |H |\n", "|NYVTAMHREVF |H |\n", "+-----------------------------+-----+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "label = \"H\"\n", "data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()\n", "print(f\"original data : {data.count()}\")\n", "data.show(10, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Word2Vec encoded feature Vector" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+--------------------+--------------------+\n", "| sequence|label| ngram| features|\n", "+--------------------+-----+--------------------+--------------------+\n", "| ACAGV| H| [AC, CA, AG, GV]|[0.46121373027563...|\n", "| GIGLHLAVRLA| H|[GI, IG, GL, LH, ...|[-0.2606903441250...|\n", "| RLWEAARA| H|[RL, LW, WE, EA, ...|[0.16112836982522...|\n", "| KSVAAARE| H|[KS, SV, VA, AA, ...|[1.15827076775687...|\n", "|EDAVASVLDVNVVGTVR...| H|[ED, DA, AV, VA, ...|[0.37046241248026...|\n", "+--------------------+-----+--------------------+--------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "segmentLength = 11\n", "n = 2\n", "windowSize = (segmentLength-1)/2\n", "vectorSize = 50\n", "\n", "encoder = ProteinSequenceEncoder(data)\n", "# overlapping_ngram_word2vec_encode uses keyword attributes\n", "data = encoder.overlapping_ngram_word2vec_encode(n=n, windowSize=windowSize, vectorSize=vectorSize)\n", "\n", "data.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Terminate Spark Context" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }