Secondary Structure Word2Vec Encoder

This demo creates a dataset of sequence segments derived from a non-redundent set. The dataset contains the seuqence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a sequnece segment, and a Word2Vec encoding of the seuqnece segment.

Imports

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader
import time

Configure Spark Context

In [2]:
conf = SparkConf() \
        .setMaster("local[*]") \
        .setAppName("secondaryStructureWord2VecEncodeDemo")
sc = SparkContext(conf = conf)

Read in, filter and sample Hadoop Sequence Files

In [3]:
path = "../../resources/mmtf_reduced_sample/"

sequenceIdentity = 20
resolution = 2.0
fraction = 0.95
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain()) \
        .sample(False, fraction, seed)

Extract Secondary Structure Segments

In [4]:
segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()

Add Word2Vec encoded feature vector

In [6]:
encoder = ProteinSequenceEncoder(data)

n = 2
windowSize = (segmentLength -1) // 2
vectorSize = 50
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n=n , windowSize=windowSize, vectorSize=vectorSize)

Show dataset schema and few rows of data

In [7]:
data.printSchema()
data.show(10, False)
root
 |-- structureChainId: string (nullable = false)
 |-- sequence: string (nullable = false)
 |-- labelQ8: string (nullable = false)
 |-- labelQ3: string (nullable = false)
 |-- ngram: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+----------------+-----------+-------+-------+----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|structureChainId|sequence   |labelQ8|labelQ3|ngram                                   |features                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+----------------+-----------+-------+-------+----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1FIP.B          |QVTQKPLRDSV|C      |C      |[QV, VT, TQ, QK, KP, PL, LR, RD, DS, SV]|[0.7218928992748261,-1.9542026817798615,0.6141835749149323,1.1600496664643287,-0.27640406005084517,1.4911903724074365,1.511457195878029,-0.007190728187561036,-0.9487916529178619,-0.4628058046102524,0.6366564273834229,-0.24788947850465776,0.45128672122955327,0.6480263471603394,0.18761155083775521,0.1619911164045334,-0.30771539211273197,0.6427282989025116,-1.341044443845749,-2.816843903809786,1.3873128592967987,0.2423313617706299,0.9503013789653778,0.30704727172851565,0.32808160185813906,0.42719419894274324,-0.7198130667209626,-0.1663548767566681,-0.1287058837711811,0.9187342792749406,0.25574764609336853,-0.5546834290027619,-0.08139598369598389,0.6092532634735108,0.6034509018063545,0.6945261999964715,0.5676412969827652,-0.52536511272192,-0.9364444792270661,1.5990618020296097,-0.05396939516067505,-0.049348920583724976,-1.1273640125989914,-0.4422869816422463,-0.9838919207453728,-0.5735954821109772,0.268616408482194,0.15573585331439974,-0.5042540408670902,0.30428771376609803]    |
|1FIP.B          |VTQKPLRDSVK|H      |H      |[VT, TQ, QK, KP, PL, LR, RD, DS, SV, VK]|[0.45652487874031067,-1.9762823641300202,0.8651541888713837,0.7666460171341897,0.12646145708858966,1.6430284127593042,2.284087720513344,0.34733521938323975,-1.057218524813652,-0.02226594388484955,0.07116093039512635,0.31447787135839467,0.5521076768636703,0.3366988211870194,0.5947444386780262,-0.2880236178636551,0.037387168407440184,0.6817272737622262,-1.2181337743997576,-2.549136293679476,0.9582108676433564,0.40777019709348683,0.6062138259410859,0.4773241370916367,-0.37046635746955875,0.3872552007203922,-0.9327833235263825,0.6253603398799896,-0.8170483835041523,1.2962547689676285,0.565328186750412,-0.4655628755688668,0.4370636761188507,0.8207613229751587,0.7193407371640206,1.0652868136763574,0.3522067219018936,-0.06724023669958115,-0.674774843454361,1.5146269530057908,0.07259597778320313,0.4994284451007843,-0.5849824875593186,-0.268012285977602,-1.4730323597788813,-0.5532087743282318,0.4165456745773554,0.3551592580974102,-0.7105251915752888,0.5414398968219757]               |
|1FIP.B          |TQKPLRDSVKQ|H      |H      |[TQ, QK, KP, PL, LR, RD, DS, SV, VK, KQ]|[-0.018302875757217407,-2.1000656187534332,0.6336446583271027,0.8518256187438965,0.5016914833337068,1.6296857938170435,2.572529545426369,0.4114768266677857,-0.7790118128061295,-0.1031761258840561,0.4647514402866364,0.7176718816161156,0.24660292565822603,0.5589953929185868,0.12458120062947274,0.1586373895406723,0.14718267917633057,0.7307373791933061,-1.1755008727312088,-1.605568803101778,1.3931305825710298,0.26324320286512376,1.1875256955623628,0.1651293784379959,-0.7394503057003021,1.4159013360505925,-0.042308670282363896,1.0027690947055816,-0.45229040160775186,1.2500343948602677,0.5970235407352448,-0.350436969101429,0.7272633731365205,1.096564558148384,0.3461916401982308,1.2782142028212549,0.3137209326028824,-0.7195944532752038,-0.942888218164444,0.6032458513975144,0.3333606243133545,0.6760650098323823,-0.7884602069854737,-0.38885987848043446,-1.0330197259783744,-0.6777424156665802,0.505250832810998,0.22407433316111566,-0.3025555968284607,0.6762225765734912]                |
|1FIP.B          |QKPLRDSVKQA|H      |H      |[QK, KP, PL, LR, RD, DS, SV, VK, KQ, QA]|[-0.09394910931587219,-1.7359864532947542,0.48190932869911196,1.1017611503601075,0.4381964433938265,0.9674800738692284,2.6951666831970216,-0.10795419216156006,-0.21965324729681016,-0.24005975723266604,0.5793093025684357,0.38524354547262196,0.15508871972560884,1.2828427821397783,0.027962434291839602,-0.05348180830478669,-0.2202654480934143,0.9114363759756089,-1.37995522916317,-1.8016631729900838,1.4356273591518403,0.12687805742025376,1.1562089145183563,0.2721506178379059,-0.7024285018444062,1.8116968482499942,-0.4658932149410248,1.2231724083423616,0.02519536241889,1.1606818109750747,0.3845492899417877,-0.03962496668100357,0.6376563966274262,0.7603978663682938,0.39072228074073795,1.0050033673644065,0.3031055599451065,-0.6297420725226403,-0.6094307601451874,0.5603405088186264,0.6178987741470338,1.1913070261478425,-1.0390252113342286,-0.24788850396871567,-1.0312373086810112,-0.35330314040184024,0.5266281932592393,-0.2854208178818226,-0.3319072604179383,0.6453560490161181]       |
|1FIP.B          |KPLRDSVKQAL|H      |H      |[KP, PL, LR, RD, DS, SV, VK, KQ, QA, AL]|[0.015949952602386474,-1.6833390891551971,0.6255883038043977,1.354280686378479,-0.14153966419398786,0.5419679507613182,2.7353835225105287,0.10353403091430664,-0.24075703471899035,0.04468633607029915,0.5111434429883958,0.15788426250219345,0.6207536071538926,0.8924932956695557,0.2500663995742798,-0.08235115110874176,0.04554082825779915,1.005113670229912,-0.7139833897352219,-1.5365664370357992,1.2528764188289643,0.21516154259443285,1.1701266467571259,0.43039557337760925,-1.019785088300705,2.0560802906518805,-1.1486250340938569,0.9742672801017762,-0.12975675836205483,1.126174834370613,0.23164895176887512,0.3583108544349671,0.6194697082042695,0.8349089890718461,0.1860122323036194,0.4633846029639244,-0.09926954209804535,-0.6675917729735374,0.08888607621192933,0.4939302772283554,1.2383000254631042,0.9662425100803376,-0.5642978549003601,-0.5871108695864677,-0.3472769968211651,-0.7674383580684663,-0.2508073002099991,-0.011432189494371414,-0.6407379984855652,1.0408918637782336]       |
|1FIP.B          |PLRDSVKQALK|H      |H      |[PL, LR, RD, DS, SV, VK, KQ, QA, AL, LK]|[0.34722700119018557,-2.061044675111771,0.46629926562309265,1.373120164871216,0.15599417574703695,0.3972497329115868,2.3960951209068297,-0.47557168006896977,-0.09631032198667527,0.28868388310074805,0.40378569066524506,0.2857971951365471,-0.09471794664859773,1.0997533291578294,0.320883983373642,-0.2192752778530121,0.4890629656612873,0.7825126498937607,-0.6501036137342453,-2.1232673771679402,1.7199108541011812,0.16032371670007706,0.4207773864269257,0.18990159258246422,-0.5684510171413422,1.4107022255426274,-1.1511801302433013,1.0176879048347474,0.22808528169989586,1.280660143494606,0.014142298698425294,0.3361297726631165,0.3676656186580658,0.43270951807498936,-0.045315349102020265,0.17734556645154953,-0.3930076748132706,-0.1348002538084984,0.10938120484352112,0.6633503764867783,1.3535306215286256,1.213214421272278,-0.8979302287101746,0.09876762181520463,-1.0234837882220746,-1.040145707130432,-0.46417640745639804,0.1515015356242657,-0.5875134825706482,0.8592647213488818]       |
|1FIP.B          |LRDSVKQALKN|H      |H      |[LR, RD, DS, SV, VK, KQ, QA, AL, LK, KN]|[0.20828620195388795,-2.018749839067459,0.33541299700737004,1.2898306846618652,0.5835982788354158,0.7634967043995857,2.163742458820343,-0.4230422914028168,0.16854635030031206,-0.17809961661696436,0.21859435737133026,0.08038461655378343,-0.2832954078912735,1.2158313084393741,0.20448452234268188,-0.3508767008781433,0.6029431708157063,0.38194607198238373,-0.3441363126039505,-1.6464017160236837,1.1240564346313477,0.025350452959537507,0.23069041967391968,0.1389751188457012,-0.5457875519990921,1.9448135375743734,-0.6778485596179963,0.48493925333023075,-0.39506885781884193,0.6549557000398636,-0.01664799451828003,0.017655837535858154,1.071532315015793,-0.26299986988306046,-0.19998992681503297,0.4313888967037201,-0.39875918552279477,0.02179919481277466,0.16763667464256288,0.7828942835330963,0.8571268856525421,1.2723923563957216,-0.5347977995872498,0.2104453518986702,-0.2612844578921795,-0.5525412559509277,0.25091978013515476,0.17714601680636408,-0.5217572562396526,0.7144922394305468]|
|1FIP.B          |RDSVKQALKNY|H      |H      |[RD, DS, SV, VK, KQ, QA, AL, LK, KN, NY]|[0.17934538125991822,-1.6363632023334505,0.22888252139091492,0.8593286871910095,0.5716567128896713,0.8357971027493477,1.9786513328552247,-0.10931193232536317,0.382646156847477,-0.8973433576524258,0.3014722913503647,-0.09126896411180496,0.14780592620372773,1.5966568041592837,0.4685353964567185,-0.08567768335342407,-0.06229518577456475,0.07647542655467987,-0.13770822882652284,-1.6120177038013936,0.5439030647277833,-0.36308919042348864,-0.007282352447509766,0.02912766858935356,-0.8129350930452347,1.7087860107189046,-0.1884878933429718,0.5515900745987893,-0.36718925908207894,0.31720527708530427,-0.14844588339328765,0.12054680585861206,0.7908448070287705,0.7718727812170982,-0.1849958300590515,0.534985713660717,-0.2009143762290478,0.18014255464076998,0.6151047348976135,0.40380980968475344,0.8676453292369843,1.0095135688781738,-0.6030888438224793,-0.4915942713618279,-1.526147996634245,-0.24818378686904907,0.66266528069973,0.39920665547251705,-0.8926080577075481,0.5296745199710131] |
|1FIP.B          |DSVKQALKNYF|H      |H      |[DS, SV, VK, KQ, QA, AL, LK, KN, NY, YF]|[0.6993545413017274,-1.6296955883502962,-0.19540942311286927,0.7496838688850403,0.9932902187108994,1.119630654156208,1.6919454395771027,-0.017393511533737183,1.142455415427685,-0.7575292326509953,0.20441125333309174,-0.05146206468343735,0.5857458084821702,2.188424639776349,0.033381232619285585,0.24651337862014772,-0.42692609652876856,0.022507238388061526,-0.7141059577465058,-1.4383350729942324,0.6240529894828797,-0.7891646847128868,0.0665635108947754,-0.3616538025438786,-0.7982579916715622,1.674454665160738,-0.11802821159362793,0.329801507294178,0.29182670414447787,0.23444792330265046,-0.33856916427612305,0.32228373885154726,1.2388611763715744,0.12735521867871286,-0.2670617461204529,1.0012074634432793,0.3161179132759571,-0.11882337629795076,0.9837825059890748,0.3947523355484009,0.7789762288331986,0.9882625102996827,-0.6959138661623001,-0.4668252468109131,-1.0430869035422803,0.08239964246749878,0.7844762295484543,0.1619375877082348,-0.8241980187594891,-0.08069768957793713]   |
|1FIP.B          |SVKQALKNYFA|H      |H      |[SV, VK, KQ, QA, AL, LK, KN, NY, YF, FA]|[0.4919814467430115,-0.9431173026561738,-0.2580112755298615,1.5856716990470887,0.8879776515066624,1.0071870923042299,1.126893788576126,-0.2444527566432953,1.1166454896330833,-0.7947338439524174,0.6635253161191941,0.15614522844552994,0.6978812485933304,2.8989720392972234,-0.3602645188570023,1.0545634388923646,-0.18614707812666895,-0.5962471559643746,-0.5550552397966385,-1.0354241728782654,0.4720494568347931,-0.8187196835875512,0.017310690879821778,-0.3344152785837651,-0.49849302470684054,1.5986349225044252,-0.10742241144180298,0.5757028475403786,0.8685306817293168,0.3660833477973938,-0.6401085242629052,0.2863482415676117,0.8527170032262803,-0.14515131637454035,0.052951979637146,1.529894070327282,-0.10164483115077019,-0.18837830126285554,1.0436001420021057,-0.045152938365936285,0.7655945390462876,1.641186797618866,-0.8167492181062699,-0.34609851837158206,-1.127408654242754,0.534778642654419,0.848238703608513,-0.16862570419907572,-1.2945820324122908,-0.4515532594174147]        |
+----------------+-----------+-------+-------+----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 10 rows

Terminate Spark Context

In [8]:
sc.stop()