Swiss Prot Sequence To Word2Vec

This demo generates Word2Vector models from protein sequences in UniProt using overlapping n-grams.

SwissProt

SwissProt

UniProt

Imports

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.datasets import uniProt

Configure Spark Context

In [2]:
conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SwissProtSequenceWord2VecEncodeDemo")

sc = SparkContext(conf = conf)

Get Swiss Prot dataset from UniProt

In [3]:
data = uniProt.get_dataset(uniProt.SWISS_PROT)

# Make sure there are no empty sequence records
data = data.na.drop(subset = ["sequence"])

data.show(10, False)
+---+----------------+----------+------------------------------------------+-------------------------------------+---------+----------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|db |uniqueIdentifier|entryName |proteinName                               |organismName                         |geneName |proteinExistence|sequenceVersion|sequence                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+---+----------------+----------+------------------------------------------+-------------------------------------+---------+----------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|>sp|Q6GZX4          |001R_FRG3G|Putativetranscriptionfactor001R           |Frogvirus3(isolateGoorha)OX=654924   |FV3-001R |4               |1              |MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL                                                                                                                                                                                                          |
|>sp|Q6GZX3          |002L_FRG3G|Uncharacterizedprotein002L                |Frogvirus3(isolateGoorha)OX=654924   |FV3-002L |4               |1              |MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW                                                                                                                                          |
|>sp|Q197F8          |002R_IIV3 |Uncharacterizedprotein002R                |Invertebrateiridescentvirus3OX=345201|IIV3-002R|4               |1              |MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPELQTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAITFEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDDLEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFETYGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMYSTILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSSGEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKIQSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC|
|>sp|Q197F7          |003L_IIV3 |Uncharacterizedprotein003L                |Invertebrateiridescentvirus3OX=345201|IIV3-003L|4               |1              |MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTTPSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPSTTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI                                                                                                                                                                                                                                                                                                              |
|>sp|Q6GZX2          |003R_FRG3G|Uncharacterizedprotein3R                  |Frogvirus3(isolateGoorha)OX=654924   |FV3-003R |3               |1              |MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVDRAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPIFGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQPCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGDAHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMARTLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGALMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKRKAKIQEMFDNMVSRMVTS                    |
|>sp|Q6GZX1          |004R_FRG3G|Uncharacterizedprotein004R                |Frogvirus3(isolateGoorha)OX=654924   |FV3-004R |4               |1              |MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY                                                                                                                                                                                                                                                                                                                                                                                                              |
|>sp|Q197F5          |005L_IIV3 |Uncharacterizedprotein005L                |Invertebrateiridescentvirus3OX=345201|IIV3-005L|3               |1              |MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTLCSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELCKADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYHQPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY                                                                                                                                                                                                                                                 |
|>sp|Q6GZX0          |005R_FRG3G|Uncharacterizedprotein005R                |Frogvirus3(isolateGoorha)OX=654924   |FV3-005R |4               |1              |MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRSNLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGEDQCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKTREFVDKDAQEFQDFLNSLDASLLS                                                                                                                                                                                                                                                              |
|>sp|Q91G88          |006L_IIV6 |PutativeKilA-Ndomain-containingprotein006L|Invertebrateiridescentvirus6OX=176652|IIV6-006L|3               |1              |MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKLIQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNIIINYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRKKDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVEDRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCHPNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV                                                                                                          |
|>sp|Q6GZW9          |006R_FRG3G|Uncharacterizedprotein006R                |Frogvirus3(isolateGoorha)OX=654924   |FV3-006R |4               |1              |MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIEYLGPWVQAEYRRIKG                                                                                                                                                                                                                                                                                                                                                                                               |
+---+----------------+----------+------------------------------------------+-------------------------------------+---------+----------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 10 rows

Generate Word2Vec model

In [4]:
segmentLength = 11
n = 2
windowSize = (segmentLength - 1)/2
vectorSize = 50

# take 10 rows of data for demo
data = data.limit(10)
# add Word2Vec encoded feature vector
encoder = ProteinSequenceEncoder(data)
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n=n , windowSize=windowSize, vectorSize=vectorSize).cache()

Example of output rows

In [5]:
data.head(2)
Out[5]:
[Row(db='>sp', uniqueIdentifier='Q6GZX4', entryName='001R_FRG3G', proteinName='Putativetranscriptionfactor001R', organismName='Frogvirus3(isolateGoorha)OX=654924', geneName='FV3-001R', proteinExistence=4, sequenceVersion=1, sequence='MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL', ngram=['MA', 'AF', 'FS', 'SA', 'AE', 'ED', 'DV', 'VL', 'LK', 'KE', 'EY', 'YD', 'DR', 'RR', 'RR', 'RR', 'RM', 'ME', 'EA', 'AL', 'LL', 'LL', 'LS', 'SL', 'LY', 'YY', 'YP', 'PN', 'ND', 'DR', 'RK', 'KL', 'LL', 'LD', 'DY', 'YK', 'KE', 'EW', 'WS', 'SP', 'PP', 'PR', 'RV', 'VQ', 'QV', 'VE', 'EC', 'CP', 'PK', 'KA', 'AP', 'PV', 'VE', 'EW', 'WN', 'NN', 'NP', 'PP', 'PS', 'SE', 'EK', 'KG', 'GL', 'LI', 'IV', 'VG', 'GH', 'HF', 'FS', 'SG', 'GI', 'IK', 'KY', 'YK', 'KG', 'GE', 'EK', 'KA', 'AQ', 'QA', 'AS', 'SE', 'EV', 'VD', 'DV', 'VN', 'NK', 'KM', 'MC', 'CC', 'CW', 'WV', 'VS', 'SK', 'KF', 'FK', 'KD', 'DA', 'AM', 'MR', 'RR', 'RY', 'YQ', 'QG', 'GI', 'IQ', 'QT', 'TC', 'CK', 'KI', 'IP', 'PG', 'GK', 'KV', 'VL', 'LS', 'SD', 'DL', 'LD', 'DA', 'AK', 'KI', 'IK', 'KA', 'AY', 'YN', 'NL', 'LT', 'TV', 'VE', 'EG', 'GV', 'VE', 'EG', 'GF', 'FV', 'VR', 'RY', 'YS', 'SR', 'RV', 'VT', 'TK', 'KQ', 'QH', 'HV', 'VA', 'AA', 'AF', 'FL', 'LK', 'KE', 'EL', 'LR', 'RH', 'HS', 'SK', 'KQ', 'QY', 'YE', 'EN', 'NV', 'VN', 'NL', 'LI', 'IH', 'HY', 'YI', 'IL', 'LT', 'TD', 'DK', 'KR', 'RV', 'VD', 'DI', 'IQ', 'QH', 'HL', 'LE', 'EK', 'KD', 'DL', 'LV', 'VK', 'KD', 'DF', 'FK', 'KA', 'AL', 'LV', 'VE', 'ES', 'SA', 'AH', 'HR', 'RM', 'MR', 'RQ', 'QG', 'GH', 'HM', 'MI', 'IN', 'NV', 'VK', 'KY', 'YI', 'IL', 'LY', 'YQ', 'QL', 'LL', 'LK', 'KK', 'KH', 'HG', 'GH', 'HG', 'GP', 'PD', 'DG', 'GP', 'PD', 'DI', 'IL', 'LT', 'TV', 'VK', 'KT', 'TG', 'GS', 'SK', 'KG', 'GV', 'VL', 'LY', 'YD', 'DD', 'DS', 'SF', 'FR', 'RK', 'KI', 'IY', 'YT', 'TD', 'DL', 'LG', 'GW', 'WK', 'KF', 'FT', 'TP', 'PL'], features=DenseVector([0.0127, -0.0083, -0.0085, -0.0045, 0.0038, 0.0048, -0.0109, 0.0048, -0.0009, -0.0023, 0.0229, 0.0086, -0.0107, 0.0072, 0.011, -0.0007, -0.0048, 0.0216, 0.0037, -0.0041, 0.001, 0.0142, -0.0041, -0.006, 0.0033, -0.0057, -0.0039, -0.0004, 0.0004, -0.0169, -0.0091, 0.0082, 0.0162, -0.0027, -0.0083, -0.0118, 0.0006, 0.0131, 0.0002, -0.0105, -0.0055, -0.0046, -0.0072, 0.0095, 0.0014, -0.0033, 0.006, 0.0024, -0.0075, 0.0011])),
 Row(db='>sp', uniqueIdentifier='Q6GZX3', entryName='002L_FRG3G', proteinName='Uncharacterizedprotein002L', organismName='Frogvirus3(isolateGoorha)OX=654924', geneName='FV3-002L', proteinExistence=4, sequenceVersion=1, sequence='MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW', ngram=['MS', 'SI', 'II', 'IG', 'GA', 'AT', 'TR', 'RL', 'LQ', 'QN', 'ND', 'DK', 'KS', 'SD', 'DT', 'TY', 'YS', 'SA', 'AG', 'GP', 'PC', 'CY', 'YA', 'AG', 'GG', 'GC', 'CS', 'SA', 'AF', 'FT', 'TP', 'PR', 'RG', 'GT', 'TC', 'CG', 'GK', 'KD', 'DW', 'WD', 'DL', 'LG', 'GE', 'EQ', 'QT', 'TC', 'CA', 'AS', 'SG', 'GF', 'FC', 'CT', 'TS', 'SQ', 'QP', 'PL', 'LC', 'CA', 'AR', 'RI', 'IK', 'KK', 'KT', 'TQ', 'QV', 'VC', 'CG', 'GL', 'LR', 'RY', 'YS', 'SS', 'SK', 'KG', 'GK', 'KD', 'DP', 'PL', 'LV', 'VS', 'SA', 'AE', 'EW', 'WD', 'DS', 'SR', 'RG', 'GA', 'AP', 'PY', 'YV', 'VR', 'RC', 'CT', 'TY', 'YD', 'DA', 'AD', 'DL', 'LI', 'ID', 'DT', 'TQ', 'QA', 'AQ', 'QV', 'VD', 'DQ', 'QF', 'FV', 'VS', 'SM', 'MF', 'FG', 'GE', 'ES', 'SP', 'PS', 'SL', 'LA', 'AE', 'ER', 'RY', 'YC', 'CM', 'MR', 'RG', 'GV', 'VK', 'KN', 'NT', 'TA', 'AG', 'GE', 'EL', 'LV', 'VS', 'SR', 'RV', 'VS', 'SS', 'SD', 'DA', 'AD', 'DP', 'PA', 'AG', 'GG', 'GW', 'WC', 'CR', 'RK', 'KW', 'WY', 'YS', 'SA', 'AH', 'HR', 'RG', 'GP', 'PD', 'DQ', 'QD', 'DA', 'AA', 'AL', 'LG', 'GS', 'SF', 'FC', 'CI', 'IK', 'KN', 'NP', 'PG', 'GA', 'AA', 'AD', 'DC', 'CK', 'KC', 'CI', 'IN', 'NR', 'RA', 'AS', 'SD', 'DP', 'PV', 'VY', 'YQ', 'QK', 'KV', 'VK', 'KT', 'TL', 'LH', 'HA', 'AY', 'YP', 'PD', 'DQ', 'QC', 'CW', 'WY', 'YV', 'VP', 'PC', 'CA', 'AA', 'AD', 'DV', 'VG', 'GE', 'EL', 'LK', 'KM', 'MG', 'GT', 'TQ', 'QR', 'RD', 'DT', 'TP', 'PT', 'TN', 'NC', 'CP', 'PT', 'TQ', 'QV', 'VC', 'CQ', 'QI', 'IV', 'VF', 'FN', 'NM', 'ML', 'LD', 'DD', 'DG', 'GS', 'SV', 'VT', 'TM', 'MD', 'DD', 'DV', 'VK', 'KN', 'NT', 'TI', 'IN', 'NC', 'CD', 'DF', 'FS', 'SK', 'KY', 'YV', 'VP', 'PP', 'PP', 'PP', 'PP', 'PP', 'PK', 'KP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PP', 'PT', 'TP', 'PR', 'RP', 'PV', 'VH', 'HN', 'NR', 'RK', 'KV', 'VM', 'MF', 'FF', 'FV', 'VA', 'AG', 'GA', 'AV', 'VL', 'LV', 'VA', 'AI', 'IL', 'LI', 'IS', 'ST', 'TV', 'VR', 'RW'], features=DenseVector([0.0246, -0.016, -0.0079, -0.0186, 0.0054, 0.0162, -0.018, 0.0005, 0.0032, -0.0087, 0.0513, 0.0186, -0.0192, 0.0147, 0.0332, 0.0054, 0.001, 0.0432, 0.0114, -0.0104, -0.0015, 0.0162, -0.0027, -0.0159, 0.0061, -0.0067, -0.0084, -0.0056, 0.0042, -0.0338, -0.0293, 0.0198, 0.0365, -0.0097, -0.0205, -0.0314, -0.0016, 0.0296, -0.0051, -0.0148, -0.0053, -0.0135, -0.0147, 0.0197, 0.0107, -0.0074, 0.012, 0.0125, -0.0103, 0.0138]))]

Terminate Spark Context

In [6]:
sc.stop()