Jpred Demo

This demo shows how to create and query a Jpred dataset.

Jpred

Jpred

Imports

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.datasets import jpredDataset

Configure Spark

In [2]:
conf = SparkConf().setMaster("local[*]") \
                      .setAppName("JpredDemo")
sc = SparkContext(conf = conf)

Get Jpred Datasets

In [3]:
res = jpredDataset.get_dataset()

Display Jpred Dataset

In [4]:
res.show(20, False)
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|scopID |sequence                                                                                                                                                                                                                                                                                                                                                                                                                   |secondaryStructure                                                                                                                                                                                                                                                                                                                                                                                                         |trained|
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|d2nnua_|SMETLCQRLNVCQDKILTHYENDSTDLRDHIDYWKHMRLECAIYYKAREMGFKHINHQVVPTLAVSKNKALQAIELQLTLETIYNSQYSNEKWTLQDVSLEVYLTAPTGCIKKHGYTVEVQFDGDICNTMHYTNWTHIYICEEASVTVVEGQVDYYGLYYVHEGIRTYFVQFKDDAEKYSKNKVWEVHAGGQVILCPTSVF                                                                                                                                                                                                                  |CHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCEECCEECCCHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCHHHHHCCCCCCEEEEEEEEEEEECCCCCCEEEEEEEEEEEEEECCEEEEECCEECCCEEEEEECCEEEEEEEHHHHHHHHCCCCCEEEECCCCEECCCCCCC                                                                                                                                                                                                                  |true   |
|d1p9ya_|MQVSVETTQGLGRRVTITIAADSIETAVKSELVNVAKKVRIDGLRKGKVPMNIVAQRYGASVRQDVLGDLMSRNFIDAIIKEKINPAGAPTYVPGEYKLGEDFTYSVEFEVYPEVEL                                                                                                                                                                                                                                                                                                      |CEEEEEECCCCEEEEEEEECHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCEEEEEEEECCCCCCCCEEEEEEEECCCCCCC                                                                                                                                                                                                                                                                                                      |true   |
|d2a9sa1|MSLFPGDIEELARRIITDFTPLGLMVSTAESCTGGLIAGALTEIAGSSAVVDRGFVTYTNDAKRDMLGVGTETLTTFGAVSRQTALQMAHGALYRSRANFAVAVTGIAGPGGGSAEKPVGLVHLATKARNGNVLHHEMRYGDIGRTEIRLATVRTALEMLIALNQAG                                                                                                                                                                                                                                                    |CCCCCHHHHHHHHHHHHHHHHHCCCEEEEECCCCCHHHHHHCCCCCCCCCEEEEEEECCHHHHHHHHCCCHHHHHHHCCCCHHHHHHHHHHHHHCCCCCEEEEEEECCCCCCCCCCCCCCEEEEEEEECCCCEEEEEEECCCCCHHHHHHHHHHHHHHHHHHHHHCC                                                                                                                                                                                                                                                    |true   |
|d1u84a_|GQQLNRLLLEWIGAWDPFGLGKDAYDVEAASVLQAVYETEDARTLAARIQSIYEFAFDEPIPFPHCLKLARRLLELKQAAS                                                                                                                                                                                                                                                                                                                                          |CCHHHHHHHHHHHHHCCCCCCCCCCHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHC                                                                                                                                                                                                                                                                                                                                          |true   |
|d3a02a_|TFTSFQLEELEKAFSRTHYPDVFTREELAMKIGLTEARIQVWFQNRRAKWR                                                                                                                                                                                                                                                                                                                                                                        |CCCHHHHHHHHHHHHHCCCCCHHHHHHHHHHHCCCHHHHHHHHHHHHHHHC                                                                                                                                                                                                                                                                                                                                                                        |false  |
|d4ay0a2|KGTPIQFAENLSWKVDGGKLIAENPSPFYMNIGELTFGGKSIPSHYIPPKSTWAFDLLAGARNVSWRIINDQGGLDRLYSKNVT                                                                                                                                                                                                                                                                                                                                       |CCCCCCCCCCCEEEEECCEEEEEECCCCCEEEEEEEECCEECCCCEECCCEEEEEECCCCCCEEEEEEECCCCCECCCEEEECC                                                                                                                                                                                                                                                                                                                                       |false  |
|d1ux5a_|KYPRPHKKLKQLHWEKLDCTDNSIWGTGKAEKFADDLYEKGVLADLEKAFAAREIKSLASKRKEDLQKITFLSRDISQQFGINLHMYSSLSVADLVKKILNCDRDFLQTPSVVEFLSKSEIIEVSVNLARNYAPYSTDWEGVRNLEDAKPPEKDPNDLQRADQIYLQLMVNLESYWGSRMRALTVVTSYEREYNELLAKLRKVDKAVSALQESDNLRNVFNVILAVGNFMNDTSKQAQGFKLSTLQRLTFIKDTTNSMTFLNYVEKIVRLNYPSFNDFLSELEPVLDVVKVSIEQLVNDCKDFSQSIVNVERSVEIGNLSDSSKFHPLDKVLIKTLPVLPEARKKGDLLEDEVKLTIMEFESLMHTYGEDSGDKFAKISFFKKFADFINEYKKAQAQNLAAEEEERLYIKH|CCCCCCCCECCCCCCCCCCCCCCCCCCCHHHHHHHHHHHCCHHHHHHHHCECCCCHHHHHHHHHCCCCECCCCHHHHHHHHHHCCCCCCCCHHHHHHHHHCCCHHHHCCHHHHHHCCCHHHHCCCHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCECHHHHHHHHCCCCCCCCHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCECCCCCCEHHHHHHHHHHHHCCCCCCHHHHCHHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCC|true   |
|d2v89a_|SPEFGYWITCCPTCDVDINTWVPFYSTELNKPAMIYCSHGDGHWVHAQCMDLEERTLIHLSEGSNKYYCNEHVQIARA                                                                                                                                                                                                                                                                                                                                             |CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCEEEECCCCCEEEECCCCCCCHHHHHHHHHCCCCCCCCCCCCCCCC                                                                                                                                                                                                                                                                                                                                             |true   |
|d1lm8v_|RPVLRSVNSREPSQVIFCNRSPRVVLPVWLNFDGEPQPYPTLPPGTGRRIHSYRGHLWLFRDAGTHDGLLVNQTELFVPSLNVDGQPIFANITLPVYTLKERCLQVVRSLVKPENYRRLDIVRSLYEDLEDHPNVQKDLERLTQERIAHQ                                                                                                                                                                                                                                                                     |CCCCCCCCCCCEEEEEEEECCCCCEEEEEECCCCCEEECCCECCCEEEEEEEECCCEEEEEECCCCCECEECCECCECCCCCECCECEEEEEECCCCCHHHHHHHHHHHHCCCCCCCCCCCCHHHHHHHHCCCCHHHHHHHHHHHHHCCC                                                                                                                                                                                                                                                                     |false  |
|d1w23a_|VKQVFNFNAGPSALPKPALERAQKELLNFNDTQMSVMELSHRSQSYEEVHEQAQNLLRELLQIPNDYQILFLQGGASLQFTMLPMNLLTKGTIGNYVLTGSWSEKALKEAKLLGETHIAASTKANSYQSIPDFSEFQLNENDAYLHITSNNTIYGTQYQNFPEINHAPLIADMSSDILSRPLKVNQFGMIYAGAQKNLGPSGVTVVIVKKDLLNTKVEQVPTMLQYATHIKSDSLYNTPPTFSIYMLRNVLDWIKDLGGAEAIAKQNEEKAKIIYDTIDESNGFYVGHAEKGSRSLMNVTFNLRNEELNQQFLAKAKEQGFVGLNGHRSVGGCRASIYNAVPIDACIALRELMIQFKENA                                                   |CCCCEECCCCCCCCCHHHHHHHHHCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHCCCCCEEEEEECCHHHHHHHHHHHHHCCCCCEEEEEECCHHHHHHHHHHHCCCEEEEEEECCCCCCCCCCCCCCCCCCCCEEEEEEECEECCCCEECCCCCCCCCCCEEEECCCCCCCCCCCCCCCCEEEEECCCCCCCCCCEEEEEEHHHHCCCCCCCCCCCCHHHHHHCCCCCCCCCHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHHHHHCCCCCCEECCCCCCECCCEEEEECCCHHHHHHHHHHHHHCCEECCECCCCCCCEEEECCCCCCHHHHHHHHHHHHHHHHHC                                                   |true   |
|d2nmla1|SHTILLVQPTKRPEGRTYADYESVNECMEGVCKMYEEHLKRMNPNSPSITYDISQLFDFIDDLADLSCLVYRADTQTYQPYNKDWIKEKIYVLLRRQAQQ                                                                                                                                                                                                                                                                                                                       |CCEEEEEECCCCCCCCEEEEECCHHHHHHHHHHHHHHHHHHHCCCCCCCCCCHHHHHHHHHHCCEEEEEEEECCCCEEEEECHHHHHHHHHHHHHHHHHC                                                                                                                                                                                                                                                                                                                       |true   |
|d4e2va_|RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKPETVRATGADIILGNTYHLMLRPGAERIAKLGGLHSFMGWDRPILTDSGGFQVMKQSEEGVTFKSHSRHMLSPERSIEIQHLLGSDIVMAFDEVTPYPATPSRAASSMERSMRWAKRSRDAFDSRKEQAENAALFGIQQGSVFENLRQQSADALAEIGFDGYAVGGLAVGEGQDEMFRVLDFSVPMLPDDKPHYLMGVGKPDDIVGAVERGIDMFDCVLPTRSGRNGQAFTWDGPINIRNARFSEDLKPLDSECHCAVCQKWSRAYIHHLIRAGEILGAMLMTEHNIAFYQQLMQKIRDSISEGRFSQFAQDFRARYFA                                               |CCCCEEEEEEEECCEEEEEEEECCEEEEECEEECECCCCCCCCCCHHHHHHCCCCCEEEEHHHHHHCCCHHHHHHCCCHHHHHCCCCCEEEECCHHHHCCECCCCEEEECCCEEEECHHHHHHHHHHHCCCEEECCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHCEEEEEECCCCCHHHHHHHHHHHHHHCCCEEEECCCCCCCCHHHHHHHHHHHCCCCCCCCCEEECCECCHHHHHHHHHCCCCEEECCHHHHHHHCCEECCCCCCEECCCCCCCCCCCCCCCCCCCHHHHHCCHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHCC                                               |true   |
|d4ac7b_|NYIVPGEYRVAEGEIEINAGREKTTIRVSNTGDRPIQVGSHIHFVEVNKELLFDRAEGIGRRLNIPSGTAARFEPGEEMEVELTELGGNREVFGISDLTNGSVDNKELILQRAKELGYKGVE                                                                                                                                                                                                                                                                                                 |CCCCCCCEECCCCEEECCCCCCEEEEEEEECCCCCEEEECCCCCCCCCCCEECCCCCCCCEEECCCCCCEEEECCCCEEEEEEEECCCCCEECCCCCCCCEECCCHHHHHHHHHHHCCCCCC                                                                                                                                                                                                                                                                                                 |true   |
|d1vyra_|AEKLFTPLKVGAVTAPNRVFMAPLTRLRSIEPGDIPTPLMGEYYRQRASAGLIISEATQISAQAKGYAGAPGLHSPEQIAAWKKITAGVHAEDGRIAVQLWHTGRISHSSIQPGGQAPVSASALNANTRTSLRDENGNAIRVDTTTPRALELDEIPGIVNDFRQAVANAREAGFDLVELHSAHGYLLHQFLSPSSNQRTDQYGGSVENRARLVLEVVDAVCNEWSADRIGIRVSPIGTFQNVDNGPNEEADALYLIEELAKRGIAYLHMSETDLAGGKPYSEAFRQKVRERFHGVIIGAGAYTAEKAEDLIGKGLIDAVAFGRDYIANPDLVARLQKKAELNPQRPESFYGGGAEGYTDYPSL                                                |CCCCCCCEEECCEEECCCEEECCCCCCCCECCCCECCHHHHHHHHHCCCCCEEEEEEEECCCCCCCCCCCCECCCHHHHHHHHHHHHHHHHCCCCEEEEEECCCCCCCCCCCCCCCCCEECCCCCCCCEEEEECCCCCEEEEECCCCEECCCCCHHHHHHHHHHHHHHHHHCCCCEEEEEECCCCHHHHHHCCCCCCCCCCCCCCHHHHCHHHHHHHHHHHHHCCCCCEEEEECCCCCECCECCCCCHHHHHHHHHHHHHHCCCCEEEEECCECCECCCCCHHHHHHHHHHCCCEEEEECCCCHHHHHHHHHCCCCCEEEECHHHHHCCCHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCC                                                |true   |
|d2bbha1|PPGTLVYTGKYREDFEIEVMNYSIEEFREFKTTDVESVLPFRDSSTPTWINITGIHRTDVVQRVGEFFGTHPLVLEDILNVHQRPKVEFFENYVFIVLKMFTYDKHELESEQVSLILTKNCVLMFQEKIGDVFDPVRERIRYNRGIIRKKRADYLLYSLIDALVDDYFVLLEKIDDEIDVLEEEVTVQRTHQLKRNLVELRKTIWPLREVLSSLYRDVPPLIE                                                                                                                                                                                            |CCCCCCCCCCCCCCCEEEEEEEECCEEEEEEECCCCCCCCCCCCCCCEEEEEECCCCHHHHHHHHHHHCCCHHHHHHHHCCCCCCEEEECCCEEEEEEEEEECCCCCCEEEEEEEEEECCEEEEEECCCCCCCHHHHHHHHCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCC                                                                                                                                                                                            |false  |
|d1ozna_|PCPGACVCYNEPKVTTSCPQQGLQAVPVGIPAASQRIFLHGNRISHVPAASFRACRNLTILWLHSNVLARIDAAAFTGLALLEQLDLSDNAQLRSVDPATFHGLGRLHTLHLDRCGLQELGPGLFRGLAALQYLYLQDNALQALPDDTFRDLGNLTHLFLHGNRISSVPERAFRGLHSLDRLLLHQNRVAHVHPHAFRDLGRLMTLYLFANNLSALPTEALAPLRALQYLRLNDNPWVCDCRARPLWAWLQKFRGSSSEVPCSLPQRLAGRDLKRLAANDLQGC                                                                                                                               |CCCCCCEEECCCCCEEECCCCCCCCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCCCCCCCCCCCCCCCCEEECCCCCCCCCCCCCCCCCCCCCEEECCCCCCCCCCCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCCCCHHHHCCCCCCCEEECCCCCEECCCCCHHHHHHHHHCCCEECCCEEEECCCCCCCECCCCCCCCCCCC                                                                                                                               |true   |
|d1gd8a_|SSHRLALYRNQAKSLLTHGRITTTVPKAKELRGFVDHLIHLAKRGDLHARRLVLRDLQDVKLVRKLFDEIAPRYRDRQGGYTRVLKLAERRRGDGAPLALVELVE                                                                                                                                                                                                                                                                                                                  |CHHHHHHHHHHHHHHHHHCEEEEEHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHHHCCCHHHHHHHHHCHHHHCCCCCCCCEEEEEEEEECCCCCCEEEEEEECC                                                                                                                                                                                                                                                                                                                  |true   |
|d3f6ya_|FWRQTWSGPGTTKRFPETVLARCVKYTEIHPEMRHVDCQSVWDAFKGAFISKHPCDITEEDYQPLMKLGTQTVPCNKILLWSRIKDLAHQFTQVQRDMFTLEDTLLGYLADDLTWCGEFDTSKINYQSCPDWRKDCSNNPVSVFWKTVSRRFAEAACDVVHVMLDGSRSKIFDKDSTFGSVEVHNLQPEKVQTLEAWVIHGGREDSRDLCQDPTIKELESIISKRNIQFSCKNIY                                                                                                                                                                                |CCCCCCCEECCCCCHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHCCCCCCCCCCCCCHHHHHHCCCCCCCCCEECCCCCCCCCCCCHHHHHHCECCCCCHHHHHHCCCCCCCCCCCCCCCCCEECCCCCCCCCCHHHHHHHHHHHHHHHHCCEEEEEEEECCCCCCCCCCCHHHHCCCCCCCCCCEEEEEEEEECCCCCCCCCCCCCHHHHHHHHHHHHCCCEEEEEEEC                                                                                                                                                                                |true   |
|d2pjua1|KPVIWTVSVTRLFELFRDISLEFDHLANITPIQLGFEKAVTYIRKKLANERCDAIIAAGSNGAYLKSRLSVPVILIKPSGYDVLQFLAKAGKLTSSIGVVTYQETIPALVAFQKTFNLRLDQRSYITEEDARGQINELKANGTEAVVGAGLITDLAEEAGMTGIFIYSAATVRQAFSDALDMTRMS                                                                                                                                                                                                                                 |CCEEEEECCHHHHHHHHHHHCCCCCCCEEEEECCCHHHHHHHHHHHCCCCCCCEEEEEHHHHHHHHCCCCCCEEEECCCHHHHHHHHHHCCCCCCCEEEEEECCCCHHHHHHHHHHCCCEEEEEECCHHHHHHHHHHHHHCCCCEEEECHHHHHHHHHCCCEEEECCCHHHHHHHHHHHHHHHHHC                                                                                                                                                                                                                                 |true   |
|d4g78a_|SNAMDHLHRKLRDHEAAMFQQGYLDDQFSQLQKLQDDTSPDFVIEVMTMFFDDSEKLLNNMSRALEQVPVNFKQIDAHAHQQKGSSASVGAARVKNVCGTFRNFCEAQNLEGCVRCLQQLQQEYSLLKNNLKYLFKLQQEIKTAGRS                                                                                                                                                                                                                                                                        |CCCCCHHHHHHHHHHHHHHHCCCECHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHCEHHHHHHHHCHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCC                                                                                                                                                                                                                                                                        |true   |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
only showing top 20 rows

Coalesce

In [5]:
res = res.coalesce(1)

Save to a local JSON file

This line of code will overwrite exsisting file or directory

In [6]:
res.write.mode("overwrite").format("json").save("Local directory to save your JSON file")

Terminate Spark

In [7]:
sc.stop()