This demo shows how to create and query a Jpred dataset.
In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.datasets import jpredDataset
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("JpredDemo")
sc = SparkContext(conf = conf)
In [3]:
res = jpredDataset.get_dataset()
In [4]:
res.show(20, False)
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|scopID |sequence |secondaryStructure |trained|
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|d2nnua_|SMETLCQRLNVCQDKILTHYENDSTDLRDHIDYWKHMRLECAIYYKAREMGFKHINHQVVPTLAVSKNKALQAIELQLTLETIYNSQYSNEKWTLQDVSLEVYLTAPTGCIKKHGYTVEVQFDGDICNTMHYTNWTHIYICEEASVTVVEGQVDYYGLYYVHEGIRTYFVQFKDDAEKYSKNKVWEVHAGGQVILCPTSVF |CHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCEECCEECCCHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCHHHHHCCCCCCEEEEEEEEEEEECCCCCCEEEEEEEEEEEEEECCEEEEECCEECCCEEEEEECCEEEEEEEHHHHHHHHCCCCCEEEECCCCEECCCCCCC |true |
|d1p9ya_|MQVSVETTQGLGRRVTITIAADSIETAVKSELVNVAKKVRIDGLRKGKVPMNIVAQRYGASVRQDVLGDLMSRNFIDAIIKEKINPAGAPTYVPGEYKLGEDFTYSVEFEVYPEVEL |CEEEEEECCCCEEEEEEEECHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCEEEEEEEECCCCCCCCEEEEEEEECCCCCCC |true |
|d2a9sa1|MSLFPGDIEELARRIITDFTPLGLMVSTAESCTGGLIAGALTEIAGSSAVVDRGFVTYTNDAKRDMLGVGTETLTTFGAVSRQTALQMAHGALYRSRANFAVAVTGIAGPGGGSAEKPVGLVHLATKARNGNVLHHEMRYGDIGRTEIRLATVRTALEMLIALNQAG |CCCCCHHHHHHHHHHHHHHHHHCCCEEEEECCCCCHHHHHHCCCCCCCCCEEEEEEECCHHHHHHHHCCCHHHHHHHCCCCHHHHHHHHHHHHHCCCCCEEEEEEECCCCCCCCCCCCCCEEEEEEEECCCCEEEEEEECCCCCHHHHHHHHHHHHHHHHHHHHHCC |true |
|d1u84a_|GQQLNRLLLEWIGAWDPFGLGKDAYDVEAASVLQAVYETEDARTLAARIQSIYEFAFDEPIPFPHCLKLARRLLELKQAAS |CCHHHHHHHHHHHHHCCCCCCCCCCHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHC |true |
|d3a02a_|TFTSFQLEELEKAFSRTHYPDVFTREELAMKIGLTEARIQVWFQNRRAKWR |CCCHHHHHHHHHHHHHCCCCCHHHHHHHHHHHCCCHHHHHHHHHHHHHHHC |false |
|d4ay0a2|KGTPIQFAENLSWKVDGGKLIAENPSPFYMNIGELTFGGKSIPSHYIPPKSTWAFDLLAGARNVSWRIINDQGGLDRLYSKNVT |CCCCCCCCCCCEEEEECCEEEEEECCCCCEEEEEEEECCEECCCCEECCCEEEEEECCCCCCEEEEEEECCCCCECCCEEEECC |false |
|d1ux5a_|KYPRPHKKLKQLHWEKLDCTDNSIWGTGKAEKFADDLYEKGVLADLEKAFAAREIKSLASKRKEDLQKITFLSRDISQQFGINLHMYSSLSVADLVKKILNCDRDFLQTPSVVEFLSKSEIIEVSVNLARNYAPYSTDWEGVRNLEDAKPPEKDPNDLQRADQIYLQLMVNLESYWGSRMRALTVVTSYEREYNELLAKLRKVDKAVSALQESDNLRNVFNVILAVGNFMNDTSKQAQGFKLSTLQRLTFIKDTTNSMTFLNYVEKIVRLNYPSFNDFLSELEPVLDVVKVSIEQLVNDCKDFSQSIVNVERSVEIGNLSDSSKFHPLDKVLIKTLPVLPEARKKGDLLEDEVKLTIMEFESLMHTYGEDSGDKFAKISFFKKFADFINEYKKAQAQNLAAEEEERLYIKH|CCCCCCCCECCCCCCCCCCCCCCCCCCCHHHHHHHHHHHCCHHHHHHHHCECCCCHHHHHHHHHCCCCECCCCHHHHHHHHHHCCCCCCCCHHHHHHHHHCCCHHHHCCHHHHHHCCCHHHHCCCHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCECHHHHHHHHCCCCCCCCHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCECCCCCCEHHHHHHHHHHHHCCCCCCHHHHCHHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCC|true |
|d2v89a_|SPEFGYWITCCPTCDVDINTWVPFYSTELNKPAMIYCSHGDGHWVHAQCMDLEERTLIHLSEGSNKYYCNEHVQIARA |CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCEEEECCCCCEEEECCCCCCCHHHHHHHHHCCCCCCCCCCCCCCCC |true |
|d1lm8v_|RPVLRSVNSREPSQVIFCNRSPRVVLPVWLNFDGEPQPYPTLPPGTGRRIHSYRGHLWLFRDAGTHDGLLVNQTELFVPSLNVDGQPIFANITLPVYTLKERCLQVVRSLVKPENYRRLDIVRSLYEDLEDHPNVQKDLERLTQERIAHQ |CCCCCCCCCCCEEEEEEEECCCCCEEEEEECCCCCEEECCCECCCEEEEEEEECCCEEEEEECCCCCECEECCECCECCCCCECCECEEEEEECCCCCHHHHHHHHHHHHCCCCCCCCCCCCHHHHHHHHCCCCHHHHHHHHHHHHHCCC |false |
|d1w23a_|VKQVFNFNAGPSALPKPALERAQKELLNFNDTQMSVMELSHRSQSYEEVHEQAQNLLRELLQIPNDYQILFLQGGASLQFTMLPMNLLTKGTIGNYVLTGSWSEKALKEAKLLGETHIAASTKANSYQSIPDFSEFQLNENDAYLHITSNNTIYGTQYQNFPEINHAPLIADMSSDILSRPLKVNQFGMIYAGAQKNLGPSGVTVVIVKKDLLNTKVEQVPTMLQYATHIKSDSLYNTPPTFSIYMLRNVLDWIKDLGGAEAIAKQNEEKAKIIYDTIDESNGFYVGHAEKGSRSLMNVTFNLRNEELNQQFLAKAKEQGFVGLNGHRSVGGCRASIYNAVPIDACIALRELMIQFKENA |CCCCEECCCCCCCCCHHHHHHHHHCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHCCCCCEEEEEECCHHHHHHHHHHHHHCCCCCEEEEEECCHHHHHHHHHHHCCCEEEEEEECCCCCCCCCCCCCCCCCCCCEEEEEEECEECCCCEECCCCCCCCCCCEEEECCCCCCCCCCCCCCCCEEEEECCCCCCCCCCEEEEEEHHHHCCCCCCCCCCCCHHHHHHCCCCCCCCCHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHHHHHCCCCCCEECCCCCCECCCEEEEECCCHHHHHHHHHHHHHCCEECCECCCCCCCEEEECCCCCCHHHHHHHHHHHHHHHHHC |true |
|d2nmla1|SHTILLVQPTKRPEGRTYADYESVNECMEGVCKMYEEHLKRMNPNSPSITYDISQLFDFIDDLADLSCLVYRADTQTYQPYNKDWIKEKIYVLLRRQAQQ |CCEEEEEECCCCCCCCEEEEECCHHHHHHHHHHHHHHHHHHHCCCCCCCCCCHHHHHHHHHHCCEEEEEEEECCCCEEEEECHHHHHHHHHHHHHHHHHC |true |
|d4e2va_|RPRFSFSIAAREGKARTGTIEMKRGVIRTPAFMPVGTAATVKALKPETVRATGADIILGNTYHLMLRPGAERIAKLGGLHSFMGWDRPILTDSGGFQVMKQSEEGVTFKSHSRHMLSPERSIEIQHLLGSDIVMAFDEVTPYPATPSRAASSMERSMRWAKRSRDAFDSRKEQAENAALFGIQQGSVFENLRQQSADALAEIGFDGYAVGGLAVGEGQDEMFRVLDFSVPMLPDDKPHYLMGVGKPDDIVGAVERGIDMFDCVLPTRSGRNGQAFTWDGPINIRNARFSEDLKPLDSECHCAVCQKWSRAYIHHLIRAGEILGAMLMTEHNIAFYQQLMQKIRDSISEGRFSQFAQDFRARYFA |CCCCEEEEEEEECCEEEEEEEECCEEEEECEEECECCCCCCCCCCHHHHHHCCCCCEEEEHHHHHHCCCHHHHHHCCCHHHHHCCCCCEEEECCHHHHCCECCCCEEEECCCEEEECHHHHHHHHHHHCCCEEECCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHCEEEEEECCCCCHHHHHHHHHHHHHHCCCEEEECCCCCCCCHHHHHHHHHHHCCCCCCCCCEEECCECCHHHHHHHHHCCCCEEECCHHHHHHHCCEECCCCCCEECCCCCCCCCCCCCCCCCCCHHHHHCCHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHCC |true |
|d4ac7b_|NYIVPGEYRVAEGEIEINAGREKTTIRVSNTGDRPIQVGSHIHFVEVNKELLFDRAEGIGRRLNIPSGTAARFEPGEEMEVELTELGGNREVFGISDLTNGSVDNKELILQRAKELGYKGVE |CCCCCCCEECCCCEEECCCCCCEEEEEEEECCCCCEEEECCCCCCCCCCCEECCCCCCCCEEECCCCCCEEEECCCCEEEEEEEECCCCCEECCCCCCCCEECCCHHHHHHHHHHHCCCCCC |true |
|d1vyra_|AEKLFTPLKVGAVTAPNRVFMAPLTRLRSIEPGDIPTPLMGEYYRQRASAGLIISEATQISAQAKGYAGAPGLHSPEQIAAWKKITAGVHAEDGRIAVQLWHTGRISHSSIQPGGQAPVSASALNANTRTSLRDENGNAIRVDTTTPRALELDEIPGIVNDFRQAVANAREAGFDLVELHSAHGYLLHQFLSPSSNQRTDQYGGSVENRARLVLEVVDAVCNEWSADRIGIRVSPIGTFQNVDNGPNEEADALYLIEELAKRGIAYLHMSETDLAGGKPYSEAFRQKVRERFHGVIIGAGAYTAEKAEDLIGKGLIDAVAFGRDYIANPDLVARLQKKAELNPQRPESFYGGGAEGYTDYPSL |CCCCCCCEEECCEEECCCEEECCCCCCCCECCCCECCHHHHHHHHHCCCCCEEEEEEEECCCCCCCCCCCCECCCHHHHHHHHHHHHHHHHCCCCEEEEEECCCCCCCCCCCCCCCCCEECCCCCCCCEEEEECCCCCEEEEECCCCEECCCCCHHHHHHHHHHHHHHHHHCCCCEEEEEECCCCHHHHHHCCCCCCCCCCCCCCHHHHCHHHHHHHHHHHHHCCCCCEEEEECCCCCECCECCCCCHHHHHHHHHHHHHHCCCCEEEEECCECCECCCCCHHHHHHHHHHCCCEEEEECCCCHHHHHHHHHCCCCCEEEECHHHHHCCCHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCC |true |
|d2bbha1|PPGTLVYTGKYREDFEIEVMNYSIEEFREFKTTDVESVLPFRDSSTPTWINITGIHRTDVVQRVGEFFGTHPLVLEDILNVHQRPKVEFFENYVFIVLKMFTYDKHELESEQVSLILTKNCVLMFQEKIGDVFDPVRERIRYNRGIIRKKRADYLLYSLIDALVDDYFVLLEKIDDEIDVLEEEVTVQRTHQLKRNLVELRKTIWPLREVLSSLYRDVPPLIE |CCCCCCCCCCCCCCCEEEEEEEECCEEEEEEECCCCCCCCCCCCCCCEEEEEECCCCHHHHHHHHHHHCCCHHHHHHHHCCCCCCEEEECCCEEEEEEEEEECCCCCCEEEEEEEEEECCEEEEEECCCCCCCHHHHHHHHCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCC |false |
|d1ozna_|PCPGACVCYNEPKVTTSCPQQGLQAVPVGIPAASQRIFLHGNRISHVPAASFRACRNLTILWLHSNVLARIDAAAFTGLALLEQLDLSDNAQLRSVDPATFHGLGRLHTLHLDRCGLQELGPGLFRGLAALQYLYLQDNALQALPDDTFRDLGNLTHLFLHGNRISSVPERAFRGLHSLDRLLLHQNRVAHVHPHAFRDLGRLMTLYLFANNLSALPTEALAPLRALQYLRLNDNPWVCDCRARPLWAWLQKFRGSSSEVPCSLPQRLAGRDLKRLAANDLQGC |CCCCCCEEECCCCCEEECCCCCCCCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCCCCCCCCCCCCCCCCEEECCCCCCCCCCCCCCCCCCCCCEEECCCCCCCCCCCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCEECCCCCCCCCCCCEEECCCCCCCCCCHHHHCCCCCCCEEECCCCCEECCCCCHHHHHHHHHCCCEECCCEEEECCCCCCCECCCCCCCCCCCC |true |
|d1gd8a_|SSHRLALYRNQAKSLLTHGRITTTVPKAKELRGFVDHLIHLAKRGDLHARRLVLRDLQDVKLVRKLFDEIAPRYRDRQGGYTRVLKLAERRRGDGAPLALVELVE |CHHHHHHHHHHHHHHHHHCEEEEEHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHHHCCCHHHHHHHHHCHHHHCCCCCCCCEEEEEEEEECCCCCCEEEEEEECC |true |
|d3f6ya_|FWRQTWSGPGTTKRFPETVLARCVKYTEIHPEMRHVDCQSVWDAFKGAFISKHPCDITEEDYQPLMKLGTQTVPCNKILLWSRIKDLAHQFTQVQRDMFTLEDTLLGYLADDLTWCGEFDTSKINYQSCPDWRKDCSNNPVSVFWKTVSRRFAEAACDVVHVMLDGSRSKIFDKDSTFGSVEVHNLQPEKVQTLEAWVIHGGREDSRDLCQDPTIKELESIISKRNIQFSCKNIY |CCCCCCCEECCCCCHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHHHHHCCCCCCCCCCCCCHHHHHHCCCCCCCCCEECCCCCCCCCCCCHHHHHHCECCCCCHHHHHHCCCCCCCCCCCCCCCCCEECCCCCCCCCCHHHHHHHHHHHHHHHHCCEEEEEEEECCCCCCCCCCCHHHHCCCCCCCCCCEEEEEEEEECCCCCCCCCCCCCHHHHHHHHHHHHCCCEEEEEEEC |true |
|d2pjua1|KPVIWTVSVTRLFELFRDISLEFDHLANITPIQLGFEKAVTYIRKKLANERCDAIIAAGSNGAYLKSRLSVPVILIKPSGYDVLQFLAKAGKLTSSIGVVTYQETIPALVAFQKTFNLRLDQRSYITEEDARGQINELKANGTEAVVGAGLITDLAEEAGMTGIFIYSAATVRQAFSDALDMTRMS |CCEEEEECCHHHHHHHHHHHCCCCCCCEEEEECCCHHHHHHHHHHHCCCCCCCEEEEEHHHHHHHHCCCCCCEEEECCCHHHHHHHHHHCCCCCCCEEEEEECCCCHHHHHHHHHHCCCEEEEEECCHHHHHHHHHHHHHCCCCEEEECHHHHHHHHHCCCEEEECCCHHHHHHHHHHHHHHHHHC |true |
|d4g78a_|SNAMDHLHRKLRDHEAAMFQQGYLDDQFSQLQKLQDDTSPDFVIEVMTMFFDDSEKLLNNMSRALEQVPVNFKQIDAHAHQQKGSSASVGAARVKNVCGTFRNFCEAQNLEGCVRCLQQLQQEYSLLKNNLKYLFKLQQEIKTAGRS |CCCCCHHHHHHHHHHHHHHHCCCECHHHHHHHCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHCEHHHHHHHHCHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCC |true |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
only showing top 20 rows
In [5]:
res = res.coalesce(1)
In [6]:
res.write.mode("overwrite").format("json").save("Local directory to save your JSON file")
In [7]:
sc.stop()