Creates an MMTF-Hadoop Sequence file for a Picses representative set of protein chains.
Please cite the following in any work that uses lists provided by PISCES G. Wang and R. L. Dunbrack, Jr. PISCES: a protein sequence culling server. Bioinformatics, 19:1589-1591, 2003. PISCES
In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import mmtfReader, mmtfWriter
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import PolymerComposition
from mmtfPyspark.webfilters import Pisces
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("CreateRepresentativeSetDemo")
sc = SparkContext(conf = conf)
In [6]:
path = "../../resources/mmtf_full_sample/"
pdb = mmtfReader.read_sequence_file(path, sc)
In [7]:
sequenceIdentity = 40
resolution = 2.0
pdb = pdb.filter(Pisces(sequenceIdentity, resolution)) \
.flatMap(StructureToPolymerChains()) \
.filter(Pisces(sequenceIdentity, resolution)) \
.filter(PolymerComposition(PolymerComposition.AMINO_ACIDS_20))
In [8]:
pdb.top(10)
Out[8]:
[('1FYE.A', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef707b1a58>),
('1FXL.A', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef702727f0>),
('1FVI.A', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef701a12b0>),
('1FV1.F', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef5d39d0f0>),
('1FTR.D', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef5d341048>),
('1FT5.A', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef43e000b8>),
('1FSG.C', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef43caf0b8>),
('1FS1.C', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef43ad1438>),
('1FR3.L', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef43aa2c18>),
('1FPZ.C', <mmtf.api.mmtf_writer.MMTFEncoder at 0x7fef43a19358>)]
In [9]:
write_path = f'./pdb_representatives_{sequenceIdentity}'
mmtfWriter.write_sequence_file(write_path, sc, pdb)
In [3]:
sc.stop()