In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("advancedZincInteractionDemo")
sc = SparkContext(conf = conf)
In [3]:
path = "../../resources/mmtf_full_sample/"
pdb = mmtfReader.read_sequence_file(path, sc)
In [4]:
seqId = 40
resolution = 2.0
pdb = pdb.filter(Pisces(seqId, resolution))
In [5]:
finder = groupInteractionExtractor("ZN",3)
interactions = finder.get_dataset(pdb).cache()
In [6]:
interactions.printSchema()
interactions.show(20)
n = interactions.count()
print(f"Number of interactions: {n}")
root
|-- structureId: string (nullable = false)
|-- residue1: string (nullable = false)
|-- atom1: string (nullable = false)
|-- element1: string (nullable = false)
|-- index1: integer (nullable = false)
|-- residue2: string (nullable = false)
|-- atom2: string (nullable = false)
|-- element2: string (nullable = false)
|-- index2: integer (nullable = false)
|-- distance: float (nullable = false)
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
| 1FN9| ZN| ZN| Zn| 730| CYS| SG| S| 50|2.3709755|
| 1FN9| ZN| ZN| Zn| 730| CYS| SG| S| 53|2.3940797|
| 1FN9| ZN| ZN| Zn| 730| HIS| NE2| N| 70|2.2196307|
| 1FN9| ZN| ZN| Zn| 730| CYS| SG| S| 72|2.3465357|
| 1FN9| ZN| ZN| Zn| 731| CYS| SG| S| 415|2.3747551|
| 1FN9| ZN| ZN| Zn| 731| CYS| SG| S| 418|2.3680198|
| 1FN9| ZN| ZN| Zn| 731| HIS| NE2| N| 435|2.1647959|
| 1FN9| ZN| ZN| Zn| 731| CYS| SG| S| 437|2.3763454|
| 1E4M| ZN| ZN| Zn| 519| HIS| CE1| C| 53|2.9807622|
| 1E4M| ZN| ZN| Zn| 519| HIS| NE2| N| 53| 2.040789|
| 1E4M| ZN| ZN| Zn| 519| ASP| CG| C| 67| 2.754825|
| 1E4M| ZN| ZN| Zn| 519| ASP| OD1| O| 67|2.8967845|
| 1E4M| ZN| ZN| Zn| 519| ASP| OD2| O| 67|1.9672809|
| 1BF6| ZN| ZN| Zn| 582| HIS| NE2| N| 10|2.2776458|
| 1BF6| ZN| ZN| Zn| 582| HIS| NE2| N| 12|2.1644206|
| 1BF6| ZN| ZN| Zn| 582| GLU| OE2| O| 123|2.3778422|
| 1BF6| ZN| ZN| Zn| 582| ASP| OD1| O| 241| 2.41581|
| 1BF6| ZN| ZN| Zn| 583| GLU| CD| C| 123|2.7811828|
| 1BF6| ZN| ZN| Zn| 583| GLU| OE1| O| 123|2.1997967|
| 1BF6| ZN| ZN| Zn| 583| HIS| ND1| N| 156|2.2733805|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
only showing top 20 rows
Number of interactions: 238
In [7]:
topGroupsAndAtoms = interactions.filter("element2 != 'C'") \
.groupBy("residue2","atom2") \
.count()
In [8]:
topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n) \
.filter("frequency > 0.01") \
.sort("frequency", ascending = False) \
.show(20)
+--------+-----+-----+--------------------+
|residue2|atom2|count| frequency|
+--------+-----+-----+--------------------+
| CYS| SG| 43| 0.18067226890756302|
| HOH| O| 37| 0.15546218487394958|
| HIS| NE2| 30| 0.12605042016806722|
| HIS| ND1| 24| 0.10084033613445378|
| ASP| OD2| 11|0.046218487394957986|
| GLU| OE1| 11|0.046218487394957986|
| GLU| OE2| 11|0.046218487394957986|
| ASP| OD1| 9|0.037815126050420166|
| ACT| O| 4| 0.01680672268907563|
| ACT| OXT| 4| 0.01680672268907563|
+--------+-----+-----+--------------------+
In [9]:
topElements = interactions.filter("element2 != 'C'") \
.groupBy("element2") \
.count()
In [10]:
topElements.withColumn("frequency", topElements["count"] / n) \
.filter("frequency > 0.01") \
.sort("frequency", ascending = False) \
.show(10)
+--------+-----+--------------------+
|element2|count| frequency|
+--------+-----+--------------------+
| O| 91| 0.38235294117647056|
| N| 56| 0.23529411764705882|
| S| 43| 0.18067226890756302|
| H| 3|0.012605042016806723|
+--------+-----+--------------------+
In [11]:
interactions.groupBy("element2") \
.avg("distance") \
.sort("avg(distance)") \
.show(10)
+--------+------------------+
|element2| avg(distance)|
+--------+------------------+
| N| 2.247671846832548|
| Cl|2.3399999141693115|
| O| 2.340171109189044|
| S|2.3423283100128174|
| C| 2.727002328092402|
| H|2.8938498497009277|
+--------+------------------+
In [12]:
interactions.groupBy("element2") \
.agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")) \
.show(10)
+--------+---------------+------------------+-------------+-------------+-------------------+
|element2|count(distance)| avg(distance)|min(distance)|max(distance)| kurtosis(distance)|
+--------+---------------+------------------+-------------+-------------+-------------------+
| O| 91| 2.340171109189044| 1.8502038| 2.9841056|-0.5095228492389405|
| C| 44| 2.727002328092402| 1.8144855| 2.9990435| 2.050274417960135|
| N| 56| 2.247671846832548| 1.9923105| 2.9953997| 2.470076287060217|
| Cl| 1|2.3399999141693115| 2.34| 2.34| NaN|
| S| 43|2.3423283100128174| 2.2196188| 2.4604716| 0.3902514824014989|
| H| 3|2.8938498497009277| 2.844304| 2.979628|-1.4999999999999993|
+--------+---------------+------------------+-------------+-------------+-------------------+
In [13]:
sc.stop()