Advanced Zinc Interaction Analysis Example

Imports

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces

Configure Spark

In [2]:
conf = SparkConf().setMaster("local[*]") \
                  .setAppName("advancedZincInteractionDemo")

sc = SparkContext(conf = conf)

Read PDB in MMTF format

In [3]:
path = "../../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)

Use only representative structures

In [4]:
seqId = 40
resolution = 2.0

pdb = pdb.filter(Pisces(seqId, resolution))

Extract proteins with Zn interactions

In [5]:
finder = groupInteractionExtractor("ZN",3)

interactions = finder.get_dataset(pdb).cache()

List the top 10 residue types that interact with Zn

In [6]:
interactions.printSchema()

interactions.show(20)

n = interactions.count()

print(f"Number of interactions: {n}")
root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    50|2.3709755|
|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    53|2.3940797|
|       1FN9|      ZN|   ZN|      Zn|   730|     HIS|  NE2|       N|    70|2.2196307|
|       1FN9|      ZN|   ZN|      Zn|   730|     CYS|   SG|       S|    72|2.3465357|
|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   415|2.3747551|
|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   418|2.3680198|
|       1FN9|      ZN|   ZN|      Zn|   731|     HIS|  NE2|       N|   435|2.1647959|
|       1FN9|      ZN|   ZN|      Zn|   731|     CYS|   SG|       S|   437|2.3763454|
|       1E4M|      ZN|   ZN|      Zn|   519|     HIS|  CE1|       C|    53|2.9807622|
|       1E4M|      ZN|   ZN|      Zn|   519|     HIS|  NE2|       N|    53| 2.040789|
|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|   CG|       C|    67| 2.754825|
|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|  OD1|       O|    67|2.8967845|
|       1E4M|      ZN|   ZN|      Zn|   519|     ASP|  OD2|       O|    67|1.9672809|
|       1BF6|      ZN|   ZN|      Zn|   582|     HIS|  NE2|       N|    10|2.2776458|
|       1BF6|      ZN|   ZN|      Zn|   582|     HIS|  NE2|       N|    12|2.1644206|
|       1BF6|      ZN|   ZN|      Zn|   582|     GLU|  OE2|       O|   123|2.3778422|
|       1BF6|      ZN|   ZN|      Zn|   582|     ASP|  OD1|       O|   241|  2.41581|
|       1BF6|      ZN|   ZN|      Zn|   583|     GLU|   CD|       C|   123|2.7811828|
|       1BF6|      ZN|   ZN|      Zn|   583|     GLU|  OE1|       O|   123|2.1997967|
|       1BF6|      ZN|   ZN|      Zn|   583|     HIS|  ND1|       N|   156|2.2733805|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
only showing top 20 rows

Number of interactions: 238

Show the top 10 interacting group/atom types

Exclude Carbon Interactions

In [7]:
topGroupsAndAtoms = interactions.filter("element2 != 'C'") \
                                .groupBy("residue2","atom2") \
                                .count()

Add column with frequency of occurence

Filter out occurrences < 1%

Sort descending

In [8]:
topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n) \
                 .filter("frequency > 0.01") \
                 .sort("frequency", ascending = False) \
                 .show(20)
+--------+-----+-----+--------------------+
|residue2|atom2|count|           frequency|
+--------+-----+-----+--------------------+
|     CYS|   SG|   43| 0.18067226890756302|
|     HOH|    O|   37| 0.15546218487394958|
|     HIS|  NE2|   30| 0.12605042016806722|
|     HIS|  ND1|   24| 0.10084033613445378|
|     ASP|  OD2|   11|0.046218487394957986|
|     GLU|  OE1|   11|0.046218487394957986|
|     GLU|  OE2|   11|0.046218487394957986|
|     ASP|  OD1|    9|0.037815126050420166|
|     ACT|    O|    4| 0.01680672268907563|
|     ACT|  OXT|    4| 0.01680672268907563|
+--------+-----+-----+--------------------+

Aggregate multiple statistics

In [12]:
interactions.groupBy("element2") \
            .agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")) \
            .show(10)
+--------+---------------+------------------+-------------+-------------+-------------------+
|element2|count(distance)|     avg(distance)|min(distance)|max(distance)| kurtosis(distance)|
+--------+---------------+------------------+-------------+-------------+-------------------+
|       O|             91| 2.340171109189044|    1.8502038|    2.9841056|-0.5095228492389405|
|       C|             44| 2.727002328092402|    1.8144855|    2.9990435|  2.050274417960135|
|       N|             56| 2.247671846832548|    1.9923105|    2.9953997|  2.470076287060217|
|      Cl|              1|2.3399999141693115|         2.34|         2.34|                NaN|
|       S|             43|2.3423283100128174|    2.2196188|    2.4604716| 0.3902514824014989|
|       H|              3|2.8938498497009277|     2.844304|     2.979628|-1.4999999999999993|
+--------+---------------+------------------+-------------+-------------+-------------------+

Terminate Spark

In [13]:
sc.stop()