This demo shows how to create and query a dataset. The dataset in this case is generated by running an RCSB PDB web service to create a custom report of PDB annotations.
In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.datasets import customReportService
import time
In [2]:
conf = SparkConf().setMaster("local[*]") \
.setAppName("secondaryStructureSegmentDemo")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
Binding addinities (Ki, Kd), group name of the ligand (hetId), and the Enzyme Classification number (ecNo)
In [3]:
ds = customReportService.get_dataset(["Ki","Kd","hetId","ecNo"])
In [4]:
ds.printSchema()
root
|-- structureChainId: string (nullable = true)
|-- structureId: string (nullable = true)
|-- chainId: string (nullable = true)
|-- Ki: string (nullable = true)
|-- Kd: string (nullable = true)
|-- hetId: string (nullable = true)
|-- ecNo: string (nullable = true)
In [5]:
ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'")
ds.show(10)
+----------------+-----------+-------+--------------------+------------+-----+---------+
|structureChainId|structureId|chainId| Ki| Kd|hetId| ecNo|
+----------------+-----------+-------+--------------------+------------+-----+---------+
| 2CLQ.A| 2CLQ| A| null|11-120 (BDB)| STU|2.7.11.25|
| 2CLQ.B| 2CLQ| B| null|11-120 (BDB)| STU|2.7.11.25|
| 2E9N.A| 2E9N| A| 6.3 (BDB)| null| 76A| 2.7.11.1|
| 2E9O.A| 2E9O| A| 20 (BDB)| null| A58| 2.7.11.1|
| 2E9P.A| 2E9P| A| 20 (BDB)| null| 77A| 2.7.11.1|
| 2E9U.A| 2E9U| A|7.94 (PDBbind)#7....| null| A25| 2.7.11.1|
| 2E9V.A| 2E9V| A|12.59 (PDBbind)#1...| null| 85A| 2.7.11.1|
| 2E9V.B| 2E9V| B|12.59 (PDBbind)#1...| null| 85A| 2.7.11.1|
| 2GNF.A| 2GNF| A|6000 (BMOAD_9806)...| null| Y27|2.7.11.11|
| 2GNH.A| 2GNH| A|149 (BMOAD_9880)#...| null| H52|2.7.11.11|
+----------------+-----------+-------+--------------------+------------+-----+---------+
only showing top 10 rows
In [6]:
ds.createOrReplaceTempView("table")
ds = sqlContext.sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'")
ds.show(10)
+----------------+-----------+-------+--------------------+------------+-----+---------+
|structureChainId|structureId|chainId| Ki| Kd|hetId| ecNo|
+----------------+-----------+-------+--------------------+------------+-----+---------+
| 2CLQ.A| 2CLQ| A| null|11-120 (BDB)| STU|2.7.11.25|
| 2CLQ.B| 2CLQ| B| null|11-120 (BDB)| STU|2.7.11.25|
| 2E9N.A| 2E9N| A| 6.3 (BDB)| null| 76A| 2.7.11.1|
| 2E9O.A| 2E9O| A| 20 (BDB)| null| A58| 2.7.11.1|
| 2E9P.A| 2E9P| A| 20 (BDB)| null| 77A| 2.7.11.1|
| 2E9U.A| 2E9U| A|7.94 (PDBbind)#7....| null| A25| 2.7.11.1|
| 2E9V.A| 2E9V| A|12.59 (PDBbind)#1...| null| 85A| 2.7.11.1|
| 2E9V.B| 2E9V| B|12.59 (PDBbind)#1...| null| 85A| 2.7.11.1|
| 2GNF.A| 2GNF| A|6000 (BMOAD_9806)...| null| Y27|2.7.11.11|
| 2GNH.A| 2GNH| A|149 (BMOAD_9880)#...| null| H52|2.7.11.11|
+----------------+-----------+-------+--------------------+------------+-----+---------+
only showing top 10 rows
In [7]:
sc.stop()