Source code for mmtfPyspark.ml.sequenceNgrammer

'''sequenceNgrammer.py

This class contians methods for creating overlapping and non-overlapping
n-grams of one-letter code sequence
(e.g., protein sequences)

'''
__author__ = "Mars (Shih-Cheng) Huang"
__maintainer__ = "Mars (Shih-Cheng) Huang"
__email__ = "marshuang80@gmail.com"
__version__ = "0.2.0"
__status__ = "Done"

from pyspark.sql import SparkSession, types


[docs]def ngram(data, n, outputCol): '''Splits a one-letter sequence column (e.g., protein sequence) into array of overlapping n-grams. Examples -------- 2-gram: IDCGH ... => [ID, DC, CG, GH, ...] Parameters ---------- data : dataset input dataset with column "sequence" n : int size of the n-gram outputCol : str name of the output column Returns ------- dataset output dataset with appended ngram column ''' session = SparkSession.builder.getOrCreate() #Encoder function to be passed as User Defined Function (UDF) def _ngrammer(s): ngram = [] i = 0 if len(s) < 1: return [] while i < len(s) - n + 1: ngram.append(s[i: i + n]) i += 1 return ngram session.udf.register("ngrammer", _ngrammer, types.ArrayType(types.StringType())) data.createOrReplaceTempView("table") sql = f"SELECT *, ngrammer(sequence) AS {outputCol} from table" data = session.sql(sql) return data
[docs]def shifted_ngram(data, n, shift, outputCol): '''Splits a one-letter sequence column (e.g., protein sequence) into array of non-overlapping n-grams. To generate all possible n-grams, this method needs to be called n times with shift parameters {0, ..., n-1}. Examples -------- 3-gram(shift=0) : IDCGHTVEDQR ... => [IDC, GHT, VED, ...] 3-gram(shift=1) : IDCGHTVEDQR ... => [DCG, HTV, EDQ, ...] 3-gram(shift=2) : IDCGHTVEDQR ... => [CGH, TVE, DQR, ...] References ---------- For anapplication of shifted n-grams see: E Asgari, MRK Mofrad, PLoS One. 2015; 10(11): e0141287, doi: https://dx.doi.org/10.1371/journal.pone.0141287 Parameters ---------- data : dataset input dataset with column "sequence" n : int size of the n-gram shift : int start index for the n-gram outputCol : str name of the output column Returns ------- dataset output dataset with appended ngram column ''' session = SparkSession.builder.getOrCreate() #Encoder function to be passed as User Defined Function (UDF) def _ngrammer(s): ngram = [] i,j = 0,0 t = int(len(s)/n) if len(s) < shift: return [] s = s[shift:] while j < t: ngram.append(s[i: i + n]) j += 1 i += n return ngram session.udf.register("ngrammer", _ngrammer, types.ArrayType(types.StringType())) data.createOrReplaceTempView("table") sql = f"SELECT *, ngrammer(sequence) AS {outputCol} from table" data = session.sql(sql) return data