Source code for fast.multi_grep

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from collections import defaultdict
from pybgl.automaton import BOTTOM


[docs]def multi_grep( w: str, map_name_dfa: list, callback: callable = lambda name, j, k, w: None, ): """ Searches sub-strings of a string matched by multiple patterns. Args: w (str): A ``str`` containing the word to process. map_name_dfa (dict): A ``dict{Name: Automaton}`` mapping each pattern name with its corresponding DFA. callback (callable): A ``callable(Name, int, int, str)`` called whenever ``w[j:k]`` is matched by pattern ``name``. """ def make_map_name_q_js_next(): return { name: defaultdict(set) for (name, g) in map_name_dfa.items() } m = len(w) map_name_q_js = { name: {g.initial(): {0}} for (name, g) in map_name_dfa.items() } map_name_q_js_next = make_map_name_q_js_next() for k in range(m): a = w[k] for (name, g) in map_name_dfa.items(): map_q_js = map_name_q_js[name] for (q, js) in map_q_js.items(): if not js: continue r = g.delta(q, a) if r is BOTTOM: map_name_q_js_next[name][q] = set() else: map_name_q_js_next[name][r] |= map_q_js[q] if g.is_final(r): for j in js: callback(name, j, k + 1, w) map_name_q_js_next[name][g.initial()] |= {k + 1} map_name_q_js = map_name_q_js_next map_name_q_js_next = make_map_name_q_js_next()
[docs]def multi_grep_with_delimiters( w: str, map_name_dfa: dict, callback: callable = lambda i, j, k, w: None, is_pattern_separator: callable = lambda name: False, is_pattern_left_separated: callable = lambda name: True, is_pattern_right_separated: callable = lambda name: True ): """ Searches sub-strings of a string matched by multiple patterns, possibly delimited by a predefined collection of separator patterns. Args: w (str): A ``str`` containing the word to process. map_name_dfa (dict): A ``dict{Name: Automaton}`` mapping each pattern name with its corresponding DFA. callback (callable): A ``callable(Name, int, int, str)`` called whenever ``w[j:k]`` is matched by pattern ``name``. is_pattern_separator (callable): A ``callable(Name) -> bool`` return ``True`` if the pattern ``name`` is a separator. is_pattern_left_separated (callable): A ``callable(Name) -> bool`` return ``True`` if the pattern ``name`` must be preceeded by a separator pattern or located at the beginning of ``w``. is_pattern_right_separated (callable): A ̀``callable(Name) -> bool`` return ``True`` if the pattern ``name`` must be followed by a separator pattern or located at the end of ``w``. """ # multi_grep on separating patterns map_name_dfa_separator = { name: dfa for (name, dfa) in map_name_dfa.items() if is_pattern_separator(name) } functor_delims = MultiGrepFonctorLargest() multi_grep(w, map_name_dfa_separator, functor_delims) js = {0} | { k for (i, jks) in functor_delims.indices().items() for (j, k) in jks } ks = {len(w)} | { j for (i, jks) in functor_delims.indices().items() for (j, k) in jks } # multi_grep on other patterns def filtered_callback(name, j, k, w): if is_pattern_separator(name): return elif j not in js and is_pattern_left_separated(name): return elif k not in ks and is_pattern_right_separated(name): return else: callback(name, j, k, w) multi_grep( w, map_name_dfa, callback=lambda name, j, k, w: filtered_callback(name, j, k, w) )
[docs]class MultiGrepFonctor: """ Base class used to customize the behavior of the :py:func:`multi_grep` function. """ def __call__(self, i, j, k, w): """ Args: i (int): The DFA index. j (int): The index of the beginning of a substring catched by the DFA. k (int): The index of o the end of a substring catched by the DFA. w (str): The input string. """ raise NotImplementedError
[docs] def indices(self) -> dict: """ Returns: A ``dict{i: [(j, k)]}`` where: ``i`` is an ``int`` corresponding to a DFA identifier. ``j`` is an ``int`` corresponding to the beginning of substring catched by the DFA ``i``. ``k`` is an ``int`` corresponding to the end of a substring catched by the DFA ``i``. """ raise NotImplementedError
[docs]class MultiGrepFonctorAll(MultiGrepFonctor): """ The :py:class:`MultiGrepFonctorAll` class catches (for each pattern Pi and for each index j) each substring w[j:k] matching Pi. """ def __init__(self): self.map_i_jk = defaultdict(list) def __call__(self, i, j, k, w): self.map_i_jk[i].append((j, k))
[docs] def indices(self) -> dict: return self.map_i_jk
[docs]class MultiGrepFonctorLargest(MultiGrepFonctor): """ The :py:class:`MultiGrepFonctorLargest` class catches (for each pattern Pi and for each index j) the largest w[j:k] matching Pi. """ def __init__(self): self.map_i_j_k = defaultdict(lambda: defaultdict(lambda: None)) def __call__(self, i, j, k, w): # As we read w from left to right, k > self.map_pattern_indices[i][j] self.map_i_j_k[i][j] = k
[docs] def indices(self) -> dict: # Rebuild {i: [(j, k)]} for each (i, j) pair result = defaultdict(list) for i in self.map_i_j_k: for k in set(self.map_i_j_k[i].values()): result[i] += [( min( j for j in self.map_i_j_k[i].keys() if self.map_i_j_k[i][j] == k ), k )] result[i] = sorted(result[i]) return result
[docs]class MultiGrepFonctorGreedy(MultiGrepFonctorLargest): """ The :py:class:`MultiGrepFonctorGreedy` class catches (for each pattern Pi and for each index j) the largest w[j′:k] matching Pi and s.t. j′ < j. """ def __init__(self): super().__init__() self.map_i_k_j = defaultdict(lambda: defaultdict(lambda: None)) def __call__(self, i, j, k, w): j_ = self.map_i_k_j[i][k] if j_ is None or j < j_: self.map_i_k_j[i][k] = j super().__call__(i, j, k, w)