Source code for fast.multi_grep

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from collections import defaultdict
from pybgl.automaton import BOTTOM


[docs]def multi_grep(
    w: str,
    map_name_dfa: list,
    callback: callable = lambda name, j, k, w: None,
):
    """
    Searches sub-strings of a string matched by multiple patterns.

    Args:
        w (str): A ``str`` containing the word to process.
        map_name_dfa (dict): A ``dict{Name:  Automaton}`` mapping each pattern
            name with its corresponding DFA.
        callback (callable): A ``callable(Name, int, int, str)`` called whenever
            ``w[j:k]`` is matched by pattern ``name``.
    """
    def make_map_name_q_js_next():
        return {
            name:  defaultdict(set)
            for (name, g) in map_name_dfa.items()
        }

    m = len(w)
    map_name_q_js = {
        name:  {g.initial():  {0}}
        for (name, g) in map_name_dfa.items()
    }

    map_name_q_js_next = make_map_name_q_js_next()
    for k in range(m):
        a = w[k]
        for (name, g) in map_name_dfa.items():
            map_q_js = map_name_q_js[name]
            for (q, js) in map_q_js.items():
                if not js:
                    continue
                r = g.delta(q, a)
                if r is BOTTOM:
                    map_name_q_js_next[name][q] = set()
                else:
                    map_name_q_js_next[name][r] |= map_q_js[q]
                    if g.is_final(r):
                        for j in js:
                            callback(name, j, k + 1, w)
            map_name_q_js_next[name][g.initial()] |= {k + 1}
        map_name_q_js = map_name_q_js_next
        map_name_q_js_next = make_map_name_q_js_next()


[docs]def multi_grep_with_delimiters(
    w: str,
    map_name_dfa: dict,
    callback: callable = lambda i, j, k, w: None,
    is_pattern_separator: callable = lambda name: False,
    is_pattern_left_separated: callable = lambda name: True,
    is_pattern_right_separated: callable = lambda name: True
):
    """
    Searches sub-strings of a string matched by multiple patterns, possibly
    delimited by a predefined collection of separator patterns.

    Args:
        w (str): A ``str`` containing the word to process.
        map_name_dfa (dict): A ``dict{Name:  Automaton}`` mapping each pattern
            name with its corresponding DFA.
        callback (callable): A ``callable(Name, int, int, str)`` called whenever
            ``w[j:k]`` is matched by pattern ``name``.
        is_pattern_separator (callable): A ``callable(Name) -> bool`` return ``True`` if
            the pattern ``name`` is a separator.
        is_pattern_left_separated (callable):  A ``callable(Name) -> bool`` return ``True`` if
            the pattern ``name`` must be preceeded by a separator pattern or
            located at the beginning of ``w``.
        is_pattern_right_separated (callable): A ̀``callable(Name) -> bool`` return ``True`` if
            the pattern ``name`` must be followed by a separator pattern or
            located at the end of ``w``.
    """
    # multi_grep on separating patterns
    map_name_dfa_separator = {
        name:  dfa
        for (name, dfa) in map_name_dfa.items()
        if is_pattern_separator(name)
    }
    functor_delims = MultiGrepFonctorLargest()
    multi_grep(w, map_name_dfa_separator, functor_delims)

    js = {0} | {
        k
        for (i, jks) in functor_delims.indices().items()
        for (j, k) in jks
    }

    ks = {len(w)} | {
        j
        for (i, jks) in functor_delims.indices().items()
        for (j, k) in jks
    }

    # multi_grep on other patterns
    def filtered_callback(name, j, k, w):
        if is_pattern_separator(name):
            return
        elif j not in js and is_pattern_left_separated(name):
            return
        elif k not in ks and is_pattern_right_separated(name):
            return
        else:
            callback(name, j, k, w)

    multi_grep(
        w,
        map_name_dfa,
        callback=lambda name, j, k, w: filtered_callback(name, j, k, w)
    )


[docs]class MultiGrepFonctor:
    """
    Base class used to customize the behavior of the :py:func:`multi_grep` function.
    """

    def __call__(self, i, j, k, w):
        """
        Args:
            i (int): The DFA index.
            j (int): The index of the beginning of a substring catched by the DFA.
            k (int): The index of o the end of a substring catched by the DFA.
            w (str): The input string.
        """
        raise NotImplementedError

[docs]    def indices(self) -> dict:
        """
        Returns:
            A ``dict{i:  [(j, k)]}`` where:
            ``i`` is an ``int`` corresponding to a DFA identifier.
            ``j`` is an ``int`` corresponding to the beginning of
            substring catched by the DFA ``i``.
            ``k`` is an ``int`` corresponding to the end of
            a substring catched by the DFA ``i``.
        """
        raise NotImplementedError


[docs]class MultiGrepFonctorAll(MultiGrepFonctor):
    """
    The :py:class:`MultiGrepFonctorAll` class catches
    (for each pattern Pi and for each index j)
    each substring w[j:k] matching Pi.
    """
    def __init__(self):
        self.map_i_jk = defaultdict(list)

    def __call__(self, i, j, k, w):
        self.map_i_jk[i].append((j, k))

[docs]    def indices(self) -> dict:
        return self.map_i_jk


[docs]class MultiGrepFonctorLargest(MultiGrepFonctor):
    """
    The :py:class:`MultiGrepFonctorLargest` class
    catches (for each pattern Pi and for each index j)
    the largest w[j:k] matching Pi.
    """
    def __init__(self):
        self.map_i_j_k = defaultdict(lambda: defaultdict(lambda: None))

    def __call__(self, i, j, k, w):
        # As we read w from left to right, k > self.map_pattern_indices[i][j]
        self.map_i_j_k[i][j] = k

[docs]    def indices(self) -> dict:
        # Rebuild {i:  [(j, k)]} for each (i, j) pair
        result = defaultdict(list)
        for i in self.map_i_j_k:
            for k in set(self.map_i_j_k[i].values()):
                result[i] += [(
                    min(
                        j for j in self.map_i_j_k[i].keys()
                        if self.map_i_j_k[i][j] == k
                    ),
                    k
                )]
            result[i] = sorted(result[i])
        return result


[docs]class MultiGrepFonctorGreedy(MultiGrepFonctorLargest):
    """
    The :py:class:`MultiGrepFonctorGreedy` class
    catches (for each pattern Pi and for each index j)
    the largest  w[j′:k] matching Pi and s.t.  j′ < j.
    """
    def __init__(self):
        super().__init__()
        self.map_i_k_j = defaultdict(lambda: defaultdict(lambda: None))

    def __call__(self, i, j, k, w):
        j_ = self.map_i_k_j[i][k]
        if j_ is None or j < j_:
            self.map_i_k_j[i][k] = j
            super().__call__(i, j, k, w)