Source code for fast.objective_func

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from .density import ast_density
from .pattern_automaton import PatternAutomaton
from .regexp_ast import RegexpAst


[docs]def make_additive_objective_func_for_str( examples: list, alphabet: list, map_pa_infix_re: dict = None, size_factor: float = 0.5, density_factor: float = 0.5, ) -> callable: """ Makes an additive objective function, finding a tradeoff between accuracy and shortness. Args: examples (list): A `list` of :py:class:`PatternAutomaton` instances. alphabet (list): The `list` of `str`, where each `str` is a symbol alphabet (possibly a metacharacter identifying a `PatternAutomaton`, like `"$date"`). map_pa_infix_re (dict): A ``dict{PatternAutomaton : str}`` that maps each :py:class:`PatternAutomaton` to its corresponding regular expression. size_factor (float): The importance of the shortness, between `0.0` and `1.0`. The higher `size_factor`, the more important the size of the inferred regular expression. density_factor (float): The importance of the accuracy, between `0.0` and `1.0`. The higher `density_factor`, the more important the size of the inferred regular expression. Should be set to `1 - size_factor`. Returns: The corresponding `callable(ast, examples) -> float `objective function where `ast` is a candidate solution; `examples` is the set of positive examples; the returned value is the objective function value given `ast`. """ # TODO Merge size_factor and density_factor # examples_sizes = [len(example.w) for example in examples] # map_len_proba = { # length: examples_sizes.count(length) / len(examples_sizes) # for length in set(examples_sizes) # } # max_len = max(examples_sizes) max_len = max( len(example.w) if isinstance(example, PatternAutomaton) else len(example) for example in examples ) map_len_proba = { length: 1 / max_len for length in range(1, max_len + 1) } char_proba = 1 / len(alphabet) def objective_function(ast: RegexpAst, examples: list): size = ast.num_nodes density = ast_density(ast, map_len_proba, char_proba, map_pa_infix_re) return size_factor * size + density_factor * density return objective_function
[docs]def make_normalized_additive_objective_func_for_str( examples: list, alphabet: list, map_pa_infix_re: dict = None, size_factor: float = 0.5, density_factor: float = 0.5, ): """ Makes a normalized additive objective function, finding a tradeoff between accuracy and shortness. Args: examples (list): A `list` of :py:class:`PatternAutomaton` instances. alphabet (list): The `list` of `str`, where each `str` is a symbol alphabet (possibly a metacharacter identifying a `PatternAutomaton`, like `"$date"`). map_pa_infix_re (dict): A ``dict{PatternAutomaton : str}`` that maps each :py:class:`PatternAutomaton` to its corresponding regular expression. size_factor (float): The importance of the shortness, between `0.0` and `1.0`. The higher `size_factor`, the more important the size of the inferred regular expression. density_factor (float): The importance of the accuracy, between `0.0` and `1.0`. The higher `density_factor`, the more important the size of the inferred regular expression. Should be set to `1 - size_factor`. Returns: The corresponding `callable(ast, examples) -> float `objective function where `ast` is a candidate solution; `examples` is the set of positive examples; the returned value is the objective function value given `ast`. """ # TODO Merge size_factor and density_factor examples_sizes = [ len(example.w) if isinstance(example, PatternAutomaton) else len(example) for example in examples ] # examples_sizes = [len(example.w) for example in examples] total_examples_size = sum(examples_sizes) + len(examples_sizes) + 1 map_len_proba = { length: examples_sizes.count(length) / len(examples_sizes) for length in set(examples_sizes) } char_proba = 1 / len(alphabet) def objective_function(ast: RegexpAst, examples: list): # TODO remove `examples` parameter which is useless size = ast.num_nodes density = ast_density(ast, map_len_proba, char_proba, map_pa_infix_re) return size_factor * (size / total_examples_size) + density_factor * density return objective_function
[docs]def make_multiplicative_objective_func_for_str( examples, alphabet, map_pa_infix_re: dict = None, size_exponent=1, density_exponent=1, ): """ Makes a multiplicative objective function, finding a tradeoff between accuracy and shortness. Args: examples (list): A `list` of :py:class:`PatternAutomaton` instances. alphabet (list): The `list` of `str`, where each `str` is a symbol alphabet (possibly a metacharacter identifying a `PatternAutomaton`, like `"$date"`). map_pa_infix_re (dict): A ``dict{PatternAutomaton : str}`` that maps each :py:class:`PatternAutomaton` to its corresponding regular expression. size_exponent (float): The importance of the shortness, between `0.0` and `1.0`. The higher `size_exponent`, the more important the size of the inferred regular expression. density_exponent (float): The importance of the accuracy, between `0.0` and `1.0`. The higher `density_exponent`, the more important the size of the inferred regular expression. Should be set to `size_exponent - 1`. Returns: The corresponding `callable(ast, examples) -> float `objective function where `ast` is a candidate solution; `examples` is the set of positive examples; the returned value is the objective function value given `ast`. """ # TODO Merge size_exponent and density_exponent EPSILON = 1E-6 examples_sizes = [len(example.w) for example in examples] map_len_proba = { length: examples_sizes.count(length) / len(examples_sizes) for length in set(examples_sizes) } char_proba = 1 / len(alphabet) def objective_function(ast: RegexpAst, examples: list) -> float: # TODO remove `examples` parameter which is useless size = ast.num_nodes density = ast_density(ast, map_len_proba, char_proba, map_pa_infix_re) return max(EPSILON, size ** size_exponent) * density ** density_exponent return objective_function
[docs]def make_tuple_based_objective_func_for_str( examples: list, alphabet: set, map_pa_infix_re: dict = None, ): """ Makes a lexicographic objective function, finding a tradeoff between accuracy and shortness. Args: examples (list): A `list` of :py:class:`PatternAutomaton` instances. alphabet (list): The `list` of `str`, where each `str` is a symbol alphabet (possibly a metacharacter identifying a `PatternAutomaton`, like `"$date"`). map_pa_infix_re (dict): A ``dict{PatternAutomaton : str}`` that maps each :py:class:`PatternAutomaton` to its corresponding regular expression. Returns: The corresponding `callable(ast, examples) -> float `objective function where `ast` is a candidate solution; `examples` is the set of positive examples; the returned value is the objective function value given `ast`. """ examples_sizes = [len(example.w) for example in examples] map_len_proba = { length: examples_sizes.count(length) / len(examples_sizes) for length in set(examples_sizes) } char_proba = 1 / len(alphabet) def objective_function(ast: RegexpAst, examples: list) -> tuple: # TODO remove `examples` parameter which is useless size = ast.num_nodes density = ast_density(ast, map_len_proba, char_proba, map_pa_infix_re) return (size, density) return objective_function