Source code for miraiml.pipeline

"""
:mod:`miraiml.pipeline` contains a function that lets you build your own
pipeline classes. It also contains a few pre-defined pipelines for baselines.
"""

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression

from miraiml.util import is_valid_pipeline_name
from miraiml.core import BasePipelineClass


[docs]def compose(steps):
    """
    A function that defines pipeline classes dinamically. It builds a pipeline
    class that can be instantiated with particular parameters for each of its
    transformers/estimator without needing to call ``set_params`` as you would
    do with scikit-learn's Pipeline when performing hyperparameters optimizations.

    Similarly to scikit-learn's Pipeline, ``steps`` is a list of tuples
    containing an alias and the respective pipeline element. Although, since
    this function is a class factory, you shouldn't instantiate the
    transformer/estimator as you would do with scikit-learn's Pipeline. Thus,
    this is how :func:`compose` should be called:

    ::

        >>> from sklearn.ensemble import RandomForestClassifier
        >>> from sklearn.preprocessing import StandardScaler

        >>> from miraiml.pipeline import compose

        >>> MyPipelineClass = compose(
        ...     steps = [
        ...         ('scaler', StandardScaler), # StandardScaler instead of StandardScaler()
        ...         ('rfc', RandomForestClassifier) # No instantiation either
        ...     ]
        ... )

    And then, in order to instantiate ``MyPipelineClass`` with the desired
    parameters, you just need to refer to them as a concatenation of their
    respective class aliases and their names, separated by ``'__'``.

    ::

        >>> pipeline = MyPipelineClass(scaler__with_mean=False, rfc__max_depth=3)

    If you want to know which parameters you're allowed to play with, just call
    ``get_params``:

    ::

        >>> params = pipeline.get_params()
        >>> print("\\n".join(params))
        scaler__with_mean
        scaler__with_std
        rfc__bootstrap
        rfc__class_weight
        rfc__criterion
        rfc__max_depth
        rfc__max_features
        rfc__max_leaf_nodes
        rfc__min_impurity_decrease
        rfc__min_impurity_split
        rfc__min_samples_leaf
        rfc__min_samples_split
        rfc__min_weight_fraction_leaf
        rfc__n_estimators
        rfc__n_jobs
        rfc__oob_score
        rfc__random_state
        rfc__verbose
        rfc__warm_start

    You can check the available methods for your instantiated pipelines on the
    documentation for :class:`miraiml.core.BasePipelineClass`, which is the
    class from which the composed classes inherit from.

    **The intended purpose** of such pipeline classes is that they can work as
    base models to build instances of :class:`miraiml.SearchSpace`.

    ::

        >>> from miraiml import SearchSpace

        >>> search_space = SearchSpace(
        ...     id='MyPipelineClass',
        ...     model_class=MyPipelineClass,
        ...     parameters_values=dict(
        ...         scaler__with_mean=[True, False],
        ...         scaler__with_std=[True, False],
        ...         rfc__max_depth=[3, 4, 5, 6]
        ...     )
        ... )

    :type steps: list
    :param steps: The list of pairs (alias, class) to define the pipeline.

        .. warning::
            Repeated aliases are not allowed and none of the aliases can start
            with numbers or contain ``'__'``.

            The classes used to compose a pipeline **must** implement ``get_params``
            and ``set_params``, such as scikit-learn's classes, or :func:`compose`
            **will break**.

    :rtype: type
    :returns: The composed pipeline class

    :raises: ``TypeError`` if an alias is not a string.

    :raises: ``ValueError`` if an alias has an invalid name.

    :raises: ``NotImplementedError`` if some class of the pipeline does not implement
        the required methods.
    """

    aliases = []

    for alias, class_type in steps:
        if not isinstance(alias, str):
            raise TypeError('{} is not a string'.format(alias))

        if not is_valid_pipeline_name(alias):
            raise ValueError('{} is not allowed for an alias'.format(alias))

        class_content = dir(class_type)

        if 'fit' not in class_content:
            raise NotImplementedError('{} must implement fit'.format(class_type.__name__))

        aliases.append(alias)

        if len(aliases) < len(steps):
            if 'transform' not in class_content:
                raise NotImplementedError(
                    '{} must implement transform'.format(class_type.__name__)
                )
        else:
            if 'predict' not in class_content and 'predict_proba' not in class_content:
                raise NotImplementedError(
                    '{} must implement predict or predict_proba'.format(class_type.__name__)
                )

    if len(set(aliases)) != len(aliases):
        raise ValueError('Repeated aliases are not allowed')

    return type('MiraiPipeline', (BasePipelineClass,), dict(steps=steps))


__initial_steps__ = [
    ('ohe', OneHotEncoder),
    ('impute', SimpleImputer),
    ('min_max', MinMaxScaler)
]


[docs]class NaiveBayesBaseliner(compose(__initial_steps__ + [('naive', GaussianNB)])):
    """
    This is a baseline pipeline for classification problems. It's composed by
    the following transformers/estimator:

    1. ``sklearn.preprocessing.OneHotEncoder``
    2. ``sklearn.impute.SimpleImputer``
    3. ``sklearn.preprocessing.MinMaxScaler``
    4. ``sklearn.naive_bayes.GaussianNB``

    The available parameters to tweak are:

    ::

        >>> from miraiml.pipeline import NaiveBayesBaseliner

        >>> for param in NaiveBayesBaseliner().get_params():
        ...     print(param)
        ...
        ohe__categorical_features
        ohe__categories
        ohe__drop
        ohe__dtype
        ohe__handle_unknown
        ohe__n_values
        ohe__sparse
        impute__add_indicator
        impute__fill_value
        impute__missing_values
        impute__strategy
        impute__verbose
        min_max__feature_range
        naive__priors
        naive__var_smoothing
    """
    def __init__(self):
        super().__init__()


[docs]class LinearRegressionBaseliner(compose(__initial_steps__ + [('lin_reg', LinearRegression)])):
    """
    This is a baseline pipeline for regression problems. It's composed by the
    following transformers/estimator:

    1. ``sklearn.preprocessing.OneHotEncoder``
    2. ``sklearn.impute.SimpleImputer``
    3. ``sklearn.preprocessing.MinMaxScaler``
    4. ``sklearn.linear_model.LinearRegression``

    The available parameters to tweak are:

    ::

        >>> from miraiml.pipeline import LinearRegressionBaseliner

        >>> for param in LinearRegressionBaseliner().get_params():
        ...     print(param)
        ...
        ohe__categorical_features
        ohe__categories
        ohe__drop
        ohe__dtype
        ohe__handle_unknown
        ohe__n_values
        ohe__sparse
        impute__add_indicator
        impute__fill_value
        impute__missing_values
        impute__strategy
        impute__verbose
        min_max__feature_range
        lin_reg__fit_intercept
        lin_reg__n_jobs
        lin_reg__normalize
    """
    def __init__(self):
        super().__init__()