Source code for MDMC.common.df_operations

"""
Contains some utility functions related to pd.DataFrames, including filtering functions.
"""

from typing import Sequence, overload

import pandas as pd


@overload
def filter_dataframe(values: Sequence,
                     dataframe: pd.DataFrame,
                     column_names: list[str]) -> pd.DataFrame: ...


@overload
def filter_dataframe(values: Sequence,
                     dataframe: pd.DataFrame,
                     column_regex: str) -> pd.DataFrame: ...



[docs]
def filter_dataframe(values: Sequence,
                     dataframe: pd.DataFrame,
                     column_names: list[str] = None,
                     column_regex: str = None) -> pd.DataFrame:
    """
    Ignore duplicated rows (i.e. only return the first occurence of any duplicated row).

    Parameters
    ----------
    values : Sequence
        The values for which to filter. If any of these values occur in any of
        the columns defined by ``column_names`` or ``column_regex``, the row
        will be included in the filtered return.
    dataframe : pandas.DataFrame
        The ``pd.DataFrame`` object to be filtered.
    column_names : list[str], optional
        A `list` of `str` specifying the names of the columns which will be used
        to filter the ``Dataframe``.

        This cannot be passed if ``column_regex`` is also passed.
    column_regex : str
        A regular expression matching one or more column names. This specifies
        which columns will be used to filter the ``DataFrame``.

        This cannot be passed if ``column_names`` is also passed.

    Returns
    -------
    pandas.DataFrame
        A ``DataFrame`` which has been filtered so that each value in ``values``
        must occur in one of the columns of ``DataFrame`` that are specified by
        ``column_names`` or matched by ``column_regex``.

    Raises
    ------
    ValueError
        If both `column_names` and `column_regex` were passed.
        If there are fewer `column_names` than values.
    """

    if column_names and column_regex:
        raise ValueError('Only one of column_names and column_regex can be'
                         ' passed')
    # Use column names or regex to set column names
    column_names = (column_names if column_names is not None
                    else list(dataframe.filter(regex=column_regex)))

    # Raise an error if there are more values than columns (as every value must
    # be found in a column)
    if len(column_names) < len(values):
        raise ValueError(f'There must be at least as many columns ({len(column_names)}) as'
                         f' values ({len(values)})')

    # Filter all columns of dataframe which match column_regex for the first
    # value in values
    filtered_dataframes: list = []
    for col_name in column_names:
        filtered_dataframes.append(dataframe[dataframe[col_name] == values[0]])
    # Concat the list of filtered dataframes (1 for each matching column)
    # into a single dataframe
    concat_filtered_dataframe = pd.concat(filtered_dataframes)
    # If there is more than one value in values, call _filter_df_multi
    # recursively to further filter by the remaining values
    if len(values) > 1:
        concat_filtered_dataframe = filter_dataframe(
            values[1:],
            concat_filtered_dataframe,
            column_names=column_names,
        )
    return concat_filtered_dataframe.drop_duplicates()



@overload
def filter_ordered_dataframe(values: Sequence,
                             dataframe: pd.DataFrame,
                             column_names: list[str],
                             wildcard: str = None) -> pd.DataFrame: ...


@overload
def filter_ordered_dataframe(values: Sequence,
                             dataframe: pd.DataFrame,
                             column_regex: str,
                             wildcard: str = None) -> pd.DataFrame: ...



[docs]
def filter_ordered_dataframe(values: Sequence,
                             dataframe: pd.DataFrame,
                             column_names: list[str] = None,
                             column_regex: str = None,
                             wildcard: str = None) -> pd.DataFrame:
    """
    Filter a ``pd.DataFrame`` with an iterable of ordered values.

    The values must occur in columns in the correct order, with the
    order specified by ``column_names``, or by the order which column
    order which occurs from using ``column_regex``.

    This filter ignores rows which are duplicated (i.e. it only returns the
    first occurence of any duplicated rows).

    Parameters
    ----------
    values : Sequence
        The values for which to filter. If any of these values occur in any of
        the columns defined by ``column_names`` or ``column_regex``, the row
        will be included in the filtered return.
    dataframe : pandas.DataFrame
        The ``pd.DataFrame`` object to be filtered.
    column_names : list[str], optional
        A `list` of `str` specifying the names of the columns which will be used
        to filter the ``Dataframe``.

        This cannot be passed if ``column_regex`` is also passed.
    column_regex : str
        A regular expression matching one or more column names. This specifies
        which columns will be used to filter the ``DataFrame``.

        This cannot be passed if ``column_names`` is also passed.
    wildcard : str
        A `str` which will be a match in any column.

    Returns
    -------
    pandas.DataFrame
        A ``DataFrame`` which has been filtered so that each value in ``values``
        must occur in one of the columns of ``DataFrame`` that are specified by
        ``column_names`` or matched by ``column_regex``.

    Raises
    ------
    ValueError
        If both `column_names` and `column_regex` were passed.
        If there are fewer `column_names` than values.
    """

    if column_names and column_regex:
        raise ValueError('Only one of column_names and column_regex can be'
                         ' passed')
    # Use column names or regex to set column names
    column_names = (column_names if column_names is not None
                    else list(dataframe.filter(regex=column_regex)))

    # Raise an error if there are more values than columns (as every value must
    # be found in a column)
    if len(column_names) < len(values):
        raise ValueError(f'There must be at least as many columns ({len(column_names)}) as'
                         f' values ({len(values)})')

    # Whether all elements of each row have the same order as values (including
    # wildcard)
    bool_rows = dataframe[column_names].agg(lambda x: all(x[i] in
                                                          [values[i], wildcard]
                                                          for i
                                                          in range(len(x))),
                                            axis="columns")
    filtered_dataframe = dataframe.loc[bool_rows]

    return filtered_dataframe.drop_duplicates()