Source code for edo.individual

""" A collection of objects to facilitate an individual representation. """

import json
import pickle
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd

from .family import Family


[docs]class Individual:
    """ A class to represent an individual in the EA.

    Parameters
    ----------
    dataframe : pd.DataFrame or dd.DataFrame
        The dataframe of the individual.
    metadata : list
        A list of distributions that are associated with the respective column
        of ``dataframe``.
    random_state : np.random.RandomState, optional
        The PRNG for the individual. If not provided, the default PRNG is used.

    Attributes
    ----------
    fitness : float
        The fitness of the individual. Initialises as ``None``.
    """

    def __init__(self, dataframe, metadata, random_state=None):

        self.dataframe = dataframe
        self.metadata = metadata

        if random_state is None:
            random_state = np.random.mtrand._rand

        self.random_state = random_state
        self.fitness = None

    def __repr__(self):

        return (
            f"Individual(dataframe={self.dataframe}, metadata={self.metadata})"
        )

    def __iter__(self):

        for part in [self.dataframe, self.metadata]:
            yield part

[docs]    @classmethod
    def from_file(
        cls, path, distributions, family_root=".edocache", method="pandas"
    ):
        """ Create an instance of ``Individual`` from the files at ``path`` and
        ``family_root`` using either ``pandas`` or ``dask`` to read in
        individuals. Always fall back on ``pandas``. """

        path = Path(path)
        distributions = {dist.name: dist for dist in distributions}

        if method == "dask":
            method = dd
        else:
            method = pd

        dataframe = method.read_csv(path / "main.csv")
        dataframe.columns = map(int, dataframe.columns)

        with open(path / "main.meta", "r") as meta:
            meta_dicts = json.load(meta)

        metadata = []
        for meta in meta_dicts:
            distribution = meta["name"]
            family = globals().get(f"{distribution}Family", None)
            if family is None:
                distribution = distributions[distribution]
                family = Family.load(distribution, family_root)

            subtype_id = meta["subtype_id"]
            subtype = family.subtypes[subtype_id]

            pdf = subtype.__new__(subtype)
            pdf.__dict__.update(meta["params"])
            metadata.append(pdf)

        with open(path / "main.state", "rb") as state:
            random_state = pickle.load(state)

        return Individual(dataframe, metadata, random_state)

[docs]    def to_file(self, path, family_root=".edocache"):
        """ Write self to file. """

        path = Path(path)
        path.mkdir(exist_ok=True, parents=True)

        self.dataframe.to_csv(path / "main.csv", index=False)

        meta_dicts = []
        for pdf in self.metadata:
            pdf.family.save(family_root)
            meta_dicts.append(pdf.to_dict())

        with open(path / "main.meta", "w") as meta:
            json.dump(meta_dicts, meta)

        with open(path / "main.state", "wb") as state:
            pickle.dump(
                self.random_state, state, protocol=pickle.HIGHEST_PROTOCOL
            )

        return path


def _sample_ncols(col_limits, random_state):
    """ Sample a valid number of columns from the column limits. """

    integer_limits = []
    for lim in col_limits:
        try:
            integer_lim = sum(lim)
        except TypeError:
            integer_lim = lim
        integer_limits.append(integer_lim)

    return random_state.randint(integer_limits[0], integer_limits[1] + 1)


def _get_minimum_columns(
    nrows, col_limits, families, family_counts, random_state
):
    """ If ``col_limits`` has a tuple lower limit then sample columns of the
    corresponding element of ``families`` as needed to satisfy this bound. """

    columns, metadata = [], []
    for family, min_limit in zip(families, col_limits[0]):
        for _ in range(min_limit):
            meta = family.make_instance(random_state)
            columns.append(meta.sample(nrows, random_state))
            metadata.append(meta)
            family_counts[family.name] += 1

    return columns, metadata, family_counts


def _get_remaining_columns(
    columns,
    metadata,
    nrows,
    ncols,
    col_limits,
    families,
    weights,
    family_counts,
    random_state,
):
    """ Sample all remaining columns for the current individual. If
    ``col_limits`` has a tuple upper limit then sample all remaining
    columns for the individual without exceeding the bounds. """

    while len(columns) < ncols:
        family = random_state.choice(families, p=weights)
        idx = families.index(family)
        try:
            if family_counts[family.name] < col_limits[1][idx]:
                meta = family.make_instance(random_state)
                columns.append(meta.sample(nrows, random_state))
                metadata.append(meta)
                family_counts[family.name] += 1

        except TypeError:
            meta = family.make_instance(random_state)
            columns.append(meta.sample(nrows, random_state))
            metadata.append(meta)

    return columns, metadata


[docs]def create_individual(row_limits, col_limits, families, weights, random_state):
    """ Create an individual within the limits provided.

    Parameters
    ----------
    row_limits : list
        Lower and upper bounds on the number of rows a dataset can have.
    col_limits : list
        Lower and upper bounds on the number of columns a dataset can have.
        Tuples can be used to indicate limits on the number of columns needed
        from each family in ``families``.
    families : list
        A list of ``edo.Family`` instances handling the column distributions
        that can be selected from.
    weights : list
        A sequence of relative weights with which to sample from ``families``.
        If ``None``, then sampling is uniform.
    random_state : numpy.random.RandomState
        The PRNG associated with the individual to use for its random sampling.
    """

    nrows = random_state.randint(row_limits[0], row_limits[1] + 1)
    ncols = _sample_ncols(col_limits, random_state)

    columns, metadata = [], []
    family_counts = {family.name: 0 for family in families}

    if isinstance(col_limits[0], tuple):
        columns, metadata, pdf_counts = _get_minimum_columns(
            nrows, col_limits, families, family_counts, random_state
        )

    columns, metadata = _get_remaining_columns(
        columns,
        metadata,
        nrows,
        ncols,
        col_limits,
        families,
        weights,
        family_counts,
        random_state,
    )

    dataframe = pd.DataFrame({i: col for i, col in enumerate(columns)})
    return Individual(dataframe, metadata, random_state)