
    Q/Ph                     `    d dl mZ d dlZd dlmZ d dlmZ dej        de	dee
e	f         fdZdS )	    )UnionN)log2)entropyvalue_counts	n_classesreturnc                     |dk    r?|                      t                    } dt          | d          t          |          z  z
  S dS )u]  column_imbalance_score

    The class balance score for categorical and boolean variables uses entropy to calculate a  bounded score between 0 and 1.
    A perfectly uniform distribution would return a score of 0, and a perfectly imbalanced distribution would return a score of 1.

    When dealing with probabilities with finite values (e.g categorical), entropy is maximised the ‘flatter’ the distribution is. (Jaynes: Probability Theory, The Logic of Science)
    To calculate the class imbalance, we calculate the entropy of that distribution and the maximum possible entropy for that number of classes.
    To calculate the entropy of the 'distribution' we use value counts (e.g frequency of classes) and we can determine the maximum entropy as log2(number of classes).
    We then divide the entropy by the maximum possible entropy to get a value between 0 and 1 which we then subtract from 1.

    Args:
        value_counts (pd.Series): frequency of each category
        n_classes (int): number of classes

    Returns:
        Union[float, int]: float or integer bounded between 0 and 1 inclusively
       )dtype   )baser   )to_numpyfloatr   r   )r   r   s     m/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/ydata_profiling/model/pandas/imbalance_pandas.pycolumn_imbalance_scorer      sM    , 1}} $,,5,99GLq111DOOCDD1    )typingr   pandaspdnumpyr   scipy.statsr   Seriesintr   r    r   r   <module>r      s                         )(+
5#:     r   