
    hMh                         d Z ddlZddlZddlmZ ddlZddlZ	 ddej	        de
dedeej	        e
f         fd	Zd
ej        defdZdS )a}  Project: PhiK - correlation analyzer library

Created: 2018/12/28

Description:
    A set of functions to check for data quality issues in input data.

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
    N)TupleTdfinterval_colsdropnareturnc                      fd|D             }t          t          t           j                  t          |          z
                      D ]`} |                                         dk    r@t          j        d                    | |                                                              ag |D ]u} |                                         dk     rU                    |           t          j        d                    | |                                                              vt          t          t           j                  t          |          z
                      D ]} |                                         dk    s  |                                         dk    rW|rU                    |           t          j        d                    | |                                                               	                                }t          j	        |          }t                    dk    r%|                    d	           fd
|D             }||fS )a  
    Basic data quality checks per column in a DataFrame.

    The following checks are done:

    1. For all non-interval variables, if the number of unique values per variable is larger than 100 a warning is printed.
    When the number of unique values is large, the variable is likely to be an interval variable. Calculation of phik
    will be slow(ish) for pairs of variables where one (or two) have many different values (i.e. many bins).

    2. For all interval variables, the number of unique values must be at least two. If the number of unique values is
    zero (i.e. all NaN) the column is removed. If the number of unique values is one, it is not possible to
    automatically create a binning for this variable (as min and max are the same). The variable is therefore dropped,
    irrespective of whether dropna is True or False.

    3. For all non-interval variables, the number of unique values must be at least either
    a) 1 if dropna=False (NaN is now also considered a valid category), or
    b) 2 if dropna=True

    The function returns a DataFrame where all columns with invalid data are removed. Also the list of interval_cols
    is updated and returned.

    :param pd.DataFrame df: input data
    :param list interval_cols: column names of columns with interval variables.
    :param bool dropna: remove NaN values when True
    :returns: cleaned data, updated list of interval columns
    c                 &    g | ]}|j         v |S  )columns).0colr   s     Q/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/phik/data_quality.py
<listcomp>z+dq_check_nunique_values.<locals>.<listcomp>6   s%    GGGSSBJ5F5FS5F5F5F      zThe number of unique values of variable {0:s} is large: {1:d}. Are you sure this is not an interval variable? Analysis for pairs of variables including {0:s} can be slow.   zSNot enough unique value for variable {0:s} for analysis {1:d}. Dropping this columnr      T)r   inplacec                     g | ]}|v|	S r
   r
   )r   r   	drop_colss     r   r   z+dq_check_nunique_values.<locals>.<listcomp>]   s#    TTTss)?S?Ss?S?S?Sr   )sortedlistsetr   nuniquewarningswarnformatappendcopylendrop)r   r   r   r   df_cleaninterval_cols_cleanr   s   `     @r   dq_check_nunique_valuesr$      sV   < HGGGMGGGM d3rz??S-?-??@@AA  c7??t##MiioioC**j j   I   c7??q  S!!!MellC**    d3rz??S-?-??@@AA  c7??!!bgoo&7&71&<&<&<S!!!MellC**    wwyyH)M22
9~~i666TTTTmTTT(((r   hist2dc                    d| j         v s	d| j         v r@t          j        d                    | j         d         | j         d                              dS | j         d         dk    r2t          j        d                    | j         d                              | j         d         dk    r2t          j        d                    | j         d                              dS )	aI  Basic data quality checks for a contingency table

    The Following checks are done:

    1. There must be at least two bins in both the x and y direction.

    2. If the number of bins in the x and/or y direction is larger than 100 a warning is printed.

    :param hist2d: contingency table
    :return: bool passed_check
    r   r   z9Too few unique values for variable x ({0:d}) or y ({1:d})Fr   zThe number of unique values of variable x is large: {0:d}. Are you sure this is not an interval variable? Analysis might be slow.zThe number of unique values of variable y is large: {0:d}. Are you sure this is not an interval variable? Analysis might be slow.T)shaper   r   r   )r%   s    r   dq_check_hist2dr(   b   s     	FLA--GNNQa 	
 	
 	

 u|AUU[U[QV V	
 	
 	
 |AUU[U[QV V	
 	
 	
 4r   )T)__doc__r   r   typingr   pandaspdnumpynp	DataFramer   boolr$   ndarrayr(   r
   r   r   <module>r2      s                     ;?G) G)
G)%)G)37G)
2<G) G) G) G)T#BJ #4 # # # # # #r   