
    hMh+                        d Z ddlZddlmZmZmZmZ ddlZddl	Z
ddlmZ ddlmZ ddlmZmZ 	 d&deej        ee
j        f         d	ed
edej        fdZdeej        ef         deej        ef         deej        ef         fdZ	 	 	 	 d'de
j        deeej        ef         deeeej        ef         d
edef
dZ	 	 	 	 	 d(de
j        dee         deeeej        ef         d
ededede
j        fdZdeeeee f                  de
j        fdZ!	 	 	 d)de
j        deded ede
j        f
d!Z"	 	 	 	 	 	 	 	 d*de
j        deeeej        f                  deee eej        ef         d
ededed edededee
j        ee
j        ef         f         fd"Z#d#ee
j        eej        f         d$e
j        eej        gdee
j        ee
j        ef         f         fd%Z$dS )+a  Project: PhiK - correlation analyzer library

Created: 2018/09/06

Description:
    A set of rebinning functions, to help rebin two lists into a 2d histogram.

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
    N)ListOptionalTupleUnion)definitions)dq_check_nunique_values)array_like_to_dataframeguess_interval_colsFarrnbinsquantilereturnc           
      f   |rt          j        dd|dz             }t          j        | t          j        |                     |          }|dxx         t	          dt          |d                   z  t          j        j                  z  cc<   nt          j        | t          j        |                               }t	          dt          |          z  t          j        j                  }t          j        ||z
  t          j        | t          j        |                               |dz             }|S )a  
    Create uniform or quantile bin-edges for the input array.

    :param arr: array like object with input data
    :param int nbins: the number of bin
    :param bool quantile: uniform bins (False) or bins based on quantiles (True)
    :returns: array with bin edges
    r      g+=)	nplinspacer   isnanmaxabssys
float_infomin)r   r   r   	quantilesxbins	min_valueconstants          L/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/phik/binning.py	bin_edgesr      s      	
K1eai00	C#/;;aCE!H-s~/ABBBF3~.//	us9~~-s~/ABB "&bhsmm^)<"="=uqy
 
 L    r   c                 f   t          j        ||                               t                    }g }t	          j        |                                          j        }t          dt          |                    D ],}||v r&|
                    ||dz
           ||         f           -t           j        |t          j        t          j        |                     <   t          j        |t          j        |dk              <   t          j        |t          j        |t          |          k              <   ||fS )z
    Index the data given the bin_edges.

    Underflow and overflow values are indicated.

    :param arr: array like object with input data
    :param bin_edges: list with bin edges.
    :returns: indexed data
    r   r   )r   searchsortedastypeobjectpdSeriesvalue_countsindexrangelenappendnanargwherer   defsUFOF)r   r   
binned_arr
bin_labelsbin_indicesis         r   	bin_arrayr4   4   s    C0077??J J)J''4466<K1c)nn%% @ @yQ/1>??? .0VJr{28C==))* 04wJr{:?++, =AGJr{:Y7889z!!r    
   datacolsbinsretbinsc           	         d}t          |t                    r,|D ](}||vr"t          d                    |                    )n#t          |t          t
          j        f          r|}|                                 }i }|D ]}t          j        t          |          t
          j
                  s,t          j        t          |          t
          j                  r?t          | |                             t                    t          |          |          }nt          |t                    rt          j        t          ||                   t
          j
                  s2t          j        t          ||                   t
          j                  rDt          | |                             t                    t          ||                   |          }n^t          ||         t          t
          j        f          r||         }n.|,t          dt!          t          |                    z            t#          | |                             t                    j        |          \  ||<   }	|r|	||<   |r||fS |S )aP  
    Index the input DataFrame given the bin_edges for the columns specified in cols.

    :param DataFrame data: input data
    :param list cols: list of columns with numeric data which needs to be indexed
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :returns: rebinned DataFrame
    :rtype: pandas.DataFrame
    Nz.column {0} is not included in bins dictionary.)r   z1Unexpected type for bins. The found type was '%s')
isinstancedict
ValueErrorformatlistr   ndarraycopy
issubdtypetypeintegerfloatingr   r"   floatintstrr4   values)
r7   r8   r9   r   r:   r   colbinned_data	bins_dictr1   s
             r   bin_datarN   W   sP   $ E$  	 	C$ DKKCPP   	
 
D4,	-	-  ))++KI ( (=dRZ00 	BMJJ5
 5
 	 d3i..u55s4yy8TTTEEd## 	}T$s)__bj99 "R]T#Y> > " "I$$U++Sc^^h   DIbj'9:: "S	]Cc$t**ooU   (1c1A1A%1H1H1OQV'W'W$C* 	('IcN &I%%r   Tdfinterval_colsdropnaverbosec                     |t          | |          }t          | ||          \  }}t          ||||d          \  }}	||	fS )a  
    Index the input DataFrame with automatic bin_edges and interval columns.

    :param pd.DataFrame data_binned: input data
    :param list interval_cols: column names of columns with interval variables.
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column
        the bins are specified. (default=10)        E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :param bool dropna: remove NaN values with True
    :param bool verbose: if False, do not print all interval columns that are guessed
    :return: phik correlation matrix
    N)rQ   T)r8   r9   r   r:   )r
   r   rN   )
rO   rP   r9   r   rQ   rR   df_cleaninterval_cols_cleandata_binnedbinning_dicts
             r   auto_bin_datarX      sq    , +B88 %<
M&% % %!H!
 !)*RV! ! !K $$r   valsc                    g }| D ]6\  }}}|                     |||g           |                     |||g           7t          j        |g d                              ddd          }d|j        _        d|j        _        |S )a  
    Create overview table of phik/significance data.

    :param list vals: list holding tuples of data for each variable pair formatted as ('var1', 'var2', value)
    :returns: symmetric table with phik/significances of all variable pairs
    :rtype: pandas.DataFrame
    )var1var2rY   )columnsr[   r\   rY   )r'   r]   rJ   N)r*   r$   	DataFramepivot_tabler]   namer'   )rY   llc0c1vcorr_matrixs         r   !create_correlation_overview_tablerf      s     
B  	B
		2r1+
		2r1+,r+C+C+CDDDPPfV Q  K  $K!Kr   rV   drop_underflowdrop_overflowc                     | j         \  }}|s!|                     t          j        d           |r,|                     t          j        t          j        d           |r,|                     t          j        t          j        d           | 	                    ||g          |         
                                                                                                                    d          }|j                                         |_         |S )a  
    Give binned 2d DataFrame of two columns of rebinned input DataFrame

    :param df: input data. DataFrame must contain exactly two columns
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :returns: histogram DataFrame
    T)inplacer   )r]   fillnar-   NaNreplacer.   r   r+   r/   groupbycountto_frameunstack	droplevel)rV   rQ   rg   rh   rb   rc   df_datahists          r   hist2d_from_rebinned_dfrt      s    $  FB 348T222 ;DGRVT::: ;DGRVT::: 	RH%%b)//11::<<DDFFMMaPP  &-7799Kr   c	                     t          | j                  dk    rt          d          |t          | |          }t	          | |d||          \  }	}
t          |	|||          }|r||
fS |S )a  
    Give binned 2d DataFrame of two columns of input DataFrame

    :param df: input data. DataFrame must contain exactly two columns
    :param interval_cols: columns with interval variables which need to be binned
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param bool verbose: if False, do not print all interval columns that are guessed
    :returns: histogram DataFrame
       z)DataFrame should contain only two columnsNT)r:   r9   r   )rQ   rg   rh   )r)   r]   r>   r
   rN   rt   )rO   rP   r9   r   rQ   rg   rh   r:   rR   rV   rW   datahists               r   hist2drx      s    8 2:!DEEE+B88 (
M4dX! ! !K '%#	  H  &%%Or   xyc                 <    t          | |          }t          |fi |S )a  
    Give binned 2d DataFrame of two input arrays

    :param x: input data. First array-like.
    :param y: input data. Second array-like.
    :param interval_cols: columns with interval variables which need to be binned
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :returns: histogram DataFrame
    )r	   rx   )ry   rz   kwargsrO   s       r   hist2d_from_arrayr}   (  s)    ( 
!A	&	&B"r   )F)r5   r6   FF)Nr6   FTT)TTT)Nr6   FTTTFT)%__doc__r   typingr   r   r   r   numpyr   pandasr$   phikr   r-   phik.data_qualityr   
phik.utilsr	   r
   rA   r@   r%   rH   boolr   r4   r^   tupler=   rN   rX   rI   rG   rf   rt   rx   r}   r5   r   r   <module>r      s    


 / / / / / / / / / / / /         $ $ $ $ $ $ 5 5 5 5 5 5 C C C C C C C C KP 	rz4*	+47CGZ   4 "	rz4	  "-22:t3C-D "
2:t "  "  "  "J ,./1; ;
,;
bj%'
(; T2:t+
,; 	;
 ; ; ; ;@ %)/1"% "%
"%D>"% T2:t+
,"% 	"%
 "% "% \"% "% "% "%J
uS#u_%
&\   4 	! !!! ! 	!
 \! ! ! !L 8<68/ /
/E$
"234/ UD"*d2
3/ 	/
 / / / / / 2<r|T1223/ / / /d RYbj() /1y$
.K 
2<r|T1223           r   