
    hMha                        d Z ddlmZmZmZ ddlZddlZddlZ	ddl
mZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZ ddlm Z m!Z! de"fdZ#dee$ej%        f         dee$e$e&f         fdZ'	 d3dej%        de(dej%        de&fdZ)	 	 	 	 	 d4de	j*        de(de(de(de(de+de	j*        fdZ,de-de	j*        de(dee$e$e&f         fdZ.	 	 	 	 	 	 	 	 	 d5d"e	j*        d#ee/         d$ee+e/ej%        e"f         d%e(de(de(de(de(d&e(de+de	j*        fd'Z0	 	 	 	 	 d4de	j*        de(de(de(de(de+deej%        ej%        f         fd(Z1	 	 	 	 	 	 	 	 	 d5d"e	j*        d#e/d$ee+e/ej%        e"f         d%e(de(de(de(de(d&e(de+deej%        ej%        f         fd)Z2	 	 	 	 	 	 	 d6d*eej%        e	j3        f         d+eej%        e	j3        f         d,ee$e/f         d$ee+e"e/ej%        f         d%e(de(de(de(de(de&fd-Z4	 	 	 	 d7d*eej%        e	j3        f         d+eej%        e	j3        f         de(de(de(de(de&fd.Z5	 	 	 	 	 d4d/e	j*        d0e	j*        de(de(de(de(de+de	j*        fd1Z6de-d/e	j*        d0e	j*        de(dee$e$e&f         f
d2Z7dS )8ai  Project: PhiK - correlation analyzer library

Created: 2018/09/05

Description:
    Functions for the Phik correlation calculation

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
    )TupleUnionOptionalN)Paralleldelayed)invpinv)definitions   )phik_from_chi2),get_chi2_using_dependent_frequency_estimatesestimate_simple_ndofget_pearson_chi_square)!create_correlation_overview_tablebin_dataauto_bin_data)dq_check_hist2d)array_like_to_dataframemake_shapes_equal	hist_dictc                    |                                 D ]?\  }}t          |t          j                  s |j        d         dk    rt          d          @t          |                                           }|                     |          }|                    t                    }|
                                }t          |          }|S )a.  Correlation matrix of bivariate gaussian using spark parallelization over variable-pair 2d histograms

    See spark notebook phik_tutorial_spark.ipynb as example.

    Each input histogram gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    :param spark_context: spark context
    :param hist_dict: dict of 2d numpy grids with value-counts. keys are histogram names.
    :return: phik correlation matrix
    r      z4hist_dict should be a dictionary of 2d numpy arrays.)items
isinstancenpndarrayshape	TypeErrorlistparallelizemap_phik_from_rowcollectr   )	spark_contextr   kv	hist_listhist_rddphik_rdd	phik_listphik_overviews	            I/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/phik/phik.py"spark_phik_matrix_from_hist2d_dictr-   #   s     !! T T1!RZ(( 	TQWQZ1__RSSSY__&&''I((33H||N++H  ""I5i@@M    rowreturnc                    | \  }}|                     d          }t          |          dk    r#|d         |d         k    r|d         |d         dfS 	 t          |          }n# t          $ r t          j        }Y nw xY w|d         |d         |fS )zHelper function for spark parallel processing

    :param row: rdd row, where row[0] is key and rdd[1]
    :return: union of key, phik-value
    :r   r   r         ?)splitlenphik_from_hist2dr   r   nan)r/   keygridc
phik_values        r,   r"   r"   =   s     IC		#A
1vv{{qtqt||tQqT3%d++

   V


Q41z!!s   A" "A;:A;Tobservednoise_correctionexpectedc                 R   t          | t          j                  r| j        } t          |t          j                  r|j        }|t	          | d          nt          | |          }|rt          |           nd}|dk     rd}t          ||                                 g| j	        R d|iS )a  
    correlation coefficient of bivariate gaussian derived from chi2-value

    Chi2-value gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    Bivariate gaussian's range is set to [-5,5] by construction.

    :param observed: 2d-array observed values
    :param bool noise_correction: apply noise correction in phik calculation
    :param expected: 2d-array expected values. Optional, default is None, otherwise evaluated automatically.
    :returns float: correlation coefficient phik
    Npearson)lambda_r   pedestal)
r   pd	DataFramevaluesr   r   r   r   sumr   )r<   r=   r>   chi2rB   s        r,   r6   r6   O   s    " (BL)) #?(BL)) #?
  	5XyQQQQ#Hh77 	 2BH#H---qH!|| $SSSS(SSSr.   data_binneddropnadrop_underflowdrop_overflownjobsc                     |s,                      t          j        t          j        d           |r,                      t          j        t          j        d           |r,                      t          j        t          j        d            j        }|dk    r- fdt          j	         j        j
        d          D             }nC t          |           fdt          j	         j        j
        d          D                       }t          |          dk    r!t          j        t          j        ||	          S t          |          }|                    |
          }|                    |          }|S )a  
    Correlation matrix of bivariate gaussian derived from chi2-value

    Chi2-value gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    Bivariate gaussian's range is set to [-5,5] by construction.

    :param pd.DataFrame data_binned: input data where interval variables have been binned
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param int njobs: number of parallel jobs used for calculation of phik. default is -1. 1 uses no parallel jobs.
    :return: phik correlation matrix
    Tinplacer   c           	      X    g | ]&}t          |t          |                             'S  )
_calc_phikr   .0corI   r=   s     r,   
<listcomp>z)phik_from_rebinned_df.<locals>.<listcomp>   sA     
 
 
 r;tBxx02BCC
 
 
r.   r   n_jobsc              3   |   K   | ]6} t          t                    |t          |                             V  7d S N)r   rS   r   rT   s     r,   	<genexpr>z(phik_from_rebinned_df.<locals>.<genexpr>   sY       +
 +
  GJKR$9;KLL+
 +
 +
 +
 +
 +
r.   r   indexcolumnsr_   r^   )replacer   r7   defsNaNUFOFr_   	itertoolscombinations_with_replacementrE   r   r5   rC   rD   r   reindex)	rI   r=   rJ   rK   rL   rM   column_orderr*   r+   s	   ``       r,   phik_from_rebinned_dfrk   u   s   8  < 	BFDHd;;; ;DGRVT::: ;DGRVT::: &Lzz
 
 
 
 
=#*A 
 
 
		 +HE*** +
 +
 +
 +
 +
=#*A +
 +
 +
 
 
	 9~~|BF,MMMM5i@@M ")),)??M!)))==Mr.   combc                    | \  }}||k    r||dfS |                     ||gd          |                                                                                                                             d          t          fddD                       r||t          j        fS j        	                                _        t          j        |          }|||fS )aN  Split off calculation of phik for parallel processing

    :param tuple comb: union of two string columns
    :param pd.DataFrame data_binned: input data where interval variables have been binned
    :param bool noise_correction: apply noise correction in phik calculation
    :return: tuple of variable-x, variable-y, phik-value
    r3   F)r<   r   c                 "    g | ]}|j         v S rR   r   )rU   r&   datahists     r,   rW   z_calc_phik.<locals>.<listcomp>        000AA000r.   r   r   r=   )groupbycountto_frameunstackfillnaanyr   r7   r_   	droplevelr6   rE   )rl   rI   r=   c0c1	phikvaluerp   s         @r,   rS   rS      s     FB	Rxx2s{""B8e"<<R@FFHHQQSS[[]]ddefggH 000000011 2rv~'1133H CSTTTIr9r.   
   Fdfinterval_colsbinsquantileverbosec
                 \    t          | |||||          \  }
}t          |
|||||	          S )a  
    Correlation matrix of bivariate gaussian derived from chi2-value

    Chi2-value gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    Bivariate gaussian's range is set to [-5,5] by construction.

    :param pd.DataFrame data_binned: input data
    :param list interval_cols: column names of columns with interval variables.
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the    bins are specified (default=10). E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param bool verbose: if False, do not print all interval columns that are guessed
    :param int njobs: number of parallel jobs used for calculation of phik. default is -1. 1 uses no parallel jobs.
    :return: phik correlation matrix
    r   r   r   r   rJ   r   rJ   rK   rL   rM   )r   rk   r   r   r   r   r=   rJ   rK   rL   r   rM   rI   binning_dicts               r,   phik_matrixr      s[    J !.#! ! !K !%#   r.   c                    t          | |||||          }|j        }t          j                            |          t          j        |j                  j        k    rt          |          }nt          |          }t          j
        ddt          j        |          t          j        |          z  z  z
            dddf         }	d|	|	dk    <   d|	|	dk     <   |	|j        j        fS )a  
    Global correlation values of variables, obtained from the PhiK correlation matrix.

    A global correlation value is a simple approximation of how well one feature can be modeled in terms of all others.

    The global correlation coefficient is a number between zero and one, obtained from the PhiK correlation matrix,
    that gives the highest possible correlation between a variable and the linear combination of all other variables.
    See PhiK paper or for original definition: https://inspirehep.net/literature/101965

    Global PhiK uses two important simplifications / approximations:
    - The variables are assumed to belong to a multinormal distribution, which is typically not the case.
    - The correlation should be a Pearson correlation matrix, allowing for negative values, which is not the case
      with PhiK correlations (which are positive by construction).
    To correct for these, the Global PhiK values are artificially capped between 0 and 1.

    Still, the Global PhiK values are useful, quick, simple estimates that are interesting in an exploratory study.
    For a solid, trustworthy estimate be sure to use a classifier or regression model instead.

    :param pd.DataFrame data_binned: rebinned input data
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param int njobs: number of parallel jobs used for calculation of phik. default is -1. 1 uses no parallel jobs.
    :return: global correlations array
    r   r   Nr3   g        )rk   rE   r   linalgcondfinfodtypeepsr   r	   sqrtdiagonalr^   )
rI   r=   rJ   rK   rL   rM   r+   VVinvglobal_correlationss
             r,   global_phik_from_rebinned_dfr   
  s    J *%#  M 	A	y~~a28AG,,0001vv Aww'	Q"+a..2;t#4#4456 aag
 69+c1258+c12 3 :::r.   c
                 \    t          | |||||          \  }
}t          |
|||||	          S )a  
    Global correlation values of variables, obtained from the PhiK correlation matrix.

    A global correlation value is a simple approximation of how well one feature can be modeled in terms of all others.

    The global correlation coefficient is a number between zero and one, obtained from the PhiK correlation matrix,
    that gives the highest possible correlation between a variable and the linear combination of all other variables.
    See PhiK paper or for original definition: https://inspirehep.net/literature/101965

    Global PhiK uses two important simplifications / approximations:
    - The variables are assumed to belong to a multinormal distribution, which is typically not the case.
    - The correlation should be a Pearson correlation matrix, allowing for negative values, which is not the case
      with PhiK correlations (which are positive by construction).
    To correct for these, the Global PhiK values are artificially capped between 0 and 1.

    Still, the Global PhiK values are useful, quick, simple estimates that are interesting in an exploratory study.
    For a solid, trustworthy estimate be sure to use a classifier or regression model instead.

    :param pd.DataFrame data_binned: input data
    :param list interval_cols: column names of columns with interval variables.
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the    bins are specified (default=10). E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param bool verbose: if False, do not print all interval columns that are guessed
    :param int njobs: number of parallel jobs used for calc of global phik. default is -1. 1 uses no parallel jobs.
    :return: global correlations array
    r   )r=   rJ   rK   rL   rM   )r   r   r   s               r,   global_phik_arrayr   I  s[    \ !.#! ! !K ()%#   r.   xynum_varsc	                     |g }nt          |t                    r|g}t          |          dk    r0t          | |          }	t	          |	|||          j        j        \  } }t          | |||||          S )a  
    Correlation matrix of bivariate gaussian derived from chi2-value

    Chi2-value gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    Bivariate gaussian's range is set to [-5,5] by construction.

    :param x: array-like input
    :param y: array-like input
    :param num_vars: list of numeric variables which need to be binned, e.g. ['x'] or ['x','y']
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the    bins are specified (default=10). E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :return: phik correlation coefficient
    Nr   )r   r   )r=   rJ   rK   rL   )r   strr5   r   r   TrE   phik_from_binned_array)
r   r   r   r   r   r=   rJ   rK   rL   r   s
             r,   phik_from_arrayr     s    D 	Hc	"	" :
8}}q$Q**H4(CCCEL1!		)%#   r.   c                    |st          j        |                               t          j                                      t                    j        } t          j        |                              t          j                                      t                    j        }|s|r"t          j        |                               t                    j        } t          j        |                              t                    j        }|r^t          j	        | t          j
        | t          j        k              <   t          j	        |t          j
        |t          j        k              <   |r^t          j	        |t          j
        |t          j        k              <   t          j	        | t          j
        | t          j        k              <   t          j        | |          j        }t          |          }|st          j	        S t          ||          S )a  
    Correlation matrix of bivariate gaussian derived from chi2-value

    Chi2-value gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    Bivariate gaussian's range is set to [-5,5] by construction.

    :param x: array-like input. Interval variables need to be binned beforehand.
    :param y: array-like input. Interval variables need to be binned beforehand.
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :return: phik correlation coefficient
    rs   )rC   Seriesrx   rc   rd   astyper   rE   r   r7   wherere   rf   crosstabr   r6   )r   r   r=   rJ   rK   rL   hist2ddq_okays           r,   r   r     sw   8  =IaLL))0055<IaLL))0055< / /IaLL$$+IaLL$$+ 	/(*AbhqDG|$$%(*AbhqDG|$$% 	/(*AbhqDG|$$%(*AbhqDG|$$%[A%Ff%%G vF5EFFFFr.   
obs_binned
exp_binnedc                     t           t          j                  rt          j                    t          t          j                  rt          j                  t           j                  t          j                  k    sJ |sX                     t          j        t          j
        d                               t          j        t          j
        d           |rX                     t          j        t          j        d                               t          j        t          j        d           |rX                     t          j        t          j        d                               t          j        t          j        d            j        }|dk    r. fdt          j         j        j        d          D             }nD t!          |           fdt          j         j        j        d          D                       }t#          |          dk    r!t          j        t          j        ||	          S t%          |          }	|	                    |
          }	|	                    |          }	|	S )a  
    PhiK correlation matrix of comparing observed with expected dataset

    Chi2-value gets converted into correlation coefficient of bivariate gauss
    with correlation value rho, assuming giving binning and number of records.
    Correlation coefficient value is between 0 and 1.

    :param pd.DataFrame obs_binned: observed input data where interval variables have been binned
    :param pd.DataFrame exp_binned: expected input data where interval variables have been binned
    :param bool noise_correction: apply noise correction in phik calculation
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param int njobs: number of parallel jobs used for calculation of phik. default is -1. 1 uses no parallel jobs.
    :return: phik correlation matrix
    TrO   r   c           
          g | ]:}t          |t          |                   t          |                             ;S rR   )_calc_phik_obs_vs_expr   rU   rV   r   r=   r   s     r,   rW   z>phik_observed_vs_expected_from_rebinned_df.<locals>.<listcomp>%  sU     
 
 
  "JtBxx(*T"XX*>@P 
 
 
r.   r   rX   c           	   3      K   | ]J} t          t                    |t          |                   t          |                             V  Kd S r[   )r   r   r   r   s     r,   r\   z=phik_observed_vs_expected_from_rebinned_df.<locals>.<genexpr>.  sn       +
 +
  +G)**JtBxx(*T"XX*>@P +
 +
 +
 +
 +
 +
r.   r   r]   r`   ra   )r   r   r   rC   rD   setr_   rb   r7   rc   rd   re   rf   rg   rh   rE   r   r5   r   ri   )
r   r   r=   rJ   rK   rL   rM   rj   r*   r+   s
   ```       r,   *phik_observed_vs_expected_from_rebinned_dfr     s~   8 *bj)) .\*--
*bj)) .\*--
z!""c**<&=&===== ; 	2648T:::2648T::: :47BFD99947BFD999 :47BFD99947BFD999 %Lzz
 
 
 
 
 
  =")1 	
 
 
		 +HE*** +
 +
 +
 +
 +
 +
  =")1 	+
 +
 +
 
 
	 9~~|BF,MMMM5i@@M ")),)??M!)))==Mr.   c                    | \  }}||k    r||dfS |                     ||g          |                                                                                                                             d          t          fddD                       r||t          j        fS |                     ||g          |                                                                                                                             d          t          fddD                       r||t          j        fS t                    t          |          }|||fS )a  Split off calculation of phik for parallel processing

    :param tuple comb: union of two string columns
    :param pd.DataFrame obs_binned: observed data where interval variables have been binned
    :param pd.DataFrame exp_binned: expected data where interval variables have been binned
    :param bool noise_correction: apply noise correction in phik calculation
    :return:
    r3   r   c                 "    g | ]}|j         v S rR   ro   )rU   r&   r<   s     r,   rW   z)_calc_phik_obs_vs_exp.<locals>.<listcomp>Y  rq   r.   rr   c                 "    g | ]}|j         v S rR   ro   )rU   r&   r>   s     r,   rW   z)_calc_phik_obs_vs_exp.<locals>.<listcomp>_  rq   r.   )r<   r=   r>   )
rt   ru   rv   rw   rx   ry   r   r7   r   r6   )	rl   r   r   r=   r{   r|   r;   r>   r<   s	          @@r,   r   r   C  sh    FB	Rxx2s{!!2r(++B/5577@@BBJJLLSSTUVVH 000000011 2rv~!!2r(++B/5577@@BBJJLLSSTUVVH 000000011 2rv~ !844H!,<x  J r:r.   )TN)TTTTrH   )	Nr~   FTTTTTrH   )Nr~   FTTTT)TTTT)8__doc__typingr   r   r   numpyr   rg   pandasrC   joblibr   r   scipy.linalgr   r	   phikr
   rc   	bivariater   
statisticsr   r   r   binningr   r   r   data_qualityr   utilsr   r   dictr-   r   r   floatr"   boolr6   rD   intrk   tuplerS   r   r   r   r   r   r   r   r   r   rR   r.   r,   <module>r      s    * ) ) ) ) ) ) ) ) )             $ $ $ $ $ $ $ $ " " " " " " " " $ $ $ $ $ $ % % % % % %         
 P O O O O O O O O O ) ) ) ) ) ) = = = = = = = =    4"c2:o. "5c53I " " " "& QU#T #Tj#T,0#TCE:#T
#T #T #T #TP "A AAA A 	A
 A A \A A A AH
 l>B
3U?   8 %)/1!4 4
4D>4 T2:t+
,4 	4
 4 4 4 4 4 4 \4 4 4 4r "<; <;<;<; <; 	<;
 <; <; 2:rz!"<; <; <; <;B /1!> >
>> T2:t+
,> 	>
 > > > > > > 2:rz!"> > > >H "&/1!2 2RZ"#2RZ"#2 CI2 T4+
,	2
 2 2 2 2 2 2 2 2 2p "0G 0GRZ"#0GRZ"#0G 0G 	0G
 0G 0G 0G 0G 0G 0Gl "N NNN N 	N
 N N N \N N N Nb&
&& & 	&
 3U?& & & & & &r.   