
    hMhzR                        d Z ddlmZmZ ddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZm Z  	 d<dee!ej"        f         de#dee$e$e$e$f         fdZ%de$de$de$de$de$de$fdZ&de$de$dee$e$f         fdZ'dej"        de$dee$e$f         fdZ(	 	 	 	 	 d=de$dej"        d"e#d#e)d$e)d%e#dee$e$f         fd&Z*	 	 	 	 	 d=de$dej"        d"e#d#e)d$e)d%e#dee$e$f         fd'Z+	 	 	 	 	 d>dej"        d"e#d#e)d$e)d)e)d%e#dee$e$f         fd*Z,	 	 	 	 	 	 	 	 d?d,ej-        d#e)d$e)d"e#d)e)d-e.d.e.d/e.d%e#dej-        fd0Z/	 	 	 	 	 	 	 	 	 	 	 d@d2ej-        d3e!d#e)d$e)d"e#d)e)d4ee#e!ej"        e0f         d-e.d.e.d/e.d5e.d%e#dej-        fd6Z1	 	 	 	 	 	 	 	 	 	 	 dAdeej"        ej2        f         d8eej"        ej2        f         d4ee#e!ej"        e0f         d9e.d#e)d"e#d)e)d$e)d-e.d.e.d/e.d%e#dee$e$f         fd:Z3	 	 	 	 	 	 	 	 dBdeej"        ej2        f         d8eej"        ej2        f         d#e)d)e)d"e#d$e)d-e.d.e.d/e.d%e#dee$e$f         fd;Z4dS )Ca  Project: PhiK - correlation analyzer library

Created: 2018/09/05

Description:
    Functions for doing the significance evaluation of an hypothesis test of variable independence
    using a contingency table.

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
    )TupleUnionN)stats)specialoptimize)definitions   )bin_data!create_correlation_overview_table),get_chi2_using_dependent_frequency_estimates)estimate_ndoftheoretical_ndof)sim_chi2_distribution)dq_check_nunique_valuesdq_check_hist2d)array_like_to_dataframeguess_interval_cols2   chi2snbinsreturnc                   	
 d 
fd		fd}t          j        |           
t          j        
          t          |           t          j        | |          \  }}|d         |d         z
  |dd         t          j        |          d	z  z   }d
}t          j        ||d||f          }|j        d         
fS )a5  
    Fit the hybrid chi2-distribution to the data to find f.

    Perform a binned likelihood fit to the data to find the optimal value for the fraction f in
    h(x|f) = N * (f * chi2(x, ndof) + (1-f) * gauss(x, ndof, sqrt(ndof))
    The parameter ndof is fixed in the fit using ndof = mean(x). The total number of datapoints N is also fixed.

    :param list chi2s: input data - a list of chi2 values
    :param int nbins: in order to fit the data a histogram is created with nbins number of bins
    :returns: f, ndof, sigma (width of gauss), bw (bin width)
    c                     ||t           j                            | |          z  d|z
  t           j                            | ||          z  z   z  S Nr	   r   chi2pdfnormxNfksigmas        Q/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/phik/significance.pymyfuncz/fit_test_statistic_distribution.<locals>.myfunc3   sB    A
q!,,,A1e9T9T/TTUU    c                 (     | z  |          S )N )r    r"   bwkmeanlsigmar&   nsims     r%   <lambda>z1fit_test_statistic_distribution.<locals>.<lambda>6   s    4"9a?? r'   c                      |g| R  }|t          j        ||          z
  t          j        |dz             z   }t          j        |          S r   )r   xlogygammalnnpsqrt)pr    yr"   llffuncs        r%   gtestz.fit_test_statistic_distribution.<locals>.gtest8   sM    E!LaLLLq!$$$wq1u'='==wr{{r'   binsr	   r   N   )      ?))g        )r	   )boundsargs)	r2   meanr3   len	histogramdiffr   least_squaresr    )r   r   r8   yhistxboundsxhist	initGuessresr*   r7   r+   r,   r&   r-   s           @@@@@@r%   fit_test_statistic_distributionrJ   $   s   V V V @???????E    
 GENNEWU^^Fu::D\%e444NE7	gaj	 BCRCL277++a//EI

 yuen  C 58UFB&&r'   r    r!   r"   r#   r$   c                     ||t           j                            | |          z  d|z
  t           j                            | ||          z  z   z  S )aX  
    Definition of the combined probability density function h(x)

    h(x|f) = N * (f * chi2(x, k) + (1-f) * gauss(x, k, sigma)

    :param float x: x
    :param float N: normalisation
    :param float f: fraction [0,1]
    :param float k: ndof of chi2 function and mean of gauss
    :param float sigma: width of gauss
    :return: h(x|f)
    r	   r   r   s        r%   hfuncrL   M   sD     EJNN1a(((AEUZ^^Aq%5P5P+PPQQr'   r   ndofc                 x   t           j                            | |          }t           j                            |           }|dk    rq| |z  }t          j        dt
          j        z             |t          j        |          z  z
  ||dz
  z  z   }t          j        |t          j        |          z
            }||fS )a  
    Convert a chi2 into significance using knowledge about the number of degrees of freedom

    Conversion is done using asymptotic approximation.

    :param float chi2: chi2 value
    :param float ndof: number of degrees of freedom
    :returns: p_value, significance
    r   r<   r	   )	r   r   sfr   ppfmathlogpir3   )r   rM   p_valuez_valuezus         r%   significance_from_chi2_ndofrX   ]   s     jmmD$''Gz~~g&&&G
 !||4KXa$'k"""TDHQKK%77$!a%.H)AO,,Gr'   valuesc                 N    t          |           }t          ||          \  }}||fS )a  
    Convert a chi2 into significance using knowledge about the number of degrees of freedom

    Convention is done using asymptotic approximation.

    :param float chi2: chi2 value
    :param float ndof: number of degrees of freedom
    :returns: p_value, significance
    )r   rX   )rY   r   rM   rT   rU   s        r%   !significance_from_chi2_asymptoticr[   u   s0     F##D24>>GWGr'     log-likelihoodmultinominalr;   r-   lambda_simulation_methodnjobsc                     |t          |||||          }dt          j        ||           dz  z
  }t          j                            |           }||fS )a  
    Convert a chi2 into significance using knowledge about the shape of the chi2 distribution of simulated data

    Calculate significance based on simulation (MC method), using a simple percentile.

    :param float chi2: chi2 value
    :param list chi2s: provide your own chi2s values (optional)
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :returns: pvalue, significance
    Nr-   r_   r`   ra   r=   g      Y@)r   r   percentileofscorer   rP   )	r   rY   r-   r_   r`   r   ra   empirical_p_valueempirical_z_values	            r%   significance_from_chi2_MCrg      so    * }%/
 
 
 e5eTBBUJJ(9:::///r'   c                 .   |t          |||||          }|                                |j        d         z  |j        d         z  }|dk    r`t          |          \  }}	}
}|t          j                            | |	          z  d|z
  t          j                            | |	|
          z  z   }n/t          |          }	t          j                            | |	          }t          j        	                    |           }|dk    r| |	z  }t          j        dt          j        z             |	t          j        |          z  z
  |	|dz
  z  z   }|dk    r|dt          j        |          z  z  }t          j        |t          j        |          z
            }||fS )a   
    Convert a chi2 into significance using a hybrid method

    This method combines the asymptotic method with the MC method, but applies several corrections:

    * use effective number of degrees of freedom instead of number of degrees of freedom. The effective number of      degrees of freedom is measured as mean(chi2s), with chi2s a list of simulated chi2 values.
    * for low statistics data sets, with on average less than 4 data points per bin, the distribution of chi2-values      is better described by h(x|f) then by the usual chi2-distribution. Use h(x|f) to convert the chi2 value to       the pvalue and significance.

    h(x|f) = N * (f * chi2(x, ndof) + (1-f) * gauss(x, ndof, sqrt(ndof))

    :param float chi2: chi2 value
    :param list chi2s: provide your own chi2s values (optional)
    :param float avg_per_bin: average number of data points per bin
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :returns: p_value, significance
    Nrc   r   r	      r<   )r   sumshaperJ   r   r   rO   r   r   rP   rQ   rR   rS   r3   )r   rY   r-   r_   r`   r   ra   avg_per_binr"   endofr,   r*   pvalue_hzvalue_hrV   rW   s                   r%   significance_from_chi2_hybridrq      s   < }%/
 
 
 **,,a06<?BKa>uEE5&"uz}}T5111QUejmm%?
 ?
 5
 
 e$$:==u--
x(((H1}} 5LXa$'k"""UTXa[[%885AE?J!dhqkk!!A9Q!_--Xr'   hybridsignificance_methodc                     t          | |          }|dk    rt          | |          \  }}n`|dk    rt          || ||||          \  }}nA|dk    rt          || ||||          \  }}n"t	          d                    |                    ||fS )a  
    Calculate the significance of correlation of two variables based on the contingency table

    :param values: contingency table
    :param int nsim: number of simulations
    :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
    :param str simulation_method: simulation method. Options: [multinominal, row_product_multinominal,      col_product_multinominal, hypergeometric].
    :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :return: pvalue, significance
    )r_   
asymptoticMCrc   rr   z"simulation_method {0:s} is unknown)r   r[   rg   rq   NotImplementedErrorformat)	rY   r-   r_   r`   rs   ra   r   pvaluezvalues	            r%   significance_from_hist2dr{      s    , 8PPPDl**:64HH		$	$2/
 
 
 
	(	( 7/
 
 
 "0778IJJ
 
 	
 6>r'   Tdata_binneddropnadrop_underflowdrop_overflowc	           
         |s,|                      t          j        t          j        d           |r,|                      t          j        t          j        d           |r,|                      t          j        t          j        d           | j        }	g }
t          t          j
        | j        j        d                    D ]9\  }\  }}|                     ||g          |                                                                                                                             d          }d|j        v s	d|j        v rc|
                    ||t          j        f           t'          j        d                    ||j        d         ||j        d                              |j                                        |_        |j        }t/          ||||||          \  }}|
                    |||f           ;t1          |
          dk    r!t3          j        t          j        |	|	          S t7          |
          }|                    |		          }|                    |	
          }|S )a  
    Calculate significance of correlation of all variable combinations in the DataFrame

    :param data_binned: input binned DataFrame
    :param int nsim: number of simulations
    :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
    :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal,      col_product_multinominal, hypergeometric].
    :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param int njobs: number of parallel jobs used for simulation. default is -1.
    :return: significance matrix
    T)inplacer<   r   r	   z[Too few unique values for variable {0:s} ({1:d}) or {2:s} ({3:d}) to calculate significance)r-   r_   r`   rs   ra   )indexcolumns)r   )r   )replacer2   nandefsNaNUFOFr   	enumerate	itertoolscombinations_with_replacementrY   groupbycountto_frameunstackfillnarl   appendwarningswarnrx   	droplevelr{   rA   pd	DataFramer   reindex)r|   r_   r`   r-   rs   r}   r~   r   ra   column_ordersignifsic0c1datahistry   rz   significance_overviews                     r%   significance_from_rebinned_dfr   &  sb   :  < 	BFDHd;;; ;DGRVT::: ;DGRVT::: &LG /0C0JANN  ) )8B R))"-3355>>@@HHJJQQRSTT 	 !x~"5"5NNBBF+,,,Mmttq)2x~a/@   
 #+5577?1/ 3
 
 
 	B'((((
7||q|BF,MMMM=gFF 299,9OO1999MM  r'   
   dfinterval_colsr:   verbosec                     |t          | |
          }t          | ||          \  }}t          |||          }t          ||||||||	|	  	        S )a  
    Calculate significance of correlation of all variable combinations in the dataframe

    :param pd.DataFrame df: input data
    :param list interval_cols: column names of columns with interval variables.
    :param int nsim: number of simulations
    :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
    :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal,      col_product_multinominal, hypergeometric].
    :param int nsim: number of simulated datasets
    :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
        :param bool dropna: remove NaN values with True
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param bool verbose: if False, do not print all interval columns that are guessed
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :return: significance matrix
    N)r}   r9   )r_   r`   r-   rs   r}   r~   r   ra   )r   r   r
   r   )r   r   r_   r`   r-   rs   r:   r}   r~   r   r   ra   df_cleaninterval_cols_cleanr|   s                  r%   significance_matrixr   v  s    L +B88$;
M&% % %!H! 8%8tDDDK(+/%#
 
 
 
r'   Fr5   quantilec                     |g }nt          |t                    r|g}t          |          dk    r0t          | |          }t	          ||||          j        j        \  } }t          | ||||||	|
||
  
        S )a  
    Calculate the significance of correlation

    Calculate the significance of correlation for two variables which can be of interval, oridnal or categorical type.    Interval variables will be binned.

    :param x: array-like input
    :param y: array-like input
    :param num_vars: list of numeric variables which need to be binned, e.g. ['x'] or ['x','y']
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
    :param int nsim: number of simulated datasets
    :param str simulation_method: simulation method. Options: [mutlinominal, row_product_multinominal,     col_product_multinominal, hypergeometric].
    :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :return: p-value, significance
    Nr   )r:   r   )r_   rs   r-   r`   r}   r~   r   ra   )
isinstancestrrA   r   r
   TrY   significance_from_binned_array)r    r5   num_varsr:   r   r_   r-   rs   r`   r}   r~   r   ra   r   s                 r%   significance_from_arrayr     s    P 	Hc	"	" :
8}}q$Q**H4(CCCEL1)		/+%#   r'   c
                 8   |st          j        |                               t          j                                      t                    j        } t          j        |                              t          j                                      t                    j        }|s|r"t          j        |                               t                    j        } t          j        |                              t                    j        }|r^t          j	        | t          j
        | t          j        k              <   t          j	        |t          j
        |t          j        k              <   |r^t          j	        |t          j
        |t          j        k              <   t          j	        | t          j
        | t          j        k              <   t          j        | |          j        }
t          |
          st          j	        t          j	        fS t          |
|||||	          S )ac  
    Calculate the significance of correlation

    Calculate the significance of correlation for two variables which can be of interval, oridnal or categorical type.     Interval variables need to be binned.

    :param x: array-like input
    :param y: array-like input
    :param str lambda_: test statistic. Available options are [pearson, log-likelihood]
    :param str simulation_method: simulation method. Options: [multinominal, row_product_multinominal,     col_product_multinominal, hypergeometric].
    :param int nsim: number of simulated datasets
    :param str significance_method: significance_method. Options: [asymptotic, MC, hybrid]
    :param bool dropna: remove NaN values with True
    :param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning    a numeric variable)
    :param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning    a numeric variable)
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :return: p-value, significance
    )r_   rs   r`   r-   ra   )r   Seriesr   r   r   astyper   rY   r2   r   wherer   r   crosstabr   r{   )r    r5   r_   rs   r-   r`   r}   r~   r   ra   hist2ds              r%   r   r     s   D  
IaLL))0055<IaLL))0055< 	
  / /IaLL$$+IaLL$$+ 	/(*AbhqDG|$$%(*AbhqDG|$$% 	/(*AbhqDG|$$%(*AbhqDG|$$%[A%F6"" vrv~#/+   r'   )r   )r\   r]   r^   Nr;   )r\   r]   r^   rr   r;   )r]   r^   r\   rr   TTTr;   )Nr]   r^   r\   rr   r   TTTTr;   )Nr   Fr]   r\   rr   r^   TTTr;   )r]   rr   r\   r^   TTTr;   )5__doc__typingr   r   numpyr2   pandasr   rQ   r   r   scipyr   r   r   phikr   r   binningr
   r   
statisticsr   r   r   
simulationr   data_qualityr   r   utilsr   r   listndarrayintfloatrJ   rL   rX   r[   r   rg   rq   r{   r   boolr   dictr   r   r   r   r)   r'   r%   <module>r      s                                 # # # # # # # # $ $ $ $ $ $ @ @ @ @ @ @ @ @ D D D D D D 7 7 7 7 7 7 7 7 - - - - - - B B B B B B B B ? ? ? ? ? ? ? ? 24&' &'rz!"&'+.&'
5%%&&' &' &' &'RRU Ru R R5 R R5 R R R R e 5 U5%<=P    0J#
5%<   , #+
"0 "0
"0J"0 "0 	"0
 "0 "0 5%<"0 "0 "0 "0P #+
> >
>J> > 	>
 > > 5%<> > > >F #+'5 5J5
5 5 	5
 5 5 5%<5 5 5 5t $+'M! M!M!M! M! 	M!
 M! M! M! M! M! \M! M! M! M!d #+'/19 9
99 9 	9
 9 9 T2:t+
,9 9 9 9 9 9 \9 9 9 9~ /1#'+< <RZ"#<RZ"#< T2:t+
,	<
 < < < < < < < < < 5%<< < < <D $'+> >RZ"#>RZ"#> > 	>
 > > > > > > 5%<> > > > > >r'   