
    hMhH              /       V   d Z ddlZddlZddlmZmZmZmZ ddlm	Z
 ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ 	 	 	 	 	 	 	 dEdee ej!        ej"        f         defdZ#	 	 	 	 	 	 	 	 	 	 	 	 	 	 dFdej!        de de de$de$de%de%de$d e$d!e$d"e&d#ej!        d$e'd%e(d&e'd'e'd(e%d)df$d*Z)dd+dddd,d-ddd.d/d0d1d2di i i i fdej*        d3e d4e'd5e'de$d6e%d7e%d8e'd9e'd:e$d;e$d<e&d=e$d>e$d?e'd@e+dAe+dBe+dCe+d)eej*        ej*        ee$ej*        f         ee$e$f         f         f(dDZ,dS )Gay  Project: PhiK - correlation analyzer library

Created: 2018/09/06

Description:
    Functions to create nice correlation overview and matrix plots

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
    N)CallableDictTupleUnion)colors)PdfPages   )bin_data)dq_check_nunique_values),outlier_significance_matrix_from_rebinned_df)global_phik_from_rebinned_dfphik_from_rebinned_df)significance_from_rebinned_df)guess_interval_colsF datafuncc
                 &   |ddg}t          j        |          sc|sat          j        |           }
t          j        |           }t	          t          |           dz  dz             }t          j        |
||          }nt          |          t          k    st          |          t          k    rdt          j        |           }
|r|d         }
t          j        |           }|r|d         }t	          |dz             }t          j        |
||          }t          j
        | ||d         |	           |dd         t          j        |          d         dz  z   }|d         |d         z
  }t          j        |t          |           |z   ||g|R  z  d|d         	           |rt          j        |           |rt          j        |           |rt          j        |           t          |d                   dk    rt          j                     dS dS )
a  
    Create a histogram of the provided data and overlay with a function.

    :param list data: data
    :param function func: function of the type f(x, a, b, c) where parameters a, b, c are optional
    :param list funcparams: parameter values to be given to the function, to be specified as [a, b, c]
    :param xbins: specify binning of histogram, either by giving the number of bins or a list of bin edges
    :param labels: labels of histogram and function to be used in the legend
    :param xlabel: figure xlabel
    :param ylabel: figure ylabel
    :param title: figure title
    :param xlimit: x limits figure
    :param alpha: alpha histogram
    :return:
    Nr   2   r	   r   )binslabelalpha   )	linewidthr   )npanyminmaxintlenlinspacetypefloatplthistdiffplotxlabelylabeltitlelegend)r   r   
funcparamsxbinslabelsr)   r*   r+   xlimitr   xminxmaxxnbinsxvalsbws                  K/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/phik/report.pyplot_hist_and_funcr7   !   s
   6 ~b 6%== 0 0vd||vd||SYY^a'((D$//	e		tE{{e33vd|| 	!9Dvd|| 	!9DUQYD$// HTVAYe<<<< #2#J*Q..E	qE!H	BHs4yy2~U 8Z 8 8 88AVTUY     
6 
6 	%
6!9~~
     correlationr   RdYlGn   T      matrix_colorsx_labelsy_labelspdf_file_namer+   vminvmax	color_mapx_labely_labeltopmatrix_numbersprint_both_numbersfigsizeusetexidentity_layoutfontsize_factorreturnc                 F  
! t          | t          j                  st          d          | j        d         t          |          k    s+| j        d         dz   t          |          k    s
J d            | j        d         t          |          k    s+| j        d         dz   t          |          k    s
J d            || }d}nP|j        d         t          |          k    s
J d            |j        d         t          |          k    s
J d	            |rIt          j        d
 | D                       } |ddd         }|t          j        d |D                       }t          j        d|           t          j	        |          \  }}t          j        ||          }|                    | |dd|          }
fd!t          |          | j        d         dz   k    r5|                    t          j        t          |                               n7|                    t          j        t          |                    dz              |                    !fd|D             dd|z             t          |          | j        d         dz   k    r5|                    t          j        t          |                               n7|                    t          j        t          |                    dz              |                    !fd|D             dd|z             t          |          dk    r2t          |d                   dk    rt          j        ddddd           t          |          dk    r2t          |d                   dk    rt          j        ddddd           |                    |d |z  !           |r|                    |d"|z  !           |	r|                    |	d"|z  !           |                    |           |s|gn|| g}t/          |j        d                   D ]}t/          |j        d                   D ]}t1          | |         |                   }|d#|z  k     p|d#|z  k    pt          j        |          }d}t5          |          D ]\  }}|r|dk    rd#}n|dk    rd$}t1          ||         |                   }t          j        |          rd%nd&                    |          }|rdnd'}|                    ||dz   ||z   f|d(d(d|z  )           t          j                     |rPt=          |          } t          j        | d*d+d,           t          j                      |                                   dS dS )-a  Create and plot correlation matrix.

    Copied with permission from the eskapade package (pip install eskapade)

    :param matrix_colors: input correlation matrix
    :param list x_labels: Labels for histogram x-axis bins
    :param list y_labels: Labels for histogram y-axis bins
    :param str pdf_file_name: if set, will store the plot in a pdf file
    :param str title: if set, title of the plot
    :param float vmin: minimum value of color legend (default is -1)
    :param float vmax: maximum value of color legend (default is +1)
    :param str x_label: Label for histogram x-axis
    :param str y_label: Label for histogram y-axis
    :param str color_map: color map passed to matplotlib pcolormesh. (default is 'RdYlGn')
    :param int top: only print the top 20 characters of x-labels and y-labels. (default is 20)
    :param matrix_numbers: input matrix used for plotting numbers. (default it matrix_colors)
    :param identity_layout: Plot diagonal from right top to bottom left (True) or bottom left to top right (False)
    z#matrix_colors is not a numpy array.r   r	   z8matrix_colors shape inconsistent with number of y-labelsz8matrix_colors shape inconsistent with number of x-labelsNFz9matrix_numbers shape inconsistent with number of y-labelsz9matrix_numbers shape inconsistent with number of x-labelsc                 $    g | ]}|d d d         S Nr    .0as     r6   
<listcomp>z+plot_correlation_matrix.<locals>.<listcomp>   s"    !A!A!Aa!DDbD'!A!A!Ar8   r   c                 $    g | ]}|d d d         S rR   rS   rT   s     r6   rW   z+plot_correlation_matrix.<locals>.<listcomp>   s"    &G&G&G1q2w&G&G&Gr8   text)rL   )rK   )rC   rD   w)cmap	edgecolorr   normc                     t          | t          t          f          r+t          j        |           rdnd                    |           } t          |           } t          |           k    r| dd         dz   } | S )z	Get tick.NaNz{0:.0f}N   z...)
isinstancer$   r    r   isnanformatstrr!   )labrH   s    r6   tickz%plot_correlation_matrix.<locals>.tick   so    cE3<(( 	D8C==C%%i.>.>s.C.CC#hhs88c>>crc(U"C
r8         ?c                 &    g | ]} |          S rS   rS   rU   re   rf   s     r6   rW   z+plot_correlation_matrix.<locals>.<listcomp>   !    '''sc'''r8   vertical
   )rotationfontsizec                 &    g | ]} |          S rS   rS   ri   s     r6   rW   z+plot_correlation_matrix.<locals>.<listcomp>   rj   r8   
horizontalxboth)axiswhichbottomrH   labelbottomy)rs   rt   leftrightrv      )rn      gffffff?g      ?r_   z{0:.2f}kcenter)xycolorhorizontalalignmentverticalalignmentrn   pdftightrc   bbox_inches
pad_inches)!ra   r   ndarray	TypeErrorshaper!   arrayr%   rcsubplotsr   	Normalize
pcolormesh
set_xticksarangeset_xticklabels
set_yticksset_yticklabelstick_params	set_title
set_xlabel
set_ylabelcolorbarranger$   rb   	enumeraterc   annotatetight_layoutr   savefigclose)"r?   r@   rA   rB   r+   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   figaxr]   imgnumbers_setijpoint_color
white_condy_offsetmmatrixpointr   r   pdf_filerf   s"             `                      @r6   plot_correlation_matrixr   e   s}   J mRZ00 ?=>>> "c(mm33A"c(mm333A 	433"c(mm33A"c(mm333A 	433&"#A&#+
 +
 
 
 
F
 
 
 #A&#+
 +
 
 
 
F
 
 
  I!A!A=!A!A!ABBDDbD>%X&G&G&G&G&GHHNF6&!!!!l7+++GCD111D
--I   C
     8}}+A.222
biH..////
biH..4555''''h'''o%     8}}+A.222
biH..////
biH..4555''''h'''o%     8}}c(1+..!33	
 	
 	
 	
 8}}c(1+..!33	
 	
 	
 	
 LLo!5L666 >
g_(<=== >
g_(<===LL !3W8W  >'*++  ~+A.// 	 	Aa 0 344KsTz) )3:-)8K(( 
 H&{33  	6% (Avv#&a#'fQil++!#%Mi6F6Fu6M6M)2sCX.(0&./1     	4   M**HUANNNN		 r8   rl      rg   zlog-likelihoodmultinominali  
asymptoticpoissoninterval_colsquantiledo_outlierssignificance_thresholdcorrelation_thresholdnoise_correctionstore_each_plotlambda_significancesimulation_method	nsim_chi2significance_method	CI_methodverboseplot_phik_matrix_kwsplot_global_phik_kwsplot_significance_matrix_kwsplot_outlier_significance_kwsc                    |t          | |          }t          | |          \  }}t                      }d}|	r*t          j                            |          }||rdndz  }d}|rt          |          }t          ||||d          \  }}|	r
|dz   }||d<   t          ||          }t          |j	        |j
        d	d
dddd|	  	        }|                    |           t          |j        fi | |r+t          j        |ddd	           t          j                     |	r
|dz   }||d<   t#          ||          \  }}t          dg|d	d
dddd|	  	        } |                     |           t          |fi |  |r+t          j        |ddd	           t          j                     |	r
|dz   }||d<   t%          ||
|||          }!t          |!j	        |!j
        dddddd|	  	        }"|"                    |           t          |!                    d	          j        fi |" |r+t          j        |ddd	           t          j                     i }#|rtt)          t+          j        |j	        d                    D ]K\  }$}%|%\  }&}'t/          |!j        |&|'f                   |k     s|j        |&|'f         |k     r?t3          ||&|'g                                         ||           }(d!                    |%                              d"d#          })|(j	        }*|(j
        }+|(j	        j        },|(j
        j        }-|	r|d$                    |)          z   }|||)<   t          |*|+|,|-ddd%dd&|'
  
        }.|.                    |           t          |(j        fi |. |(|#|)<   |r+t          j        |ddd	           t          j                     M|r,||d(<   t          j                     |                                 |||!|#|fS ))a  
    Create a correlation report for the given dataset.

    The following quantities are calculated:

    * The phik correlation matrix
    * The significance matrix
    * The outlier significances measured in pairs of variables. (optional)

    :param data: input dataframe
    :param interval_cols: list of columns names of columns containing interval data
    :param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)    E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
    :param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
    :param do_outliers: Evaluate outlier significances of variable pairs (when True)
    :param pdf_file_name: file name of the pdf where the results are stored
    :param store_each_plot: store each plot in folder derived from pdf_file_name. If true, single pdf is no longer stored. Default is false.
    :param significance_threshold: evaluate outlier significance for all variable pairs with a significance of      uncorrelation higher than this threshold
    :param correlation_threshold: evaluate outlier significance for all variable pairs with a phik correlation      higher than this threshold
    :param noise_correction: Apply noise correction in phik calculation
    :param lambda_significance: test statistic used in significance calculation. Options: [pearson, log-likelihood]
    :param simulation_method: sampling method using in significance calculation. Options: [mutlinominal,     row_product_multinominal, col_product_multinominal, hypergeometric]
    :param nsim_chi2: number of simulated datasets in significance calculation.
    :param significance_method: method for significance calculation. Options: [asymptotic, MC, hybrid]
    :param CI_method: method for uncertainty calculation for outlier significance calculation. Options: [poisson,     exact_poisson]
    :param bool verbose: if False, do not print all interval columns that are guessed
    :param dict plot_phik_matrix_kws: kwargs passed to plot_correlation_matrix() to plot the phik matrix.     updates the default plotting values.
    :param dict plot_global_phik_kws: kwargs passed to plot_correlation_matrix() to plot the global-phik vector.     updates the default plotting values.
    :param dict plot_significance_matrix_kws: kwargs passed to plot_correlation_matrix() to plot significance matrix.     updates the default plotting values.
    :param dict plot_outlier_significance_kws: kwargs passed to plot_correlation_matrix() to plot the outlier     significances. updates the default plotting values.
    :returns: phik_matrix (pd.DataFrame), global_phik (np.array), significance_matrix (pd.DataFrame),     outliers_overview (dictionary), output_files (dictionary)
    Nr   /z./T)r   r   retbinszphik_matrix.pdfphik_matrixr   r	   Blueszcorrelation $\phi_K$g      ?)r=   g      @)	r@   rA   rC   rD   rE   r+   rN   rK   rB   r   r   r   zglobal_phik.pdfglobal_phik)g      @   z$g_k$)	r@   rA   rC   rD   rK   rE   r+   rN   rB   zsignificance_matrix.pdfsignificance_matrixr>   significanceF)	r@   rA   rC   rD   r+   rL   rN   rK   rB   r   )r   : _zpulls_{0:s}.pdfzoutlier significanceg333333?)
r@   rA   rF   rG   rC   rD   r+   rM   rN   rB   all) r   r   dictospathdirnamer   r
   r   columnsindexupdater   valuesr%   r   showr   r   fillnar   	itertoolscombinationsabslocr   copyjoinreplacenamerc   r   )/r   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
data_cleaninterval_cols_cleanoutput_filesplot_file_namefolderr   data_binnedbinning_dictr   default_plot_phik_matrixr   global_labelsdefault_plot_global_phikr    default_plot_significance_matrixoutliers_overviewr   combc0c1
zvalues_dfcombixlabelsylabelsr)   r*   !default_plot_outlier_significances/                                                  r6   correlation_reportr     s;   @ +D'::&=dM&R&R#J# 66LN //)##T)  +M** ('dXt! ! !K
  5"33&4]#'5EFFK#$"%$
  
  
  ##$8999K.KK2JKKK HUANNNN


  5"33&4]#!=%" "K  $$
  
  
  ##$8999KDD+CDDD HUANNNN


  =";;.<*+7  (,$,$*$
( 
( 
($ %++,HIII""1%%, 0P    HUANNNN


  , !78KQ!O!OPP +	 +	GAtFB'+BF3447MMM?2r6*-BBBERH%**,,li  J HHTNN**344E (G &G',F%*F 5!'*;*B*B5*I*I!I&4U#04  , % #,1 1 1- .445RSSS#! %F   (2e$ HUTUVVVV


  +U	 	 r8   )FNr   r   r   Nr	   )r   r9   r   r	   r:   r   r   r;   NTr<   FTr	   )-__doc__r   r   typingr   r   r   r   matplotlib.pyplotpyplotr%   numpyr   pandaspd
matplotlibr   matplotlib.backends.backend_pdfr   binningr
   data_qualityr   outliersr   phikr   r   r   r   utilsr   listr   Seriesr7   rd   r$   r    booltupler   	DataFramer   r   rS   r8   r6   <module>r
     s        				 / / / / / / / / / / / /                     4 4 4 4 4 4       1 1 1 1 1 1 B B B B B B E E E E E E E E 7 7 7 7 7 7 & & & & & & 

A A
bj")+
,A
A A A AP !%# #l l:ll l 	l
 l l l l l l 
l Jl l l l  !l" #l$ 
%l l l lb 	$%#&!!/++!#!#)+*,)a a
,aa 	a
 a a "a !a a a a a a a a  !a" #a$ %a& #''a( $()a* 2<tC,='>S#XNO+a a a a a ar8   