
    hMh~                        d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZ  ej        ej                  j        dz
  Zd!dej        d	ed
ej        fdZd"dej        ded
ej        fdZdej        ded
ej        fdZd#dej        ded
ej        fdZ	 	 d$dej        dededededed
efdZ	 	 d%dej        deded
efd ZdS )&ac  Project: PhiK - correlation analyzer library

Created: 2018/09/05

Description:
    Helper functions to simulate 2D datasets

Authors:
    KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands

Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
    N)Paralleldelayed   )!get_dependent_frequency_estimates),get_chi2_using_dependent_frequency_estimates)CPP_SUPPORT_sim_2d_data_patefieldhistndatareturnc                 ~   |dk    r3t          t          j        |                                                     }|dk    rt	          d          | dd         |                                 z  }|                                }t          j                            ||          }t          j        ||j	                  }|S )a<  
    Simulate a 2 dimensional dataset given a 2 dimensional pdf

    :param array-like hist: contingency table, which contains the observed number of occurrences in each category.
        This table is used as probability density function.
    :param int ndata: number of simulations
    :return: simulated data
    r   z(ndata (or hist.sum()) has to be positiveN)npvals)
intnprintsum
ValueErrorravelrandommultinomialreshapeshape)r
   r   hchcrhouthout2ds         O/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/phik/simulation.pysim_2d_datar      s     zzBGDHHJJ''((zzCDDD 
aaa488::	B
((**C9  5 44DZbh''FM    dataseedc                 H   t           st          d          | j        \  }}t          j        |                     d                                        t          j                  }t          j        |                     d                                        t          j                  }|p$t          j        	                    dt                    }t          j        ||z  t          j                  }t          ||||||           |                    ||          j        S )a/  
    Simulate a two dimensional dataset with fixed row and column totals.

    Simulation algorithm by Patefield:
    W. M. Patefield, Applied Statistics 30, 91 (1981)
    Python implementation inspired by (C version):
    https://people.sc.fsu.edu/~jburkardt/c_src/asa159/asa159.html

    :param data: contingency table, which contains the observed number of occurrences in each category.    :param seed: optional seed for the simulation, primarily for testing purposes.    This table is used as probability density function.
    :return: simulated data
    z;Patefield requires a compiled extension that was not found.r   )axisr   )dtype)r   NotImplementedErrorr   r   r   r   astypeint32r   randintNUMPY_INT_MAXemptyr	   r   T)r!   r"   nrowsncolsnrowtncoltmatrixs          r   sim_2d_data_patefieldr2   3   s      
!I
 
 	

 :LE5 GDHH!H$$%%,,RX66EGDHH!H$$%%,,RX66E 629$$Q66D Xeem28444F 5%tVDDD>>%''))r    r$   c                 $    |dk    r8t          j         fdt           j        d                   D                       S |dk    r=t          j         fdt           j        d                   D                       j        S t          d          )aC  
    Simulate 2 dimensional data with either row or column totals fixed.

    :param data: contingency table, which contains the observed number of occurrences in each category.    This table is used as probability density function.
    :param axis: fix row totals (0) or column totals (1).
    :return: simulated data
    r   c                 T    g | ]$}t          t          |                             %S  )listr   .0ir!   s     r   
<listcomp>z/sim_2d_product_multinominal.<locals>.<listcomp>e   s-    RRRk$q'2233RRRr    r   c                 ^    g | ])}t          t          j        |                             *S r5   )r6   r   r,   r7   s     r   r:   z/sim_2d_product_multinominal.<locals>.<listcomp>g   s/    TTT!k$&)4455TTTr    z%Axis should be 0 (row) or 1 (column).)r   arrayranger   r,   r&   )r!   r$   s   ` r   sim_2d_product_multinominalr>   Z   s     qyyxRRRRU4:a==Q=QRRRSSS	xTTTTuTZPQ]?S?STTTUUWW!"IJJJr    multinominalmethodc                     |dk    rt          |           S |dk    rt          |           S |dk    rt          | d          S |dk    rt          | d          S t          d          )a  
    Simulate a 2 dimensional dataset given a 2 dimensional pdf

    Several simulation methods are provided:

     - multinominal: Only the total number of records is fixed.
     - row_product_multinominal: The row totals fixed in the sampling.
     - col_product_multinominal: The column totals fixed in the sampling.
     - hypergeometric: Both the row or column totals are fixed in the sampling. Note that this type of sampling is    only available when row and column totals are integers.

    :param data: contingency table
    :param str method: sampling method. Options: [multinominal, hypergeometric, row_product_multinominal,     col_product_multinominal]
    :return: simulated data
    r?   hypergeometricrow_product_multinominalr   col_product_multinominalr   zselected method not recognized.)r   r2   r>   r&   )r!   r@   s     r   sim_datarE   l   s~    $ 4   	#	#	#$T***	-	-	-*4333	-	-	-*4333!"CDDDr      log-likelihoodFvaluesnsimlambda_simulation_methodalt_hypothesisnjobsc                     |st          |           n| |dk    rfdt          |          D             }n4 t          |          fdt          |          D                       }|S )a  
    Simulate 2D data and calculate the chi-square statistic for each simulated dataset.

    :param values: The contingency table. The table contains the observed number of occurrences in each category
    :param int nsim: number of simulations (optional, default=1000)
    :param str simulation_method: sampling method. Options: [multinominal, hypergeometric, row_product_multinominal,
        col_product_multinominal]
    :param str lambda_: test statistic. Available options are [pearson, log-likelihood].
    :param bool alt_hypothesis: if True, simulate values directly, and not its dependent frequency estimates.
    :param int njobs: number of parallel jobs used for simulation. default is -1. 1 uses no parallel jobs.
    :returns chi2s: list of chi2 values for each simulated dataset
    r   c                 2    g | ]}t                    S r5   )_simulate_and_fitr8   _exp_deprK   rL   s     r   r:   z)sim_chi2_distribution.<locals>.<listcomp>   s'    ]]]A"7,=wGG]]]r    )n_jobsc              3   V   K   | ]#} t          t                              V  $d S N)r   rQ   rR   s     r   	<genexpr>z(sim_chi2_distribution.<locals>.<genexpr>   sQ       '> '>,- (Bw/@'A'A'K\^e'f'f '> '> '> '> '> '>r    )r   r=   r   )rI   rJ   rK   rL   rM   rN   chi2srT   s     ``   @r   sim_chi2_distributionrZ      s     @NY/777SYGzz]]]]]]QVW[Q\Q\]]]&&&& '> '> '> '> '> '>16t'> '> '> > > Lr    rT   c                 H    t          | |          }t          ||          }|S )z9split off simulate function to allow for parallellization)r@   )rE   r   )rT   rL   rK   simdatasimchi2s        r   rQ   rQ      s+     w'8999G:7GLLGNr    )r   rW   )r?   )rF   rG   r?   FrH   )r?   rG   )__doc__numpyr   joblibr   r   
statisticsr   r   phik.simcorer   r	   iinfor(   maxr*   ndarrayr   r   r2   r>   strrE   boolr6   rZ   floatrQ   r5   r    r   <module>ri      s        $ $ $ $ $ $ $ $ 9 9 9 9 9 9 D D D D D D < < < < < < < < ""&* RZ s     0$* $*
 $*3 $*"* $* $* $* $*NKRZ Ks Krz K K K K$E E"* ES EBJ E E E E< JZeg "* 3 S ,/OSadlp   2 CQ"2 rz c !7<     r    