
    M/PhŬ                         d Z ddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ d Z G d d	          Z G d
 d          Z G d de          Z G d de          Z G d d          ZddZddZdS )a  
Methods for analyzing two-way contingency tables (i.e. frequency
tables for observations that are cross-classified with respect to two
categorical variables).

The main classes are:

  * Table : implements methods that can be applied to any two-way
  contingency table.

  * SquareTable : implements methods that can be applied to a square
  two-way contingency table.

  * Table2x2 : implements methods that can be applied to a 2x2
  contingency table.

  * StratifiedTable : implements methods that can be applied to a
  collection of 2x2 contingency tables.

Also contains functions for conducting McNemar's test and Cochran's q
test.

Note that the inference procedures may depend on how the data were
sampled.  In general the observed units are independent and
identically distributed.
    N)stats)iolib)sm_exceptions)cache_readonlyc                 t   t          | t          j                  s| S | j                            | j                  sbt          t          | j                  t          | j                  z            }|                                 | 	                    ||d          } | 	                    | j                  } | S )z
    Reindex a pandas DataFrame so that it becomes square, meaning that
    the row and column indices contain the same values, in the same
    order.  The row and column index are extended to achieve this.
    r   )indexcolumns
fill_value)

isinstancepd	DataFramer   equalsr	   listsetsortreindex)tableixs     d/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/statsmodels/stats/contingency_tables.py_make_df_squarer   '   s     eR\**  ;em,, B#ek""S%7%7788
			BqAA MM%-((EL    c                       e Zd Zd Zd ZdS )_Bunchc                     dS )Nz1<bunch containing results, print to see contents> selfs    r   __repr__z_Bunch.__repr__?   s    BBr   c                 l   d | j                                         D             }|                                 t          d |D                       }g }dt	          |          z   dz   }|D ]6}|                    |                    || j         |                              7d                    |          S )Nc                     g | ]\  }}|S r   r   ).0k_s      r   
<listcomp>z"_Bunch.__str__.<locals>.<listcomp>C   s    222DAqa222r   c                 ,    g | ]}t          |          S r   )len)r!   r"   s     r   r$   z"_Bunch.__str__.<locals>.<listcomp>E   s    $$$AQ$$$r   z{:z}   {}
)__dict__itemsr   maxstrappendformatjoin)r   kymtabfr"   s         r   __str__z_Bunch.__str__B   s    22DM//11222
			$$$$$%%3q66MH$ 	6 	6AJJqxx4=#3445555yy~~r   N)__name__
__module____qualname__r   r3   r   r   r   r   r   =   s5        C C C    r   r   c                   "   e Zd ZdZddZd Zedd            Zd ZddZ	e
d	             Ze
d
             Ze
d             Ze
d             Ze
d             Ze
d             Ze
d             Ze
d             Ze
d             Ze
d             ZdS )Tablea0  
    A two-way contingency table.

    Parameters
    ----------
    table : array_like
        A contingency table.
    shift_zeros : bool
        If True and any cell count is zero, add 0.5 to all values
        in the table.

    Attributes
    ----------
    table_orig : array_like
        The original table is cached as `table_orig`.

    See Also
    --------
    statsmodels.graphics.mosaicplot.mosaic
    scipy.stats.chi2_contingency

    Notes
    -----
    The inference procedures used here are all based on a sampling
    model in which the units are independent and identically
    distributed, with each unit being classified with respect to two
    categorical variables.

    References
    ----------
    Definitions of residuals:
        https://onlinecourses.science.psu.edu/stat504/node/86
    Tc                     || _         t          j        |t          j                  | _        |r2| j                                        dk    rd| j        | j        dk    <   d S d S d S )Ndtyper         ?)
table_orignpasarrayfloat64r   min)r   r   shift_zeross      r   __init__zTable.__init__p   sg    ZRZ888
 	.DJNN,,11*-DJtzQ'''	. 	.11r   c                 v    dt          | j        j                  z  }|t          j        | j                  z  }|S )Nz'A %dx%d contingency table with counts:
)tupler   shaper>   	array_str)r   ss     r   r3   zTable.__str__x   s7    74:#$$%	R\$*%%%r   c                    t          |t          j                  r4t          j        |j        dddf         |j        dddf                   }n)t          j        |dddf         |dddf                   } | ||          S )a  
        Construct a Table object from data.

        Parameters
        ----------
        data : array_like
            The raw data, from which a contingency table is constructed
            using the first two columns.
        shift_zeros : bool
            If True and any cell count is zero, add 0.5 to all values
            in the table.

        Returns
        -------
        A Table instance.
        Nr      r   r   r   crosstabilocclsdatarB   r   s       r   	from_datazTable.from_data~   s    & dBL)) 	8K	!!!Q$111a4AAEEKQQQT
DAJ77Es5+&&&r   c                 N   t          j        | j                                                  }t          j        t          j        | j        j                  dz
            }dt          j        	                    ||          z
  }t                      }||_        ||_        ||_        |S )a  
        Assess independence for nominal factors.

        Assessment of independence between rows and columns using
        chi^2 testing.  The rows and columns are treated as nominal
        (unordered) categorical variables.

        Returns
        -------
        A bunch containing the following attributes:

        statistic : float
            The chi^2 test statistic.
        df : int
            The degrees of freedom of the reference distribution
        pvalue : float
            The p-value for the test.
        rJ   )r>   r?   chi2_contribssumprodr   rF   r   chi2cdfr   	statisticdfpvalue)r   rX   rY   rZ   bs        r   test_nominal_associationzTable.test_nominal_association   s    ( Jt1226688	WRZ
 011A566UZ^^Ir222HHr   Nc                 d   |$t          j        | j        j        d                   }|$t          j        | j        j        d                   }t	          |          | j        j        d         k    rd}t          |          t	          |          | j        j        d         k    rd}t          |          t          j        |t          j        | j        |                    }| j                                        }| j                            d          }t          j        ||          }t          j        |dz  |          }| j                            d          }	t          j        ||	          }
t          j        |dz  |	          }||
z  |z  }||dz  |z  z
  ||
dz  |z  z
  z  |dz
  z  }t          j        |          }||z
  |z  }dt          j
                            t          j        |                     z  }t                      }||_        ||_        ||_        ||_        ||_        |S )ar  
        Assess independence between two ordinal variables.

        This is the 'linear by linear' association test, which uses
        weights or scores to target the test to have more power
        against ordered alternatives.

        Parameters
        ----------
        row_scores : array_like
            An array of numeric row scores
        col_scores : array_like
            An array of numeric column scores

        Returns
        -------
        A bunch with the following attributes:

        statistic : float
            The test statistic.
        null_mean : float
            The expected value of the test statistic under the null
            hypothesis.
        null_sd : float
            The standard deviation of the test statistic under the
            null hypothesis.
        zscore : float
            The Z-score for the test statistic.
        pvalue : float
            The p-value for the test.

        Notes
        -----
        The scores define the trend to which the test is most sensitive.

        Using the default row and column scores gives the
        Cochran-Armitage trend test.
        Nr   rJ   zEThe length of `row_scores` must match the first dimension of `table`.zFThe length of `col_scores` must match the second dimension of `table`.   )r>   aranger   rF   r&   
ValueErrordotrT   sqrtr   normrW   absr   rX   	null_meannull_sdzscorerZ   )r   
row_scores
col_scoresmsgrX   n_obsrtotumu2mctotvnv2ne_statv_statsd_statrg   rZ   r[   s                     r   test_ordinal_associationzTable.test_ordinal_association   s   P 4:#3A#677J4:#3A#677Jz??dj.q111+CS//!z??dj.q111+CS//! F:rvdj*'E'EFF	 
  z~~a  VJ%%fZ]D))z~~a  VJ%%fZ]D)) b5A%#A*=>%!)L'&//f$/UZ^^RVF^^O444HH	r   c                 j   | j                                         }| j                             d          |z  }| j                             d          |z  }t          | j        t          j                  r>t	          j        || j        j                  }t	          j        || j        j                  }||fS )z
        Estimate marginal probability distributions for the rows and columns.

        Returns
        -------
        row : ndarray
            Marginal row probabilities
        col : ndarray
            Marginal column probabilities
        rJ   r   )	r   rT   r   r=   r   r   Seriesr   r	   )r   nrowcols       r   marginal_probabilitieszTable.marginal_probabilities	  s     JNNjnnQ!#jnnQ!#dor|44 	:)C!677C)C!899CCxr   c                     | j         \  }}t          j        ||          }t          | j        t
          j                  r*t          j        || j        j        | j        j                  }|S )z
        Returns fitted joint probabilities under independence.

        The returned table is outer(row, column), where row and
        column are the estimated marginal distributions
        of the rows and columns.
        )	r{   r>   outerr   r=   r   r   r   r	   )r   ry   rz   itabs       r   independence_probabilitiesz Table.independence_probabilities   s`     .SxS!!dor|44 	9<do&; $ 79 9D r   c                 L    | j         }| j                                        |z  }|S )z
        Returns fitted cell counts under independence.

        The returned cell counts are estimates under a model
        where the rows and columns of the table are independent.
        )r   r   rT   )r   probsfits      r   fittedvalueszTable.fittedvalues3  s'     /jnn&
r   c                 R    | j         }| j        |z
  t          j        |          z  }|S )z
        Returns Pearson residuals.

        The Pearson residuals are calculated under a model where
        the rows and columns of the table are independent.
        )r   r   r>   rb   )r   r   residss      r   resid_pearsonzTable.resid_pearson@  s*     *s"bgcll2r   c                     | j         \  }}| j        t          j        t          j        d|z
  d|z
                      z  }|S )zD
        Returns standardized residuals under independence.
        rJ   )r{   r   r>   rb   r}   )r   ry   rz   sresidss       r   standardized_residszTable.standardized_residsM  s@     .S$rwrxCS/I/I'J'JJr   c                     | j         dz  S )a  
        Returns the contributions to the chi^2 statistic for independence.

        The returned table contains the contribution of each cell to the chi^2
        test statistic for the null hypothesis that the rows and columns
        are independent.
        r^   )r   r   s    r   rS   zTable.chi2_contribsW  s     !1$$r   c                 h   | j                                         }|ddddf         }|ddddf         }|ddddf         }|ddddf         }t          j        |          t          j        |          z   t          j        |          z
  t          j        |          z
  }t          j        | j         j        t          j                  }|t          j        z  }||ddddf<   t          | j	        t          j                  r+t          j        || j	        j        | j	        j                  }|S )z
        Returns local log odds ratios.

        The local log odds ratios are the log odds ratios
        calculated for contiguous 2x2 sub-tables.
        r   rJ   Nr   r	   )r   copyr>   logemptyrF   r@   nanr   r=   r   r   r   r	   r   taar[   cdr1   rslts           r   local_log_oddsratioszTable.local_log_oddsratiosc  s    Z__qtQrTzNqtQRRxLqrr1R4xLqrr122vJfQii"&))#bfQii/"&));x
("*55QrT1R4Zdor|44 	A<DO,A(,(?A A AD r   c                 4    t          j        | j                  S )za
        Returns local odds ratios.

        See documentation for local_log_oddsratios.
        )r>   expr   r   s    r   local_oddsratioszTable.local_oddsratios|  s     vd/000r   c                    | j                             d                              d          }|ddddf         }|ddddf         |z
  }|ddddf         |z
  }|d         ||z   |z   z
  }t          j        |          t          j        |          z   t          j        |          z
  t          j        |          z
  }t          j        | j         j        t          j                  }|t          j        z  }||ddddf<   t          | j	        t          j                  r+t          j        || j	        j        | j	        j                  }|S )aP  
        Returns cumulative log odds ratios.

        The cumulative log odds ratios for a contingency table
        with ordered rows and columns are calculated by collapsing
        all cells to the left/right and above/below a given point,
        to obtain a 2x2 table from which a log odds ratio can be
        calculated.
        r   rJ   r   N)r   r   r   )r   cumsumr>   r   r   rF   r@   r   r   r=   r   r   r   r	   r   s           r   cumulative_log_oddsratioszTable.cumulative_log_oddsratios  s;    Zq!!((++qtQrTzNqtRSSyMArssAbDyMAvJ!a%!)$fQii"&))#bfQii/"&));x
("*55QrT1R4Zdor|44 	A<DO,A(,(?A A AD r   c                 4    t          j        | j                  S )z
        Returns the cumulative odds ratios for a contingency table.

        See documentation for cumulative_log_oddsratio.
        )r>   r   r   r   s    r   cumulative_oddsratioszTable.cumulative_oddsratios  s     vd4555r   T)NN)r4   r5   r6   __doc__rC   r3   classmethodrQ   r\   ru   r   r{   r   r   r   r   rS   r   r   r   r   r   r   r   r8   r8   M   s          D. . . .   ' ' ' ['2  :R R R Rh   ^,   ^$ 
 
 ^
 
 
 ^
   ^ 	% 	% ^	%   ^0 1 1 ^1   ^: 6 6 ^6 6 6r   r8   c                   <     e Zd ZdZd fd	ZddZddZdd
Z xZS )SquareTablea  
    Methods for analyzing a square contingency table.

    Parameters
    ----------
    table : array_like
        A square contingency table, or DataFrame that is converted
        to a square form.
    shift_zeros : bool
        If True and any cell count is zero, add 0.5 to all values
        in the table.

    Notes
    -----
    These methods should only be used when the rows and columns of the
    table have the same categories.  If `table` is provided as a
    Pandas DataFrame, the row and column indices will be extended to
    create a square table, inserting zeros where a row or column is
    missing.  Otherwise the table should be provided in a square form,
    with the (implicit) row and column categories appearing in the
    same order.
    Tc                     t          |          }|j        \  }}||k    rt          d          t                                          ||           d S )Nztable must be square)r   rF   r`   superrC   )r   r   rB   k1k2	__class__s        r   rC   zSquareTable.__init__  sS    &&B883444,,,,,r   bowkerc                    |                                 dk    rt          d          | j        j        d         }t	          j        |d          }| j        j        |         }| j        |         }||z
  dz  ||z   dz   z                                  }||dz
  z  dz  }t          j	        
                    ||          }t                      }	||	_        ||	_        ||	_        |	S )a  
        Test for symmetry of a joint distribution.

        This procedure tests the null hypothesis that the joint
        distribution is symmetric around the main diagonal, that is

        .. math::

        p_{i, j} = p_{j, i}  for all i, j

        Returns
        -------
        Bunch
            A bunch with attributes

            * statistic : float
                chisquare test statistic
            * p-value : float
                p-value of the test statistic based on chisquare distribution
            * df : int
                degrees of freedom of the chisquare distribution

        Notes
        -----
        The implementation is based on the SAS documentation. R includes
        it in `mcnemar.test` if the table is not 2 by 2.  However a more
        direct generalization of the McNemar test to larger tables is
        provided by the homogeneity test (TableSymmetry.homogeneity).

        The p-value is based on the chi-square distribution which requires
        that the sample size is not very small to be a good approximation
        of the true distribution. For 2x2 contingency tables the exact
        distribution can be obtained with `mcnemar`

        See Also
        --------
        mcnemar
        homogeneity
        r   z,method for symmetry testing must be 'bowker'r   rJ   r^   g#B;g       @)lowerr`   r   rF   r>   triu_indicesTrT   r   rV   sfr   rX   rZ   rY   )
r   methodr"   upp_idxtriltriurX   rY   rZ   r[   s
             r   symmetryzSquareTable.symmetry  s    R <<>>X%%KLLLJQ/!Q''z|G$z'"TkA%u)<=BBDD	!A#Y^y"--HHr   stuart_maxwellc                 t   | j         j        d         dk     rt          d          | j         j        d         dk    r%t                      }d|_        d|_        d|_        |S |                                }|dvrt          d|z            | j                                         }| j         	                    t          j                  |z  }|                    d          dd         }|                    d          dd         }|ddddf         }||z
  }|j        d         }|dk    rZ||j        z    t          j        ||          z
  }	||z   dt          j        |          z  z
  |dz  z
  }
t          j        |	|
           nC|d	k    r=||j        z    }	||z   dt          j        |          z  z
  }
t          j        |	|
           	 |t          j        |t          j                            |	|                    z  }nr# t          j        j        $ r[ t)          j        d
t,          j                   t                      }t          j        |_        t          j        |_        ||_        |cY S w xY wdt2          j                            ||          z
  }t                      }||_        ||_        ||_        |S )a(  
        Compare row and column marginal distributions.

        Parameters
        ----------
        method : str
            Either 'stuart_maxwell' or 'bhapkar', leading to two different
            estimates of the covariance matrix for the estimated
            difference between the row margins and the column margins.

        Returns
        -------
        Bunch
            A bunch with attributes:

            * statistic : float
                The chi^2 test statistic
            * pvalue : float
                The p-value of the test statistic
            * df : int
                The degrees of freedom of the reference distribution

        Notes
        -----
        For a 2x2 table this is equivalent to McNemar's test.  More
        generally the procedure tests the null hypothesis that the
        marginal distribution of the row factor is equal to the
        marginal distribution of the column factor.  For this to be
        meaningful, the two factors must have the same sample space
        (i.e. the same categories).
        r   rJ   ztable is empty)bhapkarr   z%method '%s' for homogeneity not knownr   r   r^   r   z"Unable to invert covariance matrix)r   rF   r`   r   rX   rZ   rY   r   rT   astyper>   r@   r   r}   diagfill_diagonalra   linalgsolveLinAlgErrorwarningswarnr   SingularMatrixWarningr   r   rV   rW   )r   r   r[   rk   prry   rz   r   rY   vmatdvrX   rZ   s                r   homogeneityzSquareTable.homogeneity  s   B :A""-...Za A%%AAKAHADH666DvMNNN
  Zrz**U2 ffQii"offQii"o"ad
^ #I Xa[Y"$Y<"(1a..0DsQrwr{{]*QT1BT2&&&&'''"$Y<DsQrwr{{]*BT2&&&		q")//$*B*B C CCIIy$ 	 	 	M>'=? ? ?A&AKvAHADHHH	 UZ^^Ir222HHs   	6H   A,I/.I/皙?%.3fc                    |}g d}ddg}|                                  }|                                 }||j        z  ||j        z  d|j        z  g||j        z  ||j        z  d|j        z  gg}t          j        |||dd          }	|	S )a  
        Produce a summary of the analysis.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the interval.
        float_format : str
            Used to format numeric values in the table.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        )	StatisticP-valueDFSymmetryHomogeneity%dr data_alignstable_dec_above)r   r   rX   rZ   rY   r   SimpleTable)
r   alphafloat_formatfmtheadersstubssyhmrP   r1   s
             r   summaryzSquareTable.summaryf  s     000]+]]__r|#S29_dRUlCr|#S29_dRUlCEgu#024 4 4 
r   r   )r   )r   )r   r   )	r4   r5   r6   r   rC   r   r   r   __classcell__r   s   @r   r   r     s         .- - - - - -; ; ; ;zX X X Xt       r   r   c                       e Zd ZdZd fd	Zedd            Zed             Zed             Z	ed             Z
dd	ZddZddZddZed             Zed             Zed             ZddZddZddZddZddZ xZS )Table2x2a  
    Analyses that can be performed on a 2x2 contingency table.

    Parameters
    ----------
    table : array_like
        A 2x2 contingency table
    shift_zeros : bool
        If true, 0.5 is added to all cells of the table if any cell is
        equal to zero.

    Notes
    -----
    The inference procedures used here are all based on a sampling
    model in which the units are independent and identically
    distributed, with each unit being classified with respect to two
    categorical variables.

    Note that for the risk ratio, the analysis is not symmetric with
    respect to the rows and columns of the contingency table.  The two
    rows define population subgroups, column 0 is the number of
    'events', and column 1 is the number of 'non-events'.
    Tc                    t          |          t          u rt          j        |          }|j        dk    s"|j        d         dk    s|j        d         dk    rt          d          t                                          ||           d S )Nr^   r   rJ   z$Table2x2 takes a 2x2 table as input.)	typer   r>   r?   ndimrF   r`   r   rC   )r   r   rB   r   s      r   rC   zTable2x2.__init__  s}    ;;$Ju%%EJ!OOQ1!4!4%+a.A:M:MCDDD,,,,,r   c                    t          |t          j                  r4t          j        |j        dddf         |j        dddf                   }n)t          j        |dddf         |dddf                   } | ||          S )a  
        Construct a Table object from data.

        Parameters
        ----------
        data : array_like
            The raw data, the first column defines the rows and the
            second column defines the columns.
        shift_zeros : bool
            If True, and if there are any zeros in the contingency
            table, add 0.5 to all four cells of the table.
        Nr   rJ   rK   rN   s       r   rQ   zTable2x2.from_data  s     dBL)) 	8K	!!!Q$111a4AAEEKQQQT
DAJ77Es5+&&&r   c                     | j                                         }t          j        t          j        |          t          j        d                   S )z=
        Returns the log odds ratio for a 2x2 table.
        )rJ   r   r   rJ   )r   flattenr>   ra   r   r_)r   r2   s     r   log_oddsratiozTable2x2.log_oddsratio  s8     J  vbfQii|!4555r   c                 p    | j         d         | j         d         z  | j         d         | j         d         z  z  S )z9
        Returns the odds ratio for a 2x2 table.
        )r   r   )rJ   rJ   r   rJ   rJ   r   )r   r   s    r   	oddsratiozTable2x2.oddsratio  s:     
4 4:d#33D!DJt$446 	7r   c                 ^    t          j        t          j        d| j        z                      S )zD
        Returns the standard error for the log odds ratio.
        rJ   )r>   rb   rT   r   r   s    r   log_oddsratio_sezTable2x2.log_oddsratio_se  s$     wrva$*n--...r   rJ   c                 P    |                      t          j        |                    S )z
        P-value for a hypothesis test about the odds ratio.

        Parameters
        ----------
        null : float
            The null value of the odds ratio.
        )log_oddsratio_pvaluer>   r   r   nulls     r   oddsratio_pvaluezTable2x2.oddsratio_pvalue        ((666r   r   c                     | j         |z
  | j        z  }dt          j                            t          j        |                     z  }|S )z
        P-value for a hypothesis test about the log odds ratio.

        Parameters
        ----------
        null : float
            The null value of the log odds ratio.
        r^   )r   r   r   rc   rW   r>   rd   r   r   rg   rZ   s       r   r   zTable2x2.log_oddsratio_pvalue  @     $t+t/DDUZ^^RVF^^O444r   r   normalc                     t           j                            |dz             }| j        }| j        }|||z  z
  }|||z  z   }||fS )a}  
        A confidence level for the log odds ratio.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            confidence interval.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        r^   )r   rc   ppfr   r   )r   r   r   r2   lorselcbucbs           r   log_oddsratio_confintzTable2x2.log_oddsratio_confint  sQ     Z^^EAI&&& "AFlAFlCxr   c                     |                      ||          \  }}t          j        |          t          j        |          fS )a|  
        A confidence interval for the odds ratio.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            confidence interval.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        r   )r   r>   r   r   r   r   r   r   s        r   oddsratio_confintzTable2x2.oddsratio_confint  ;     --eF-CCSvc{{BF3KK''r   c                 |    | j         dddf         | j                             d          z  }|d         |d         z  S )zy
        Returns the risk ratio for a 2x2 table.

        The risk ratio is calculated with respect to the rows.
        Nr   rJ   )r   rT   )r   ps     r   	riskratiozTable2x2.riskratio  s=     Jqqq!ttz~~a000tad{r   c                 4    t          j        | j                  S )z4
        Returns the log of the risk ratio.
        )r>   r   r  r   s    r   log_riskratiozTable2x2.log_riskratio  s     vdn%%%r   c                     | j                             d          }| j         dddf         |z  }t          j        d|z
  ||z  z            }t          j        |          S )zJ
        Returns the standard error of the log of the risk ratio.
        rJ   Nr   )r   rT   r>   rb   )r   rx   r  vas       r   log_riskratio_sezTable2x2.log_riskratio_se'  sY     JNN1Jqqq!tq VQUqsO$$wr{{r   c                 P    |                      t          j        |                    S )z
        p-value for a hypothesis test about the risk ratio.

        Parameters
        ----------
        null : float
            The null value of the risk ratio.
        )log_riskratio_pvaluer>   r   r   s     r   riskratio_pvaluezTable2x2.riskratio_pvalue2  r   r   c                     | j         |z
  | j        z  }dt          j                            t          j        |                     z  }|S )z
        p-value for a hypothesis test about the log risk ratio.

        Parameters
        ----------
        null : float
            The null value of the log risk ratio.
        r^   )r	  r  r   rc   rW   r>   rd   r   s       r   r  zTable2x2.log_riskratio_pvalue>  r   r   c                     t           j                            |dz             }| j        }| j        }|||z  z
  }|||z  z   }||fS )a  
        A confidence interval for the log risk ratio.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            confidence interval.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        r^   )r   rc   r   r	  r  )r   r   r   r2   lrrr   r   r   s           r   log_riskratio_confintzTable2x2.log_riskratio_confintL  sQ     Z^^EAI&&& "AFlAFlCxr   c                     |                      ||          \  }}t          j        |          t          j        |          fS )a|  
        A confidence interval for the risk ratio.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            confidence interval.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        r  )r  r>   r   r  s        r   riskratio_confintzTable2x2.riskratio_confint`  r  r   r   c           
      ~   fdg d}g d}|                      ||          \  }}|                     ||          \  }}	|                     ||          \  }
}|                     ||          \  }}fd| j        d|||                                 fD             fd| j        | j        ||	|                                 fD             fd| j        d|
|| 	                                fD             fd| j
        | j        ||| 	                                fD             g}t          j        |||d	d
          }|S )a  
        Summarizes results for a 2x2 table analysis.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the confidence
            intervals.
        float_format : str
            Used to format the numeric values in the table.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        c                 <    t          | t                    r| S | z  S Nr   r+   xr   s    r   r   zTable2x2.summary.<locals>.fmt  %    !S!! !##r   )EstimateSELCBUCBzp-value)z
Odds ratiozLog odds ratioz
Risk ratiozLog risk ratioc                 &    g | ]} |          S r   r   r!   r  r   s     r   r$   z$Table2x2.summary.<locals>.<listcomp>  -     < < <AQ < < <r   r   c                 &    g | ]} |          S r   r   r"  s     r   r$   z$Table2x2.summary.<locals>.<listcomp>  3     H H HAQ H H Hr   c                 &    g | ]} |          S r   r   r"  s     r   r$   z$Table2x2.summary.<locals>.<listcomp>  r#  r   c                 &    g | ]} |          S r   r   r"  s     r   r$   z$Table2x2.summary.<locals>.<listcomp>  r%  r   r   r   )r  r   r  r  r   r   r   r   r  r  r	  r  r   r   )r   r   r   r   r   r   lcb1ucb1lcb2ucb2lcb3ucb3lcb4ucb4rP   r1   r   s     `             @r   r   zTable2x2.summaryp  s    	$ 	$ 	$ 	$ 	$
 >==# # # ++E6::
d//v>>
d++E6::
d//v>>
d< < < <$."dD"&"7"7"9"9"; < < <H H H H$"4d6K"&d.C.C.E.E"G H H H< < < <$."dD"&"7"7"9"9"; < < <H H H H$"4d6K"&d.C.C.E.E"G H H HI gu#024 4 4
r   r   )rJ   )r   r   r   r   r   r   )r4   r5   r6   r   rC   r   rQ   r   r   r   r   r   r   r   r  r  r	  r  r  r  r  r  r   r   r   s   @r   r   r     s        0- - - - - - ' ' ' ['( 6 6 ^6 7 7 ^7 / / ^/
7 
7 
7 
7      *( ( ( (    ^ & & ^&   ^
7 
7 
7 
7      (( ( ( ( ' ' ' ' ' ' ' 'r   r   c                       e Zd ZdZddZed             ZddZed             Z	ed             Z
ed             Zed	             ZddZddZddZddZdS )StratifiedTablea  
    Analyses for a collection of 2x2 contingency tables.

    Such a collection may arise by stratifying a single 2x2 table with
    respect to another factor.  This class implements the
    'Cochran-Mantel-Haenszel' and 'Breslow-Day' procedures for
    analyzing collections of 2x2 contingency tables.

    Parameters
    ----------
    tables : list or ndarray
        Either a list containing several 2x2 contingency tables, or
        a 2x2xk ndarray in which each slice along the third axis is a
        2x2 contingency table.

    Notes
    -----
    This results are based on a sampling model in which the units are
    independent both within and between strata.
    Fc                    t          |t          j                  rG|j        }t	          |          dk    s|d         dk    s|d         dk    rt          d          |dz  }n[t          d |D                       rd}t          |          t          j        |                              t          j	                  }|r|dk    
                    d          
                    d          }t          j        |dk              }t	          |          dk    r+|                                }|d d d d |fxx         d	z  cc<   || _        i | _        |ddd d f         |ddd d f         z   | _        |ddd d f         |ddd d f         z   | _        |ddd d f         |ddd d f         z   | _        |ddd d f         |ddd d f         z   | _        |ddd d f         |ddd d f         z  | _        |ddd d f         |ddd d f         z  | _        |ddd d f         |ddd d f         z   | _        |ddd d f         |ddd d f         z
  | _        |
                    d          
                    d          | _        d S )
N   r   r^   rJ   z%If an ndarray, argument must be 2x2xn      ?c                 H    g | ]}t          j        |          j        d k     S )r^   r^   )r>   r?   rF   )r!   r  s     r   r$   z,StratifiedTable.__init__.<locals>.<listcomp>  s(    BBBaBJqMM'61BBBr   z8If `tables` is a list, all of its elements should be 2x2r<   )r   r>   ndarrayrF   r&   r`   anydstackr   r@   rT   flatnonzeror   r   _cache_apb_apc_bpd_cpd_ad_bc_apd_dma_n)r   tablesrB   spr   r0   zxr   s           r   rC   zStratifiedTable.__init__  s   fbj)) 	9BB1"Q%1**"Q%1** !HIIIRKEEBB6BBBCC $N mm# If%%,,RZ88E 	'1*!!!$$((++BQ''B2ww{{

aaaBh3&

 !Q'NU1a7^3	!Q'NU1a7^3	!Q'NU1a7^3	!Q'NU1a7^3	Aqqq>E!Q'N2Aqqq>E!Q'N2!Q'NU1a7^3	!Q'NU1a7^3	))A,,""1%%r   c                    t          |t          j                  st          j        t          j        |j        d                   |||g          }|dd|f         ||j        |         <   |dd|f         ||j        |         <   |dd|f         ||j        |         <   n||||g         }|                    |          j        }g }|D ]}||         }	t          j	        |j
        |	|f         |j
        |	|f                   }
|
j        t          j        d         k                                    rd}t          |          |                    t          j        |
                      | |          S )ad  
        Construct a StratifiedTable object from data.

        Parameters
        ----------
        var1 : int or string
            The column index or name of `data` specifying the variable
            defining the rows of the contingency table.  The variable
            must have only two distinct values.
        var2 : int or string
            The column index or name of `data` specifying the variable
            defining the columns of the contingency table.  The variable
            must have only two distinct values.
        strata : int or string
            The column index or name of `data` specifying the variable
            defining the strata.
        data : array_like
            The raw data.  A cross-table for analysis is constructed
            from the first two columns.

        Returns
        -------
        StratifiedTable
        r   r   Nr8  zInvalid table dimensions)r   r   r   r>   r_   rF   r	   groupbygroupsrL   locr   r:  r`   r,   r?   )rO   var1var2stratarP   data1gbrG  giir1   rj   s               r   rQ   zStratifiedTable.from_data  sj   6 $-- 	/LryA'?'?*.f)=? ? ?E)-aaagE%-%&)-aaagE%-%&+/6	?E%-'(($f-.E]]6"") 	+ 	+AAB+eiD159RX3FGGC	RU4[(--// &0 oo%MM"*S//****s6{{r   c                    t          j        | j        ddddf         | j        | j        z  | j        z  z
            }t          j        |          }|r|dz  }|dz  }| j        | j        z  | j        z  | j        z  }|| j        dz  | j        dz
  z  z  }t          j        |          }||z  }dt          j
                            |d          z
  }t                      }||_        ||_        |S )a  
        Test that all tables have odds ratio equal to 1.

        This is the 'Mantel-Haenszel' test.

        Parameters
        ----------
        correction : bool
            If True, use the continuity correction when calculating the
            test statistic.

        Returns
        -------
        Bunch
            A bunch containing the chi^2 test statistic and p-value.
        r   Nr<   r^   rJ   )r>   rT   r   r>  r?  rF  rd   r@  rA  r   rV   rW   r   rX   rZ   )r   
correctionrX   denomrZ   r[   s         r   test_null_oddszStratifiedTable.test_null_odds  s    $ F4:aAAAg.9ty047:; < <	F9%%	 	IqL		DI%	1DI=$'1*!,-uU	 UZ^^Iq111HHr   c                     t          j        | j        | j        z            t          j        | j        | j        z            z  }|S )z
        The pooled odds ratio.

        The value is an estimate of a common odds ratio across all of the
        stratified tables.
        )r>   rT   rB  rF  rC  )r   
odds_ratios     r   oddsratio_pooledz StratifiedTable.oddsratio_pooled-  s:     VDHtw.//"&DG9K2L2LL
r   c                 4    t          j        | j                  S )zu
        Returns the logarithm of the pooled odds ratio.

        See oddsratio_pooled for more information.
        )r>   r   r[  r   s    r   logodds_pooledzStratifiedTable.logodds_pooled8  s     vd+,,,r   c                     | j         ddddf         | j        z  }| j         ddddf         | j        z  }t          j        || j        z            t          j        || j        z            z  }|S )z4
        Estimate of the pooled risk ratio.
        r   NrJ   )r   rA  r>  r>   rT   rF  )r   acdcabrrs       r   riskratio_pooledz StratifiedTable.riskratio_pooledA  sm     jAqqq!DI-jAqqq!DI-VC$'M""RVC$'M%:%::	r   c                 Z   t          j        | j        | j        z            }t          j        | j        | j        z            }t          j        | j        | j        z  | j        dz  z            |dz  z  }| j        | j        z  | j        dz  z  }|d| j        | j        z  z
  | j        z  | j        z  z  }t          j        |          }|||z  z  }||z  }|t          j        d| j        | j        z  z
  | j        z  | j        z            |dz  z  z  }|dz  }t          j        |          }|S )a>  
        Estimated standard error of the pooled log odds ratio

        References
        ----------
        J. Robins, N. Breslow, S. Greenland. "Estimators of the
        Mantel-Haenszel Variance Consistent in Both Sparse Data and
        Large-Strata Limiting Models." Biometrics 42, no. 2 (1986): 311-23.
        r^   rJ   )r>   rT   rB  rF  rC  rD  rb   )r   adnsbcnslor_vamidlor_ses         r   logodds_pooled_sez!StratifiedTable.logodds_pooled_seM  s&    vdh())vdh())	DH,twz9::T1WDi$("TWaZ/DI''483dg==fSkkt#"&!di$'11"$(G, - -/3Qw7 	7!r   r   r   c                     t          j        | j                  }| j        }t          j                            |dz             }|||z  z
  }|||z  z   }||fS )a  
        A confidence interval for the pooled log odds ratio.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            interval.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.

        Returns
        -------
        lcb : float
            The lower confidence limit.
        ucb : float
            The upper confidence limit.
        r^   )r>   r   r[  ri  r   rc   r   )r   r   r   r   rh  r2   r   r   s           r   logodds_pooled_confintz&StratifiedTable.logodds_pooled_confintg  s]    * fT*++'Z^^EAI&&&AJAJCxr   c                     |                      ||          \  }}t          j        |          }t          j        |          }||fS )a  
        A confidence interval for the pooled odds ratio.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            interval.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.

        Returns
        -------
        lcb : float
            The lower confidence limit.
        ucb : float
            The upper confidence limit.
        r  )rk  r>   r   r  s        r   oddsratio_pooled_confintz(StratifiedTable.oddsratio_pooled_confint  sC    * ..uV.DDSfSkkfSkkCxr   c                    | j         }| j        }d|z
  }|| j        | j        z   z  | j        z   }| | j        z  | j        z  }t          j        |dz  d|z  |z  z
            }| |z   d|z  z  }d|z  d| j        |z
  z  z   d| j        |z
  z  z   d| j        |z   z  z   }	d|	z  }	t          j        |ddddf         |z
  dz  |	z            }
|rU|ddddf                                         |                                z
  }|dz  }|t          j        |	          z  }|
|z  }
dt          j	        
                    |
|j        d         dz
            z
  }t                      }|
|_        ||_        |S )a  
        Test that all odds ratios are identical.

        This is the 'Breslow-Day' testing procedure.

        Parameters
        ----------
        adjust : bool
            Use the 'Tarone' adjustment to achieve the chi^2
            asymptotic distribution.

        Returns
        -------
        A bunch containing the following attributes:

        statistic : float
            The chi^2 test statistic.
        p-value : float
            The p-value for the test.
        rJ   r^      r   N)r   r[  r>  r?  rE  r>   rb   rT   r   rV   rW   rF   r   rX   rZ   )r   adjustr   r   r   r[   r   dre11v11rX   adjrZ   s                r   test_equal_oddszStratifiedTable.test_equal_odds  s   , 
!ETY&'$)3BNTY& WQTAaCE\""rBw1Q3 3wdi#o..di#o1FFDIO$%#gFE!Q'NS014s:;;	 	1aaa.$$&&2Cq&C26#;;CIUZ^^Iu{1~/ABBBHHr   r   c                    fd|                      ||          \  }}|                     ||          \  }}g d}g d}	fd| j        ||fD             fd| j        ||fD             fd| j        ddfD             g d	g}
t          j        |
||	d
d          }g d}ddg}	|                                 }|                                 }fd|j	        |j
        dfD             fd|j	        |j
        dfD             g}
t          j        |
||	d
          }|                    |           g d	}g d}	| j                            d                              d          }d| j        j        d         z  ddgdt          |          z  ddgdt!          |          z  ddgdt#          j        |          z  ddgdt          |          z  dddgg}
t          j        |
||	d
          }|                    |           |S )a  
        A summary of all the main results.

        Parameters
        ----------
        alpha : float
            `1 - alpha` is the nominal coverage probability of the
            confidence intervals.
        float_format : str
            Used for formatting numeric values in the summary.
        method : str
            The method for producing the confidence interval.  Currently
            must be 'normal' which uses the normal approximation.
        c                 <    t          | t                    r| S | z  S r  r  r  s    r   r   z$StratifiedTable.summary.<locals>.fmt  r  r   )r   r   )r  r  r   )zPooled oddszPooled log oddszPooled risk ratior   c                 &    g | ]} |          S r   r   r"  s     r   r$   z+StratifiedTable.summary.<locals>.<listcomp>  !    IIIAQIIIr   c                 &    g | ]} |          S r   r   r"  s     r   r$   z+StratifiedTable.summary.<locals>.<listcomp>  ry  r   c                 &    g | ]} |          S r   r   r"  s     r   r$   z+StratifiedTable.summary.<locals>.<listcomp>  s!    AAAAQAAAr   r   )r   r   r   r   r   )r   r   r   zTest of OR=1zTest constant ORc                 &    g | ]} |          S r   r   r"  s     r   r$   z+StratifiedTable.summary.<locals>.<listcomp>  !    EEEAQEEEr   c                 &    g | ]} |          S r   r   r"  s     r   r$   z+StratifiedTable.summary.<locals>.<listcomp>  r}  r   )r   )zNumber of tableszMin nzMax nzAvg nzTotal nr   r   r^   z%.0f)rm  rk  r[  r]  rb  r   r   rX  ru  rX   rZ   extendr   rT   rF   rA   r*   r>   mean)r   r   r   r   co_lcbco_ucbclo_lcbclo_ucbr   r   rP   tab1rslt1rslt2tab2sstab3r   s     `              @r   r   zStratifiedTable.summary  s    	$ 	$ 	$ 	$ 	$
 66 7 ( (66 7 ( (,,,KKKIIII$"7!HIIIIIII$"5w!HIIIAAAA$"7R!@AAA  w3135 5 5 /..!34##%%$$&&EEEE%/5<!DEEEEEEE%/5<!DEEEG w3GGGD,,JJJZ^^A""1%%
(++R4BR(BR("'"++%r2.BR,	.
  w3GGGDr   N)Fr0  r1  )r4   r5   r6   r   rC   r   rQ   rX  r   r[  r]  rb  ri  rk  rm  ru  r   r   r   r   r3  r3    s!        *%& %& %& %&N - - [-^$ $ $ $L   ^ - - ^- 	 	 ^	   ^2   >   44 4 4 4l6 6 6 6 6 6r   r3  Tc                 `   t          |           } t          j        | t          j                  } | d         | d         }}|ryt          j        ||          }t          ||z             }|||z   k    rt          d          t          j        	                    ||d          dz  }t          j        |d          }nWt          |          }t          j
        ||z
            |z
  dz  d||z   z  z  }d}	t          j                            ||	          }t                      }
||
_        ||
_        |
S )	a  
    McNemar test of homogeneity.

    Parameters
    ----------
    table : array_like
        A square contingency table.
    exact : bool
        If exact is true, then the binomial distribution will be used.
        If exact is false, then the chisquare distribution will be
        used, which is the approximation to the distribution of the
        test statistic for large sample sizes.
    correction : bool
        If true, then a continuity correction is used for the chisquare
        distribution (if exact is false.)

    Returns
    -------
    A bunch with attributes:

    statistic : float or int, array
        The test statistic is the chisquare statistic if exact is
        false. If the exact binomial distribution is used, then this
        contains the min(n1, n2), where n1, n2 are cases that are zero
        in one sample but one in the other sample.
    pvalue : float or array
        p-value of the null hypothesis of equal marginal distributions.

    Notes
    -----
    This is a special case of Cochran's Q test, and of the homogeneity
    test. The results when the chisquare distribution is used are
    identical, except for continuity correction.
    r:   r   r   z7exact can only be used with tables containing integers.r<   r^   rJ   r6  )r   r>   r?   r@   minimumintr`   r   binomrW   rd   rV   r   r   rX   rZ   )r   exactrV  n1n2rX   int_sumrZ   corrrY   r[   s              r   mcnemarr    s!   H E""EJuBJ///E4[%+B .Jr2&&	 b2g,,rBwI   GS99A=FA&&:VBG__t+a/2b>B	y"--AAKAHHr   c                    t          j        | t           j                  } t          j        |           }| j        \  }}| |d         k                        dt                    }| |d         k                        dt                    }|                                }|                                }||k    sJ |dz
  |t          j        |dz            z  |dz  z
  z  ||z  t          j        |dz            z
  z  }	|dz
  }
t          j        	                    |	|
          }|r%t                      }|	|_        |
|_        ||_        |S |	||
fS )a  
    Cochran's Q test for identical binomial proportions.

    Parameters
    ----------
    x : array_like, 2d (N, k)
        data with N cases and k variables
    return_object : bool
        Return values as bunch instead of as individual values.

    Returns
    -------
    Returns a bunch containing the following attributes, or the
    individual values according to the value of `return_object`.

    statistic : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution

    Notes
    -----
    Cochran's Q is a k-sample extension of the McNemar test. If there
    are only two groups, then Cochran's Q test and the McNemar test
    are equivalent.

    The procedure tests that the probability of success is the same
    for every group.  The alternative hypothesis is that at least two
    groups have a different probability of success.

    In Wikipedia terminology, rows are blocks and columns are
    treatments.  The number of rows N, should be large for the
    chisquare distribution to be a good approximation.

    The Null hypothesis of the test is that all treatments have the
    same effect.

    References
    ----------
    https://en.wikipedia.org/wiki/Cochran_test
    SAS Manual for NPAR TESTS
    r:   r   rJ   r   r^   )r>   r?   r@   uniquerF   rT   floatr   rV   r   r   rX   rY   rZ   )r  return_objectgruniNr"   count_row_successcount_col_successcount_row_sscount_col_ssq_statrY   rZ   r[   s                r   
cochrans_qr  N  sK   X 	
1BJ'''AIaLLE7DAqeBi,,Q66eBi,,Q66$((**L$((**L<'''' sq26"3Q"6777,/IJ\!BF+<a+?$@$@@BF 
QBZ]]62&&F HH62r   )TTr   )r   r   numpyr>   pandasr   scipyr   statsmodelsr   statsmodels.toolsr   statsmodels.tools.decoratorsr   r   r   r8   r   r   r3  r  r  r   r   r   <module>r     s   6                      + + + + + + 7 7 7 7 7 7  ,        _6 _6 _6 _6 _6 _6 _6 _6DQ Q Q Q Q% Q Q QhT T T T T{ T T Tnr r r r r r r rj< < < <~I I I I I Ir   