
    M/PhbO              
          d Z ddlZddlmZ ddlmZ ddlZddlm	Z	  G d d          Z
d d
Zd!dZ G d d          Z G d d          Z	 d Zd Zd"dZd Zedk    r ej        g d          Z e e
e                                                      e eed                      e e ej        ddd          e                      e edd                              d                      e eej                            d          ej                            ddd                                e eej                            ddd                               dS dS )#a  runstest

formulas for mean and var of runs taken from SAS manual NPAR tests, also idea
for runstest_1samp and runstest_2samp

Description in NIST handbook and dataplot does not explain their expected
values, or variance

Note:
There are (at least) two definitions of runs used in literature. The classical
definition which is also used here, is that runs are sequences of identical
observations separated by observations with different realizations.
The second definition allows for overlapping runs, or runs where counting a
run is also started after a run of a fixed length of the same kind.


TODO
* add one-sided tests where possible or where it makes sense

    N)stats)comb)
array_likec                        e Zd ZdZd ZddZdS )Runsa=  class for runs in a binary sequence


    Parameters
    ----------
    x : array_like, 1d
        data array,


    Notes
    -----
    This was written as a more general class for runs. This has some redundant
    calculations when only the runs_test is used.

    TODO: make it lazy

    The runs test could be generalized to more than 1d if there is a use case
    for it.

    This should be extended once I figure out what the distribution of runs
    of any length k is.

    The exact distribution for the runs test is also available but not yet
    verified.

    c                 8   t          j        |          | _        t          j        t          j        t           j        t           j         g|t           j        gf                             d         x| _        }t          j        |          x| _        }||d d                  x| _	        }||dk             | _
        ||dk             | _        t          j        |          | _        t          | j                  | _        |dk                                    | _        d S )Nr      )npasarrayxnonzerodiffr_infrunstartruns	runs_signruns_posruns_negbincount
runs_freqslenn_runssumn_pos)selfr   r   r   r   s        ^/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/statsmodels/sandbox/stats/runs.py__init__zRuns.__init__8   s    A#%:bgbebfWIq26(<R6S.T.T#U#UVW#XX78,,,	D%&x}%55Y\*Y\*+d++$)nndZZ\\


    Tc                    | j                                         x| _        }| j                                        x| _        }||z   }||z  }d|z  |z  dz   }d|z  d|z  |z
  z  |dz  z  |dz
  z  }t          j        |          }| j        |z
  }	|dk    s|s|	}
n|	dk    r|	dz
  }
n|	dk     r|	dz   }
nd}
|
|z  }
dt          j	        
                    t          j        |
                    z  }|
|fS )a  basic version of runs test

        Parameters
        ----------
        correction : bool
            Following the SAS manual, for samplesize below 50, the test
            statistic is corrected by 0.5. This can be turned off with
            correction=False, and was included to match R, tseries, which
            does not use any correction.

        pvalue based on normal distribution, with integer correction

               @r
         ?2         ?g           )r   r   npor   nner   sqrtr   r   normsfabs)r   
correctionr'   r(   nnpnrmeanrvarrstdrdemeanzpvals               r   	runs_testzRuns.runs_testD   s    -,,...3-,,...3 #ICiS1q Cx2c6A:&B.!B$7wt}}+%77*7AA}}cM3cM	T	5:==+++$wr    N)T)__name__
__module____qualname____doc__r   r6    r    r   r   r      sA         6
" 
" 
"$ $ $ $ $ $r    r   meanTc                 .   t          | d          } |dk    rt          j        |           }n*|dk    rt          j        |           }nt	          |          }| |k                        t                    }t          |                              |          S )a  use runs test on binary discretized data above/below cutoff

    Parameters
    ----------
    x : array_like
        data, numeric
    cutoff : {'mean', 'median'} or number
        This specifies the cutoff to split the data into large and small
        values.
    correction : bool
        Following the SAS manual, for samplesize below 50, the test
        statistic is corrected by 0.5. This can be turned off with
        correction=False, and was included to match R, tseries, which
        does not use any correction.

    Returns
    -------
    z_stat : float
        test statistic, asymptotically normally distributed
    p-value : float
        p-value, reject the null hypothesis if it is below an type 1 error
        level, alpha .

    r   r<   medianr-   )	r   r   r<   r>   floatastypeintr   r6   )r   cutoffr-   
xindicators       r   runstest_1samprE   j   s    4 	1cA	8		1vv+%%c**J
%%%<<<r    c                 @   t          j        |           } |t          j        |          }t          j        t          j        t	          |                     t          j        t	          |                    f          }t          j        | |f          } t          j        d          }n@|/t          j        |          }|j        dk    rt          d          nt          d          t          j
        |           }| |         }t          j        |          }|                                dk    rBt          d           ||dk                                             }|dz  }	|                                 }
|
||d         k    xx         |	z  cc<   t          j
        |
          }||         }t          |                              |          \  }}|
||d         k    xx         |	z  cc<   |
||d	         k    xx         |	z  cc<   t          j
        |
          }||         }t          |                              |          \  }}t          j        ||g          }||g|         ||g|         fS ||         }t          |                              |          S )
a{	  Wald-Wolfowitz runstest for two samples

    This tests whether two samples come from the same distribution.

    Parameters
    ----------
    x : array_like
        data, numeric, contains either one group, if y is also given, or
        both groups, if additionally a group indicator is provided
    y : array_like (optional)
        data, numeric
    groups : array_like
        group labels or indicator the data for both groups is given in a
        single 1-dimensional array, x. If group labels are not [0,1], then
    correction : bool
        Following the SAS manual, for samplesize below 50, the test
        statistic is corrected by 0.5. This can be turned off with
        correction=False, and was included to match R, tseries, which
        does not use any correction.

    Returns
    -------
    z_stat : float
        test statistic, asymptotically normally distributed
    p-value : float
        p-value, reject the null hypothesis if it is below an type 1 error
        level, alpha .


    Notes
    -----
    Wald-Wolfowitz runs test.

    If there are ties, then then the test statistic and p-value that is
    reported, is based on the higher p-value between sorting all tied
    observations of the same group


    This test is intended for continuous distributions
    SAS has treatment for ties, but not clear, and sounds more complicated
    (minimum and maximum possible runs prevent use of argsort)
    (maybe it's not so difficult, idea: add small positive noise to first
    one, run test, then to the other, run test, take max(?) p-value - DONE
    This gives not the minimum and maximum of the number of runs, but should
    be close. Not true, this is close to minimum but far away from maximum.
    maximum number of runs would use alternating groups in the ties.)
    Maybe adding random noise would be the better approach.

    SAS has exact distribution for sample size <=30, does not look standard
    but should be easy to add.

    currently two-sided test only

    This has not been verified against a reference implementation. In a short
    Monte Carlo simulation where both samples are normally distribute, the test
    seems to be correctly sized for larger number of observations (30 or
    larger), but conservative (i.e. reject less often than nominal) with a
    sample size of 10 in each group.

    See Also
    --------
    runs_test_1samp
    Runs
    RunsProb

    Nr&   z not exactly two groups specifiedzeither y or groups is necessaryr   zties detectedr"   r?   r
   )r   r   concatenatezerosr   onesarangeuniquesize
ValueErrorargsortr   minprintcopyr   r6   argmax)r   ygroupsr-   grunixargsortx_sortedx_diff	x_mindiffepsxxrD   z0p0z1p1idxs                    r   runstest_2sampra      sm   F 	
1A}JqMM#a&&!1!1273q66?? CDDNAq6""	!			&!!:???@@@  :;;;z!}}H{HWXFzz||qo6A:&**,,	lVVXX
658#:b>>H%
j!!++z+BBB
658#
658#:b>>H%
j!!++z+BBBiB  Bx}r2hsm++ H%
J))Z)@@@r    c                   0    e Zd ZdZd Zd Zd Zd Zd ZdS )TotalRunsProba_  class for the probability distribution of total runs

    This is the exact probability distribution for the (Wald-Wolfowitz)
    runs test. The random variable is the total number of runs if the
    sample has (n0, n1) observations of groups 0 and 1.


    Notes
    -----
    Written as a class so I can store temporary calculations, but I do not
    think it matters much.

    Formulas taken from SAS manual for one-sided significance level.

    Could be converted to a full univariate distribution, subclassing
    scipy.stats.distributions.

    *Status*
    Not verified yet except for mean.



    c                 d    || _         || _        ||z   x| _        }t          ||          | _        d S N)n0n1r.   r   comball)r   rf   rg   r.   s       r   r   zTotalRunsProb.__init__  s3    "WAr{{r    c                     | j         | j        }}t          |dz
  |dz  dz
            }t          |dz
  |dz  dz
            }||z  dz  | j        z  S )Nr
   r&   r"   rf   rg   r   rh   )r   rrf   rg   tmp0tmp1s         r   runs_prob_evenzTotalRunsProb.runs_prob_even  s[    $'BBqD!Q$q&!!BqD!Q$q&!!d{R$,..r    c                    | j         | j        }}|dz   dz  }t          |dz
  |dz
            }t          |dz
  |dz
            }t          |dz
  |dz
            }t          |dz
  |dz
            }||z  ||z  z   | j        z  S )Nr
   r&   rj   )	r   rk   rf   rg   krl   rm   tmp3tmp4s	            r   runs_prob_oddzTotalRunsProb.runs_prob_odd"  s    $'BqS1HBqD!A#BqD!A#BqD!A#BqD!A#tdTk)dl::r    c                    t          j        |          }t          j        |d          dk    }||         }||          }t          j        |j                  }|                     |          ||<   |                     |          || <   |S )Nr&   r   )r   r   modrH   shapers   rn   )r   rk   r_isoddr_oddr_evenruns_pdfs         r   pdfzTotalRunsProb.pdf+  s    JqMM&A,,"'
G88AG$$ ..u55!0088'r    c                     t          j        d|dz             }|                     |d d d                                                   }||                     |dd d                                                   z  }|S )Nr&   r
   )r   rJ   rn   r   rs   )r   rk   r   cdfvals       r   cdfzTotalRunsProb.cdf6  ss    Yq1$$R!W--1133$$$R1X..22444r    N)	r7   r8   r9   r:   r   rn   rs   r{   r~   r;   r    r   rc   rc      si         0# # #/ / /; ; ;      r    rc   c                       e Zd ZdZd Zd ZdS )RunsProba  distribution of success runs of length k or more (classical definition)

    The underlying process is assumed to be a sequence of Bernoulli trials
    of a given length n.

    not sure yet, how to interpret or use the distribution for runs
    of length k or more.

    Musseli also has longest success run, and waiting time distribution
    negative binomial of order k and geometric of order k

    need to compare with Godpole

    need a MonteCarlo function to do some quick tests before doing more


    c                 J   d|z
  }t          j        ||dz   |dz   z  dz             dddf         }d||z
  z  t          ||          z  |||z  z  z  ||dz
  z  z  t          |||z  z
  |dz
            |t          |||z  z
  |          z  z   z  }|                    d          S )aG  distribution of success runs of length k or more

        Parameters
        ----------
        x : float
            count of runs of length n
        k : int
            length of runs
        n : int
            total number of observations or trials
        p : float
            probability of success in each Bernoulli trial

        Returns
        -------
        pdf : float
            probability that x runs of length of k are observed

        Notes
        -----
        not yet vectorized

        References
        ----------
        Muselli 1996, theorem 3
        r
   Nr	   r   )r   rJ   r   r   )r   r   rp   r.   pqmtermss           r   r{   zRunsProb.pdfR  s    8 aCIa!A#1a((40qsd1ajj(1qs83a!A#h>AaCQ''!d1qs7A.>.>*>>@yy||r    c                     d S re   r;   )r   r   rp   r.   r   s        r   pdf_nbzRunsProb.pdf_nbt  s    r    N)r7   r8   r9   r:   r{   r   r;   r    r   r   r   =  s=         (     D    r    r   c                     t          j                    t          j                  } fd|D             }t          j                   t          j        fd|D                       }t          j        d |D                       }||z
  }|                                } k                                    }||z
  }	t          j        ||f          }
t          j        |dz  |z  |	z  |dz  |z  |z  f          }|dk                                     rt          d           t          j
        |
                                |                                d          |
|fS )	a  chisquare test for equality of median/location

    This tests whether all groups have the same fraction of observations
    above the median.

    Parameters
    ----------
    x : array_like
        data values stacked for all groups
    groups : array_like
        group labels or indicator

    Returns
    -------
    stat : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution
    others ????
       currently some test output, table and expected

    c                 (    g | ]}|k             S r;   r;   ).0grouprT   r   s     r   
<listcomp>z'median_test_ksample.<locals>.<listcomp>  s"    
/
/
/1VU]
/
/
/r    c                 @    g | ]}|k                                     S r;   )r   )r   xgxmedians     r   r   z'median_test_ksample.<locals>.<listcomp>  s)    AAArrG|0022AAAr    c                 ,    g | ]}t          |          S r;   )r   )r   r   s     r   r   z'median_test_ksample.<locals>.<listcomp>  s    ---2s2ww---r    r#      zWarning: There are cells with less than 5 expectedobservations. The chisquare distribution might not be a goodapproximation for the true distribution.r
   )ddof)r   r   rK   r>   arrayr   vstackanyrP   r   	chisquareravel)r   rT   rU   xlicounts_largercountscounts_smallernobsn_larger	n_smallertableexpectedr   s   ``          @r   median_test_ksampler     ss   . 	
1AIfE
/
/
/
/
/
/
/
/CillGHAAAASAAABBMX-----..Fm+N::<<DG  ""HxII~}566E y&2+,y8 2+,x79 : :H 	1 4 3 	4 	4 	4
 ?5;;==(..*:*:CCCUHTTr    c                 `   t          j        dt                     t          j        |           } t          j        |           }| j        \  }}| |d         k                        dt                    }| |d         k                        dt                    }|                                }|                                }||k    sJ |dz
  |t          j        |dz            z  |dz  z
  z  ||z  t          j        |dz            z
  z  }|t          j
                            ||dz
            fS )a  Cochran's Q test for identical effect of k treatments

    Cochran's Q is a k-sample extension of the McNemar test. If there are only
    two treatments, then Cochran's Q test and McNemar test are equivalent.

    Test that the probability of success is the same for each treatment.
    The alternative is that at least two treatments have a different
    probability of success.

    Parameters
    ----------
    x : array_like, 2d (N,k)
        data with N cases and k variables

    Returns
    -------
    q_stat : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution

    Notes
    -----
    In Wikipedia terminology, rows are blocks and columns are treatments.
    The number of rows N, should be large for the chisquare distribution to be
    a good approximation.
    The Null hypothesis of the test is that all treatments have the
    same effect.

    References
    ----------
    https://en.wikipedia.org/wiki/Cochran_test
    SAS Manual for NPAR TESTS

    z(Deprecated, use stats.cochrans_q insteadr	   r
   r   r&   )warningswarnFutureWarningr   r   rK   rv   r   r@   r   chi2r+   )	r   rU   Nrp   count_row_successcount_col_successcount_row_sscount_col_ssq_stats	            r   
cochrans_qr     s   J M<mLLL

1AIaLLE7DAqE"I**1e44E"I**1e44$((**L$((**L<'''' ca26"3Q"6777,/IJL 26*;Q*>#?#??AF 5:==1----r    c                    t          j        dt                     t          j        |           } |M| j        d         | j        d         k    r1| j        d         dk    rt          d          | d         | d         }}n2t          j        | |k     d          }t          j        | |k    d          }|rRt          j        ||          }t          j
                            |||z   d	          dz  }t          j        |d          }nWt          |          }t          j        ||z
            |z
  dz  d
||z   z  z  }d}	t          j                            ||	          }||fS )a#  McNemar test

    Parameters
    ----------
    x, y : array_like
        two paired data samples. If y is None, then x can be a 2 by 2
        contingency table. x and y can have more than one dimension, then
        the results are calculated under the assumption that axis zero
        contains the observation for the samples.
    exact : bool
        If exact is true, then the binomial distribution will be used.
        If exact is false, then the chisquare distribution will be used, which
        is the approximation to the distribution of the test statistic for
        large sample sizes.
    correction : bool
        If true, then a continuity correction is used for the chisquare
        distribution (if exact is false.)

    Returns
    -------
    stat : float or int, array
        The test statistic is the chisquare statistic if exact is false. If the
        exact binomial distribution is used, then this contains the min(n1, n2),
        where n1, n2 are cases that are zero in one sample but one in the other
        sample.

    pvalue : float or array
        p-value of the null hypothesis of equal effects.

    Notes
    -----
    This is a special case of Cochran's Q test. The results when the chisquare
    distribution is used are identical, except for continuity correction.

    +Deprecated, use stats.TableSymmetry insteadNr   r
   r&   ztable needs to be 2 by 2)r
   r   )r   r
   r%   r#   )r   r   r   r   r   rv   rM   r   minimumr   binomr~   rB   r,   r   r+   )
r   rS   exactr-   rg   n2statr5   corrdfs
             r   mcnemarr     s@   J M?OOO

1AyQWQZ171:--71:??78884!D'B VAE1VAE1 	'z"b!!{tR"Wc22Q6z$"":rBw$&*bBGn=z}}T2&&:r    c                    t          j        dt                     t          j        |           } | j        \  }}||k    rt          d          t          j        |d          }| j        |         }| |         }||z
  dz  ||z   dz   z  	                                }||dz
  z  dz  }t          j                            ||          }|||fS )aD  Test for symmetry of a (k, k) square contingency table

    This is an extension of the McNemar test to test the Null hypothesis
    that the contingency table is symmetric around the main diagonal, that is

    n_{i, j} = n_{j, i}  for all i, j

    Parameters
    ----------
    table : array_like, 2d, (k, k)
        a square contingency table that contains the count for k categories
        in rows and columns.

    Returns
    -------
    statistic : float
        chisquare test statistic
    p-value : float
        p-value of the test statistic based on chisquare distribution
    df : int
        degrees of freedom of the chisquare distribution

    Notes
    -----
    Implementation is based on the SAS documentation, R includes it in
    `mcnemar.test` if the table is not 2 by 2.

    The pvalue is based on the chisquare distribution which requires that the
    sample size is not very small to be a good approximation of the true
    distribution. For 2x2 contingency tables exact distribution can be
    obtained with `mcnemar`

    See Also
    --------
    mcnemar


    r   ztable needs to be squarer
   r&   g#B;r"   )r   r   r   r   r   rv   rM   triu_indicesTr   r   r   r+   )	r   rp   k2upp_idxtriltriur   r   r5   s	            r   symmetry_bowkerr   8  s    P M?OOOJuEKEArBww3444 oa##G77D>DD[1te 3499;;D	
acRB:==r""Dr>r    __main__)r
   r
   r
   r   r   r
   r   r
   r   r
   r
   r
   r   r
   r   r
   )rC      r	   )rT      	      d   r&   )r      )r<   T)NNT)NTT)r:   numpyr   scipyr   scipy.specialr   r   statsmodels.tools.validationr   r   rE   ra   rc   r   r   r   r   r   r7   r   x1rP   r6   rJ   r~   randomrandnrandintr;   r    r   <module>r      s   *                  3 3 3 3 3 3L L L L L L L L\"= "= "= "=HlA lA lA lA^= = = = = = = =@8 8 8 8 8 8 8 8v"-U -U -Ud<. <. <.|< < < <~9 9 9x z	BBB	C	CB	E$$r((



	E..F
+
+
+,,,	E..2a++B
7
7
7888	E--!


 
 
$
$%%%	E

biooc22BI4E4Ea#4N4N
O
OPPP	E**RY&&q733
4
455555 r    