
    M/PhD                         d Z ddlmZ ddlZddlZddlmZmZ ddl	m
Z
 ddlmZ  edg d          Zdd
Zd Zd Zd Zd ZddZd Zd Zd ZdS )u   Distance dependence measure and the dCov test.

Implementation of Székely et al. (2007) calculation of distance
dependence statistics, including the Distance covariance (dCov) test
for independence of random vectors of arbitrary length.

Author: Ron Itzikovitch

References
----------
.. Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
   "Measuring and testing dependence by correlation of distances".
   Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.

    )
namedtupleN)pdist
squareform)norm)HypothesisTestWarningDistDependStattest_statisticdistance_correlationdistance_covariancedvar_xdvar_ySautoc                    t          | |          \  } }| j        d         }t          | |          }|dk    r|dk    s|dk    rd}t          | ||||          \  }}n9|dk    r|dk    s|dk    rd}t	          |          \  }}nt          d|           |dk    r6|dv r2d| d	}	t          j        |	t                     t	          |          \  }
}|||fS )
a  The Distance Covariance (dCov) test

    Apply the Distance Covariance (dCov) test of independence to `x` and `y`.
    This test was introduced in [1]_, and is based on the distance covariance
    statistic. The test is applicable to random vectors of arbitrary length
    (see the notes section for more details).

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.
    B : int, optional, default=`None`
        The number of iterations to perform when evaluating the null
        distribution of the test statistic when the `emp` method is
        applied (see below). if `B` is `None` than as in [1]_ we set
        `B` to be ``B = 200 + 5000/n``, where `n` is the number of
        observations.
    method : {'auto', 'emp', 'asym'}, optional, default=auto
        The method by which to obtain the p-value for the test.

        - `auto` : Default method. The number of observations will be used to
          determine the method.
        - `emp` : Empirical evaluation of the p-value using permutations of
          the rows of `y` to obtain the null distribution.
        - `asym` : An asymptotic approximation of the distribution of the test
          statistic is used to find the p-value.

    Returns
    -------
    test_statistic : float
        The value of the test statistic used in the test.
    pval : float
        The p-value.
    chosen_method : str
        The method that was used to obtain the p-value. Mostly relevant when
        the function is called with `method='auto'`.

    Notes
    -----
    The test applies to random vectors of arbitrary dimensions, i.e., `x`
    can be a 1-D vector of observations for a single random variable while
    `y` can be a `k` by `n` 2-D array (where `k > 1`). In other words, it
    is also possible for `x` and `y` to both be 2-D arrays and have the
    same number of rows (observations) while differing in the number of
    columns.

    As noted in [1]_ the statistics are sensitive to all types of departures
    from independence, including nonlinear or nonmonotone dependence
    structure.

    References
    ----------
    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
       "Measuring and testing by correlation of distances".
       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.

    Examples
    --------
    >>> from statsmodels.stats.dist_dependence_measures import
    ... distance_covariance_test
    >>> data = np.random.rand(1000, 10)
    >>> x, y = data[:, :3], data[:, 3:]
    >>> x.shape
    (1000, 3)
    >>> y.shape
    (1000, 7)
    >>> distance_covariance_test(x, y)
    (1.0426404792714983, 0.2971148340813543, 'asym')
    # (test_statistic, pval, chosen_method)

    r   r   i  empasymzUnknown 'method' parameter: )r      zp-value was zS when using the empirical method. The asymptotic approximation will be used instead)	_validate_and_tranform_x_and_yshapedistance_statistics_empirical_pvalue_asymptotic_pvalue
ValueErrorwarningswarnr   )xyBmethodnstatschosen_methodr
   pvalmsg_s              j/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/statsmodels/stats/dist_dependence_measures.pydistance_covariance_testr(       s$   b *!Q//DAq	
A1%%EAHH%0Aq!UCC	6		a#gg6)9)91%88 @@@AAA
 $&..@4 @ @ @ 	 	c0111$U++44..    c                    t          j        |           } t          j        |          }| j        d         |j        d         k    rt          d          t	          | j                  dk    r"|                     | j        d         df          } t	          |j                  dk    r"|                    |j        d         df          }| |fS )a  Ensure `x` and `y` have proper shape and transform/reshape them if
    required.

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.

    Returns
    -------
    x : array_like, 1-D or 2-D
    y : array_like, 1-D or 2-D

    Raises
    ------
    ValueError
        If `x` and `y` have a different number of observations.

    r   z9x and y must have the same number of observations (rows).r   )np
asanyarrayr   r   lenreshaper   r   s     r'   r   r      s    : 	aA
aAwqzQWQZG
 
 	
 17||qIIqwqz1o&&
17||qIIqwqz1o&&a4Kr)   c                    |rt          |          n&t          t          j        dd|z  z                       }t          | ||          }dt          j        t          |          |j                  t          |          z  z
  }|j        }||fS )ap  Calculate the empirical p-value based on permutations of `y`'s rows

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.
    B : int
        The number of iterations when evaluating the null distribution.
    n : Number of observations found in each of `x` and `y`.
    stats: namedtuple
        The result obtained from calling ``distance_statistics(x, y)``.

    Returns
    -------
    test_statistic : float
        The empirical test statistic.
    pval : float
        The empirical p-value.

       i  r   )intr+   floor _get_test_statistic_distributionsearchsortedsortedr
   r-   )r   r   r   r!   r"   empirical_distr$   r
   s           r'   r   r      s    < 6ARXcD1Hn5566A5aA>>Nr~ 4 N D )N4r)   c                     t          j        | j        | j        z            }dt	          j        |          z
  dz  }||fS )aq  Calculate the p-value based on an approximation of the distribution of
    the test statistic under the null.

    Parameters
    ----------
    stats: namedtuple
        The result obtained from calling ``distance_statistics(x, y)``.

    Returns
    -------
    test_statistic : float
        The test statistic.
    pval : float
        The asymptotic p-value.

    r      )r+   sqrtr
   r   r   cdf)r"   r
   r$   s      r'   r   r      sA    " WU1EG;<<N(((A-D4r)   c                 &   |                                 }t          j        |          }t          t	          | d                    }t          |          D ];}t          j                            |           t          | ||          j	        ||<   <|S )a  
    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.
    B : int
        The number of iterations to perform when evaluating the null
        distribution.

    Returns
    -------
    emp_dist : array_like
        The empirical distribution of the test statistic.

    	euclidean)x_dist)
copyr+   zerosr   r   rangerandomshuffler   r
   )r   r   r   emp_distr>   is         r'   r4   r4      s    2 	
Ax{{Ha--..F1XX N N
	!)!Qv>>>MOr)   c                    t          | |          \  } }| j        d         }||nt          t          | d                    }||nt          t          |d                    }|                    dd          }|                    dd          }|                    dd          }	|                    dd          }
|                                }|                                }||z
  |	z
  |z   }||z
  |
z
  |z   }||z  }t          j        t          j        ||                                                    }t          j        t          j        ||                                                    }t          j        t          j        ||                                                    }|t          j        ||z            z  }||dz  z  }t          ||||||          S )	a	  Calculate various distance dependence statistics.

    Calculate several distance dependence statistics as described in [1]_.

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.
    x_dist : array_like, 2-D, optional
        A square 2-D array_like object whose values are the euclidean
        distances between `x`'s rows.
    y_dist : array_like, 2-D, optional
        A square 2-D array_like object whose values are the euclidean
        distances between `y`'s rows.

    Returns
    -------
    namedtuple
        A named tuple of distance dependence statistics (DistDependStat) with
        the following values:

        - test_statistic : float - The "basic" test statistic (i.e., the one
          used when the `emp` method is chosen when calling
          ``distance_covariance_test()``
        - distance_correlation : float - The distance correlation
          between `x` and `y`.
        - distance_covariance : float - The distance covariance of
          `x` and `y`.
        - dvar_x : float - The distance variance of `x`.
        - dvar_y : float - The distance variance of `y`.
        - S : float - The mean of the euclidean distances in `x` multiplied
          by those of `y`. Mostly used internally.

    References
    ----------
    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
       "Measuring and testing dependence by correlation of distances".
       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.

    Examples
    --------

    >>> from statsmodels.stats.dist_dependence_measures import
    ... distance_statistics
    >>> distance_statistics(np.random.random(1000), np.random.random(1000))
    DistDependStat(test_statistic=0.07948284320205831,
    distance_correlation=0.04269511890990793,
    distance_covariance=0.008915315092696293,
    dvar_x=0.20719027438266704, dvar_y=0.21044934264957588,
    S=0.10892061635588891)

    r   Nr=   T)axiskeepdimsr   r9   r	   )	r   r   r   r   meanr+   r:   multiplyr   )r   r   r>   y_distr!   aba_row_meansb_row_meansa_col_meansb_col_meansa_meanb_meanAr   r   dcovr   r   dcorr
   s                        r'   r   r      s   | *!Q//DAq	
A$*U1k5J5J*K*KA$*U1k5J5J*K*KA&&a$&//K&&a$&//K&&a$&//K&&a$&//KVVXXFVVXXF	K+%.A	K+%.AA72;q!$$))++,,DWR[A&&++--..FWR[A&&++--..F"'&6/***D]N%! 
   r)   c                 ,    t          | |          j        S )a.  Distance covariance.

    Calculate the empirical distance covariance as described in [1]_.

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.

    Returns
    -------
    float
        The empirical distance covariance between `x` and `y`.

    References
    ----------
    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
       "Measuring and testing dependence by correlation of distances".
       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.

    Examples
    --------

    >>> from statsmodels.stats.dist_dependence_measures import
    ... distance_covariance
    >>> distance_covariance(np.random.random(1000), np.random.random(1000))
    0.007575063951951362

    )r   r   r/   s     r'   r   r     s    N q!$$88r)   c                 "    t          | |           S )a  Distance variance.

    Calculate the empirical distance variance as described in [1]_.

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.

    Returns
    -------
    float
        The empirical distance variance of `x`.

    References
    ----------
    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
       "Measuring and testing dependence by correlation of distances".
       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.

    Examples
    --------

    >>> from statsmodels.stats.dist_dependence_measures import
    ... distance_variance
    >>> distance_variance(np.random.random(1000))
    0.21732609190659702

    )r   )r   s    r'   distance_variancerY     s    D q!$$$r)   c                 ,    t          | |          j        S )a?  Distance correlation.

    Calculate the empirical distance correlation as described in [1]_.
    This statistic is analogous to product-moment correlation and describes
    the dependence between `x` and `y`, which are random vectors of
    arbitrary length. The statistics' values range between 0 (implies
    independence) and 1 (implies complete dependence).

    Parameters
    ----------
    x : array_like, 1-D or 2-D
        If `x` is 1-D than it is assumed to be a vector of observations of a
        single random variable. If `x` is 2-D than the rows should be
        observations and the columns are treated as the components of a
        random vector, i.e., each column represents a different component of
        the random vector `x`.
    y : array_like, 1-D or 2-D
        Same as `x`, but only the number of observation has to match that of
        `x`. If `y` is 2-D note that the number of columns of `y` (i.e., the
        number of components in the random vector) does not need to match
        the number of columns in `x`.

    Returns
    -------
    float
        The empirical distance correlation between `x` and `y`.

    References
    ----------
    .. [1] Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007)
       "Measuring and testing dependence by correlation of distances".
       Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.

    Examples
    --------

    >>> from statsmodels.stats.dist_dependence_measures import
    ... distance_correlation
    >>> distance_correlation(np.random.random(1000), np.random.random(1000))
    0.04060497840149489

    )r   r   r/   s     r'   r   r     s    V q!$$99r)   )Nr   )NN)__doc__collectionsr   r   numpyr+   scipy.spatial.distancer   r   scipy.statsr   statsmodels.tools.sm_exceptionsr   r   r(   r   r   r   r4   r   r   rY   r    r)   r'   <module>rb      sA    # " " " " "      4 4 4 4 4 4 4 4       A A A A A A5 5 5 l/ l/ l/ l/^+ + +\%  %  % P     .! ! !H^ ^ ^ ^B'9 '9 '9T"% "% "%J+: +: +: +: +:r)   