
    M/Phm'                     	   d Z ddlmZ ddlZd Zd ZddZ G d d          Ze	d	k    rddl
mZ d
Zej                            e          ZdgZdev r} ee            e eed                      e eed                      ej        e                                e                                          Z eee          Z ej        ee            ej         eee          edz               ej         ej        e           ej         ee                    d          Z ej        d eD                       Z ej                      ej        ee            ee          Z ej                      ej        ee            ej                      ej         edd          ej!        e           ej!        e          z              ej        e                                e                                d          Z" ej                      ej         e"dd          ej!         ee"                     ej!        e"          z              ej        e          Z#e#ddedz           Z$ ej                      ej         e$dd          ej!         ee$                     ej!        e$          z              ee          Z% ee%&                                            ee%'                    e%j(                              ee%)                    g d                      ee%'                    g d                      ej        e                                e                                d          Z ej         ej        e           ej         ee                    d          Z ee          Z ej                      ej        ee            ej        eed          Z* e*e          Z+ ej        e+e            ej,         ej         ee                     ej        e          dd          Z- e-e          Z. ej        e.e            ed            ed ej!        e+                                                      ed ej!        e                                                     dS dS )a 
  
from David Huard's scipy sandbox, also attached to a ticket and
in the matplotlib-user mailinglist  (links ???)


Notes
=====

out of bounds interpolation raises exception and would not be completely
defined ::

>>> scoreatpercentile(x, [0,25,50,100])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.
>>> percentileofscore(x, [-50, 50])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.


idea
====

histogram and empirical interpolated distribution
-------------------------------------------------

dual constructor
* empirical cdf : cdf on all observations through linear interpolation
* binned cdf : based on histogram
both should work essentially the same, although pdf of empirical has
many spikes, fluctuates a lot
- alternative: binning based on interpolated cdf : example in script
* ppf: quantileatscore based on interpolated cdf
* rvs : generic from ppf
* stats, expectation ? how does integration wrt cdf work - theory?

Problems
* limits, lower and upper bound of support
  does not work or is undefined with empirical cdf and interpolation
* extending bounds ?
  matlab has pareto tails for empirical distribution, breaks linearity

empirical distribution with higher order interpolation
------------------------------------------------------

* should work easily enough with interpolating splines
* not piecewise linear
* can use pareto (or other) tails
* ppf how do I get the inverse function of a higher order spline?
  Chuck: resample and fit spline to inverse function
  this will have an approximation error in the inverse function
* -> does not work: higher order spline does not preserve monotonicity
  see mailing list for response to my question
* pmf from derivative available in spline

-> forget this and use kernel density estimator instead


bootstrap/empirical distribution:
---------------------------------

discrete distribution on real line given observations
what's defined?
* cdf : step function
* pmf : points with equal weight 1/nobs
* rvs : resampling
* ppf : quantileatscore on sample?
* moments : from data ?
* expectation ? sum_{all observations x} [func(x) * pmf(x)]
* similar for discrete distribution on real line
* References : ?
* what's the point? most of it is trivial, just for the record ?


Created on Monday, May 03, 2010, 11:47:03 AM
Author: josef-pktd, parts based on David Huard
License: BSD

    Nc                     t          j        |          }t          |           }t          j        t          j        |          t          j        |                     } ||dz            S )zReturn the score at the given percentile of the data.

    Example:
        >>> data = randn(100)
            >>> scoreatpercentile(data, 50)

        will return the median of sample `data`.
          Y@)nparrayempiricalcdfinterpolateinterp1dsort)data
percentilepercdfinterpolators        f/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/statsmodels/sandbox/stats/stats_dhuard.pyscoreatpercentiler   V   sV     (:

C
t

C'bgdmmDDL<D!!!    c                     t          |           }t          j        t          j        |           t          j        |                    } ||          dz  S )aD  Return the percentile-position of score relative to data.

    score: Array of scores at which the percentile is computed.

    Return percentiles (0-100).

    Example
            r = randn(50)
        x = linspace(-2,2,100)
        percentileofscore(r,x)

    Raise an error if the score is outside the range of data.
    r   )r   r   r	   r   r
   )r   scorer   r   s       r   percentileofscorer   d   sH     t

C'rws||DDL<t##r   Hazenc                    t          j        t          j        |                     dz   }t          |           }|                                }|dk    r	|dz
  |z  }nc|dk    r	||dz   z  }nT|dk    r	|dz
  |z  }nE|dk    r|dz
  |dz   z  }n3|d	k    r|dz
  |d
z   z  }n!|dk    r|dz
  |dz   z  }nt	          d          |S )a  Return the empirical cdf.

    Methods available:
        Hazen:       (i-0.5)/N
            Weibull:     i/(N+1)
        Chegodayev:  (i-.3)/(N+.4)
        Cunnane:     (i-.4)/(N+.2)
        Gringorten:  (i-.44)/(N+.12)
        California:  (i-1)/N

    Where i goes from 1 to N.
          ?hazen      ?weibull
california
chegodayev333333?皙?cunnane皙?
gringorten)\(?Q?[Unknown method. Choose among Weibull, Hazen,Chegodayev, Cunnane, Gringorten and California.)r   argsortlenlower
ValueError)r   methodiNr   s        r   r   r   v   s    	
2:d##$$r)AD		A\\^^Fuai	9		2h	<		tQh	<		tadm	9		tadm	<		uquo K L L 	L Jr   c                   4    e Zd ZdZd Zd
dZd Zd Zdd	ZdS )HistDistzDistribution with piecewise linear cdf, pdf is step function

    can be created from empiricial distribution or from a histogram (not done yet)

    work in progress, not finished


    c                 (   t          j        |          | _        t          j        | j                                        | j                                        g          | _        t          j        |          }||         | _        t          j        |          | _	        | 
                                }t          j        |          | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )N)r   
atleast_1dr   r   minmaxbinlimitr&   _datasortedrankingr   r
   _empcdfsortedr   r	   cdfintpppfintp)selfr   sortindr   s       r   __init__zHistDist.__init__   s    M$''	$)--//49==??!CDD*T""=z'**!!WS\\"+D,<d>PQQ"+D,>@PQQr   Nr   c                    || j         }| j        }n)t          j        t          j        |                    dz   }t	          |          }|                                }|dk    r	|dz
  |z  }nc|dk    r	||dz   z  }nT|dk    r	|dz
  |z  }nE|dk    r|dz
  |d	z   z  }n3|d
k    r|d	z
  |dz   z  }n!|dk    r|dz
  |dz   z  }nt          d          |S )aA  Return the empirical cdf.

        Methods available:
            Hazen:       (i-0.5)/N
                Weibull:     i/(N+1)
            Chegodayev:  (i-.3)/(N+.4)
            Cunnane:     (i-.4)/(N+.2)
            Gringorten:  (i-.44)/(N+.12)
            California:  (i-1)/N

        Where i goes from 1 to N.
        Nr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   )r   r5   r   r&   r'   r(   r)   )r9   r   r*   r+   r,   r   s         r   r   zHistDist.empiricalcdf   s    <9DAA
2:d++,,r1AIIWS5!)CCy  QrT(CC|##R4(CC|##R4!B$-CCy  R4!B$-CC|##S51S5/CC O P P P 
r   c                 ,    |                      |          S z&
        this is score in dh

        )r7   )r9   r   s     r   cdf_empzHistDist.cdf_emp   s    
 ||E"""r   c                 ,    |                      |          S r>   )r8   )r9   quantiles     r   ppf_empzHistDist.ppf_emp   s    
 ||H%%%r   Freedmanc                 D   t          | j                  }|dk    r7|                     d          |                     d          z
  }d|z  |dz  z  }n(|dk    r"dt          j        | j                  z  |dz  z  }t          j        | j                  |z  | _        | j        S )zFind the optimal number of bins and update the bin countaccordingly.
        Available methods : Freedman
                            Scott
        rC         ?      ?   gUUUUUUտScottgQ@)r'   r   rB   r   stdptpr3   nbin)r9   r*   nobsIQRwidths        r   optimize_binningzHistDist.optimize_binning   s     49~~:,,t$$t||D'9'99CsFD5M)EEW__26$),,,te}<EVDM**50	yr   )Nr   )rC   )	__name__
__module____qualname____doc__r;   r   r?   rB   rO    r   r   r.   r.      sw         
R 
R 
R& & & &R# # #& & &     r   r.   __main__d   rG      r   2   )kc                 N    g | ]"}t                               |          d          #S )rW   )empderivatives).0xis     r   
<listcomp>r_     s)    BBBb3??2..q1BBBr      )rF   r   rE   )g      g      пr   rF   r   i     gQ?)rY   sznegative densityz(np.diff(ppfs)).min()z(np.diff(cdf_ongrid)).min())r   )/rS   scipy.interpolater   numpyr   r   r   r   r.   rP   matplotlib.pyplotpyplotpltrL   randomrandnxexamplesprintlinspacer1   r2   xsuppposplotInterpolatedUnivariateSpliner
   r[   r   pdfempfigure
cdf_ongridstepdiffxsupp2xsoxshistdrO   r?   r3   rB   r8   ppfsUnivariateSplineppfempppferT   r   r   <module>r      s  Q Qd ( ' ' ' ' '    " " "$ $ $$! ! ! !H` ` ` ` ` ` ` `H z######D
	AsHH}}Q3''(((2&&'''AEEGGQUUWW--5)) 	""1c**CE222 5K4WRWQZZUV@X@X[\]]]BBEBBBCC
vSZZ


### 	
ssGBGJ//>??? QUUWWaeeggr22
WRWSS[[11'"'&//ABBB bgajj47^
CRCR))'"'"++5666 HQKKE	E%
 
 
"
"###	E%--
'
'(((	E%--)))
*
*+++	E%--333
4
4555 BK#..E00GBGLLQROO<T<TWXYYYC UJCJLLLCHUJ6k6z%!LLLG7:DCHT: (;'Q(@(@aSWXXXF6*DCHT:	E
	E
!GBGDMM#6#6#8#8999	E
''"'**=*=)B)B)D)DEEEEES r   