
    M/PhI                         d Z ddlZddlZddlZddlmZ ddZd Z	d Z
d Zd	 Zd
 Zd Zd Zd Zddddej        ej        dfdZdS )aS  
Implementation of Regression on Order Statistics for imputing left-
censored (non-detect data)

Method described in *Nondetects and Data Analysis* by Dennis R.
Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
values of a dataset.

Author: Paul M. Hobson
Company: Geosyntec Consultants (Portland, OR)
Date: 2016-06-14

    N)statsFc                    | | |                                       |d          }| | |                                        |d          }||                                         ||                                         k    rB|||         ||                                         k             }|rd}t          j        |           t	          j        ||gd          }|||g                             d          S )a  
    This function prepares a dataframe for ROS.

    It sorts ascending with
    left-censored observations first. Censored observations larger than
    the maximum uncensored observations are removed from the dataframe.

    Parameters
    ----------
    df : DataFrame

    observations : str
        Name of the column in the dataframe that contains observed
        values. Censored values should be set to the detection (upper)
        limit.

    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)

    Returns
    ------
    sorted_df : DataFrame
        The sorted dataframe with all columns dropped except the
        observation and censorship columns.
    r   axiszKDropping censored observations greater than the max uncensored observation.T)drop)sort_valuesmaxwarningswarnpdconcatreset_index)dfobservations
censorshipr   censored
uncensoredmsgcombineds           Z/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/statsmodels/imputation/ros.py	_ros_sortr      s    < "Z.!--l-CCHR
^O$00A0FFJ!!##j&>&B&B&D&DDDH\2j6N6R6R6T6TTU 	5CM#y(J/a888H\:./;;;FFF    c                      fd} fd} fd}d }d }          }t          j         j        |f                   }	|	                                 |	j        d         dk    rJ                                          |	                                k     r.t          j                                                  |	g          }	t          j        |	dg          }
 ||
          |
j        d	d	d
f<   |
	                    |d          |
j        d	d	df<   |
	                    |d          |
j        d	d	df<   |
	                    |d          |
j        d	d	df<   |

                    t          |	j        d         dz                       }
 ||
d         |
d                   |
j        d	d	df<   n;g d}t          j        t          j        dt          |          f          |          }
|
S )aM  
    Computes the Cohn numbers for the detection limits in the dataset.

    The Cohn Numbers are:

        - :math:`A_j =` the number of uncensored obs above the jth
          threshold.
        - :math:`B_j =` the number of observations (cen & uncen) below
          the jth threshold.
        - :math:`C_j =` the number of censored observations at the jth
          threshold.
        - :math:`\mathrm{PE}_j =` the probability of exceeding the jth
          threshold
        - :math:`\mathrm{DL}_j =` the unique, sorted detection limits
        - :math:`\mathrm{DL}_{j+1} = \mathrm{DL}_j` shifted down a
          single index (row)

    Parameters
    ----------
    dataframe : DataFrame

    observations : str
        Name of the column in the dataframe that contains observed
        values. Censored values should be set to the detection (upper)
        limit.

    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)

    Returns
    -------
    cohn : DataFrame
    c                              | d         k    }         | d         k     }          }||z  |z           j         d         S )zD A, the number of uncensored obs above the given threshold.
        lower_dlupper_dlr   shape)rowabovebelowdetectr   r   r   s       r   nuncen_abovez"cohn_numbers.<locals>.nuncen_aboveh   sX    
 < C
O3 < 3z?2 Z. %%-&()/22r   c                     	         | d         k     }	         | d         k    }          }         }||z           j         d         }||z           j         d         }||z   S )zW B, the number of observations (cen & uncen) below the given
        threshold
        r   r   r   )
r   	less_thanless_thanequalr   r   LTE_censoredLT_uncensoredr   r   r   s
          r   
nobs_belowz cohn_numbers.<locals>.nobs_belowx   s     |$s:6	 L)S_< n_
j> .834:1= 9z128; m++r   c                 p             }         |         }|| d         k    }|                                 S )zP C, the number of censored observations at the given
        threshold.
        r   )sum)r   censored_indexcensored_datacensored_belowr   r   r   s       r   
ncen_equalz cohn_numbers.<locals>.ncen_equal   s=    
 J<(8&#j/9!!###r   c                     | j         d         dk    r9| d                             d                              t          j                  S t          j        gS )z: Sets the upper_dl DL for each row of the Cohn dataframe. r      r   )value)r   shiftfillnanpinf)cohns    r   set_upper_limitz%cohn_numbers.<locals>.set_upper_limit   sH    :a=1
#))"--44264BBBF8Or   c                     t          |           }t          j        |d          }d|d<   t          |dz
  dd          D ]:}||dz            d||dz            z
  | |         z  | |         ||         z   z  z   ||<   ;|S )zS Computes the probability of excedance for each row of the
        Cohn dataframe. float64)dtypeg        r2      r1   )lenr6   emptyrange)ABNPEjs        r   
compute_PEz cohn_numbers.<locals>.compute_PE   s     FFXay)))2qsB## 	C 	CAqsGq2ac7{ad2adQqTkBBBqEE	r   r   r   )columnsNr   r1   r   r#   r)   r/   prob_exceedance)r   r   r#   r)   r/   rH   )r   uniquelocsortr   minr6   hstack	DataFrameapplyreindexr@   r?   r>   )r   r   r   r#   r)   r/   r9   rF   r-   DLsr8   dl_colss   ```         r   cohn_numbersrS   C   sT   J3 3 3 3 3 3 3 , , , , , , ,0$ $ $ $ $ $ $  	 	 	 zNM
)BF=,67
8
8CHHJJJ y|al!!CGGII--)R-1133S9::C
 |C*666"1/$"7"7J&*jjAj&F&FN"#$(JJzJ$B$BL!$(JJzJ$B$BL!||E#)A,"23344)3D4H$|J\)])]%%&&B B B|BHaW%677IIIKr   c                 ~    |j         d         dk    r)t          j        |d         | k              \  }|d         }nd}|S )a  
    Locates the corresponding detection limit for each observation.

    Basically, creates an array of indices for the detection limits
    (Cohn numbers) corresponding to each data point.

    Parameters
    ----------
    obs : float
        A single observation from the larger dataset.

    cohn : DataFrame
        DataFrame of Cohn numbers.

    Returns
    -------
    det_limit_index : int
        The index of the corresponding detection limit in `cohn`

    See Also
    --------
    cohn_numbers
    r   r   r2   )r   r6   where)obsr8   indexdet_limit_indexs       r   _detection_limit_indexrY      sF    2 z!}q$z*c122)r   c                     |                                  }d|j        dddf<   |                    ||g          d                             d           }|S )a  
    Ranks each observation within the data groups.

    In this case, the groups are defined by the record's detection
    limit index and censorship status.

    Parameters
    ----------
    df : DataFrame

    dl_idx : str
        Name of the column in the dataframe the index of the
        observations' corresponding detection limit in the `cohn`
        dataframe.

    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)

    Returns
    -------
    ranks : ndarray
        Array of ranks for the dataset.
    r1   Nrank)byc                 *    |                                  S N)cumsum)gs    r   <lambda>z!_ros_group_rank.<locals>.<lambda>  s    !((** r   )copyrJ   groupby	transform)r   dl_idxr   rankss       r   _ros_group_rankrg      s]    : GGIIEEIaaai&*-..v6i,,-- 
 Lr   c                     | d         }| d         }| |         }|j         |         }|j         |dz            }|rd|d         z
  |z  |d         dz   z  S d|d         z
  |d         |d         z
  |z  |d         dz   z  z   S )a  
    ROS-specific plotting positions.

    Computes the plotting position for an observation based on its rank,
    censorship status, and detection limit index.

    Parameters
    ----------
    row : {Series, dict}
        Full observation (row) from a censored dataset. Requires a
        'rank', 'detection_limit', and `censorship` column.

    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)

    cohn : DataFrame
        DataFrame of Cohn numbers.

    Returns
    -------
    plotting_position : float

    See Also
    --------
    cohn_numbers
    rX   r[   r1   rH   r/   r#   )iloc)r   r   r8   DL_indexr[   r   dl_1dl_2s           r   _ros_plot_posrm     s    < $%Hv;D:H9XD9X\"D 0D*++t3tL7I!7KLLD*++5F0G$O`Ja0a0^,Q.00 0 	0r   c                 r    t          j        | d          \  }}t           j                            |          S )z
    Computes standard normal (Gaussian) plotting positions using scipy.

    Parameters
    ----------
    observations : array_like
        Sequence of observed quantities.

    Returns
    -------
    plotting_position : array of floats
    F)fit)r   probplotnormcdf)r   ppos
sorted_ress      r   _norm_plot_posru   =  s1     ~l>>>D*:>>$r   c                     |                      fdd          }||                   }t          j        |d          }|                                 ||j        |          j        |                   <   |S )aP  
    Compute the plotting positions for the observations.

    The ROS-specific plotting postions are based on the observations'
    rank, censorship status, and corresponding detection limit.

    Parameters
    ----------
    df : DataFrame

    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)

    cohn : DataFrame
        DataFrame of Cohn numbers.

    Returns
    -------
    plotting_position : array of float

    See Also
    --------
    cohn_numbers
    c                 &    t          |           S r^   )rm   )rr   r8   s    r   ra   z$plotting_positions.<locals>.<lambda>j  s    -:t"D"D r   r1   r   W)requirements)rO   r6   requirerK   rJ   rW   )r   r   r8   plot_pos
ND_plotposND_plotpos_arrs    ``   r   plotting_positionsr   N  s    8 xxDDDDD1xMMH "Z.)JZ
===N9GHLJ%bn56Or   c                 h   | |          }| |         }t          j        | d         |          || |         |                             }|dd         \  }}	 ||| d         |         z  |	z             | j        dddf<   t          j        | |         | d         | |                   | j        dddf<   | S )a  
    Executes the basic regression on order stat (ROS) proceedure.

    Uses ROS to impute censored from the best-fit line of a
    probability plot of the uncensored values.

    Parameters
    ----------
    df : DataFrame
    observations : str
        Name of the column in the dataframe that contains observed
        values. Censored values should be set to the detection (upper)
        limit.
    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)
    transform_in, transform_out : callable
        Transformations to be applied to the data prior to fitting
        the line and after estimated values from that line. Typically,
        `np.log` and `np.exp` are used, respectively.

    Returns
    -------
    estimated : DataFrame
        A new dataframe with two new columns: "estimated" and "final".
        The "estimated" column contains of the values inferred from the
        best-fit line. The "final" column contains the estimated values
        only where the original observations were censored, and the original
        observations everwhere else.
    ZprelimNr=   	estimatedfinal)r   
linregressrJ   r6   rU   )
r   r   r   transform_intransform_outuncensored_maskcensored_mask
fit_paramsslope	intercepts
             r   _imputer   u  s    D *~oOzNM !
9o&R%o677 J ""1"~E9
 +]52i=3O+OR[+[\\BF111k>"Z."[/2lCSTTBF111g:Ir   c                    t          | ||          }t          | ||          }||                             t          |f          |j        dddf<   t          |d|          |j        dddf<   t          |||          |j        dddf<   t          j        	                    |d                   |j        dddf<   t          |||||          S )a  
    DataFrame-centric function to impute censored valies with ROS.

    Prepares a dataframe for, and then esimates the values of a censored
    dataset using Regression on Order Statistics

    Parameters
    ----------
    df : DataFrame

    observations : str
        Name of the column in the dataframe that contains observed
        values. Censored values should be set to the detection (upper)
        limit.

    censorship : str
        Name of the column in the dataframe that indicates that a
        observation is left-censored. (i.e., True -> censored,
        False -> uncensored)

    transform_in, transform_out : callable
        Transformations to be applied to the data prior to fitting
        the line and after estimated values from that line. Typically,
        `np.log` and `np.exp` are used, respectively.

    Returns
    -------
    estimated : DataFrame
        A new dataframe with two new columns: "estimated" and "final".
        The "estimated" column contains of the values inferred from the
        best-fit line. The "final" column contains the estimated values
        only where the original observations were censored, and the original
        observations everwhere else.
    )r   r   )argsNrX   r[   r|   r   )rS   r   rO   rY   rJ   rg   r   r   rq   ppfr   )r   r   r   r   r   r8   modeleds          r   _do_rosr     s    J *MMMD *MMMG(/(=(C(CDZbfah(C(i(iGK$$%,W6GTTGK6	!3GZ!N!NGK: %
wz/B C CGK97L*lMRRRr   r=   g?g      ?Tc	                 8   |t          j        | |d          }d} d}|j        d         }	||                             t                                                    }
|	|
z
  }|
|	z  }|
dk    r1|| |g                                         }||          |j        dddf<   nm||k     s||k    rN|| |g                                         }||          |j        dddf<   |j        ||         dfxx         |z  cc<   nt          || |||          }|r|d         j	        }|S )a)	  
    Impute censored dataset using Regression on Order Statistics (ROS).

    Method described in *Nondetects and Data Analysis* by Dennis R.
    Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
    values of a dataset. When there is insufficient non-censorded data,
    simple substitution is used.

    Parameters
    ----------
    observations : str or array-like
        Label of the column or the float array of censored observations

    censorship : str
        Label of the column or the bool array of the censorship
        status of the observations.

          * True if censored,
          * False if uncensored

    df : DataFrame, optional
        If `observations` and `censorship` are labels, this is the
        DataFrame that contains those columns.

    min_uncensored : int (default is 2)
        The minimum number of uncensored values required before ROS
        can be used to impute the censored observations. When this
        criterion is not met, simple substituion is used instead.

    max_fraction_censored : float (default is 0.8)
        The maximum fraction of censored data below which ROS can be
        used to impute the censored observations. When this fraction is
        exceeded, simple substituion is used instead.

    substitution_fraction : float (default is 0.5)
        The fraction of the detection limit to be used during simple
        substitution of the censored values.

    transform_in : callable (default is np.log)
        Transformation to be applied to the values prior to fitting a
        line to the plotting positions vs. uncensored values.

    transform_out : callable (default is np.exp)
        Transformation to be applied to the imputed censored values
        estimated from the previously computed best-fit line.

    as_array : bool (default is True)
        When True, a numpy array of the imputed observations is
        returned. Otherwise, a modified copy of the original dataframe
        with all of the intermediate calculations is returned.

    Returns
    -------
    imputed : {ndarray, DataFrame}
        The final observations where the censored values have either been
        imputed through ROS or substituted as a fraction of the
        detection limit.

    Notes
    -----
    This function requires pandas 0.14 or more recent.
    N)rV   cenrV   r   r   r   )
r   rN   r   astypeintr+   rb   rJ   r   values)r   r   r   min_uncensoredmax_fraction_censoredsubstitution_fractionr   r   as_arrayN_observations
N_censoredN_uncensoredfraction_censoredoutputs                 r   
impute_rosr     s\   H 
z\,zBBCC
 Xa[NJ&&s++//11J!J.L"^3
 Q\:./4466!#L!1
111g: 
'
'->AV-V-V\:./4466!#L!1
111g:
2j>7*+++/DD++++
 \:|]SS  ('Mr   )F)__doc__r
   numpyr6   pandasr   scipyr   r   rS   rY   rg   rm   ru   r   r   r   logexpr    r   r   <module>r      s                  *G *G *G *GZD D DN  D# # #L(0 (0 (0V     "$ $ $N4 4 4n/S /S /Sd -1%(F"&h h h h h hr   