
    0Phe                        d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZ dd	lmZmZmZmZmZmZ  G d
 d          Z G d de          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& G d de          Z' G d  d!e          Z(eee e!e"e#e$e&e'e(d"
Z)dS )#z
This module contains loss classes suitable for fitting.

It is not part of the public API.
Specific losses are used for regression, binary classification or multiclass
classification.
    Nxlogy   )check_scalar)_weighted_percentile   )CyAbsoluteErrorCyExponentialLossCyHalfBinomialLossCyHalfGammaLossCyHalfMultinomialLossCyHalfPoissonLossCyHalfSquaredErrorCyHalfTweedieLossCyHalfTweedieLossIdentityCyHuberLossCyPinballLoss)HalfLogitLinkIdentityLinkInterval	LogitLinkLogLinkMultinomialLogitc                       e Zd ZdZdZdZdZddZd Zd Z		 	 	 dd	Z
	 	 	 	 dd
Z	 	 	 ddZ	 	 	 	 ddZddZddZddZej        dfdZdS )BaseLossa  Base class for a loss function of 1-dimensional targets.

    Conventions:

        - y_true.shape = sample_weight.shape = (n_samples,)
        - y_pred.shape = raw_prediction.shape = (n_samples,)
        - If is_multiclass is true (multiclass classification), then
          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
          Note that this corresponds to the return value of decision_function.

    y_true, y_pred, sample_weight and raw_prediction must either be all float64
    or all float32.
    gradient and hessian must be either both float64 or both float32.

    Note that y_pred = link.inverse(raw_prediction).

    Specific loss classes can inherit specific link classes to satisfy
    BaseLink's abstractmethods.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.
    n_classes : {None, int}
        The number of classes for classification, else None.

    Attributes
    ----------
    closs: CyLossFunction
    link : BaseLink
    interval_y_true : Interval
        Valid interval for y_true
    interval_y_pred : Interval
        Valid Interval for y_pred
    differentiable : bool
        Indicates whether or not loss function is differentiable in
        raw_prediction everywhere.
    need_update_leaves_values : bool
        Indicates whether decision trees in gradient boosting need to uptade
        leave values after having been fit to the (negative) gradients.
    approx_hessian : bool
        Indicates whether the hessian is approximated or exact. If,
        approximated, it should be larger or equal to the exact one.
    constant_hessian : bool
        Indicates whether the hessian is one for this loss.
    is_multiclass : bool
        Indicates whether n_classes > 2 is allowed.
    TFNc                     || _         || _        d| _        d| _        || _        t          t          j         t          j        dd          | _        | j        j	        | _	        d S )NF)
closslinkapprox_hessianconstant_hessian	n_classesr   npinfinterval_y_trueinterval_y_pred)selfr   r   r!   s       R/var/www/html/test/jupyter/venv/lib/python3.11/site-packages/sklearn/_loss/loss.py__init__zBaseLoss.__init__   sV    
	# %"'FF#y8    c                 6    | j                             |          S zuReturn True if y is in the valid range of y_true.

        Parameters
        ----------
        y : ndarray
        )r$   includesr&   ys     r'   in_y_true_rangezBaseLoss.in_y_true_range        #,,Q///r)   c                 6    | j                             |          S )zuReturn True if y is in the valid range of y_pred.

        Parameters
        ----------
        y : ndarray
        )r%   r,   r-   s     r'   in_y_pred_rangezBaseLoss.in_y_pred_range   r0   r)   r   c                     |t          j        |          }|j        dk    r&|j        d         dk    r|                    d          }| j                            |||||           |S )aJ  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
        Nr   r   y_trueraw_predictionsample_weightloss_out	n_threads)r"   
empty_likendimshapesqueezer   loss)r&   r5   r6   r7   r8   r9   s         r'   r>   zBaseLoss.loss   s    < }V,,H!##(<Q(?1(D(D+33A66N
)' 	 	
 	
 	
 r)   c                    |G|)t          j        |          }t          j        |          }n9t          j        ||j                  }n|t          j        ||j                  }|j        dk    r&|j        d         dk    r|                    d          }|j        dk    r&|j        d         dk    r|                    d          }| j                            ||||||           ||fS )a  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the loss is stored. If None, a new array
            might be created.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Ndtyper   r   )r5   r6   r7   r8   gradient_outr9   )r"   r:   rA   r;   r<   r=   r   loss_gradient)r&   r5   r6   r7   r8   rB   r9   s          r'   rC   zBaseLoss.loss_gradient   s   L #=00!}^<<=|7IJJJ!=x~NNNL !##(<Q(?1(D(D+33A66N!!l&8&;q&@&@'//22L
  )'% 	! 	
 	
 	
 %%r)   c                 4   |t          j        |          }|j        dk    r&|j        d         dk    r|                    d          }|j        dk    r&|j        d         dk    r|                    d          }| j                            |||||           |S )a  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Nr   r   )r5   r6   r7   rB   r9   )r"   r:   r;   r<   r=   r   gradient)r&   r5   r6   r7   rB   r9   s         r'   rE   zBaseLoss.gradient  s    > =88L !##(<Q(?1(D(D+33A66N!!l&8&;q&@&@'//22L
)'% 	 	
 	
 	
 r)   c                     |@|)t          j        |          }t          j        |          }n+t          j        |          }n|t          j        |          }|j        dk    r&|j        d         dk    r|                    d          }|j        dk    r&|j        d         dk    r|                    d          }|j        dk    r&|j        d         dk    r|                    d          }| j                            ||||||           ||fS )a  Compute gradient and hessian of loss w.r.t raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the hessian is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.

        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise hessians.
        Nr   r   )r5   r6   r7   rB   hessian_outr9   )r"   r:   r;   r<   r=   r   gradient_hessian)r&   r5   r6   r7   rB   rG   r9   s          r'   rH   zBaseLoss.gradient_hessian=  s0   N "!}^<< mN;;!}[99 -55K !##(<Q(?1(D(D+33A66N!!l&8&;q&@&@'//22Lq  [%6q%9Q%>%>%--a00K
##)'%# 	$ 	
 	
 	
 [((r)   c           	      ^    t          j        |                     ||dd|          |          S )a{  Compute the weighted average loss.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr4   weights)r"   averager>   )r&   r5   r6   r7   r9   s        r'   __call__zBaseLoss.__call__  sG    ( zII-"#    "	
 	
 	
 		
r)   c                     t          j        ||d          }dt          j        |j                  j        z  }| j        j        t           j         k    rd}n(| j        j        r| j        j        }n| j        j        |z   }| j        j	        t           j        k    rd}n(| j        j
        r| j        j	        }n| j        j	        |z
  }||| j                            |          S | j                            t          j        |||                    S )a#  Compute raw_prediction of an intercept-only model.

        This can be used as initial estimates of predictions, i.e. before the
        first iteration in fit.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or array of shape (n_samples,)
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        r   rK   axis
   N)r"   rL   finforA   epsr%   lowr#   low_inclusivehighhigh_inclusiver   clip)r&   r5   r7   y_predrS   a_mina_maxs          r'   fit_intercept_onlyzBaseLoss.fit_intercept_only  s    ( FMBBB28FL))--#w..EE!/ 	3(,EE(,s2E$..EE!0 	4(-EE(-3E=U]9>>&)))9>>"'&%"?"?@@@r)   c                 *    t          j        |          S )zpCalculate term dropped in loss.

        With this term added, the loss of perfect predictions is zero.
        )r"   
zeros_liker&   r5   r7   s      r'   constant_to_optimal_zeroz!BaseLoss.constant_to_optimal_zero  s    
 }V$$$r)   Fc                 $   |t           j        t           j        fvrt          d| d          | j        r
|| j        f}n|f}t          j        |||          }| j        rt          j        d|          }nt          j        |||          }||fS )au  Initialize arrays for gradients and hessians.

        Unless hessians are constant, arrays are initialized with undefined values.

        Parameters
        ----------
        n_samples : int
            The number of samples, usually passed to `fit()`.
        dtype : {np.float64, np.float32}, default=np.float64
            The dtype of the arrays gradient and hessian.
        order : {'C', 'F'}, default='F'
            Order of the arrays gradient and hessian. The default 'F' makes the arrays
            contiguous along samples.

        Returns
        -------
        gradient : C-contiguous array of shape (n_samples,) or array of shape             (n_samples, n_classes)
            Empty array (allocated but not initialized) to be used as argument
            gradient_out.
        hessian : C-contiguous array of shape (n_samples,), array of shape
            (n_samples, n_classes) or shape (1,)
            Empty (allocated but not initialized) array to be used as argument
            hessian_out.
            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
            initialized to ``1``.
        zCValid options for 'dtype' are np.float32 and np.float64. Got dtype=z	 instead.)r<   rA   order)r   )r<   rA   )	r"   float32float64
ValueErroris_multiclassr!   emptyr    ones)r&   	n_samplesrA   rc   r<   rE   hessians          r'   init_gradient_and_hessianz"BaseLoss.init_gradient_and_hessian  s    8 RZ000.". . .  
  	!/EELE8%uEBBB  	F
 gD666GGhU%uEEEG  r)   N)NNr   NNNr   Nr   )__name__
__module____qualname____doc__differentiableneed_update_leaves_valuesrg   r(   r/   r2   r>   rC   rE   rH   rM   r\   r`   r"   re   rl    r)   r'   r   r   F   sK       / /t N %M9 9 9 90 0 00 0 0 + + + +b =& =& =& =&F / / / /j @) @) @) @)D
 
 
 
>(A (A (A (AT% % % % :<3 1! 1! 1! 1! 1! 1!r)   r   c                   $     e Zd ZdZd fd	Z xZS )HalfSquaredErrora  Half squared error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half squared error is defined as::

        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2

    The factor of 0.5 simplifies the computation of gradients and results in a
    unit hessian (and is consistent with what is done in LightGBM). It is also
    half the Normal distribution deviance.
    Nc                     t                                          t                      t                                 |d u | _        d S )Nr   r   )superr(   r   r   r    r&   r7   	__class__s     r'   r(   zHalfSquaredError.__init__  s>    133,..III - 5r)   rm   rp   rq   rr   rs   r(   __classcell__r}   s   @r'   rx   rx     sG         "6 6 6 6 6 6 6 6 6 6r)   rx   c                   4     e Zd ZdZdZdZd fd	ZddZ xZS )AbsoluteErrora  Absolute error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the absolute error is defined as::

        loss(x_i) = |y_true_i - raw_prediction_i|

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.
    FTNc                     t                                          t                      t                                 d| _        |d u | _        d S )Nrz   T)r{   r(   r	   r   r   r    r|   s     r'   r(   zAbsoluteError.__init__3  sE    00|~~FFF" - 5r)   c                 T    |t          j        |d          S t          ||d          S )Compute raw_prediction of an intercept-only model.

        This is the weighted median of the target, i.e. over the samples
        axis=0.
        Nr   rP   2   )r"   medianr   r_   s      r'   r\   z AbsoluteError.fit_intercept_only8  s1      9V!,,,,'rBBBr)   rm   	rp   rq   rr   rs   rt   ru   r(   r\   r   r   s   @r'   r   r     sp         " N $6 6 6 6 6 6
	C 	C 	C 	C 	C 	C 	C 	Cr)   r   c                   4     e Zd ZdZdZdZd fd	Zd	dZ xZS )
PinballLossa  Quantile loss aka pinball loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the pinball loss is defined as::

        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)

        rho_{quantile}(u) = u * (quantile - 1_{u<0})
                          = -u *(1 - quantile)  if u < 0
                             u * quantile       if u >= 0

    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level of the quantile to be estimated. Must be in range (0, 1).
    FTN      ?c                     t          |dt          j        ddd           t                                          t          t          |                    t                                 d| _        |d u | _	        d S )	Nquantiler   r   neithertarget_typemin_valmax_valinclude_boundaries)r   rz   T)
r   numbersRealr{   r(   r   floatr   r   r    )r&   r7   r   r}   s      r'   r(   zPinballLoss.__init__e  s    (	
 	
 	
 	
 	x999 	 	
 	
 	
 # - 5r)   c                     |$t          j        |d| j        j        z  d          S t	          ||d| j        j        z            S )r   Nd   r   r   )r"   
percentiler   r   r   r_   s      r'   r\   zPinballLoss.fit_intercept_onlyu  sN      =tz/B)BKKKK'sTZ-@'@  r)   )Nr   rm   r   r   s   @r'   r   r   D  sh         : N $6 6 6 6 6 6        r)   r   c                   4     e Zd ZdZdZdZd	 fd	Zd
dZ xZS )	HuberLossa  Huber loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the Huber loss is defined as::

        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
                    delta * (abserr - delta/2) if abserr > delta

        abserr = |y_true_i - raw_prediction_i|
        delta = quantile(abserr, self.quantile)

    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
    equals delta * (AbsoluteError() - delta/2).

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level which defines the breaking point `delta` to distinguish
        between absolute error and squared error. Must be in range (0, 1).

     Reference
    ---------
    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
      boosting machine <10.1214/aos/1013203451>`.
      Annals of Statistics, 29, 1189-1232.
    FTN?r   c                    t          |dt          j        ddd           || _        t	                                          t          t          |                    t                                 d| _	        d	| _
        d S )
Nr   r   r   r   r   )deltarz   TF)r   r   r   r   r{   r(   r   r   r   r   r    )r&   r7   r   r   r}   s       r'   r(   zHuberLoss.__init__  s    (	
 	
 	
 	
 !E%LL111 	 	
 	
 	
 # %r)   c                     |t          j        |dd          }nt          ||d          }||z
  }t          j        |          t          j        | j        j        t          j        |                    z  }|t          j        ||          z   S )r   Nr   r   r   rJ   )	r"   r   r   signminimumr   r   absrL   )r&   r5   r7   r   diffterms         r'   r\   zHuberLoss.fit_intercept_only  s      ]62A666FF)&-DDFwt}}rz$**:BF4LLIII
4?????r)   )Nr   r   rm   r   r   s   @r'   r   r     sq         B N $& & & & & &"@ @ @ @ @ @ @ @r)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfPoissonLossa  Half Poisson deviance loss with log-link, for regression.

    Domain:
    y_true in non-negative real numbers
    y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half the Poisson deviance is defined as::

        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                    - y_true_i + exp(raw_prediction_i)

    Half the Poisson deviance is actually the negative log-likelihood up to
    constant terms (not involving raw_prediction) and simplifies the
    computation of the gradients.
    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
    Nc                     t                                          t                      t                                 t	          dt
          j        dd          | _        d S )Nrz   r   TF)r{   r(   r   r   r   r"   r#   r$   r|   s     r'   r(   zHalfPoissonLoss.__init__  sI    022CCC'264??r)   c                 :    t          ||          |z
  }|||z  }|S rm   r   r&   r5   r7   r   s       r'   r`   z(HalfPoissonLoss.constant_to_optimal_zero  s+    VV$$v-$M!Dr)   rm   rp   rq   rr   rs   r(   r`   r   r   s   @r'   r   r     sa         (@ @ @ @ @ @       r)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfGammaLossaV  Half Gamma deviance loss with log-link, for regression.

    Domain:
    y_true and y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Gamma deviance loss is defined as::

        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                    + y_true/exp(raw_prediction_i) - 1

    Half the Gamma deviance is actually proportional to the negative log-
    likelihood up to constant terms (not involving raw_prediction) and
    simplifies the computation of the gradients.
    We also skip the constant term `-log(y_true_i) - 1`.
    Nc                     t                                          t                      t                                 t	          dt
          j        dd          | _        d S )Nrz   r   F)r{   r(   r   r   r   r"   r#   r$   r|   s     r'   r(   zHalfGammaLoss.__init__   sH    00wyyAAA'265%@@r)   c                 D    t          j        |           dz
  }|||z  }|S ro   )r"   logr   s       r'   r`   z&HalfGammaLoss.constant_to_optimal_zero  s+    v"$M!Dr)   rm   r   r   s   @r'   r   r     sa         &A A A A A A       r)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfTweedieLossa  Half Tweedie deviance loss with log-link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers
    power in real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
                    + exp(raw_prediction_i)**(2-p) / (2-p)

    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
    HalfPoissonLoss and HalfGammaLoss.

    We also skip constant terms, but those are different for p=0, 1, 2.
    Therefore, the loss is not continuous in `power`.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    N      ?c                    t                                          t          t          |                    t	                                 | j        j        dk    r.t          t          j	         t          j	        dd          | _
        d S | j        j        dk     r#t          dt          j	        dd          | _
        d S t          dt          j	        dd          | _
        d S N)powerrz   r   Fr   T)r{   r(   r   r   r   r   r   r   r"   r#   r$   r&   r7   r   r}   s      r'   r(   zHalfTweedieLoss.__init__*  s    #%,,777 	 	
 	
 	
 :q  #+RVGRVUE#J#JD   Z!!#+ArvtU#C#CD   #+Arvue#D#DD   r)   c                    | j         j        dk    r#t                                          ||          S | j         j        dk    r#t	                                          ||          S | j         j        dk    r#t                                          ||          S | j         j        }t          j        t          j        |d          d|z
            d|z
  z  d|z
  z  }|||z  }|S )Nr   )r5   r7   r   r   )r   r   rx   r`   r   r   r"   maximum)r&   r5   r7   pr   s        r'   r`   z(HalfTweedieLoss.constant_to_optimal_zero6  s    :q  #%%>>] ?    Z"""$$==] >    Z"" ??;;] <    
 A8BJvq111q599QUCq1uMD(%Kr)   Nr   rm   r   r   s   @r'   r   r     sa         <
E 
E 
E 
E 
E 
E       r)   r   c                   $     e Zd ZdZd fd	Z xZS )HalfTweedieLossIdentityan  Half Tweedie deviance loss with identity link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers for power != 0
    y_pred in real numbers for power = 0
    power in real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
                    + raw_prediction_i**(2-p) / (2-p)

    Note that the minimum value of this loss is 0.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    Nr   c                 z   t                                          t          t          |                    t	                                 | j        j        dk    r-t          t          j	         t          j	        dd          | _
        nS| j        j        dk     r"t          dt          j	        dd          | _
        n!t          dt          j	        dd          | _
        | j        j        dk    r.t          t          j	         t          j	        dd          | _        d S t          dt          j	        dd          | _        d S r   )r{   r(   r   r   r   r   r   r   r"   r#   r$   r%   r   s      r'   r(   z HalfTweedieLossIdentity.__init__g  s   +%,,??? 	 	
 	
 	
 :q  #+RVGRVUE#J#JD  Z!!#+ArvtU#C#CD  #+Arvue#D#DD :q  #+RVGRVUE#J#JD   #+Arvue#D#DD   r)   r   r~   r   s   @r'   r   r   K  sQ         6E E E E E E E E E Er)   r   c                   2     e Zd ZdZd fd	ZddZd Z xZS )HalfBinomialLossaY  Half Binomial deviance loss with logit link, for binary classification.

    This is also know as binary cross entropy, log-loss and logistic loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(raw_prediction)

    For a given sample x_i, half Binomial deviance is defined as the negative
    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
    as::

        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i

    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
    section 4.4.1 (about logistic regression).

    Note that the formulation works for classification, y = {0, 1}, as well as
    logistic regression, y = [0, 1].
    If you add `constant_to_optimal_zero` to the loss, you get half the
    Bernoulli/binomial deviance.

    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
    in the loss gives the well known::

        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
    Nc                     t                                          t                      t                      d           t	          dddd          | _        d S Nr   r   r   r!   r   r   T)r{   r(   r   r   r   r$   r|   s     r'   r(   zHalfBinomialLoss.__init__  sU    $&& 	 	
 	
 	

  (1dD99r)   c                 b    t          ||          t          d|z
  d|z
            z   }|||z  }|S ro   r   r   s       r'   r`   z)HalfBinomialLoss.constant_to_optimal_zero  s=    VV$$uQZV'D'DD$M!Dr)   c                 &   |j         dk    r&|j        d         dk    r|                    d          }t          j        |j        d         df|j                  }| j                            |          |dddf<   d|dddf         z
  |dddf<   |S a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, 2)
            Element-wise class probabilities.
        r   r   r   r@   Nr;   r<   r=   r"   rh   rA   r   inverser&   r6   probas      r'   predict_probazHalfBinomialLoss.predict_proba       !##(<Q(?1(D(D+33A66N..q115^=QRRRi''77aaad%1+oaaadr)   rm   rp   rq   rr   rs   r(   r`   r   r   r   s   @r'   r   r   y  sj         >: : : : : :         r)   r   c                   L     e Zd ZdZdZd fd	Zd ZddZd Z	 	 	 	 dd
Z	 xZ
S )HalfMultinomialLossa  Categorical cross-entropy loss, for multiclass classification.

    Domain:
    y_true in {0, 1, 2, 3, .., n_classes - 1}
    y_pred has n_classes elements, each element in (0, 1)

    Link:
    y_pred = softmax(raw_prediction)

    Note: We assume y_true to be already label encoded. The inverse link is
    softmax. But the full link function is the symmetric multinomial logit
    function.

    For a given sample x_i, the categorical cross-entropy loss is defined as
    the negative log-likelihood of the multinomial distribution, it
    generalizes the binary cross-entropy to more than 2 classes::

        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)

    See [1].

    Note that for the hessian, we calculate only the diagonal part in the
    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
    we calculate H_i_k_k, i.e. k=l.

    Reference
    ---------
    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
        Multinomial Regression".
        <1311.6529>`
    TN   c                     t                                          t                      t                      |           t	          dt
          j        dd          | _        t	          dddd          | _        d S )Nr   r   TFr   )	r{   r(   r   r   r   r"   r#   r$   r%   )r&   r7   r!   r}   s      r'   r(   zHalfMultinomialLoss.__init__  so    '))!## 	 	
 	
 	

  (264??'1eU;;r)   c                     | j                             |          o/t          j        |                    t
                    |k              S r+   )r$   r,   r"   allastypeintr-   s     r'   r/   z#HalfMultinomialLoss.in_y_true_range  s9     #,,Q//NBF188C==A;M4N4NNr)   c                    t          j        | j        |j                  }t          j        |j                  j        }t          | j                  D ]B}t          j        ||k    |d          ||<   t          j        ||         |d|z
            ||<   C| j	        	                    |dddf                   
                    d          S )zCompute raw_prediction of an intercept-only model.

        This is the softmax of the weighted average of the target, i.e. over
        the samples axis=0.
        r@   r   rO   r   N)r"   zerosr!   rA   rR   rS   rangerL   rX   r   reshape)r&   r5   r7   outrS   ks         r'   r\   z&HalfMultinomialLoss.fit_intercept_only  s     ht~V\:::hv|$$(t~&& 	3 	3AZ!]KKKCFWSVS!c'22CFFy~~c$'l++33B777r)   c                 6    | j                             |          S )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r   r   )r&   r6   s     r'   r   z!HalfMultinomialLoss.predict_proba  s     y  000r)   r   c                     |@|)t          j        |          }t          j        |          }n+t          j        |          }n|t          j        |          }| j                            ||||||           ||fS )aK  Compute gradient and class probabilities fow raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or array of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        proba_out : None or array of shape (n_samples, n_classes)
            A location into which the class probabilities are stored. If None,
            a new array might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples, n_classes)
            Element-wise gradients.

        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        N)r5   r6   r7   rB   	proba_outr9   )r"   r:   r   gradient_proba)r&   r5   r6   r7   rB   r   r9   s          r'   r   z"HalfMultinomialLoss.gradient_proba  s    H  !}^<<M.99		!}Y77l33I
!!)'% 	" 	
 	
 	
 Y&&r)   )Nr   rm   rn   )rp   rq   rr   rs   rg   r(   r/   r\   r   r   r   r   s   @r'   r   r     s           D M< < < < < <O O O8 8 8 81 1 1& 5' 5' 5' 5' 5' 5' 5' 5'r)   r   c                   2     e Zd ZdZd fd	ZddZd Z xZS )ExponentialLossa"  Exponential loss with (half) logit link, for binary classification.

    This is also know as boosting loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(2 * raw_prediction)

    For a given sample x_i, the exponential loss is defined as::

        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)

    See:
    - J. Friedman, T. Hastie, R. Tibshirani.
      "Additive logistic regression: a statistical view of boosting (With discussion
      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
      https://doi.org/10.1214/aos/1016218223
    - A. Buja, W. Stuetzle, Y. Shen. (2005).
      "Loss Functions for Binary Class Probability Estimation and Classification:
      Structure and Applications."

    Note that the formulation works for classification, y = {0, 1}, as well as
    "exponential logistic" regression, y = [0, 1].
    Note that this is a proper scoring rule, but without it's canonical link.

    More details: Inserting the predicted probability
    y_pred = expit(2 * raw_prediction) in the loss gives::

        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
    Nc                     t                                          t                      t                      d           t	          dddd          | _        d S r   )r{   r(   r
   r   r   r$   r|   s     r'   r(   zExponentialLoss.__init__m  sU    #%% 	 	
 	
 	

  (1dD99r)   c                 N    dt          j        |d|z
  z            z  }|||z  }|S )Nr   )r"   sqrtr   s       r'   r`   z(ExponentialLoss.constant_to_optimal_zerou  s4    BGFa&j1222$M!Dr)   c                 &   |j         dk    r&|j        d         dk    r|                    d          }t          j        |j        d         df|j                  }| j                            |          |dddf<   d|dddf         z
  |dddf<   |S r   r   r   s      r'   r   zExponentialLoss.predict_proba|  r   r)   rm   r   r   s   @r'   r   r   I  sk        ! !F: : : : : :         r)   r   )
squared_errorabsolute_errorpinball_loss
huber_losspoisson_loss
gamma_losstweedie_lossbinomial_lossmultinomial_lossexponential_loss)*rs   r   numpyr"   scipy.specialr   utilsr   utils.statsr   _lossr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rx   r   r   r   r   r   r   r   r   r   r   _LOSSESrv   r)   r'   <module>r      s   *                        . . . . . .                                        8z! z! z! z! z! z! z! z!B6 6 6 6 6x 6 6 6.#C #C #C #C #CH #C #C #CL< < < < <( < < <~F@ F@ F@ F@ F@ F@ F@ F@R    h   @    H   >= = = = =h = = =@+E +E +E +E +Eh +E +E +E\B B B B Bx B B BJH' H' H' H' H'( H' H' H'VF F F F Fh F F FT &###%+' r)   