"""
Generalized Additive Models

Author: Luca Puggini
Author: Josef Perktold

created on 08/07/2015
"""

from collections.abc import Iterable
import copy  # check if needed when dropping python 2.7

import numpy as np
from scipy import optimize
import pandas as pd

import statsmodels.base.wrapper as wrap

from statsmodels.discrete.discrete_model import Logit
from statsmodels.genmod.generalized_linear_model import (
    GLM, GLMResults, GLMResultsWrapper, _check_convergence)
import statsmodels.regression.linear_model as lm
# import statsmodels.regression._tools as reg_tools  # TODO: use this for pirls
from statsmodels.tools.sm_exceptions import (PerfectSeparationError,
                                             ValueWarning)
from statsmodels.tools.decorators import cache_readonly
from statsmodels.tools.data import _is_using_pandas
from statsmodels.tools.linalg import matrix_sqrt

from statsmodels.base._penalized import PenalizedMixin
from statsmodels.gam.gam_penalties import MultivariateGamPenalty
from statsmodels.gam.gam_cross_validation.gam_cross_validation import (
    MultivariateGAMCVPath)
from statsmodels.gam.gam_cross_validation.cross_validators import KFold


def _transform_predict_exog(model, exog, design_info=None):
    """transform exog for predict using design_info

    Note: this is copied from base.model.Results.predict and converted to
    standalone function with additional options.
    """

    is_pandas = _is_using_pandas(exog, None)

    exog_index = exog.index if is_pandas else None

    if design_info is None:
        design_info = getattr(model.data, 'design_info', None)

    if design_info is not None and (exog is not None):
        from patsy import dmatrix
        if isinstance(exog, pd.Series):
            # we are guessing whether it should be column or row
            if (hasattr(exog, 'name') and isinstance(exog.name, str) and
                    exog.name in design_info.describe()):
                # assume we need one column
                exog = pd.DataFrame(exog)
            else:
                # assume we need a row
                exog = pd.DataFrame(exog).T
        orig_exog_len = len(exog)
        is_dict = isinstance(exog, dict)
        exog = dmatrix(design_info, exog, return_type="dataframe")
        if orig_exog_len > len(exog) and not is_dict:
            import warnings
            if exog_index is None:
                warnings.warn('nan values have been dropped', ValueWarning)
            else:
                exog = exog.reindex(exog_index)
        exog_index = exog.index

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (model.exog.ndim == 1 or
                               model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    return exog, exog_index


class GLMGamResults(GLMResults):
    """Results class for generalized additive models, GAM.

    This inherits from GLMResults.

    Warning: some inherited methods might not correctly take account of the
    penalization

    GLMGamResults inherits from GLMResults
    All methods related to the loglikelihood function return the penalized
    values.

    Attributes
    ----------

    edf
        list of effective degrees of freedom for each column of the design
        matrix.
    hat_matrix_diag
        diagonal of hat matrix
    gcv
        generalized cross-validation criterion computed as
        ``gcv = scale / (1. - hat_matrix_trace / self.nobs)**2``
    cv
        cross-validation criterion computed as
        ``cv = ((resid_pearson / (1 - hat_matrix_diag))**2).sum() / nobs``

    Notes
    -----
    status: experimental
    """

    def __init__(self, model, params, normalized_cov_params, scale, **kwds):

        # this is a messy way to compute edf and update scale
        # need several attributes to compute edf
        self.model = model
        self.params = params
        self.normalized_cov_params = normalized_cov_params
        self.scale = scale
        edf = self.edf.sum()
        self.df_model = edf - 1  # assume constant
        # need to use nobs or wnobs attribute
        self.df_resid = self.model.endog.shape[0] - edf

        # we are setting the model df for the case when super is using it
        # df in model will be incorrect state when alpha/pen_weight changes
        self.model.df_model = self.df_model
        self.model.df_resid = self.df_resid
        mu = self.fittedvalues
        self.scale = scale = self.model.estimate_scale(mu)
        super().__init__(
            model, params, normalized_cov_params, scale, **kwds
        )

    def _tranform_predict_exog(self, exog=None, exog_smooth=None,
                               transform=True):
        """Transform original explanatory variables for prediction

        Parameters
        ----------
        exog : array_like, optional
            The values for the linear explanatory variables.
        exog_smooth : array_like
            values for the variables in the smooth terms
        transform : bool, optional
            If transform is False, then ``exog`` is returned unchanged and
            ``x`` is ignored. It is assumed that exog contains the full
            design matrix for the predict observations.
            If transform is True, then the basis representation of the smooth
            term will be constructed from the provided ``x``.

        Returns
        -------
        exog_transformed : ndarray
            design matrix for the prediction
        """
        if exog_smooth is not None:
            exog_smooth = np.asarray(exog_smooth)
        exog_index = None
        if transform is False:
            # the following allows that either or both exog are not None
            if exog_smooth is None:
                # exog could be None or array
                ex = exog
            else:
                if exog is None:
                    ex = exog_smooth
                else:
                    ex = np.column_stack((exog, exog_smooth))
        else:
            # transform exog_linear if needed
            if exog is not None and hasattr(self.model, 'design_info_linear'):
                exog, exog_index = _transform_predict_exog(
                    self.model, exog, self.model.design_info_linear)

            # create smooth basis
            if exog_smooth is not None:
                ex_smooth = self.model.smoother.transform(exog_smooth)
                if exog is None:
                    ex = ex_smooth
                else:
                    # TODO: there might be problems is exog_smooth is 1-D
                    ex = np.column_stack((exog, ex_smooth))
            else:
                ex = exog

        return ex, exog_index

    def predict(self, exog=None, exog_smooth=None, transform=True, **kwargs):
        """"
        compute prediction

        Parameters
        ----------
        exog : array_like, optional
            The values for the linear explanatory variables
        exog_smooth : array_like
            values for the variables in the smooth terms
        transform : bool, optional
            If transform is True, then the basis representation of the smooth
            term will be constructed from the provided ``exog``.
        kwargs :
            Some models can take additional arguments or keywords, see the
            predict method of the model for the details.

        Returns
        -------
        prediction : ndarray, pandas.Series or pandas.DataFrame
            predicted values
        """
        ex, exog_index = self._tranform_predict_exog(exog=exog,
                                                     exog_smooth=exog_smooth,
                                                     transform=transform)
        predict_results = super().predict(ex, transform=False, **kwargs)
        if exog_index is not None and not hasattr(
                predict_results, 'predicted_values'):
            if predict_results.ndim == 1:
                return pd.Series(predict_results, index=exog_index)
            else:
                return pd.DataFrame(predict_results, index=exog_index)
        else:
            return predict_results

    def get_prediction(self, exog=None, exog_smooth=None, transform=True,
                       **kwargs):
        """compute prediction results

        Parameters
        ----------
        exog : array_like, optional
            The values for which you want to predict.
        exog_smooth : array_like
            values for the variables in the smooth terms
        transform : bool, optional
            If transform is True, then the basis representation of the smooth
            term will be constructed from the provided ``x``.
        kwargs :
            Some models can take additional arguments or keywords, see the
            predict method of the model for the details.

        Returns
        -------
        prediction_results : generalized_linear_model.PredictionResults
            The prediction results instance contains prediction and prediction
            variance and can on demand calculate confidence intervals and
            summary tables for the prediction of the mean and of new
            observations.
        """
        ex, exog_index = self._tranform_predict_exog(exog=exog,
                                                     exog_smooth=exog_smooth,
                                                     transform=transform)
        return super().get_prediction(ex, transform=False, **kwargs)

    def partial_values(self, smooth_index, include_constant=True):
        """contribution of a smooth term to the linear prediction

        Warning: This will be replaced by a predict method

        Parameters
        ----------
        smooth_index : int
            index of the smooth term within list of smooth terms
        include_constant : bool
            If true, then the estimated intercept is added to the prediction
            and its standard errors. This avoids that the confidence interval
            has zero width at the imposed identification constraint, e.g.
            either at a reference point or at the mean.

        Returns
        -------
        predicted : nd_array
            predicted value of linear term.
            This is not the expected response if the link function is not
            linear.
        se_pred : nd_array
            standard error of linear prediction
        """
        variable = smooth_index
        smoother = self.model.smoother
        mask = smoother.mask[variable]

        start_idx = self.model.k_exog_linear
        idx = start_idx + np.nonzero(mask)[0]

        # smoother has only smooth parts, not exog_linear
        exog_part = smoother.basis[:, mask]

        const_idx = self.model.data.const_idx
        if include_constant and const_idx is not None:
            idx = np.concatenate(([const_idx], idx))
            exog_part = self.model.exog[:, idx]

        linpred = np.dot(exog_part, self.params[idx])
        # select the submatrix corresponding to a single variable
        partial_cov_params = self.cov_params(column=idx)

        covb = partial_cov_params
        var = (exog_part * np.dot(covb, exog_part.T).T).sum(1)
        se = np.sqrt(var)

        return linpred, se

    def plot_partial(self, smooth_index, plot_se=True, cpr=False,
                     include_constant=True, ax=None):
        """plot the contribution of a smooth term to the linear prediction

        Parameters
        ----------
        smooth_index : int
            index of the smooth term within list of smooth terms
        plot_se : bool
            If plot_se is true, then the confidence interval for the linear
            prediction will be added to the plot.
        cpr : bool
            If cpr (component plus residual) is true, then a scatter plot of
            the partial working residuals will be added to the plot.
        include_constant : bool
            If true, then the estimated intercept is added to the prediction
            and its standard errors. This avoids that the confidence interval
            has zero width at the imposed identification constraint, e.g.
            either at a reference point or at the mean.
        ax : None or matplotlib axis instance
           If ax is not None, then the plot will be added to it.

        Returns
        -------
        Figure
            If `ax` is None, the created figure. Otherwise, the Figure to which
            `ax` is connected.
        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_ax
        _import_mpl()

        variable = smooth_index
        y_est, se = self.partial_values(variable,
                                        include_constant=include_constant)
        smoother = self.model.smoother
        x = smoother.smoothers[variable].x
        sort_index = np.argsort(x)
        x = x[sort_index]
        y_est = y_est[sort_index]
        se = se[sort_index]

        fig, ax = create_mpl_ax(ax)

        if cpr:
            # TODO: resid_response does not make sense with nonlinear link
            # use resid_working ?
            residual = self.resid_working[sort_index]
            cpr_ = y_est + residual
            ax.scatter(x, cpr_, s=4)

        ax.plot(x, y_est, c='blue', lw=2)
        if plot_se:
            ax.plot(x, y_est + 1.96 * se, '-', c='blue')
            ax.plot(x, y_est - 1.96 * se, '-', c='blue')

        ax.set_xlabel(smoother.smoothers[variable].variable_name)

        return fig

    def test_significance(self, smooth_index):
        """hypothesis test that a smooth component is zero.

        This calls `wald_test` to compute the hypothesis test, but uses
        effective degrees of freedom.

        Parameters
        ----------
        smooth_index : int
            index of the smooth term within list of smooth terms

        Returns
        -------
        wald_test : ContrastResults instance
            the results instance created by `wald_test`
        """

        variable = smooth_index
        smoother = self.model.smoother
        start_idx = self.model.k_exog_linear

        k_params = len(self.params)
        # a bit messy, we need first index plus length of smooth term
        mask = smoother.mask[variable]
        k_constraints = mask.sum()
        idx = start_idx + np.nonzero(mask)[0][0]
        constraints = np.eye(k_constraints, k_params, idx)
        df_constraints = self.edf[idx: idx + k_constraints].sum()

        return self.wald_test(constraints, df_constraints=df_constraints)

    def get_hat_matrix_diag(self, observed=True, _axis=1):
        """
        Compute the diagonal of the hat matrix

        Parameters
        ----------
        observed : bool
            If true, then observed hessian is used in the hat matrix
            computation. If false, then the expected hessian is used.
            In the case of a canonical link function both are the same.
            This is only relevant for models that implement both observed
            and expected Hessian, which is currently only GLM. Other
            models only use the observed Hessian.
        _axis : int
            This is mainly for internal use. By default it returns the usual
            diagonal of the hat matrix. If _axis is zero, then the result
            corresponds to the effective degrees of freedom, ``edf`` for each
            column of exog.

        Returns
        -------
        hat_matrix_diag : ndarray
            The diagonal of the hat matrix computed from the observed
            or expected hessian.
        """
        weights = self.model.hessian_factor(self.params, scale=self.scale,
                                            observed=observed)
        wexog = np.sqrt(weights)[:, None] * self.model.exog

        # we can use inverse hessian directly instead of computing it from
        # WLS/IRLS as in GLM

        # TODO: does `normalized_cov_params * scale` work in all cases?
        # this avoids recomputing hessian, check when used for other models.
        hess_inv = self.normalized_cov_params * self.scale
        # this is in GLM equivalent to the more generic and direct
        # hess_inv = np.linalg.inv(-self.model.hessian(self.params))
        hd = (wexog * hess_inv.dot(wexog.T).T).sum(axis=_axis)
        return hd

    @cache_readonly
    def edf(self):
        return self.get_hat_matrix_diag(_axis=0)

    @cache_readonly
    def hat_matrix_trace(self):
        return self.hat_matrix_diag.sum()

    @cache_readonly
    def hat_matrix_diag(self):
        return self.get_hat_matrix_diag(observed=True)

    @cache_readonly
    def gcv(self):
        return self.scale / (1. - self.hat_matrix_trace / self.nobs)**2

    @cache_readonly
    def cv(self):
        cv_ = ((self.resid_pearson / (1. - self.hat_matrix_diag))**2).sum()
        cv_ /= self.nobs
        return cv_


class GLMGamResultsWrapper(GLMResultsWrapper):
    pass


wrap.populate_wrapper(GLMGamResultsWrapper, GLMGamResults)


class GLMGam(PenalizedMixin, GLM):
    """
    Generalized Additive Models (GAM)

    This inherits from `GLM`.

    Warning: Not all inherited methods might take correctly account of the
    penalization. Not all options including offset and exposure have been
    verified yet.

    Parameters
    ----------
    endog : array_like
        The response variable.
    exog : array_like or None
        This explanatory variables are treated as linear. The model in this
        case is a partial linear model.
    smoother : instance of additive smoother class
        Examples of smoother instances include Bsplines or CyclicCubicSplines.
    alpha : float or list of floats
        Penalization weights for smooth terms. The length of the list needs
        to be the same as the number of smooth terms in the ``smoother``.
    family : instance of GLM family
        See GLM.
    offset : None or array_like
        See GLM.
    exposure : None or array_like
        See GLM.
    missing : 'none'
        Missing value handling is not supported in this class.
    **kwargs
        Extra keywords are used in call to the super classes.

    Notes
    -----
    Status: experimental. This has full unit test coverage for the core
    results with Gaussian and Poisson (without offset and exposure). Other
    options and additional results might not be correctly supported yet.
    (Binomial with counts, i.e. with n_trials, is most likely wrong in pirls.
    User specified var or freq weights are most likely also not correct for
    all results.)
    """

    _results_class = GLMGamResults
    _results_class_wrapper = GLMGamResultsWrapper

    def __init__(self, endog, exog=None, smoother=None, alpha=0, family=None,
                 offset=None, exposure=None, missing='none', **kwargs):

        # TODO: check usage of hasconst
        hasconst = kwargs.get('hasconst', None)
        xnames_linear = None
        if hasattr(exog, 'design_info'):
            self.design_info_linear = exog.design_info
            xnames_linear = self.design_info_linear.column_names

        is_pandas = _is_using_pandas(exog, None)

        # TODO: handle data is experimental, see #5469
        # This is a bit wasteful because we need to `handle_data twice`
        self.data_linear = self._handle_data(endog, exog, missing, hasconst)
        if xnames_linear is None:
            xnames_linear = self.data_linear.xnames
        if exog is not None:
            exog_linear = self.data_linear.exog
            k_exog_linear = exog_linear.shape[1]
        else:
            exog_linear = None
            k_exog_linear = 0
        self.k_exog_linear = k_exog_linear
        # We need exog_linear for k-fold cross validation
        # TODO: alternative is to take columns from combined exog
        self.exog_linear = exog_linear

        self.smoother = smoother
        self.k_smooths = smoother.k_variables
        self.alpha = self._check_alpha(alpha)
        penal = MultivariateGamPenalty(smoother, alpha=self.alpha,
                                       start_idx=k_exog_linear)
        kwargs.pop('penal', None)
        if exog_linear is not None:
            exog = np.column_stack((exog_linear, smoother.basis))
        else:
            exog = smoother.basis

        # TODO: check: xnames_linear will be None instead of empty list
        #       if no exog_linear
        # can smoother be empty ? I guess not allowed.
        if xnames_linear is None:
            xnames_linear = []
        xnames = xnames_linear + self.smoother.col_names

        if is_pandas and exog_linear is not None:
            # we a dataframe so we can get a PandasData instance for wrapping
            exog = pd.DataFrame(exog, index=self.data_linear.row_labels,
                                columns=xnames)

        super().__init__(endog, exog=exog, family=family,
                         offset=offset, exposure=exposure,
                         penal=penal, missing=missing, **kwargs)

        if not is_pandas:
            # set exog nanmes if not given by pandas DataFrame
            self.exog_names[:] = xnames

        # TODO: the generic data handling might attach the design_info from the
        #       linear part, but this is incorrect for the full model and
        #       causes problems in wald_test_terms

        if hasattr(self.data, 'design_info'):
            del self.data.design_info
        # formula also might be attached which causes problems in predict
        if hasattr(self, 'formula'):
            self.formula_linear = self.formula
            self.formula = None
            del self.formula

    def _check_alpha(self, alpha):
        """check and convert alpha to required list format

        Parameters
        ----------
        alpha : scalar, list or array_like
            penalization weight

        Returns
        -------
        alpha : list
            penalization weight, list with length equal to the number of
            smooth terms
        """
        if not isinstance(alpha, Iterable):
            alpha = [alpha] * len(self.smoother.smoothers)
        elif not isinstance(alpha, list):
            # we want alpha to be a list
            alpha = list(alpha)
        return alpha

    def fit(self, start_params=None, maxiter=1000, method='pirls', tol=1e-8,
            scale=None, cov_type='nonrobust', cov_kwds=None, use_t=None,
            full_output=True, disp=False, max_start_irls=3, **kwargs):
        """estimate parameters and create instance of GLMGamResults class

        Parameters
        ----------
        most parameters are the same as for GLM
        method : optimization method
            The special optimization method is "pirls" which uses a penalized
            version of IRLS. Other methods are gradient optimizers as used in
            base.model.LikelihoodModel.

        Returns
        -------
        res : instance of wrapped GLMGamResults
        """
        # TODO: temporary hack to remove attribute
        # formula also might be attached which in inherited from_formula
        # causes problems in predict
        if hasattr(self, 'formula'):
            self.formula_linear = self.formula
            del self.formula

        # TODO: alpha not allowed yet, but is in `_fit_pirls`
        # alpha = self._check_alpha()

        if method.lower() in ['pirls', 'irls']:
            res = self._fit_pirls(self.alpha, start_params=start_params,
                                  maxiter=maxiter, tol=tol, scale=scale,
                                  cov_type=cov_type, cov_kwds=cov_kwds,
                                  use_t=use_t, **kwargs)
        else:
            if max_start_irls > 0 and (start_params is None):
                res = self._fit_pirls(self.alpha, start_params=start_params,
                                      maxiter=max_start_irls, tol=tol,
                                      scale=scale,
                                      cov_type=cov_type, cov_kwds=cov_kwds,
                                      use_t=use_t, **kwargs)
                start_params = res.params
                del res
            res = super().fit(start_params=start_params,
                              maxiter=maxiter, method=method,
                              tol=tol, scale=scale,
                              cov_type=cov_type, cov_kwds=cov_kwds,
                              use_t=use_t,
                              full_output=full_output, disp=disp,
                              max_start_irls=0,
                              **kwargs)
        return res

    # pag 165 4.3 # pag 136 PIRLS
    def _fit_pirls(self, alpha, start_params=None, maxiter=100, tol=1e-8,
                   scale=None, cov_type='nonrobust', cov_kwds=None, use_t=None,
                   weights=None):
        """fit model with penalized reweighted least squares
        """
        # TODO: this currently modifies several attributes
        # self.scale, self.scaletype, self.mu, self.weights
        # self.data_weights,
        # and possibly self._offset_exposure
        # several of those might not be necessary, e.g. mu and weights

        # alpha = alpha * len(y) * self.scale / 100
        # TODO: we need to rescale alpha
        endog = self.endog
        wlsexog = self.exog  # smoother.basis
        spl_s = self.penal.penalty_matrix(alpha=alpha)

        nobs, n_columns = wlsexog.shape

        # TODO what are these values?
        if weights is None:
            self.data_weights = np.array([1.] * nobs)
        else:
            self.data_weights = weights

        if not hasattr(self, '_offset_exposure'):
            self._offset_exposure = 0

        self.scaletype = scale
        # TODO: check default scale types
        # self.scaletype = 'dev'
        # during iteration
        self.scale = 1

        if start_params is None:
            mu = self.family.starting_mu(endog)
            lin_pred = self.family.predict(mu)
        else:
            lin_pred = np.dot(wlsexog, start_params) + self._offset_exposure
            mu = self.family.fitted(lin_pred)
        dev = self.family.deviance(endog, mu)

        history = dict(params=[None, start_params], deviance=[np.inf, dev])
        converged = False
        criterion = history['deviance']
        # This special case is used to get the likelihood for a specific
        # params vector.
        if maxiter == 0:
            mu = self.family.fitted(lin_pred)
            self.scale = self.estimate_scale(mu)
            wls_results = lm.RegressionResults(self, start_params, None)
            iteration = 0

        for iteration in range(maxiter):

            # TODO: is this equivalent to point 1 of page 136:
            # w = 1 / (V(mu) * g'(mu))  ?
            self.weights = self.data_weights * self.family.weights(mu)

            # TODO: is this equivalent to point 1 of page 136:
            # z = g(mu)(y - mu) + X beta  ?
            wlsendog = (lin_pred + self.family.link.deriv(mu) * (endog - mu)
                        - self._offset_exposure)

            # this defines the augmented matrix point 2a on page 136
            wls_results = penalized_wls(wlsendog, wlsexog, spl_s, self.weights)
            lin_pred = np.dot(wlsexog, wls_results.params).ravel()
            lin_pred += self._offset_exposure
            mu = self.family.fitted(lin_pred)

            # We do not need to update scale in GLM/LEF models
            # We might need it in dispersion models.
            # self.scale = self.estimate_scale(mu)
            history = self._update_history(wls_results, mu, history)

            if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0):
                msg = "Perfect separation detected, results not available"
                raise PerfectSeparationError(msg)

            # TODO need atol, rtol
            # args of _check_convergence: (criterion, iteration, atol, rtol)
            converged = _check_convergence(criterion, iteration, tol, 0)
            if converged:
                break
        self.mu = mu
        self.scale = self.estimate_scale(mu)
        glm_results = GLMGamResults(self, wls_results.params,
                                    wls_results.normalized_cov_params,
                                    self.scale,
                                    cov_type=cov_type, cov_kwds=cov_kwds,
                                    use_t=use_t)

        glm_results.method = "PIRLS"
        history['iteration'] = iteration + 1
        glm_results.fit_history = history
        glm_results.converged = converged

        return GLMGamResultsWrapper(glm_results)

    def select_penweight(self, criterion='aic', start_params=None,
                         start_model_params=None,
                         method='basinhopping', **fit_kwds):
        """find alpha by minimizing results criterion

        The objective for the minimization can be results attributes like
        ``gcv``, ``aic`` or ``bic`` where the latter are based on effective
        degrees of freedom.

        Warning: In many case the optimization might converge to a local
        optimum or near optimum. Different start_params or using a global
        optimizer is recommended, default is basinhopping.

        Parameters
        ----------
        criterion='aic'
            name of results attribute to be minimized.
            Default is 'aic', other options are 'gcv', 'cv' or 'bic'.
        start_params : None or array
            starting parameters for alpha in the penalization weight
            minimization. The parameters are internally exponentiated and
            the minimization is with respect to ``exp(alpha)``
        start_model_params : None or array
            starting parameter for the ``model._fit_pirls``.
        method : 'basinhopping', 'nm' or 'minimize'
            'basinhopping' and 'nm' directly use the underlying scipy.optimize
            functions `basinhopping` and `fmin`. 'minimize' provides access
            to the high level interface, `scipy.optimize.minimize`.
        fit_kwds : keyword arguments
            additional keyword arguments will be used in the call to the
            scipy optimizer. Which keywords are supported depends on the
            scipy optimization function.

        Returns
        -------
        alpha : ndarray
            penalization parameter found by minimizing the criterion.
            Note that this can be only a local (near) optimum.
        fit_res : tuple
            results returned by the scipy optimization routine. The
            parameters in the optimization problem are `log(alpha)`
        history : dict
            history of calls to pirls and contains alpha, the fit
            criterion and the parameters to which pirls converged to for the
            given alpha.

        Notes
        -----
        In the test cases Nelder-Mead and bfgs often converge to local optima,
        see also https://github.com/statsmodels/statsmodels/issues/5381.

        This does not use any analytical derivatives for the criterion
        minimization.

        Status: experimental, It is possible that defaults change if there
        is a better way to find a global optimum. API (e.g. type of return)
        might also change.
        """
        # copy attributes that are changed, so we can reset them
        scale_keep = self.scale
        scaletype_keep = self.scaletype
        # TODO: use .copy() method when available for all types
        alpha_keep = copy.copy(self.alpha)

        if start_params is None:
            start_params = np.zeros(self.k_smooths)
        else:
            start_params = np.log(1e-20 + start_params)

        history = {}
        history['alpha'] = []
        history['params'] = [start_model_params]
        history['criterion'] = []

        def fun(p):
            a = np.exp(p)
            res_ = self._fit_pirls(start_params=history['params'][-1],
                                   alpha=a)
            history['alpha'].append(a)
            history['params'].append(np.asarray(res_.params))
            return getattr(res_, criterion)

        if method == 'nm':
            kwds = dict(full_output=True, maxiter=1000, maxfun=2000)
            kwds.update(fit_kwds)
            fit_res = optimize.fmin(fun, start_params, **kwds)
            opt = fit_res[0]
        elif method == 'basinhopping':
            kwds = dict(minimizer_kwargs={'method': 'Nelder-Mead',
                        'options': {'maxiter': 100, 'maxfev': 500}},
                        niter=10)
            kwds.update(fit_kwds)
            fit_res = optimize.basinhopping(fun, start_params, **kwds)
            opt = fit_res.x
        elif method == 'minimize':
            fit_res = optimize.minimize(fun, start_params, **fit_kwds)
            opt = fit_res.x
        else:
            raise ValueError('method not recognized')

        del history['params'][0]  # remove the model start_params

        alpha = np.exp(opt)

        # reset attributes that have or might have changed
        self.scale = scale_keep
        self.scaletype = scaletype_keep
        self.alpha = alpha_keep

        return alpha, fit_res, history

    def select_penweight_kfold(self, alphas=None, cv_iterator=None, cost=None,
                               k_folds=5, k_grid=11):
        """find alphas by k-fold cross-validation

        Warning: This estimates ``k_folds`` models for each point in the
            grid of alphas.

        Parameters
        ----------
        alphas : None or list of arrays
        cv_iterator : instance
            instance of a cross-validation iterator, by default this is a
            KFold instance
        cost : function
            default is mean squared error. The cost function to evaluate the
            prediction error for the left out sample. This should take two
            arrays as argument and return one float.
        k_folds : int
            number of folds if default Kfold iterator is used.
            This is ignored if ``cv_iterator`` is not None.

        Returns
        -------
        alpha_cv : list of float
            Best alpha in grid according to cross-validation
        res_cv : instance of MultivariateGAMCVPath
            The instance was used for cross-validation and holds the results

        Notes
        -----
        The default alphas are defined as
        ``alphas = [np.logspace(0, 7, k_grid) for _ in range(k_smooths)]``
        """

        if cost is None:
            def cost(x1, x2):
                return np.linalg.norm(x1 - x2) / len(x1)

        if alphas is None:
            alphas = [np.logspace(0, 7, k_grid) for _ in range(self.k_smooths)]

        if cv_iterator is None:
            cv_iterator = KFold(k_folds=k_folds, shuffle=True)

        gam_cv = MultivariateGAMCVPath(smoother=self.smoother, alphas=alphas,
                                       gam=GLMGam, cost=cost, endog=self.endog,
                                       exog=self.exog_linear,
                                       cv_iterator=cv_iterator)
        gam_cv_res = gam_cv.fit()

        return gam_cv_res.alpha_cv, gam_cv_res


class LogitGam(PenalizedMixin, Logit):
    """Generalized Additive model for discrete Logit

    This subclasses discrete_model Logit.

    Warning: not all inherited methods might take correctly account of the
    penalization

    not verified yet.
    """
    def __init__(self, endog, smoother, alpha, *args, **kwargs):
        if not isinstance(alpha, Iterable):
            alpha = np.array([alpha] * len(smoother.smoothers))

        self.smoother = smoother
        self.alpha = alpha
        self.pen_weight = 1  # TODO: pen weight should not be defined here!!
        penal = MultivariateGamPenalty(smoother, alpha=alpha)

        super().__init__(endog, smoother.basis, penal=penal, *args, **kwargs)


def penalized_wls(endog, exog, penalty_matrix, weights):
    """weighted least squares with quadratic penalty

    Parameters
    ----------
    endog : ndarray
        response or endogenous variable
    exog : ndarray
        design matrix, matrix of exogenous or explanatory variables
    penalty_matrix : ndarray, 2-Dim square
        penality matrix for quadratic penalization. Note, the penalty_matrix
        is multiplied by two to match non-pirls fitting methods.
    weights : ndarray
        weights for WLS

    Returns
    -------
    results : Results instance of WLS
    """
    y, x, s = endog, exog, penalty_matrix
    # TODO: I do not understand why I need 2 * s
    aug_y, aug_x, aug_weights = make_augmented_matrix(y, x, 2 * s, weights)
    wls_results = lm.WLS(aug_y, aug_x, aug_weights).fit()
    # TODO: use MinimalWLS during iterations, less overhead
    # However, MinimalWLS does not return normalized_cov_params
    #   which we need at the end of the iterations
    # call would be
    # wls_results = reg_tools._MinimalWLS(aug_y, aug_x, aug_weights).fit()
    wls_results.params = wls_results.params.ravel()

    return wls_results


def make_augmented_matrix(endog, exog, penalty_matrix, weights):
    """augment endog, exog and weights with stochastic restriction matrix

    Parameters
    ----------
    endog : ndarray
        response or endogenous variable
    exog : ndarray
        design matrix, matrix of exogenous or explanatory variables
    penalty_matrix : ndarray, 2-Dim square
        penality matrix for quadratic penalization
    weights : ndarray
        weights for WLS

    Returns
    -------
    endog_aug : ndarray
        augmented response variable
    exog_aug : ndarray
        augmented design matrix
    weights_aug : ndarray
        augmented weights for WLS
    """
    y, x, s, = endog, exog, penalty_matrix
    nobs = x.shape[0]

    # TODO: needs full because of broadcasting with weights
    # check what weights should be doing
    rs = matrix_sqrt(s)
    x1 = np.vstack([x, rs])  # augmented x
    n_samp1es_x1 = x1.shape[0]

    y1 = np.array([0.] * n_samp1es_x1)  # augmented y
    y1[:nobs] = y

    id1 = np.array([1.] * rs.shape[0])
    w1 = np.concatenate([weights, id1])

    return y1, x1, w1
