Binary files /tmp/tmp3NAdo6/f1kG0NcwdC/r-cran-glmnet-4.0-2/build/vignette.rds and /tmp/tmp3NAdo6/BBEWWRnMdv/r-cran-glmnet-4.1/build/vignette.rds differ diff -Nru r-cran-glmnet-4.0-2/debian/changelog r-cran-glmnet-4.1/debian/changelog --- r-cran-glmnet-4.0-2/debian/changelog 2020-07-16 07:14:52.000000000 +0000 +++ r-cran-glmnet-4.1/debian/changelog 2021-01-15 22:26:54.000000000 +0000 @@ -1,3 +1,10 @@ +r-cran-glmnet (4.1-1) unstable; urgency=medium + + * New upstream version + * Standards-Version: 4.5.1 (routine-update) + + -- Andreas Tille Fri, 15 Jan 2021 23:26:54 +0100 + r-cran-glmnet (4.0-2-1) unstable; urgency=medium * Team upload. diff -Nru r-cran-glmnet-4.0-2/debian/control r-cran-glmnet-4.1/debian/control --- r-cran-glmnet-4.0-2/debian/control 2020-07-16 07:14:52.000000000 +0000 +++ r-cran-glmnet-4.1/debian/control 2021-01-15 22:26:54.000000000 +0000 @@ -11,7 +11,7 @@ r-cran-foreach, r-cran-shape, r-cran-survival -Standards-Version: 4.5.0 +Standards-Version: 4.5.1 Vcs-Browser: https://salsa.debian.org/r-pkg-team/r-cran-glmnet Vcs-Git: https://salsa.debian.org/r-pkg-team/r-cran-glmnet.git Homepage: https://cran.r-project.org/package=glmnet diff -Nru r-cran-glmnet-4.0-2/DESCRIPTION r-cran-glmnet-4.1/DESCRIPTION --- r-cran-glmnet-4.0-2/DESCRIPTION 2020-06-16 00:00:02.000000000 +0000 +++ r-cran-glmnet-4.1/DESCRIPTION 2021-01-11 08:00:30.000000000 +0000 @@ -1,8 +1,8 @@ Package: glmnet Type: Package Title: Lasso and Elastic-Net Regularized Generalized Linear Models -Version: 4.0-2 -Date: 2020-6-13 +Version: 4.1 +Date: 2021-01-10 Authors@R: c(person("Jerome", "Friedman", role=c("aut")), person("Trevor", "Hastie", role=c("aut", "cre"), email = "hastie@stanford.edu"), person("Rob", "Tibshirani", role=c("aut")), @@ -12,7 +12,7 @@ person("Junyang", "Qian", role=c("ctb"))) Depends: R (>= 3.6.0), Matrix (>= 1.0-6) Imports: methods, utils, foreach, shape, survival -Suggests: knitr, lars, testthat +Suggests: knitr, lars, testthat, xfun, rmarkdown Description: Extremely efficient procedures for fitting the entire lasso or elastic-net regularization path for linear regression, logistic and multinomial regression models, Poisson regression, Cox model, multiple-response Gaussian, and the grouped multinomial regression. There are two new and important additions. The family argument can be a GLM family object, which opens the door to any programmed family. This comes with a modest computational cost, so when the built-in families suffice, they should be used instead. The other novelty is the relax option, which refits each of the active sets in the path unpenalized. The algorithm uses cyclical coordinate descent in a path-wise fashion, as described in the papers listed in the URL below. License: GPL-2 VignetteBuilder: knitr @@ -20,9 +20,9 @@ URL: https://glmnet.stanford.edu, https://dx.doi.org/10.18637/jss.v033.i01, https://dx.doi.org/10.18637/jss.v039.i05 -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 NeedsCompilation: yes -Packaged: 2020-06-14 23:21:58 UTC; hastie +Packaged: 2021-01-11 00:03:32 UTC; hastie Author: Jerome Friedman [aut], Trevor Hastie [aut, cre], Rob Tibshirani [aut], @@ -32,4 +32,4 @@ Junyang Qian [ctb] Maintainer: Trevor Hastie Repository: CRAN -Date/Publication: 2020-06-16 00:00:02 UTC +Date/Publication: 2021-01-11 08:00:30 UTC diff -Nru r-cran-glmnet-4.0-2/inst/CITATION r-cran-glmnet-4.1/inst/CITATION --- r-cran-glmnet-4.0-2/inst/CITATION 2019-09-04 20:28:27.000000000 +0000 +++ r-cran-glmnet-4.1/inst/CITATION 2021-01-06 22:06:54.000000000 +0000 @@ -10,13 +10,13 @@ volume = "33", number = "1", pages = "1--22", - url = "http://www.jstatsoft.org/v33/i01/", + url = "https://www.jstatsoft.org/v33/i01/", textVersion = paste("Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010).", "Regularization Paths for Generalized Linear Models via Coordinate Descent.", "Journal of Statistical Software, 33(1), 1-22.", - "URL http://www.jstatsoft.org/v33/i01/.") + "URL https://www.jstatsoft.org/v33/i01/.") ) citEntry(entry = "Article", @@ -30,13 +30,13 @@ volume = "39", number = "5", pages = "1--13", - url = "http://www.jstatsoft.org/v39/i05/", + url = "https://www.jstatsoft.org/v39/i05/", textVersion = paste("Noah Simon, Jerome Friedman, Trevor Hastie, Rob Tibshirani (2011).", "Regularization Paths for Cox's Proportional Hazards Model via Coordinate Descent.", "Journal of Statistical Software, 39(5), 1-13.", - "URL http://www.jstatsoft.org/v39/i05/."), + "URL https://www.jstatsoft.org/v39/i05/."), header = "If coxnet is used, please also cite:" ) Binary files /tmp/tmp3NAdo6/f1kG0NcwdC/r-cran-glmnet-4.0-2/inst/doc/Coxnet.pdf and /tmp/tmp3NAdo6/BBEWWRnMdv/r-cran-glmnet-4.1/inst/doc/Coxnet.pdf differ diff -Nru r-cran-glmnet-4.0-2/inst/doc/Coxnet.R r-cran-glmnet-4.1/inst/doc/Coxnet.R --- r-cran-glmnet-4.0-2/inst/doc/Coxnet.R 2020-06-14 23:21:23.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/Coxnet.R 2021-01-11 00:03:01.000000000 +0000 @@ -1,22 +1,150 @@ -## ------------------------------------------------------------------------ -library("glmnet") -library("survival") -patient.data <- readRDS("assets/coxnet.RDS") - -## ---- warning = TRUE----------------------------------------------------- -cv.fit <- cv.glmnet(patient.data$x, Surv(patient.data$time, patient.data$status), family="cox", maxit = 1000) -fit <- glmnet(patient.data$x, Surv(patient.data$time,patient.data$status), family = "cox", maxit = 1000) +## ----include=FALSE------------------------------------------------------------ +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") -## ------------------------------------------------------------------------ +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) + +## ----------------------------------------------------------------------------- +library(glmnet) +library(survival) +data(CoxExample) +y[1:5, ] + +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, family = "cox") + +## ----------------------------------------------------------------------------- +plot(fit) + +## ----out.lines = 10----------------------------------------------------------- +coef(fit, s = 0.05) + +## ----------------------------------------------------------------------------- +set.seed(1) +cvfit <- cv.glmnet(x, y, family = "cox", type.measure = "C") + +## ----------------------------------------------------------------------------- +plot(cvfit) + +## ----------------------------------------------------------------------------- +cvfit$lambda.min +cvfit$lambda.1se + +## ----------------------------------------------------------------------------- +# create x matrix +set.seed(1) +nobs <- 100; nvars <- 15 +x <- matrix(rnorm(nobs * nvars), nrow = nobs) + +# create response +ty <- rep(rexp(nobs / 5), each = 5) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +y <- Surv(ty, tcens) + +# coefficients from these two models will not line up because +# of different tie handling methods +glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0) +coxph_fit <- coxph(y ~ x) +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) + +## ----------------------------------------------------------------------------- +# coefficients from these two models will line up +glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0) +coxph_fit <- coxph(y ~ x, ties = "breslow") +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) + +## ----------------------------------------------------------------------------- +# create x matrix +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +x <- matrix(xvec, nrow = nobs) # non-sparse x +x_sparse <- Matrix::Matrix(xvec, nrow = nobs, sparse = TRUE) # sparse x + +# create start-stop data response +beta <- rnorm(5) +fx <- x_sparse[, 1:5] %*% beta / 3 +ty <- rexp(nobs, drop(exp(fx))) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +starty <- runif(nobs) +yss <- Surv(starty, starty + ty, tcens) + +# fit regularized Cox model with start-stop data +fit <- glmnet(x, yss, family = "cox") + +## ----------------------------------------------------------------------------- +cv.fit <- cv.glmnet(x, yss, family = "cox", nfolds = 5) plot(cv.fit) -cv.fit$lambda.min -## ------------------------------------------------------------------------ -Coefficients <- coef(fit, s = cv.fit$lambda.min) -Active.Index <- which(Coefficients != 0) -Active.Coefficients <- Coefficients[Active.Index] - -## ------------------------------------------------------------------------ -Active.Index -Active.Coefficients +## ----------------------------------------------------------------------------- +glmnet_fit <- glmnet(x, yss, family = "cox", lambda = 0) +coxph_fit <- coxph(yss ~ x) +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) + +## ----------------------------------------------------------------------------- +strata <- rep(1:5, length.out = nobs) +y2 <- stratifySurv(y, strata) +str(y2[1:6]) + +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y2, family = "cox") + +## ----------------------------------------------------------------------------- +cv.fit <- cv.glmnet(x, y2, family = "cox", nfolds = 5) +plot(cv.fit) + +## ----------------------------------------------------------------------------- +y3 <- y +attr(y3, "strata") <- strata +str(y3[1:6]) # note that the strata attribute is no longer there + +## ----error=TRUE--------------------------------------------------------------- +fit <- glmnet(x, y3, family = "cox") + +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, family = "cox") +survival::survfit(fit, s = 0.05, x = x, y = y) + +## ----------------------------------------------------------------------------- +plot(survival::survfit(fit, s = 0.05, x = x, y = y)) + +## ----------------------------------------------------------------------------- +survival::survfit(fit, s = 0.05, x = x, y = y, newx = x[1:3, ]) +plot(survival::survfit(fit, s = 0.05, x = x, y = y, newx = x[1:3, ])) + +## ----------------------------------------------------------------------------- +y2 <- stratifySurv(y, rep(1:2, length.out = nobs)) +fit <- glmnet(x, y2, family = "cox") +survival::survfit(fit, s = 0.01, x = x, y = y2) + +# survival curve plot for first two individuals in dataset +plot(survival::survfit(fit, s = 0.01, x = x, y = y2, + newx = x[1:2, ], newstrata = strata[1:2])) + +## ----------------------------------------------------------------------------- +sf <- survival::survfit(fit, x = x, y = y2) +length(sf) +length(fit$lambda) + +## ----------------------------------------------------------------------------- +cv.fit <- cv.glmnet(x, y2, family = "cox", nfolds = 5) +survival::survfit(cv.fit, x = x, y = y2) +survival::survfit(cv.fit, s = "lambda.min", x = x, y = y2) diff -Nru r-cran-glmnet-4.0-2/inst/doc/Coxnet.Rmd r-cran-glmnet-4.1/inst/doc/Coxnet.Rmd --- r-cran-glmnet-4.0-2/inst/doc/Coxnet.Rmd 2019-10-31 18:19:34.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/Coxnet.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,13 @@ --- -title: "Coxnet: Regularized Cox Regression" -author: "Noah Simon, Jerome Friedman, Trevor Hastie and Rob Tibshirani" -date: '`r Sys.Date()`' +title: "Regularized Cox Regression" +author: + - Kenneth Tay + - Noah Simon + - Jerome Friedman + - Trevor Hastie + - Rob Tibshirani + - Balasubramanian Narasimhan +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -10,88 +16,274 @@ toc: yes toc_depth: 3 vignette: > - %\VignetteIndexEntry{Coxnet: Regularized Cox Regression} + %\VignetteIndexEntry{Regularized Cox Regression} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- +```{r include=FALSE} +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) +``` + ## Introduction -We will give a short tutorial on using coxnet. Coxnet is a function -which fits the Cox Model regularized by an elastic net penalty. It is -used for underdetermined (or nearly underdetermined systems) and chooses -a small number of covariates to include in the model. Because the Cox -Model is rarely used for actual prediction, we will rather focus on -finding and interpretating an appropriate model. We give a simple -example of how to format data and run the Cox Model in glmnet with cross -validation. - -Further details may be found in @coxnet, @strongrules and @block. - -## Example - -We first load our data and set up the response. In this case $x$ must be -an $n$ by $p$ matrix of covariate values --- each row corresponds to a -patient and each column a covariate. $y$ is an $n$ length vector of -failure/censoring times, and status is an $n$ length vector with each -entry, a $1$ or a $0$, indicating whether the corresponding entry in $y$ -is indicative of a failure time or right censoring time ($1$ for -failure, $0$ for censoring) - -```{r} -library("glmnet") -library("survival") -patient.data <- readRDS("assets/coxnet.RDS") -``` - -We then call our functions to fit with the lasso penalty ($\alpha=1$), -and cross validate. We set maxit = 1000 (increasing the maximum number -of iterations to $1000$) because our data is relatively high -dimensional, so more iterations are needed for convergence. In practice, -the function will spit out an error if convergence isn't reached by the -maximum number of iterations. - -```{r, warning = TRUE} -cv.fit <- cv.glmnet(patient.data$x, Surv(patient.data$time, patient.data$status), family="cox", maxit = 1000) -fit <- glmnet(patient.data$x, Surv(patient.data$time,patient.data$status), family = "cox", maxit = 1000) -``` - -The Surv function packages the survival data into the form expected by -glmnet. Once fit, we can view the optimal $\lambda$ value and a cross -validated error plot to help evaluate our model. +This vignette describes how one can use the `glmnet` package to fit regularized Cox models. + +The Cox proportional hazards model is commonly used for the study of the relationship beteween predictor variables and survival time. In the usual survival analysis framework, we have data of the form $(y_1, x_1, \delta_1), \ldots, (y_n, x_n, \delta_n)$ where $y_i$, the observed time, is a time of failure if $\delta_i$ is 1 or a right-censored time if $\delta_i$ is 0. We also let $t_1 < t_2 < \ldots < t_m$ be the increasing list of unique failure times, and let $j(i)$ denote the index of the observation failing at time $t_i$. + +The Cox model assumes a semi-parametric form for the hazard +$$ +h_i(t) = h_0(t) e^{x_i^T \beta}, +$$ +where $h_i(t)$ is the hazard for patient $i$ at time $t$, $h_0(t)$ is a shared baseline hazard, and $\beta$ is a fixed, length $p$ vector. In the classic setting $n \geq p$, inference is made via the partial likelihood +$$ +L(\beta) = \prod_{i=1}^m \frac{e^{x_{j(i)}^T \beta}}{\sum_{j \in R_i} e^{x_j^T \beta}}, +$$ +where $R_i$ is the set of indices $j$ with $y_j \geq t_i$ (those at risk at time $t_i$). + +Note there is no intercept in the Cox model as it is built into the baseline hazard, and like it, would cancel in the partial likelihood. + +In `glmnet`, we penalize the negative log of the partial likelihood with an elastic net penalty. + +(Credits: The original `"coxnet"` algorithm for right-censored data was developed by Noah Simon, Jerome Friedman, Trevor Hastie and Rob Tibshirani: see @coxnet for details. The other features for Cox models, introduced in v4.1, were developed by Kenneth Tay, Trevor Hastie, Balasubramanian Narasimhan and Rob Tibshirani.) + +## Basic usage for right-censored data + +We use a pre-generated set of sample data and response. `x` must be an $n\times p$ matrix of covariate values --- each row corresponds to a patient and each column a covariate. `y` is an $n \times 2$ matrix, with a column `"time"` of failure/censoring times, and `"status"` a 0/1 indicator, with 1 meaning the time is a failure time, and 0 a censoring time. The `Surv` function in the `survival` package creates such a response matrix, and it is recommended that the user uses the output of a call to `Surv` for the response to `glmnet`. (For backward compatibility, `glmnet` can accept a two-column matrix with column names `"time"` and `"status"` for right-censored data.) +```{r} +library(glmnet) +library(survival) +data(CoxExample) +y[1:5, ] +``` + +We apply the `glmnet` function to compute the solution path under default settings: +```{r} +fit <- glmnet(x, y, family = "cox") +``` + +All the standard options such as `alpha`, `weights`, `nlambda` and `standardize` package, and their usage is similar as in the Gaussian case. (See the vignette ["An Introduction to `glmnet`"](https://glmnet.stanford.edu/articles/glmnet.html) for details, or refer to the help file `help(glmnet)`.) + +We can plot the coefficients with the `plot` method: +```{r} +plot(fit) +``` + +As before, we can extract the coefficients at certain values of $\lambda$: +```{r out.lines = 10} +coef(fit, s = 0.05) +``` + +Since the Cox Model is not commonly used for prediction, we do not give an illustrative example on prediction. If needed, users can refer to the help file by typing `help(predict.glmnet)`. + +### Cross-validation + +The function `cv.glmnet` can be used to compute $K$-fold cross-validation (CV) for the Cox model. The usage is similar to that for other families except for two main differences. + +First, `type.measure` only supports `"deviance"` (also default) which +gives the partial-likelihood, and `"C"`, which gives the Harrell +*C index*. This is like the area under the curve (AUC) measure of concordance for survival data, but only considers comparable pairs. Pure concordance would record the fraction of pairs for which the order of the death times agree with the order of the predicted risk. However, with survival data, if an observation is right censored at a time *before* another observation's death time, they are not comparable. + +The code below illustrates how one can perform cross-validation using the Harell C index. Note that unlike most error measures, a higher C index means better prediction performance. +```{r} +set.seed(1) +cvfit <- cv.glmnet(x, y, family = "cox", type.measure = "C") +``` + +Once fit, we can view the optimal $\lambda$ value and a cross validated error plot to help evaluate our model. +```{r} +plot(cvfit) +``` + +As with other families, the left vertical line in our plot shows us where the CV-error curve hits its minimum. The right vertical line shows us the most regularized model with CV-error within 1 standard deviation of the minimum. We also extract such optimal $\lambda$'s: +```{r} +cvfit$lambda.min +cvfit$lambda.1se +``` + +Second, the option `grouped = TRUE` (default) obtains the CV partial likelihood for the Kth fold by subtraction, i.e. by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the $(K-1)/K$ dataset. This makes more efficient use of risk sets. With `grouped = FALSE` the log partial likelihood is computed only on the $K$th fold, which is only reasonable if each fold has a large number of observations. + +### Handling of ties + +`glmnet` handles ties in survival time with the Breslow approximation. This is different from `survival` package's `coxph` function, whose default tie-handling method is the Efron approximation. +```{r} +# create x matrix +set.seed(1) +nobs <- 100; nvars <- 15 +x <- matrix(rnorm(nobs * nvars), nrow = nobs) + +# create response +ty <- rep(rexp(nobs / 5), each = 5) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +y <- Surv(ty, tcens) + +# coefficients from these two models will not line up because +# of different tie handling methods +glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0) +coxph_fit <- coxph(y ~ x) +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) +``` + +`glmnet` is not able to perform the Efron approximation at the moment. `survival`'s `coxph` can perform the Breslow approximation by specifying `ties = "breslow"`: +```{r} +# coefficients from these two models will line up +glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0) +coxph_fit <- coxph(y ~ x, ties = "breslow") +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) +``` + +## Cox models for start-stop data + +Since version 4.1 `glmnet` can fit models where the response is a (start, stop] time interval. As explained in @Therneau2000, the ability to work with start-stop responses opens the door to fitting regularized Cox models with + +* time-dependent covariates, +* time-dependent strata, +* left truncation, +* multiple time scales, +* multiple events per subject, +* independent increment, marginal, and conditional models for correlated data, and +* various forms of case-cohort models. +The code below shows how to create a response of this type (using `survival` package's `Surv` function) and how to fit such a model with `glmnet`. ```{r} +# create x matrix +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +x <- matrix(xvec, nrow = nobs) # non-sparse x +x_sparse <- Matrix::Matrix(xvec, nrow = nobs, sparse = TRUE) # sparse x + +# create start-stop data response +beta <- rnorm(5) +fx <- x_sparse[, 1:5] %*% beta / 3 +ty <- rexp(nobs, drop(exp(fx))) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +starty <- runif(nobs) +yss <- Surv(starty, starty + ty, tcens) + +# fit regularized Cox model with start-stop data +fit <- glmnet(x, yss, family = "cox") +``` + +(Note that the call above would have worked as well if `x` was replaced by `x_sparse`.) `cv.glmnet` works with start-stop data too: +```{r} +cv.fit <- cv.glmnet(x, yss, family = "cox", nfolds = 5) plot(cv.fit) -cv.fit$lambda.min ``` -The left vertical line in our plot shows us where the CV-error curve -hits its minimum. The right vertical line shows us the most -regularized model with CV-error within$1$standard deviation of the -minimum. In this case, we see that the minimum was achieved by a -fairly regularized model, however the right line indicates that the -null model (no coefficients included) is within$1$sd of the -minimum. This might lead us to believe that in actuality the -covariates are not explaining any variability. For the time being we -will concern ourselves with the minimum CV-error model. We can check -which covariates our model chose to be active, and see the -coefficients of those covariates. +As a sanity check, the code below shows that fitting start-stop responses +using `glmnet` with `lambda = 0` matches up with `coxph`'s result: +```{r} +glmnet_fit <- glmnet(x, yss, family = "cox", lambda = 0) +coxph_fit <- coxph(yss ~ x) +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) +``` + +## Stratified Cox models + +One extension of the Cox regression model is to allow for strata that divide the observations into disjoint groups. Each group has its own baseline hazard function, but the groups share the same coefficient vector for the covariates provided by the design matrix `x`. + +`glmnet` can fit stratified Cox models with the elastic net +penalty. With `coxph` one can specify strata in the model formula. +Since `glmnet` does not use a model formula, we achieve this by adding +a strata attribute to the `Surv` response object. We achieve this via +the function `stratifySurv`: +```{r} +strata <- rep(1:5, length.out = nobs) +y2 <- stratifySurv(y, strata) +str(y2[1:6]) +``` +`stratifySurv` returns an object of class `stratifySurv`. We can then pass this `stratifySurv` object as the response to a `glmnet` call. `glmnet` will fit a stratified Cox model if it detects that the response has class `stratifySurv`. ```{r} -Coefficients <- coef(fit, s = cv.fit$lambda.min) -Active.Index <- which(Coefficients != 0) -Active.Coefficients <- Coefficients[Active.Index] +fit <- glmnet(x, y2, family = "cox") ``` -`coef(fit, s = cv.fit\$lambda.min)` returns the $p$ length coefficient -vector of the solution corresponding to $\lambda =$`cv.fit$lambda.min`. +This `stratifySurv` object can also be passed to `cv.glmnet` to fit stratified Cox models with cross-validation: +```{r} +cv.fit <- cv.glmnet(x, y2, family = "cox", nfolds = 5) +plot(cv.fit) +``` +Note that simply giving the response a `"strata"` attribute is not enough! The response needs to be of class `stratifySurv` in order for subsetting to work correctly. To protect against this, an error will be thrown if the response has a `"strata"` attribute but is not of class `stratifySurv`. Add strata via the `stratifySurv` function. ```{r} -Active.Index -Active.Coefficients +y3 <- y +attr(y3, "strata") <- strata +str(y3[1:6]) # note that the strata attribute is no longer there ``` -We see that our optimal model chose 2 active covariates ($X80$ and -$X394$) each with a small positive effect on hazard. +```{r error=TRUE} +fit <- glmnet(x, y3, family = "cox") +``` + +## Plotting survival curves + +Fitting a regularized Cox model using `glmnet` with `family = "cox"` +returns an object of class `"coxnet"`. Class `"coxnet"` objects have a +`survfit` method which allows the user to visualize the survival +curves from the model. In addition to the `"coxnet"` object, the user +must pass the `x` and `y` objects used to fit the model (for +computation of the baseline hazard), as well as the lambda value for which the survival curve is desired: +```{r} +fit <- glmnet(x, y, family = "cox") +survival::survfit(fit, s = 0.05, x = x, y = y) +``` + +We are unable to provide standard errors for these survival curves, so we do not present the confidence bounds for them. To plot the survival curve, pass the result of the `survfit` call to `plot`: +```{r} +plot(survival::survfit(fit, s = 0.05, x = x, y = y)) +``` + +As noted in the documentation for `survival::survfit.coxph`, without new data, a curve is produced for a single "pseudo" subject with covariate values equal to the means of the data set, and this resulting curve(s) almost never make sense. We can get survival curves for individual observations by passing a `newx` argument: +```{r} +survival::survfit(fit, s = 0.05, x = x, y = y, newx = x[1:3, ]) +plot(survival::survfit(fit, s = 0.05, x = x, y = y, newx = x[1:3, ])) +``` + +If the original model was fit with strata, then the `strata` option needs to be specified as well. If `newx` is being passed for such a model, the strata for these new observations need to be passed via `newstrata`. +```{r} +y2 <- stratifySurv(y, rep(1:2, length.out = nobs)) +fit <- glmnet(x, y2, family = "cox") +survival::survfit(fit, s = 0.01, x = x, y = y2) + +# survival curve plot for first two individuals in dataset +plot(survival::survfit(fit, s = 0.01, x = x, y = y2, + newx = x[1:2, ], newstrata = strata[1:2])) +``` + +To be consistent with other methods in `glmnet`, if the `s` parameter is not specified, survival curves are returned for the entire `lambda` sequence. The survival curves are returned as a list, one element for each `lambda` value. +```{r} +sf <- survival::survfit(fit, x = x, y = y2) +length(sf) +length(fit$lambda) +``` + +The `survfit` method is available for `cv.glmnet` objects as well. By default, the `s` value chosen is the "lambda.1se" value stored in the CV object. The `s` value can also be set to the `"lambda.min"` value stored in the +CV object. +```{r} +cv.fit <- cv.glmnet(x, y2, family = "cox", nfolds = 5) +survival::survfit(cv.fit, x = x, y = y2) +survival::survfit(cv.fit, s = "lambda.min", x = x, y = y2) +``` ## References Binary files /tmp/tmp3NAdo6/f1kG0NcwdC/r-cran-glmnet-4.0-2/inst/doc/glmnetFamily.pdf and /tmp/tmp3NAdo6/BBEWWRnMdv/r-cran-glmnet-4.1/inst/doc/glmnetFamily.pdf differ diff -Nru r-cran-glmnet-4.0-2/inst/doc/glmnetFamily.R r-cran-glmnet-4.1/inst/doc/glmnetFamily.R --- r-cran-glmnet-4.0-2/inst/doc/glmnetFamily.R 2020-06-14 23:21:40.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/glmnetFamily.R 2021-01-11 00:03:21.000000000 +0000 @@ -1,25 +1,25 @@ -## ---- include = FALSE---------------------------------------------------- +## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- fam <- binomial() class(fam) names(fam) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- set.seed(1) x <- matrix(rnorm(500), ncol = 5) y <- rowSums(x[, 1:2]) + rnorm(100) -## ----message = FALSE----------------------------------------------------- +## ----message = FALSE---------------------------------------------------------- library(glmnet) oldfit <- glmnet(x, y, family = "gaussian") newfit <- glmnet(x, y, family = gaussian()) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- thresh <- 1e-18 oldfit <- glmnet(x, y, family="gaussian", thresh = thresh) newfit <- glmnet(x, y, family = gaussian(), thresh = thresh) @@ -31,7 +31,7 @@ expect_equal(oldfit[[key]], newfit[[key]]) } -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- biny <- ifelse(y > 0, 1, 0) # binary data cnty <- ceiling(exp(y)) # count data @@ -43,46 +43,46 @@ oldfit <- glmnet(x, cnty, family = "poisson") newfit <- glmnet(x, cnty, family = poisson()) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- newfit <- glmnet(x, biny, family = binomial(link = "probit")) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- newfit <- glmnet(x, cnty, family = quasipoisson()) -## ---- eval=FALSE--------------------------------------------------------- +## ---- eval=FALSE-------------------------------------------------------------- # library(MASS) # newfit <- glmnet(x, cnty, family = negative.binomial(theta = 5)) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- class(newfit) -## ------------------------------------------------------------------------ -fit=glmnet(x,y,family="gaussian") +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, family = "gaussian") class(fit) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- set.seed(2020) n <- 100 p <- 4 -x <- matrix(runif(n*p, 5, 10), n) +x <- matrix(runif(n * p, 5, 10), n) y <- rpois(n, exp(rowMeans(x))) # glm fit -glmfit <- glm(y ~ x-1, family = poisson) +glmfit <- glm(y ~ x - 1, family = poisson) coef(glmfit) -## ------------------------------------------------------------------------ -oldfit <- glmnet(x, y, family = "poisson", standardize = FALSE, intercept = FALSE, - lambda = 0) +## ----------------------------------------------------------------------------- +oldfit <- glmnet(x, y, family = "poisson", standardize = FALSE, + intercept = FALSE, lambda = 0) coef(oldfit) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- glmnet.control(mxitnr = 50) # increase maximum no. of IRLS iterations allowed -newfit <- glmnet(x, y, family = poisson(), standardize = FALSE, intercept = FALSE, - lambda = 0) +newfit <- glmnet(x, y, family = poisson(), standardize = FALSE, + intercept = FALSE, lambda = 0) coef(newfit) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- thresh <- 1e-15 glmfit <- glm(y ~ x-1, family = poisson, control = list(epsilon = thresh, maxit = 100)) @@ -93,9 +93,3 @@ expect_equal(as.numeric(coef(glmfit)), as.numeric(coef(newfit))[2:5]) -## ------------------------------------------------------------------------ -glmnet.control(mxitnr = 50) - -## ------------------------------------------------------------------------ -glmnet.control(factory = TRUE) - diff -Nru r-cran-glmnet-4.0-2/inst/doc/glmnetFamily.Rmd r-cran-glmnet-4.1/inst/doc/glmnetFamily.Rmd --- r-cran-glmnet-4.0-2/inst/doc/glmnetFamily.Rmd 2020-05-13 00:21:05.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/glmnetFamily.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,9 @@ --- -title: "Glm `family` functions in `glmnet` 4.0" -author: "Trevor Hastie and Kenneth Tay" -date: "April 30, 2020" +title: "The `family` Argument for `glmnet`" +author: + - Trevor Hastie + - Kenneth Tay +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -10,7 +12,7 @@ toc: yes toc_depth: 3 vignette: > - %\VignetteIndexEntry{Glm family} + %\VignetteIndexEntry{The family Argument for glmnet} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -28,106 +30,65 @@ penalized maximum likelihood. Concretely, it solves the problem $$ \min_{\beta_0, \beta} \frac{1}{N}\sum_{i=1}^N w_i l_i(y_i, \beta_0 + \beta^T x_i) + \lambda \left[\frac{1 - \alpha}{2}\|\beta\|_2^2 + \alpha \|\beta\|_1 \right] $$ -over a grid of values of $\lambda$ covering the entire range. In the equation above, $l_i(y, \eta)$ is the negative log-likelihood contribution for observation $i$. $\alpha \in [0,1]$ is a tuning parameter which bridges the gap between the lasso ($\alpha = 1$, the default) and ridge regression ($\alpha = 0$), while $\lambda$ controls the overall strength of the penalty. +over a grid of values of $\lambda$ covering the entire range. In the equation above, $l_i(y_i, \eta_i)$ is the negative log-likelihood contribution for observation $i$. $\alpha \in [0,1]$ is a tuning parameter which bridges the gap between the lasso ($\alpha = 1$, the default) and ridge regression ($\alpha = 0$), while $\lambda$ controls the overall strength of the penalty. -For `glmnet` v3.0-2 and below, `glmnet` could only solve the -minimization problem above for a limited number of built-in -(hardwired) families via its `family` argument. In particular, -`glmnet` could fit penalized Gaussian, binomial, and Poisson GLMs -(along with a few other special cases such as the Cox model, -multinomial regression, and multi-response Gaussian). In v4.0 onwards, -`glmnet` allows the user to fit a penalized regression model for *any* -GLM by allowing for any legitimate GLM family object, as used by the -`stats:glm` function. - -### Additional families - -Before v4.0, `glmnet`'s `family` argument had to be one of a limited -set of strings: `c("gaussian", "binomial", "poisson", "multinomial", -"cox", "mgaussian")`. This specified which of the *built-in* families -was to be used. From v4.0 onwards, in addition to these -strings, the `family argument to `glmnet` can also be the result of a call -to a `family` function. (To learn more about family functions in R, -run `?family` in the R console.) - -All the functionality of `glmnet` applies to these new families, and -hence their addition expands the scope of `glmnet` considerably. In -particular, - -* All the methods work as before, such as `plot`, `predict` etc. -* large and sparse `X` matrices as input ; -* upper and lower bound constraints on parameters; -* `cv.glmnet` for selecting the tuning parameters; -* `relax=TRUE` for fitting the unpenalized models to the - active sets; +`glmnet` solves the minimization problem above very efficiently for a limited number of built-in (hardwired) families. To fit these model families, one should specify the `family` argument to `glmnet` as a character string. The families that can be fit efficiently this way are the penalized Gaussian (`"gaussian"`), binomial (`"binomial"`), and Poisson (`"poisson"`) GLMs, along with a few other special cases: the Cox model (`"cox"`), multinomial regression (`"multinomial"`), and multi-response Gaussian (`"mgaussian"`). Details for how to fit these models can be found in the vignette ["An Introduction to `glmnet`"](https://glmnet.stanford.edu/articles/glmnet.html). + +Apart from these built-in families, `glmnet` also allows the user to fit a penalized regression model for *any* GLM by allowing the `family` argument to be any legitimate GLM family object, as used by the `stats:glm` function. + +### Using class "family" objects for the `family` argument + +The `family` argument to `glmnet` can be the result of a call to a `family` function. (To learn more about family functions in R, run `?family` in the R console.) All the functionality of `glmnet` applies to these new families, and hence their addition expands the scope of `glmnet` considerably. In particular, + +* All the methods, such as `plot` and `predict`, work as before; +* Large and sparse `x` matrices can be taken as input; +* The user can put upper and lower bound constraints on parameters; +* `cv.glmnet` can be used for selecting the tuning parameters; +* `relax = TRUE` can be specified for fitting unpenalized models to the active sets; * `offsets` can be provided; -* penalty strengths, standardization, and other options. +* Penalty strengths, standardization, and other options to `glmnet` work as before. + +When the `family` argument is a class `"family"` object, `glmnet` fits the model for each value of `lambda` with a proximal Newton algorithm, also known as iteratively reweighted least squares (IRLS). The outer loop of the IRLS algorithm is coded in R, while the inner loop solves the weighted least squares problem with the elastic net penalty, and is implemented in Fortran. The R code exploits warm starts as it iterates down the path, and so is reasonably efficient. ### More on GLM families -A GLM is linear model for a response variable whose conditional -distribution belongs to a one-dimensional exponential family. Apart -from Gaussian, Poisson and binomial, there are other interesting -members of this family. Some examples are Gamma, inverse Gaussian, -negative binomial, to name a few. A GLM consists of 3 parts: +A GLM is linear model for a response variable whose conditional distribution belongs to a one-dimensional exponential family. Apart from Gaussian, Poisson and binomial families, there are other interesting members of this family, e.g. Gamma, inverse Gaussian, negative binomial, to name a few. A GLM consists of 3 parts: 1. A linear predictor: $\eta_i = \sum_{j=1}^p \beta^T x_i$, 2. A link function: $\eta_i = g(\mu_i)$, and 3. A random component: $y_i \sim f(y \mid \mu_i)$. -The user gets to specify the link function $g$ and the family of -response distributions $f(\cdot \mid \mu)$, and fitting a GLM amounts -to estimating the parameter $\beta$ by maximum likelihood. - -In R, these 3 parts of the GLM are encapsulated in an object of class -`family` (run `?family` in the R console for more details). A `family` -object is a list of GLM components which allows functions such as -`stats:glm` to fit GLMs in R. As an example, the code below shows the -constituent parts for the binomial GLM, which is what is used to fit -linear logistic regression: +The user gets to specify the link function $g$ and the family of response distributions $f(\cdot \mid \mu)$, and fitting a GLM amounts to estimating the parameter $\beta$ by maximum likelihood. + +In R, these 3 parts of the GLM are encapsulated in an object of class `family` (run `?family` in the R console for more details). A `family` object is a list of GLM components which allows functions such as `stats:glm` to fit GLMs in R. As an example, the code below shows the constituent parts for the binomial GLM, which is what is used to fit linear logistic regression: ```{r} fam <- binomial() class(fam) names(fam) ``` -This is a list of functions and expressions that get used in the -*iteratively reweighted least squares* algorithm for fitting the GLM. +This is a list of functions and expressions that get used in the *iteratively reweighted least squares* (IRLS) algorithm for fitting the GLM. -From v4.0 onwards, `glmnet` can fit penalized GLMs for any family as -long as the family can be expressed as a `family` object. In fact, -users can make their own families, or customize existing families, -just as they can for regular GLMs. - -Generally this option should be used if the desired family is not -included in the built-in list. The reason is that the entire path -algorithm for the built-in families is implemented in Fortran, and so -will be faster. +`glmnet` can fit penalized GLMs for any family as long as the family can be expressed as a `family` object. In fact, users can make their own families, or customize existing families, just as they can for regular GLMs. + +Generally this option should be used only if the desired family is not included in the built-in list. The reason is that the entire path algorithm for the built-in families is implemented in Fortran, and so will be faster. ## Fitting Gaussian, binomial and Poisson GLMs -First we demonstrate how we can use this new version of `glmnet` to fit ordinary least squares with the elastic net penalty. We set up some fake data: +First, we demonstrate how we can use this new version of `glmnet` to fit ordinary least squares with the elastic net penalty. We set up some fake data: ```{r} set.seed(1) x <- matrix(rnorm(500), ncol = 5) y <- rowSums(x[, 1:2]) + rnorm(100) ``` -The function calls below demonstrate how we would fit the model with -the old and new `family` parameter options. To fit a linear regression -by least squares, we want to use the Gaussian family. There is a -*hard-wired* option for this, specified via `family="gaussian"` (which -is also the defaults for `glmnet`). Now we can also use `family = gaussian()` to fit the same model. +The function calls below demonstrate how we would fit the model with the old and new `family` parameter options. To fit a linear regression by least squares, we want to use the Gaussian family. There is a *hard-wired* option for this, specified via `family="gaussian"` (which is also the default for `glmnet`). Now we can also use `family = gaussian()` to fit the same model. ```{r message = FALSE} library(glmnet) oldfit <- glmnet(x, y, family = "gaussian") newfit <- glmnet(x, y, family = gaussian()) ``` -`glmnet` distinguishes these two cases because the first is a -character string, while the second is a GLM family object. -Of course if we really wanted to fit this model, we would use the -hard-wired version, because it is faster. Here we want to show that -they are equivalent, up to machine precision. +`glmnet` distinguishes these two cases because the first is a character string, while the second is a GLM family object. Of course if we really wanted to fit this model, we would use the hard-wired version, because it is faster. Here we want to show that they are equivalent, up to machine precision. There are slight differences in the algorithms used to compute the solutions, so some of the equality tests run using `testthat::expect_equal` might fail. However, these same tests can be made to pass by decreasing the `thresh` option in both function calls: @@ -144,8 +105,7 @@ } ``` -Next, we demonstrate the function calls for the binomial and Poisson -GLM families. +Next, we demonstrate the function calls for the binomial and Poisson GLM families. ```{r} biny <- ifelse(y > 0, 1, 0) # binary data @@ -164,15 +124,7 @@ In the examples above, the new version is simply replicating existing functionality in `glmnet`. For these GLMs, we recommend specifying the -GLM family as a string for computational efficiency. The Figures 1-2 -illustrate that existing code for these GLM families is more -efficient than the new code, especially for the Gaussian case. - - For the new families, the model is fit for each value -of `lambda` by a "proximal Newton" algorithm, with the outer loops -coded in R. The inner loop is fit by a weighted elastic-net algorithm, -which is implemented in Fortran. However, the R code also exploits -warm starts as it iterates down the path, so is reasonably efficient. +GLM family as a character string for computational efficiency. The figures below illustrate that existing code for these GLM families is more efficient than the new code, especially for the Gaussian case. (In the figures, `n` and `p` denote the number of observations and features respectively. Each point is the mean of 5 simulation runs. Note that both the `x` and `y` axes are on a log scale.) @@ -182,91 +134,82 @@ ## Fitting other GLMs -The real power of the new code is in fitting GLMs other than the three -in the previous section, by passing a GLM `"family"` object as the -`family` argument to `glmnet`. - - - - -For example, performing probit regression with the elastic net penalty is as simple as the code below: +The real power of using class `"family"` objects for the `family` argument is in fitting GLMs other than the three in the previous section. For example, performing probit regression with the elastic net penalty is as simple as the code below: ```{r} newfit <- glmnet(x, biny, family = binomial(link = "probit")) ``` -For the *complementary log-log* link we would specify `family = binomial(link = "cloglog")` -We can fit nonlinear least-squares models by using a different link -with the Gaussian family; for example `family=gaussian(link="log")`. +For the *complementary log-log* link we would specify `family = binomial(link = "cloglog")`. + +We can fit nonlinear least-squares models by using a different link with the Gaussian family; for example `family = gaussian(link = "log")`. For count data, we can fit a quasi-Poisson model that allows for overdispersion: ```{r} newfit <- glmnet(x, cnty, family = quasipoisson()) ``` -Performing negative binomial regression (instead of Poisson regression) is also easy: +The negative binomial is often used to model over-dispersed count data +(instead of Poisson regression), and is also easy: ```{r, eval=FALSE} library(MASS) newfit <- glmnet(x, cnty, family = negative.binomial(theta = 5)) ``` -There are many other families, including `quasi` where users can -customize their own families. In addition, there are additional -specialized -families, such as `statmod:tweedie` for overdispersed count data, for example. +There are many other families, including `quasi` where users can customize their own families. There are additional specialized families such as `statmod:tweedie` for overdispersed count data. -## `glmnetfit` class +## Class `"glmnetfit"` objects -If `glmnet` is called with GLM `family` object as its argument, it -returns an object with class: +If `glmnet` is called with a class `"family"` object as its argument, it returns an object with class ```{r} class(newfit) ``` -This is similar to the hard-wired classes; for example a -`family="gaussian"` has class + +This is similar to the hard-wired classes; for example a `family = "gaussian"` fit has class ```{r} -fit=glmnet(x,y,family="gaussian") +fit <- glmnet(x, y, family = "gaussian") class(fit) ``` -Importantly, both these inherit from class `"glmnet"`, and so all the S3 -methods such as `plot`, `predict`, `coef`, and `print` will work out -the box. +Importantly, both these inherit from class `"glmnet"`, and so all the S3 methods such as `plot`, `predict`, `coef`, and `print` will work out the box. ## Step size halving within iteratively reweighted least squares (IRLS) -Before v4.0, `glmnet` solved the optimization problem for non-Gaussian families via iteratively reweighted least squares (IRLS). In each iteration a unit Newton step was taken, and the algorithm terminated when the unit Newton step did not decrease the deviance sufficiently. Because the algorithm was forced to take a unit step, this could result in non-convergence of the algorithm in some cases. +For the built-in families, `glmnet` solves the optimization problem +for non-Gaussian families via iteratively reweighted least squares +(IRLS). In each iteration a unit Newton step is taken, and the +algorithm terminates when the unit Newton step fails to decrease the deviance sufficiently. Because the algorithm iss forced to take a unit step, this can result in non-convergence of the algorithm in some cases. Here is an example of the non-convergence for Poisson data. The `stats:glm` function converges and gives us coefficients that are reasonably close to the truth: ```{r} set.seed(2020) n <- 100 p <- 4 -x <- matrix(runif(n*p, 5, 10), n) +x <- matrix(runif(n * p, 5, 10), n) y <- rpois(n, exp(rowMeans(x))) # glm fit -glmfit <- glm(y ~ x-1, family = poisson) +glmfit <- glm(y ~ x - 1, family = poisson) coef(glmfit) ``` -Fitting `glmnet` with `lambda = 0` is equivalent to fitting a GLM. If we use `glmnet` version before v4.0, we encounter an issue with non-convergence: +Fitting `glmnet` with `lambda = 0` is equivalent to fitting an unregularized GLM. If we use `glmnet` with `family = "poisson"`, we encounter an issue with non-convergence: ```{r} -oldfit <- glmnet(x, y, family = "poisson", standardize = FALSE, intercept = FALSE, - lambda = 0) +oldfit <- glmnet(x, y, family = "poisson", standardize = FALSE, + intercept = FALSE, lambda = 0) coef(oldfit) ``` -This divergence happens because the unit Newton step was too large. To address this issue, from v4.0 onwards if a `family` object is passed to the `family` argument of `glmnet`, the IRLS algorithm will perform step size halving. After computing the Newton step, the algorithm checks if the new solution has infinite (or astronomically large) objective function value or if it results in invalid $\eta$ or $\mu$. If so, the algorithm halves the step size repeatedly until these invalid conditions no longer hold. +This divergence happens because the unit Newton step was too large. However, if a `family` object is passed to the `family` argument of `glmnet`, the IRLS algorithm will perform step size halving. After computing the Newton step, the algorithm checks if the new solution has infinite (or astronomically large) objective function value or if it results in invalid $\eta$ or $\mu$. If so, the algorithm halves the step size repeatedly until these invalid conditions no longer hold. The code below shows that this step size halving avoids the divergence we were experiencing in our running example: ```{r} glmnet.control(mxitnr = 50) # increase maximum no. of IRLS iterations allowed -newfit <- glmnet(x, y, family = poisson(), standardize = FALSE, intercept = FALSE, - lambda = 0) +newfit <- glmnet(x, y, family = poisson(), standardize = FALSE, + intercept = FALSE, lambda = 0) coef(newfit) ``` -In the process, `glmnet` warns the user that an infinite objective function value was encountered and that step size halving was done to address the issue. The coefficients are close to those obtained by `stats:glm`, and can be made to be numerically indistinguishable by tightening the convergence criterion in both function calls. +In the process, `glmnet` warns the user that an infinite objective function value was encountered and that step size halving was done to address the issue. The coefficients are close to those obtained by `stats:glm`, and can be made to be numerically indistinguishable by tightening the convergence criteria in both function calls. ```{r} thresh <- 1e-15 glmfit <- glm(y ~ x-1, family = poisson, @@ -278,23 +221,3 @@ expect_equal(as.numeric(coef(glmfit)), as.numeric(coef(newfit))[2:5]) ``` - - -## Appendix 1: Internal parameters - -With this generalization of `glmnet`, we have added two new internal parameters which control some aspects of the model computation. The factory default settings are expected to work in most cases and users do not need to make changes unless there are special requirements. - -These two parameters are related to the iteratively reweighted least squares (IRLS) loop for solving the optimization problem at each value of $\lambda$: - -- `epsnr`: convergence threshold for the IRLS loop; factory default = 1e-08 -- `mxitnr`: maximum iterations for the IRLS loop for each value of $\lambda$; factory default = 25 - -As with other internal parameters, `epsnr` and `mxitnr` can be changed by calling `glmnet.control`. For example, if we wanted to increase the maximum number of iterations allowed for the IRLS loop for each $\lambda$, we would run -```{r} -glmnet.control(mxitnr = 50) -``` - -Any changes made to these internal parameters will hold for the duration of the R session unless they are changed by the user with a subsequent call to `glmnet.control`. To restore the factory defaults, run -```{r} -glmnet.control(factory = TRUE) -``` Binary files /tmp/tmp3NAdo6/f1kG0NcwdC/r-cran-glmnet-4.0-2/inst/doc/glmnet.pdf and /tmp/tmp3NAdo6/BBEWWRnMdv/r-cran-glmnet-4.1/inst/doc/glmnet.pdf differ diff -Nru r-cran-glmnet-4.0-2/inst/doc/glmnet.R r-cran-glmnet-4.1/inst/doc/glmnet.R --- r-cran-glmnet-4.0-2/inst/doc/glmnet.R 2020-06-14 23:21:35.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/glmnet.R 2021-01-11 00:03:17.000000000 +0000 @@ -1,327 +1,373 @@ -## ---- eval=FALSE--------------------------------------------------------- +## ----include=FALSE------------------------------------------------------------ +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) + +## ---- eval=FALSE-------------------------------------------------------------- # install.packages("glmnet", repos = "https://cran.us.r-project.org") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- library(glmnet) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(QuickStartExample) -## ------------------------------------------------------------------------ -fit = glmnet(x, y) +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- plot(fit) -## ----height = 4---------------------------------------------------------- +## ----out.lines = 10----------------------------------------------------------- print(fit) -## ------------------------------------------------------------------------ -coef(fit,s=0.1) +## ----out.lines = 10----------------------------------------------------------- +coef(fit, s = 0.1) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- set.seed(29) -nx = matrix(rnorm(10*20),10,20) -predict(fit,newx=nx,s=c(0.1,0.05)) +nx <- matrix(rnorm(5 * 20), 5, 20) +predict(fit, newx = nx, s = c(0.1, 0.05)) -## ------------------------------------------------------------------------ -cvfit = cv.glmnet(x, y) +## ----------------------------------------------------------------------------- +cvfit <- cv.glmnet(x, y) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- plot(cvfit) -## ------------------------------------------------------------------------ +## ----out.lines = 10----------------------------------------------------------- cvfit$lambda.min - -## ------------------------------------------------------------------------ coef(cvfit, s = "lambda.min") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- predict(cvfit, newx = x[1:5,], s = "lambda.min") -## ------------------------------------------------------------------------ -fit = glmnet(x, y, alpha = 0.2, weights = c(rep(1,50),rep(2,50)), nlambda = 20) +## ----------------------------------------------------------------------------- +wts <- c(rep(1,50), rep(2,50)) +fit <- glmnet(x, y, alpha = 0.2, weights = wts, nlambda = 20) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- print(fit) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y) +any(fit$lambda == 0.5) # 0.5 not in original lambda sequence +coef.apprx <- coef(fit, s = 0.5, exact = FALSE) +coef.exact <- coef(fit, s = 0.5, exact = TRUE, x=x, y=y) +cbind2(coef.exact[which(coef.exact != 0)], + coef.apprx[which(coef.apprx != 0)]) + +## ----------------------------------------------------------------------------- +predict(fit, newx = x[1:5,], type = "response", s = 0.05) + +## ----------------------------------------------------------------------------- plot(fit, xvar = "lambda", label = TRUE) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- plot(fit, xvar = "dev", label = TRUE) -## ------------------------------------------------------------------------ -fit = glmnet(x, y) -any(fit$lambda == 0.5) -coef.apprx = coef(fit, s = 0.5, exact = FALSE) -coef.exact = coef(fit, s = 0.5, exact = TRUE, x=x, y=y) -cbind2(coef.exact, coef.apprx) - -## ------------------------------------------------------------------------ -predict(fit, newx = x[1:5,], type = "response", s = 0.05) +## ----------------------------------------------------------------------------- +cvfit <- cv.glmnet(x, y, type.measure = "mse", nfolds = 20) -## ------------------------------------------------------------------------ -cvfit = cv.glmnet(x, y, type.measure = "mse", nfolds = 20) +## ----------------------------------------------------------------------------- +print(cvfit) -## ---- eval=FALSE--------------------------------------------------------- -# require(doMC) -# registerDoMC(cores=2) -# X = matrix(rnorm(1e4 * 200), 1e4, 200) -# Y = rnorm(1e4) +## ---- eval=FALSE-------------------------------------------------------------- +# library(doMC) +# registerDoMC(cores = 2) +# X <- matrix(rnorm(1e4 * 200), 1e4, 200) +# Y <- rnorm(1e4) -## ---- eval=FALSE--------------------------------------------------------- +## ---- eval=FALSE-------------------------------------------------------------- # system.time(cv.glmnet(X, Y)) -## ---- echo=FALSE--------------------------------------------------------- +## ---- echo=FALSE-------------------------------------------------------------- structure(c(2.44, 0.08, 2.518, 0, 0), class = "proc_time", .Names = c("user.self", "sys.self", "elapsed", "user.child", "sys.child")) -## ---- eval=FALSE--------------------------------------------------------- +## ---- eval=FALSE-------------------------------------------------------------- # system.time(cv.glmnet(X, Y, parallel = TRUE)) -## ---- echo=FALSE--------------------------------------------------------- +## ---- echo=FALSE-------------------------------------------------------------- structure(c(0.508999999999999, 0.057, 1.56699999999999, 1.941, 0.1), class = "proc_time", .Names = c("user.self", "sys.self", "elapsed", "user.child", "sys.child")) -## ------------------------------------------------------------------------ +## ----out.lines = 10----------------------------------------------------------- cvfit$lambda.min -coef(cvfit, s = "lambda.min") predict(cvfit, newx = x[1:5,], s = "lambda.min") +coef(cvfit, s = "lambda.min") -## ------------------------------------------------------------------------ -foldid=sample(1:10,size=length(y),replace=TRUE) -cv1=cv.glmnet(x,y,foldid=foldid,alpha=1) -cv.5=cv.glmnet(x,y,foldid=foldid,alpha=.5) -cv0=cv.glmnet(x,y,foldid=foldid,alpha=0) - -## ------------------------------------------------------------------------ -par(mfrow=c(2,2)) -plot(cv1);plot(cv.5);plot(cv0) -plot(log(cv1$lambda),cv1$cvm,pch=19,col="red",xlab="log(Lambda)",ylab=cv1$name) -points(log(cv.5$lambda),cv.5$cvm,pch=19,col="grey") -points(log(cv0$lambda),cv0$cvm,pch=19,col="blue") -legend("topleft",legend=c("alpha= 1","alpha= .5","alpha 0"),pch=19,col=c("red","grey","blue")) +## ----------------------------------------------------------------------------- +foldid <- sample(1:10, size = length(y), replace = TRUE) +cv1 <- cv.glmnet(x, y, foldid = foldid, alpha = 1) +cv.5 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.5) +cv0 <- cv.glmnet(x, y, foldid = foldid, alpha = 0) + +## ----------------------------------------------------------------------------- +par(mfrow = c(2,2)) +plot(cv1); plot(cv.5); plot(cv0) +plot(log(cv1$lambda) , cv1$cvm , pch = 19, col = "red", + xlab = "log(Lambda)", ylab = cv1$name) +points(log(cv.5$lambda), cv.5$cvm, pch = 19, col = "grey") +points(log(cv0$lambda) , cv0$cvm , pch = 19, col = "blue") +legend("topleft", legend = c("alpha= 1", "alpha= .5", "alpha 0"), + pch = 19, col = c("red","grey","blue")) -## ------------------------------------------------------------------------ -tfit=glmnet(x,y,lower=-.7,upper=.5) +## ----------------------------------------------------------------------------- +tfit <- glmnet(x, y, lower.limits = -0.7, upper.limits = 0.5) plot(tfit) -## ------------------------------------------------------------------------ -p.fac = rep(1, 20) -p.fac[c(5, 10, 15)] = 0 -pfit = glmnet(x, y, penalty.factor = p.fac) +## ----------------------------------------------------------------------------- +p.fac <- rep(1, 20) +p.fac[c(1, 3, 5)] <- 0 +pfit <- glmnet(x, y, penalty.factor = p.fac) plot(pfit, label = TRUE) -## ------------------------------------------------------------------------ -set.seed(101) -x=matrix(rnorm(1000),100,10) -y=rnorm(100) -vn=paste("var",1:10) -fit=glmnet(x,y) -plot(fit) - -## ------------------------------------------------------------------------ -par(mar=c(4.5,4.5,1,4)) -plot(fit) -vnat=coef(fit) -vnat=vnat[-1,ncol(vnat)] # remove the intercept, and get the coefficients at the end of the path -axis(4, at=vnat,line=-.5,label=vn,las=1,tick=FALSE, cex.axis=0.5) - -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(MultiGaussianExample) +mfit <- glmnet(x, y, family = "mgaussian") -## ------------------------------------------------------------------------ -mfit = glmnet(x, y, family = "mgaussian") - -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- plot(mfit, xvar = "lambda", label = TRUE, type.coef = "2norm") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- predict(mfit, newx = x[1:5,], s = c(0.1, 0.01)) -## ------------------------------------------------------------------------ -cvmfit = cv.glmnet(x, y, family = "mgaussian") - -## ------------------------------------------------------------------------ -plot(cvmfit) - -## ------------------------------------------------------------------------ -cvmfit$lambda.min -cvmfit$lambda.1se - -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(BinomialExample) -## ------------------------------------------------------------------------ -fit = glmnet(x, y, family = "binomial") - -## ------------------------------------------------------------------------ -plot(fit, xvar = "dev", label = TRUE) +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, family = "binomial") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01)) -## ------------------------------------------------------------------------ -cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class") +## ----------------------------------------------------------------------------- +cvfit <- cv.glmnet(x, y, family = "binomial", type.measure = "class") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- plot(cvfit) - -## ------------------------------------------------------------------------ cvfit$lambda.min cvfit$lambda.1se -## ------------------------------------------------------------------------ -coef(cvfit, s = "lambda.min") - -## ------------------------------------------------------------------------ -predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") - -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(MultinomialExample) -## ------------------------------------------------------------------------ -fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped") - -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, family = "multinomial", type.multinomial = "grouped") plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm") -## ------------------------------------------------------------------------ -cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped", parallel = TRUE) +## ----------------------------------------------------------------------------- +cvfit <- cv.glmnet(x, y, family = "multinomial", type.multinomial = "grouped") plot(cvfit) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(PoissonExample) -## ------------------------------------------------------------------------ -fit = glmnet(x, y, family = "poisson") +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, family = "poisson") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- plot(fit) -## ------------------------------------------------------------------------ +## ----out.lines = 7------------------------------------------------------------ coef(fit, s = 1) predict(fit, newx = x[1:5,], type = "response", s = c(0.1,1)) -## ------------------------------------------------------------------------ -cvfit = cv.glmnet(x, y, family = "poisson") +## ----------------------------------------------------------------------------- +cvfit <- cv.glmnet(x, y, family = "poisson") -## ------------------------------------------------------------------------ -plot(cvfit) - -## ------------------------------------------------------------------------ -opt.lam = c(cvfit$lambda.min, cvfit$lambda.1se) -coef(cvfit, s = opt.lam) - -## ------------------------------------------------------------------------ -data(CoxExample) -y[1:5,] - -## ------------------------------------------------------------------------ -fit = glmnet(x, y, family = "cox") - -## ------------------------------------------------------------------------ -plot(fit) - -## ------------------------------------------------------------------------ -coef(fit, s = 0.05) - -## ------------------------------------------------------------------------ -cvfit = cv.glmnet(x, y, family = "cox") - -## ------------------------------------------------------------------------ -plot(cvfit) - -## ------------------------------------------------------------------------ -cvfit$lambda.min -cvfit$lambda.1se +## ----------------------------------------------------------------------------- +data(BinomialExample) +itrain <- 1:70 +fit <- glmnet(x[itrain, ], y[itrain], family = "binomial", nlambda = 5) +assess.glmnet(fit, newx = x[-itrain, ], newy = y[-itrain]) + +## ---- eval=FALSE-------------------------------------------------------------- +# pred <- predict(fit, newx = x[-itrain, ]) +# assess.glmnet(pred, newy = y[-itrain], family = "binomial") + +## ----------------------------------------------------------------------------- +glmnet.measures() + +## ----out.lines = 11----------------------------------------------------------- +cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "binomial", nlambda = 30) +assess.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) + +## ----out.lines = 11----------------------------------------------------------- +assess.glmnet(cfit, newx = x[-itrain, ],newy = y[-itrain], s = "lambda.min") + +## ----out.lines = 11----------------------------------------------------------- +cfit <- cv.glmnet(x, y, family = "binomial", keep = TRUE, nlambda = 30) +assess.glmnet(cfit$fit.preval, newy = y, family = "binomial") + +## ----------------------------------------------------------------------------- +cfit <- cv.glmnet(x, y, family = "binomial", type.measure = "auc", + keep = TRUE) +rocs <- roc.glmnet(cfit$fit.preval, newy = y) + +## ----------------------------------------------------------------------------- +best <- cvfit$index["min",] +plot(rocs[[best]], type = "l") +invisible(sapply(rocs, lines, col="grey")) +lines(rocs[[best]], lwd = 2,col = "red") -## ------------------------------------------------------------------------ -coef.min = coef(cvfit, s = "lambda.min") -active.min = which(coef.min != 0) -index.min = coef.min[active.min] - -## ------------------------------------------------------------------------ -index.min -coef.min +## ----------------------------------------------------------------------------- +data(MultinomialExample) +set.seed(101) +itrain <- sample(1:500, 400, replace = FALSE) +cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "multinomial") +cnf <- confusion.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) + +## ----------------------------------------------------------------------------- +print(cnf) + +## ----------------------------------------------------------------------------- +cfit <- cv.glmnet(x, y, family = "multinomial", type = "class", keep = TRUE) +cnf <- confusion.glmnet(cfit$fit.preval, newy = y, family = "multinomial") +best <- cfit$index["min",] +print(cnf[[best]]) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- data(SparseExample) - -## ------------------------------------------------------------------------ class(x) -## ------------------------------------------------------------------------ -fit = glmnet(x, y) +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- cvfit = cv.glmnet(x, y) plot(cvfit) -## ------------------------------------------------------------------------ -i = sample(1:5, size = 25, replace = TRUE) -j = sample(1:20, size = 25, replace = TRUE) -x = rnorm(25) -nx = sparseMatrix(i = i, j = j, x = x, dims = c(5, 20)) +## ----------------------------------------------------------------------------- +i <- sample(1:5, size = 25, replace = TRUE) +j <- sample(1:20, size = 25, replace = TRUE) +x <- rnorm(25) +nx <- sparseMatrix(i = i, j = j, x = x, dims = c(5, 20)) predict(cvfit, newx = nx, s = "lambda.min") -## ------------------------------------------------------------------------ -data(QuickStartExample) -fit = glmnet(x, y) +## ----------------------------------------------------------------------------- +data(BinomialExample) +fit <- bigGlm(x, y, family = "binomial", lower.limits = -1) print(fit) -## ------------------------------------------------------------------------ -glmnet.control(fdev = 0) -fit = glmnet(x, y) -print(fit) +## ----------------------------------------------------------------------------- +set.seed(101) +X <- matrix(rnorm(5), nrow = 5) +X2 <- sample(letters[1:3], 5, replace = TRUE) +X3 <- sample(LETTERS[1:3], 5, replace = TRUE) +df <- data.frame(X, X2, X3) +makeX(df) + +## ----------------------------------------------------------------------------- +makeX(df, sparse = TRUE) + +## ----------------------------------------------------------------------------- +Xn <- X ; Xn[3,1] <- NA +X2n <- X2; X2n[1] <- NA +X3n <- X3; X3n[5] <- NA +dfn <- data.frame(Xn, X2n, X3n) +dfn +makeX(dfn) + +## ----------------------------------------------------------------------------- +makeX(dfn, na.impute = TRUE, sparse = TRUE) + +## ----------------------------------------------------------------------------- +set.seed(102) +X <- matrix(rnorm(5), nrow = 5) +X2 <- sample(letters[1:3], 5, replace = TRUE) +X3 <- sample(LETTERS[1:3], 5, replace = TRUE) +Xn <- X ; Xn[5,1] <- NA +X2n <- X2; X2n[1] <- NA +X3n <- X3; X3n[2] <- NA +dftn <- data.frame(Xn, X2n, X3n) +dftn +makeX(dfn, dftn, sparse = TRUE) + +## ---- eval=FALSE-------------------------------------------------------------- +# fit <- glmnet(x, y, trace.it = TRUE) + +## ---- eval=FALSE-------------------------------------------------------------- +# fit <- cv.glmnet(x, y, trace.it = TRUE) -## ------------------------------------------------------------------------ +## ---- eval=FALSE-------------------------------------------------------------- +# glmnet.control(itrace = 1) + +## ----------------------------------------------------------------------------- +data(QuickStartExample) +fit <- glmnet(x, y) +length(fit$lambda) # number of lambda values fit + +## ----------------------------------------------------------------------------- +glmnet.control(fdev = 0.1) +fit <- glmnet(x, y) +length(fit$lambda) # number of lambda values fit + +## ----------------------------------------------------------------------------- glmnet.control(factory = TRUE) -## ------------------------------------------------------------------------ +## ----out.lines = 8------------------------------------------------------------ glmnet.control() -## ---- echo=FALSE--------------------------------------------------------- +## ---- echo=FALSE-------------------------------------------------------------- data(QuickStartExample) -## ----eval=FALSE---------------------------------------------------------- -# fit = glmnet(x, y, intercept = F, standardize = F, lambda = 8/(2*dim(x)[1]), thresh = 1e-20) +## ----------------------------------------------------------------------------- +np <- dim(x); n <- np[1]; p <-np[2] -## ----eval=FALSE---------------------------------------------------------- -# beta_glmnet = as.matrix(predict(fit, type = "coefficients")[-1,]) +fit <- glmnet(x, y, intercept = F, standardize = F, + lambda = 8 / (2 * n), thresh = 1e-20) -## ------------------------------------------------------------------------ -fit = glmnet(x, y, intercept = F, standardize = F, thresh = 1e-20) -beta_glmnet = as.matrix(predict(fit, s = 8/(2*dim(x)[1]), type = "coefficients", - exact = TRUE, x=x, y=y)[-1,]) - -## ---- eval=FALSE--------------------------------------------------------- -# library(CVXfromR) -# setup.dir = "change/this/to/your/cvx/directory" -# n = dim(x)[1]; p = dim(x)[2] -# cvxcode = paste("variables beta(p)", -# "minimize(square_pos(norm(y - x * beta, 2)) + lambda * norm(beta, 1))", -# sep = ";") -# Lasso = CallCVX(cvxcode, const.var = list(p = p, x = x, y = y, lambda = 8), opt.var.names = "beta", setup.dir = setup.dir, matlab.call = "change/this/to/path/to/matlab") -# beta_CVX = Lasso$beta +## ----eval=FALSE--------------------------------------------------------------- +# beta_glmnet <- as.matrix(predict(fit, type = "coefficients")[-1,]) -## ------------------------------------------------------------------------ -data(CVXResults) +## ----------------------------------------------------------------------------- +fit <- glmnet(x, y, intercept = F, standardize = F, thresh = 1e-20) +beta_glmnet <- as.matrix(predict(fit, s = 8 / (2 * n), + type = "coefficients", + exact = TRUE, x = x, y = y)[-1,]) + +## ---- eval=FALSE-------------------------------------------------------------- +# library(CVXR) +# beta <- Variable(p) +# loss <- sum((y-x%*%beta)^2)/(2*n) +# lassoPenalty <- function(beta,lambda)lambda*p_norm(beta,1) +# obj <- loss + lassoPenalty(beta, lambda = 8/(2*n)) +# prob <- Problem(Minimize(obj)) +# result <- solve(prob) +# beta_CVX <- result$getValue(beta) -## ----message=FALSE------------------------------------------------------- -require(lars) +## ----------------------------------------------------------------------------- +data(CVXResults) -## ------------------------------------------------------------------------ -fit_lars = lars(x, y, type = "lasso", intercept = F, normalize = F) -beta_lars = predict(fit_lars, s = 8/2, type = "coefficients", mode = "lambda")$coefficients - -## ------------------------------------------------------------------------ -cmp = round(cbind(beta_glmnet, beta_lars, beta_CVX), digits = 6) -colnames(cmp) = c("beta_glmnet", "beta_lars", "beta_CVX") +## ---- message=FALSE----------------------------------------------------------- +library(lars) +fit_lars <- lars(x, y, type = "lasso", intercept = F, normalize = F) +beta_lars <- predict(fit_lars, s = 8 / 2, type = "coefficients", + mode = "lambda")$coefficients + +## ----------------------------------------------------------------------------- +cmp <- round(cbind(beta_glmnet, beta_lars, beta_CVX), digits = 6) +colnames(cmp) <- c("beta_glmnet", "beta_lars", "beta_CVX") cmp diff -Nru r-cran-glmnet-4.0-2/inst/doc/glmnet.Rmd r-cran-glmnet-4.1/inst/doc/glmnet.Rmd --- r-cran-glmnet-4.0-2/inst/doc/glmnet.Rmd 2020-05-06 19:54:31.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/glmnet.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,10 @@ --- title: "An Introduction to `glmnet`" -author: "Trevor Hastie and Junyang Qian" -date: "September 13, 2016" +author: + - Trevor Hastie + - Junyang Qian + - Kenneth Tay +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -15,106 +18,86 @@ \usepackage[utf8]{inputenc} --- +```{r include=FALSE} +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) +``` + + ## Introduction -Glmnet is a package that fits a generalized linear model via penalized -maximum likelihood. The regularization path is computed for the lasso -or elasticnet penalty at a grid of values for the regularization -parameter lambda. The algorithm is extremely fast, and can exploit -sparsity in the input matrix `x`. It fits linear, logistic and -multinomial, poisson, and Cox regression models. A variety of -predictions can be made from the fitted models. It can also fit -multi-response linear regression. - -The authors of glmnet are Jerome Friedman, Trevor Hastie, Rob -Tibshirani, Balasubramanian Narasimhan, Kenneth Tay and Noah Simon, with -contribution from Junyang Qian, and the R package is maintained by Trevor -Hastie. The matlab version of glmnet is maintained by Junyang -Qian, and the Python version by B. Balakumar (although both are a few -versions behind). This vignette describes the usage of glmnet in R. -There are additional vignettes that should be useful: - -* one that describes in detail the new `relaxed` features in `glmnet`, along with some new capabilities. -* a vignette devoted to Cox models in `glmnet`. -* the newest one that describes using `glm()` family objects. - - -`glmnet` solves the following problem - -$$ -\min_{\beta_0,\beta} \frac{1}{N} \sum_{i=1}^{N} w_i l(y_i,\beta_0+\beta^T x_i) + \lambda\left[(1-\alpha)||\beta||_2^2/2 + \alpha ||\beta||_1\right], -$$ - -over a grid of values of $\lambda$ covering the entire range. Here -$l(y,\eta)$ is the negative log-likelihood contribution for -observation $i$; e.g. for the Gaussian case it is -$\frac{1}{2}(y-\eta)^2$. The _elastic-net_ penalty is controlled by -$\alpha$, and bridges the gap between lasso ($\alpha=1$, the default) -and ridge ($\alpha=0$). The tuning parameter $\lambda$ controls the -overall strength of the penalty. - -It is known that the ridge penalty shrinks the coefficients of -correlated predictors towards each other while the lasso tends to pick -one of them and discard the others. The elastic-net penalty mixes -these two; if predictors are correlated in groups, an $\alpha=0.5$ -tends to select the groups in or out together. This is a higher level -parameter, and users might pick a value upfront, else experiment with -a few different values. One use of $\alpha$ is for numerical -stability; for example, the elastic net with $\alpha = 1 - \epsilon$ -for some small $\epsilon > 0$ performs much like the lasso, but -removes any degeneracies and wild behavior caused by extreme -correlations. - -The `glmnet` algorithms use cyclical coordinate descent, which -successively optimizes the objective function over each parameter with -others fixed, and cycles repeatedly until convergence. The package -also makes use of the strong rules for efficient restriction of the -active set. Due to highly efficient updates and techniques such as -warm starts and active-set convergence, our algorithms can compute the -solution path very fast. - -The code can handle sparse input-matrix formats, as well as range -constraints on coefficients. The core of `glmnet` is a set of fortran -subroutines, which make for very fast execution. +Glmnet is a package that fits generalized linear and similar models +via penalized maximum likelihood. The regularization path is computed +for the lasso or elastic net penalty at a grid of values (on the log +scale) for the +regularization parameter lambda. The algorithm is extremely fast, and +can exploit sparsity in the input matrix `x`. It fits linear, logistic +and multinomial, poisson, and Cox regression models. It can also fit +multi-response linear regression, generalized linear models for custom +families, and relaxed lasso regression models. The package includes +methods for prediction and plotting, and functions for cross-validation. + +The authors of glmnet are Jerome Friedman, Trevor Hastie, Rob Tibshirani, Balasubramanian Narasimhan, Kenneth Tay and Noah Simon, with contribution from Junyang Qian, and the R package is maintained by Trevor Hastie. A MATLAB version of glmnet is maintained by Junyang Qian, and a Python version by B. Balakumar (although both are a few versions behind). + +This vignette describes basic usage of glmnet in R. There are additional vignettes that should be useful: + +* ["Regularized Cox Regression"](https://glmnet.stanford.edu/articles/Coxnet.html) describes how to fit regularized Cox models for survival data with `glmnet`. +* ["GLM `family` functions in `glmnet`"](https://glmnet.stanford.edu/articles/glmnetFamily.html) describes how to fit custom generalized linear models (GLMs) with the elastic net penalty via the `family` argument. +* ["The Relaxed Lasso"](https://glmnet.stanford.edu/articles/relax.html) + describes how to fit relaxed lasso regression models using the `relax` argument. + +`glmnet` solves the problem -The package also includes methods for prediction and plotting, and a -function that performs K-fold cross-validation. +$$ +\min_{\beta_0,\beta} \frac{1}{N} \sum_{i=1}^{N} w_i l(y_i,\beta_0+\beta^T x_i) + \lambda\left[(1-\alpha)\|\beta\|_2^2/2 + \alpha \|\beta\|_1\right], +$$ + +over a grid of values of $\lambda$ covering the entire range of possible solutions. Here $l(y_i,\eta_i)$ is the negative log-likelihood contribution for observation $i$; e.g. for the Gaussian case it is $\frac{1}{2}(y_i-\eta_i)^2$. The _elastic net_ penalty is controlled by $\alpha$, and bridges the gap between lasso regression ($\alpha=1$, the default) and ridge regression ($\alpha=0$). The tuning parameter $\lambda$ controls the overall strength of the penalty. + +It is known that the ridge penalty shrinks the coefficients of correlated predictors towards each other while the lasso tends to pick one of them and discard the others. The elastic net penalty mixes these two: if predictors are correlated in groups, an $\alpha=0.5$ tends to either select or leave out the entire group of features. This is a higher level parameter, and users might pick a value upfront or experiment with a few different values. One use of $\alpha$ is for numerical stability; for example, the elastic net with $\alpha = 1 - \epsilon$ for some small $\epsilon > 0$ performs much like the lasso, but removes any degeneracies and wild behavior caused by extreme correlations. -The theory and algorithms in this implementation are described in -@glmnet, @coxnet, @strongrules and @block . +The `glmnet` algorithms use cyclical coordinate descent, which successively optimizes the objective function over each parameter with others fixed, and cycles repeatedly until convergence. The package also makes use of the strong rules for efficient restriction of the active set. Due to highly efficient updates and techniques such as warm starts and active-set convergence, our algorithms can compute the solution path very quickly. +The code can handle sparse input-matrix formats, as well as range constraints on coefficients. The core of `glmnet` is a set of Fortran subroutines, which make for very fast execution. + +The theory and algorithms in this implementation are described in @glmnet, @coxnet, @strongrules and @block. ## Installation -Like many other R packages, the simplest way to obtain `glmnet` is to -install it directly from CRAN. Type the following command in R -console: +Like many other R packages, the simplest way to obtain `glmnet` is to install it directly from CRAN. Type the following command in R console: ```{r, eval=FALSE} install.packages("glmnet", repos = "https://cran.us.r-project.org") ``` -Users may change the `repos` options depending on their locations and -preferences. Other options such as the directories where to install -the packages can be altered in the command. For more details, see -`help(install.packages)`. - -Here the R package has been downloaded and installed to the default -directories. - -Alternatively, users can download the package source from -[CRAN](https://cran.r-project.org/package=glmnet) and type Unix -commands to install it to the desired location. +Users may change the `repos` argument depending on their locations and preferences. Other arguments such as the directories to install the packages at can be altered in the command. For more details, see `help(install.packages)`. Alternatively, users can download the package source from [CRAN](https://cran.r-project.org/package=glmnet) and type Unix commands to install it to the desired location. ## Quick Start -The purpose of this section is to give users a general sense of the package, including the components, what they do and some basic usage. We will briefly go over the main functions, see the basic operations and have a look at the outputs. Users may have a better idea after this section what functions are available, which one to choose, or at least where to seek help. More details are given in later sections. +The purpose of this section is to give users a general sense of the package. We will briefly go over the main functions, basic operations and outputs. After this section, users may have a better idea of what functions are available, which ones to use, or at least where to seek help. First, we load the `glmnet` package: ```{r} library(glmnet) ``` -The default model used in the package is the Guassian linear model or "least squares", which we will demonstrate in this section. We load a set of data created beforehand for illustration. Users can either load their own data or use those saved in the workspace. + +The default model used in the package is the Guassian linear model or "least squares", which we will demonstrate in this section. We load a set of data created beforehand for illustration: ```{r} data(QuickStartExample) ``` @@ -122,187 +105,189 @@ We fit the model using the most basic call to `glmnet`. ```{r} -fit = glmnet(x, y) +fit <- glmnet(x, y) ``` -"fit" is an object of class `glmnet` that contains all the relevant information of the fitted model for further use. We do not encourage users to extract the components directly. Instead, various methods are provided for the object such as `plot`, `print`, `coef` and `predict` that enable us to execute those tasks more elegantly. +`fit` is an object of class `glmnet` that contains all the relevant information of the fitted model for further use. We do not encourage users to extract the components directly. Instead, various methods are provided for the object such as `plot`, `print`, `coef` and `predict` that enable us to execute those tasks more elegantly. -We can visualize the coefficients by executing the `plot` function: +We can visualize the coefficients by executing the `plot` method: ```{r} plot(fit) ``` -Each curve corresponds to a variable. It shows the path of its coefficient against the $\ell_1$-norm of the whole coefficient vector at as $\lambda$ varies. The axis above indicates the number of nonzero coefficients at the current $\lambda$, which is the effective degrees of freedom (_df_) for the lasso. Users may also wish to annotate the curves; this can be done by setting `label = TRUE` in the plot command. +Each curve corresponds to a variable. It shows the path of its coefficient against the $\ell_1$-norm of the whole coefficient vector as $\lambda$ varies. The axis above indicates the number of nonzero coefficients at the current $\lambda$, which is the effective degrees of freedom (_df_) for the lasso. Users may also wish to annotate the curves: this can be done by setting `label = TRUE` in the plot command. A summary of the `glmnet` path at each step is displayed if we just enter the object name or use the `print` function: -```{r height = 4} +```{r out.lines = 10} print(fit) ``` -It shows from left to right the number of nonzero coefficients (`Df`), the percent (of null) deviance explained (`%dev`) and the value of $\lambda$ (`Lambda`). Although by default `glmnet` calls for 100 values of `lambda` the program stops early if `%dev% does not change sufficently from one lambda to the next (typically near the end of the path.) - -We can obtain the actual coefficients at one or more $\lambda$'s within the range of the sequence: -```{r} -coef(fit,s=0.1) +It shows from left to right the number of nonzero coefficients (`Df`), +the percent (of null) deviance explained (`%dev`) and the value of +$\lambda$ (`Lambda`). Although `glmnet` fits the model for 100 values +of `lambda` by default, it stops early if `%dev` does not change +sufficently from one lambda to the next (typically near the end of the +path.) Here we have truncated the prinout for brevity. + +We can obtain the model coefficients at one or more $\lambda$'s within the range of the sequence: +```{r out.lines = 10} +coef(fit, s = 0.1) ``` -(why `s` and not `lambda`? In case later we want to allow one to specify the model size in other ways.) -Users can also make predictions at specific $\lambda$'s with new input data: +(Why `s` and not `lambda`? In case we want to allow one to specify the model size in other ways in the future.) Users can also make predictions at specific $\lambda$'s with new input data: ```{r} set.seed(29) -nx = matrix(rnorm(10*20),10,20) -predict(fit,newx=nx,s=c(0.1,0.05)) +nx <- matrix(rnorm(5 * 20), 5, 20) +predict(fit, newx = nx, s = c(0.1, 0.05)) ``` -The function `glmnet` returns a sequence of models for the users to choose from. In many cases, users may prefer the software to select one of them. Cross-validation is perhaps the simplest and most widely used method for that task. - -`cv.glmnet` is the main function to do cross-validation here, along with various supporting methods such as plotting and prediction. We still act on the sample data loaded before. +The function `glmnet` returns a sequence of models for the users to choose from. In many cases, users may prefer the software to select one of them. Cross-validation is perhaps the simplest and most widely used method for that task. `cv.glmnet` is the main function to do cross-validation here, along with various supporting methods such as plotting and prediction. ```{r} -cvfit = cv.glmnet(x, y) +cvfit <- cv.glmnet(x, y) ``` -`cv.glmnet` returns a `cv.glmnet` object, which is "cvfit" here, a list with all the ingredients of the cross-validation fit. As for `glmnet`, we do not encourage users to extract the components directly except for viewing the selected values of $\lambda$. The package provides well-designed functions for potential tasks. -We can plot the object. +`cv.glmnet` returns a `cv.glmnet` object, a list with all the ingredients of the cross-validated fit. As with `glmnet`, we do not encourage users to extract the components directly except for viewing the selected values of $\lambda$. The package provides well-designed functions for potential tasks. For example, we can plot the object: ```{r} plot(cvfit) ``` -It includes the cross-validation curve (red dotted line), and upper and lower standard deviation curves along the $\lambda$ sequence (error bars). Two selected $\lambda$'s are indicated by the vertical dotted lines (see below). +This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\lambda$ sequence (error bars). Two special values along the $\lambda$ sequence are indicated by the vertical dotted lines. `lambda.min` is the value of $\lambda$ that gives minimum mean cross-validated error, while `lambda.1se` is the value of $\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum. -We can view the selected $\lambda$'s and the corresponding coefficients. For example, -```{r} +We can use the following code to get the value of `lambda.min` and the model coefficients at that value of $\lambda$: +```{r out.lines = 10} cvfit$lambda.min -``` -`lambda.min` is the value of $\lambda$ that gives minimum mean cross-validated error. The other $\lambda$ saved is `lambda.1se`, which gives the most regularized model such that error is within one standard error of the minimum. To use that, we only need to replace `lambda.min` with `lambda.1se` above. -```{r} coef(cvfit, s = "lambda.min") ``` -Note that the coefficients are represented in the sparse matrix format. The reason is that the solutions along the regularization path are often sparse, and hence it is more efficient in time and space to use a sparse format. If you prefer non-sparse format, pipe the output through `as.matrix()`. -Predictions can be made based on the fitted `cv.glmnet` object. Let's see a toy example. +To get the corresponding values at `lambda.1se`, simply replace +`lambda.min` with `lambda.1se` above, or omit the `s` argument, since +`lambda.1se` is the default. + +Note that the coefficients are represented in the sparse matrix format. This is because the solutions along the regularization path are often sparse, and hence it is more efficient in time and space to use a sparse format. If you prefer non-sparse format, pipe the output through `as.matrix()`. + +Predictions can be made based on the fitted `cv.glmnet` object as well. The code below gives predictions for the new input matrix `newx` at `lambda.min`: ```{r} predict(cvfit, newx = x[1:5,], s = "lambda.min") ``` -`newx` is for the new input matrix and `s`, as before, is the value(s) of $\lambda$ at which predictions are made. - -That is the end of `glmnet` 101. With the tools introduced so far, users are able to fit the entire elastic net family, including ridge regression, using squared-error loss. In the package, there are many more options that give users a great deal of flexibility. To learn more, move on to later sections. -## Linear Regression +This concludes `glmnet` 101. With the tools introduced so far, users are able to fit the entire elastic net family, including ridge regression, using squared-error loss. There are many more arguments in the package that give users a great deal of flexibility. To learn more, move on to later sections. -Linear regression here refers to two families of models. One is `gaussian`, the Gaussian family, and the other is `mgaussian`, the multiresponse Gaussian family. We first discuss the ordinary Gaussian and the multiresponse one after that. +## Linear Regression: `family = "gaussian"` (default) -### Gaussian Family - -`gaussian ` is the default family option in the function `glmnet`. Suppose we have observations $x_i \in \mathbb{R}^p$ and the responses $y_i \in \mathbb{R}, i = 1, \ldots, N$. The objective function for the Gaussian family is +`"gaussian"` is the default `family` argument for the function `glmnet`. Suppose we have observations $x_i \in \mathbb{R}^p$ and the responses $y_i \in \mathbb{R}, i = 1, \ldots, N$. The objective function for the Gaussian family is $$ -\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}}\frac{1}{2N} \sum_{i=1}^N (y_i -\beta_0-x_i^T \beta)^2+\lambda \left[ (1-\alpha)||\beta||_2^2/2 + \alpha||\beta||_1\right], +\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}}\frac{1}{2N} \sum_{i=1}^N (y_i -\beta_0-x_i^T \beta)^2+\lambda \left[ (1-\alpha)\|\beta\|_2^2/2 + \alpha\|\beta\|_1\right], $$ -where $\lambda \geq 0$ is a complexity parameter and $0 \leq \alpha \leq 1$ is a compromise between ridge ($\alpha = 0$) and lasso ($\alpha = 1$). +where $\lambda \geq 0$ is a complexity parameter and $0 \leq \alpha \leq 1$ is a compromise between ridge regression ($\alpha = 0$) and lasso regression ($\alpha = 1$). -Coordinate descent is applied to solve the problem. Specifically, suppose we have current estimates $\tilde{\beta_0}$ and $\tilde{\beta}_\ell$ $\forall j\in 1,]\ldots,p$. By computing the gradient at $\beta_j = \tilde{\beta}_j$ and simple calculus, the update is +`glmnet` applies coordinate descent to solve the problem. Specifically, suppose we have current estimates $\tilde{\beta_0}$ and $\tilde{\beta}_\ell$ $\forall \ell\in 1,\ldots,p$. By computing the gradient at $\beta_j = \tilde{\beta}_j$ and simple calculus, the update is $$ \tilde{\beta}_j \leftarrow \frac{S(\frac{1}{N}\sum_{i=1}^N x_{ij}(y_i-\tilde{y}_i^{(j)}),\lambda \alpha)}{1+\lambda(1-\alpha)}, $$ where $\tilde{y}_i^{(j)} = \tilde{\beta}_0 + \sum_{\ell \neq j} x_{i\ell} \tilde{\beta}_\ell$, and $S(z, \gamma)$ is the soft-thresholding operator with value $\text{sign}(z)(|z|-\gamma)_+$. -This formula above applies when the `x` variables are standardized to have unit variance (the default); it is slightly more complicated when they are not. Note that for "family=gaussian", `glmnet` standardizes $y$ to have unit variance before computing its lambda sequence (and then unstandardizes the resulting coefficients); if you wish to reproduce/compare results with other software, best to supply a standardized $y$ first (Using the "1/N" variance formula). +This formula above applies when the `x` variables are standardized to have unit variance (the default); it is slightly more complicated when they are not. Note that for `family = "gaussian"`, `glmnet` standardizes $y$ to have unit variance before computing its `lambda` sequence (and then unstandardizes the resulting coefficients). If you wish to reproduce or compare results with other software, it is best to supply a standardized $y$ first (Using the "1/N" variance formula). -`glmnet` provides various options for users to customize the fit. We introduce some commonly used options here and they can be specified in the `glmnet` function. +### Commonly used function arguments -* `alpha` is for the elastic-net mixing parameter $\alpha$, with range $\alpha \in [0,1]$. $\alpha = 1$ is the lasso (default) and $\alpha = 0$ is the ridge. +`glmnet` provides various arguments for users to customize the fit: we introduce some commonly used arguments here. (For more information, type `?glmnet`.) -* `weights` is for the observation weights. Default is 1 for each observation. (Note: `glmnet` rescales the weights to sum to N, the sample size.) +* `alpha` is for the elastic net mixing parameter $\alpha$, with range $\alpha \in [0,1]$. $\alpha = 1$ is lasso regression (default) and $\alpha = 0$ is ridge regression. -* `nlambda` is the number of $\lambda$ values in the sequence. Default is 100. +* `weights` is for the observation weights, default is 1 for each observation. (Note: `glmnet` rescales the weights internally to sum to N, the sample size.) -* `lambda` can be provided, but is typically not and the program constructs a sequence. When automatically generated, the $\lambda$ sequence is determined by `lambda.max` and `lambda.min.ratio`. The latter is the ratio of smallest value of the generated $\lambda$ sequence (say `lambda.min`) to `lambda.max`. The program then generated `nlambda` values linear on the log scale from `lambda.max` down to `lambda.min`. `lambda.max` is not given, but easily computed from the input $x$ and $y$; it is the smallest value for `lambda` such that all the coefficients are zero. For `alpha=0` (ridge) `lambda.max` would be $\infty$; hence for this case we pick a value corresponding to a small value for `alpha` close to zero.) +* `nlambda` is the number of $\lambda$ values in the sequence (default is 100). -* `standardize` is a logical flag for `x` variable standardization, prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is `standardize=TRUE`. +* `lambda` can be provided if the user wants to specify the lambda sequence, but typical usage is for the program to construct the lambda sequence on its own. When automatically generated, the $\lambda$ sequence is determined by `lambda.max` and `lambda.min.ratio`. The latter is the ratio of smallest value of the generated $\lambda$ sequence (say `lambda.min`) to `lambda.max`. The program generates `nlambda` values linear on the log scale from `lambda.max` down to `lambda.min`. `lambda.max` is not user-specified but is computed from the input $x$ and $y$: it is the smallest value for `lambda` such that all the coefficients are zero. For `alpha = 0` (ridge) `lambda.max` would be $\infty$: in this case we pick a value corresponding to a small value for `alpha` close to zero.) -For more information, type `help(glmnet)` or simply `?glmnet`. +* `standardize` is a logical flag for `x` variable standardization prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is `standardize = TRUE`. -As an example, we set $\alpha = 0.2$ (more like a ridge regression), and give double weights to the latter half of the observations. To avoid too long a display here, we set `nlambda` to 20. In practice, however, the number of values of $\lambda$ is recommended to be 100 (default) or more. In most cases, it does not come with extra cost because of the warm-starts used in the algorithm, and for nonlinear models leads to better convergence properties. +As an example, we set $\alpha = 0.2$ (more like a ridge regression), and give double weight to the latter half of the observations. We set `nlambda` to 20 so that the model fit is only compute for 20 values of $\lambda$. In practice, we recommend `nlambda` to be 100 (default) or more. In most cases, it does not come with extra cost because of the warm-starts used in the algorithm, and for nonlinear models leads to better convergence properties. ```{r} -fit = glmnet(x, y, alpha = 0.2, weights = c(rep(1,50),rep(2,50)), nlambda = 20) +wts <- c(rep(1,50), rep(2,50)) +fit <- glmnet(x, y, alpha = 0.2, weights = wts, nlambda = 20) ``` -We can then print the `glmnet` object. + +We can then print the `glmnet` object: ```{r} print(fit) ``` -This displays the call that produced the object `fit` and a three-column matrix with columns `Df` (the number of nonzero coefficients), `%dev` (the percent deviance explained) and `Lambda` (the corresponding value of $\lambda$). +This displays the call that produced the object `fit` and a three-column matrix with columns `Df` (the number of nonzero coefficients), `%dev` (the percent deviance explained) and `Lambda` (the corresponding value of $\lambda$). (The `digits` argument can used to specify significant digits in the printout.) -(Note that the `digits` option can used to specify significant digits in the printout.) +Here the actual number of $\lambda$'s is less than that specified in the call. This is because of the algorithm's stopping criteria. According to the default internal settings, the computations stop if either the fractional change in deviance down the path is less than $10^{-5}$ or the fraction of explained deviance reaches $0.999$. From the last few lines of the output, we see the fraction of deviance does not change much and therefore the computation ends before the all 20 models are fit. The internal parameters governing the stopping criteria can be changed. For details, see the Appendix section or type `help(glmnet.control)`. -Here the actual number of $\lambda$'s here is less than specified in the call. The reason lies in the stopping criteria of the algorithm. According to the default internal settings, the computations stop if either the fractional change in deviance down the path is less than $10^{-5}$ or the fraction of explained deviance reaches $0.999$. From the last few lines , we see the fraction of deviance does not change much and therefore the computation ends when meeting the stopping criteria. We can change such internal parameters. For details, see the Appendix section or type `help(glmnet.control)`. +### Predicting and plotting with `glmnet` objects -We can plot the fitted object as in the previous section. There are more options in the `plot` function. +We can extract the coefficients and make predictions for a `glmnet` object at certain values of $\lambda$. Two commonly used arguments are: -Users can decide what is on the X-axis. `xvar` allows three measures: "norm" for the $\ell_1$-norm of the coefficients (default), "lambda" for the log-lambda value and "dev" for %deviance explained. +* `s` for specifiying the value(s) of $\lambda$ at which to extract coefficients/predictions. -Users can also label the curves with variable sequence numbers simply by setting `label = TRUE`. +* `exact` for indicating whether the exact values of coefficients are desired or not. If `exact = TRUE` and predictions are to be made at values of `s` not included in the original fit, these values of `s` are merged with `object$lambda` and the model is refit before predictions are made. If `exact = FALSE` (default), then the `predict` function uses linear interpolation to make predictions for values of `s` that do not coincide with lambdas used in the fitting algorithm. -Let's plot "fit" against the log-lambda value and with each curve labeled. +Here is a simple example illustrating the use of both these function arguments: ```{r} -plot(fit, xvar = "lambda", label = TRUE) -``` - -Now when we plot against %deviance we get a very different picture. This is percent deviance explained on the training data. What we see here is that toward the end of the path this value are not changing much, but the coefficients are "blowing up" a bit. This lets us focus attention on the parts of the fit that matter. This will especially be true for other models, such as logistic regression. -```{r} -plot(fit, xvar = "dev", label = TRUE) +fit <- glmnet(x, y) +any(fit$lambda == 0.5) # 0.5 not in original lambda sequence +coef.apprx <- coef(fit, s = 0.5, exact = FALSE) +coef.exact <- coef(fit, s = 0.5, exact = TRUE, x=x, y=y) +cbind2(coef.exact[which(coef.exact != 0)], + coef.apprx[which(coef.apprx != 0)]) ``` +The left and right columns show the coefficients for `exact = TRUE` and `exact = FALSE` respectively. (For brevity we only show the non-zero coefficients.) We see from the above that 0.5 is not in the sequence and that hence there are some small differences in coefficient values. Linear interpolation is usually accurate enough if there are no special requirements. Notice that with `exact = TRUE` we have to supply by named argument any data that was used in creating the original fit, in this case `x` and `y`. +Users can make predictions from the fitted `glmnet` object. In addition to the arguments in `coef`, the primary argument is `newx`, a matrix of new values for `x` at which predictions are desired. The `type` argument allows users to choose the type of prediction returned: -We can extract the coefficients and make predictions at certain values of $\lambda$. Two commonly used options are: +* "link" returns the fitted values (i.e. $\hat\beta_0 + x_i^T\hat\beta$) -* `s` specifies the value(s) of $\lambda$ at which extraction is made. +* "response" gives the same output "link" for "gaussian" family. -* `exact` indicates whether the exact values of coefficients are desired or not. That is, if `exact = TRUE`, and predictions are to be made at values of `s` not included in the original fit, these values of `s` are merged with `object$lambda`, and the model is refit before predictions are made. If `exact=FALSE` (default), then the predict function uses linear interpolation to make predictions for values of `s` that do not coincide with lambdas used in the fitting algorithm. +* "coefficients" returns the model codfficients. -A simple example is: +* "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. +For example, the following code gives the fitted values for the first 5 observations at $\lambda = 0.05$: ```{r} -fit = glmnet(x, y) -any(fit$lambda == 0.5) -coef.apprx = coef(fit, s = 0.5, exact = FALSE) -coef.exact = coef(fit, s = 0.5, exact = TRUE, x=x, y=y) -cbind2(coef.exact, coef.apprx) +predict(fit, newx = x[1:5,], type = "response", s = 0.05) ``` -The left column is for `exact = TRUE` and the right for `FALSE`. We -see from the above that 0.5 is not in the sequence and that hence -there are some difference, though not much. Linear interpolation is -mostly enough if there are no special requirements. Notice that with -`exact=TRUE` we have to supply by named argument any data that was used in creating the -original fit, in this case `x` and `y`. -Users can make predictions from the fitted object. In addition to the options in `coef`, the primary argument is `newx`, a matrix of new values for `x`. The `type` option allows users to choose the type of prediction: -* "link" gives the fitted values +If multiple values of `s` are supplied, a matrix of predictions is +produced. If no value of `s` is supplied, a matrix of predictions is +supplied, with columns corresponding to all the lambdas used in the fit. -* "response" the sames as "link" for "gaussian" family. +We can plot the fitted object as in the Quick Start section. Here we walk through more arguments for the `plot` function. The `xvar` argument allows users to decide what is plotted on the `x`-axis. `xvar` allows three measures: "norm" for the $\ell_1$-norm of the coefficients (default), "lambda" for the log-lambda value and "dev" for %deviance explained. Users can also label the curves with the variable index numbers simply by setting `label = TRUE`. -* "coefficients" computes the coefficients at values of `s` - -* "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. +For example, let's plot `fit` against the log-lambda value and with each curve labeled: +```{r} +plot(fit, xvar = "lambda", label = TRUE) +``` -For example, +Now when we plot against %deviance we get a very different +picture. This is percent deviance explained on the training data, and +is a measure of complexity of the model. We see that toward the end of the path, %deviance is not changing much but the coefficients are "blowing up" a bit. This enables us focus attention on the parts of the fit that matter. This will especially be true for other models, such as logistic regression. ```{r} -predict(fit, newx = x[1:5,], type = "response", s = 0.05) +plot(fit, xvar = "dev", label = TRUE) ``` -gives the fitted values for the first 5 observations at $\lambda = 0.05$. If multiple values of `s` are supplied, a matrix of predictions is produced. -Users can customize K-fold cross-validation. In addition to all the `glmnet` parameters, `cv.glmnet` has its special parameters including `nfolds` (the number of folds), `foldid` (user-supplied folds), `type.measure`(the loss used for cross-validation): -* "deviance" or "mse" uses squared loss +### Cross-validation -* "mae" uses mean absolute error +K-fold cross-validation can be performed using the `cv.glmnet` function. In addition to all the `glmnet` parameters, `cv.glmnet` has its special parameters including `nfolds` (the number of folds), `foldid` (user-supplied folds), and `type.measure`(the loss used for cross-validation): + +* "deviance" or "mse" for squared loss, and + +* "mae" uses mean absolute error. As an example, ```{r} -cvfit = cv.glmnet(x, y, type.measure = "mse", nfolds = 20) +cvfit <- cv.glmnet(x, y, type.measure = "mse", nfolds = 20) +``` +does 20-fold cross-validation based on mean squared error criterion (the default for "gaussian" family). Printing the resulting object gives some basic information on the cross-validation performed: +```{r} +print(cvfit) ``` -does 20-fold cross-validation, based on mean squared error criterion (default though). -Parallel computing is also supported by `cv.glmnet`. To make it work, users must register parallel beforehand. We give a simple example of comparison here. Unfortunately, the package `doMC` is not available on Windows platforms (it is on others), so we cannot run the code here, but we make it looks as if we have. +`cv.glmnet` also supports parallel computing. To make it work, users must register parallel beforehand. We give a simple example of comparison here. Unfortunately, the package `doMC` is not available on Windows platforms (it is on others), so we cannot run the code here, but we present timing information recorded during one of our test runs. ```{r, eval=FALSE} -require(doMC) -registerDoMC(cores=2) -X = matrix(rnorm(1e4 * 200), 1e4, 200) -Y = rnorm(1e4) +library(doMC) +registerDoMC(cores = 2) +X <- matrix(rnorm(1e4 * 200), 1e4, 200) +Y <- rnorm(1e4) ``` ```{r, eval=FALSE} @@ -323,211 +308,176 @@ As suggested from the above, parallel computing can significantly speed up the computation process especially for large-scale problems. -Functions `coef` and `predict` on cv.glmnet object are similar to those for a `glmnet` object, except that two special strings are also supported by `s` (the values of $\lambda$ requested): -* "lambda.1se": the largest $\lambda$ at which the MSE is within one standard error of the minimal MSE. +The `coef` and `predict` methods for `cv.glmnet` objects are similar to those for a `glmnet` object, except that two special strings are also supported by `s` (the values of $\lambda$ requested): -* "lambda.min": the $\lambda$ at which the minimal MSE is achieved. +* "lambda.min": the $\lambda$ at which the smallest MSE is achieved. -```{r} +* "lambda.1se": the largest $\lambda$ at which the MSE is within one standard error of the smallest MSE (default). + +```{r out.lines = 10} cvfit$lambda.min -coef(cvfit, s = "lambda.min") predict(cvfit, newx = x[1:5,], s = "lambda.min") +coef(cvfit, s = "lambda.min") ``` -Users can control the folds used. Here we use the same folds so we can also select a value for $\alpha$. - +Users can explicitly control the fold that each observation is assigned to via the `foldid` argument. This is useful, for example, in using cross-validation to select a value for $\alpha$: ```{r} -foldid=sample(1:10,size=length(y),replace=TRUE) -cv1=cv.glmnet(x,y,foldid=foldid,alpha=1) -cv.5=cv.glmnet(x,y,foldid=foldid,alpha=.5) -cv0=cv.glmnet(x,y,foldid=foldid,alpha=0) +foldid <- sample(1:10, size = length(y), replace = TRUE) +cv1 <- cv.glmnet(x, y, foldid = foldid, alpha = 1) +cv.5 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.5) +cv0 <- cv.glmnet(x, y, foldid = foldid, alpha = 0) ``` + There are no built-in plot functions to put them all on the same plot, so we are on our own here: ```{r} -par(mfrow=c(2,2)) -plot(cv1);plot(cv.5);plot(cv0) -plot(log(cv1$lambda),cv1$cvm,pch=19,col="red",xlab="log(Lambda)",ylab=cv1$name) -points(log(cv.5$lambda),cv.5$cvm,pch=19,col="grey") -points(log(cv0$lambda),cv0$cvm,pch=19,col="blue") -legend("topleft",legend=c("alpha= 1","alpha= .5","alpha 0"),pch=19,col=c("red","grey","blue")) +par(mfrow = c(2,2)) +plot(cv1); plot(cv.5); plot(cv0) +plot(log(cv1$lambda) , cv1$cvm , pch = 19, col = "red", + xlab = "log(Lambda)", ylab = cv1$name) +points(log(cv.5$lambda), cv.5$cvm, pch = 19, col = "grey") +points(log(cv0$lambda) , cv0$cvm , pch = 19, col = "blue") +legend("topleft", legend = c("alpha= 1", "alpha= .5", "alpha 0"), + pch = 19, col = c("red","grey","blue")) ``` -We see that lasso (`alpha=1`) does about the best here. We also see that the range of lambdas used differs with alpha. - +We see that the lasso (`alpha=1`) does about the best here. We also see that the range of lambdas used differs with `alpha`. -#### Coefficient upper and lower bounds +### Other function arguments -These are recently added features that enhance the scope of the models. Suppose we want to fit our model, but limit the coefficients to be bigger than -0.7 and less than 0.5. This is easily achieved via the `upper.limits` and `lower.limits` arguments: +In this section we breifly describe some other useful arguments when +calling `glmnet`: `upper.limits`, `lower.limits`, `penalty.factor`, `exclude` and `intercept`. +Suppose we want to fit our model but limit the coefficients to be bigger than -0.7 and less than 0.5. This can be achieved by specifying the `upper.limits` and `lower.limits` arguments: ```{r} -tfit=glmnet(x,y,lower=-.7,upper=.5) +tfit <- glmnet(x, y, lower.limits = -0.7, upper.limits = 0.5) plot(tfit) ``` -These are rather arbitrary limits; often we want the coefficients to be positive, so we can set only `lower.limit` to be 0. -(Note, the lower limit must be no bigger than zero, and the upper limit no smaller than zero.) -These bounds can be a vector, with different values for each coefficient. If given as a scalar, the same number gets recycled for all. - -#### Penalty factors +Often we want the coefficients to be positive: to do so, we just need to specify `lower.limits = 0`. (Note, the lower limit must be no bigger than zero, and the upper limit no smaller than zero.) These bounds can be a vector, with different values for each coefficient. If given as a scalar, the same number gets recycled for all. -This argument allows users to apply separate penalty factors to each coefficient. Its default is 1 for each parameter, but other values can be specified. In particular, any variable with `penalty.factor` equal to zero is not penalized at all! Let $v_j$ denote the penalty factor for $j$ th variable. The penalty term becomes +The `penalty.factor` argument allows users to apply separate penalty factors to each coefficient. This is very useful when we have prior knowledge or preference over the variables. Specifically, if $v_j$ denotes the penalty factor for the $j$th variable, the penalty term becomes $$ \lambda \sum_{j=1}^p \boldsymbol{v_j} P_\alpha(\beta_j) = \lambda \sum_{j=1}^p \boldsymbol{v_j} \left[ (1-\alpha)\frac{1}{2} \beta_j^2 + \alpha |\beta_j| \right]. $$ -Note the penalty factors are internally rescaled to sum to nvars. -This is very useful when people have prior knowledge or preference over the variables. In many cases, some variables may be so important that one wants to keep them all the time, which can be achieved by setting corresponding penalty factors to 0: - -```{r} -p.fac = rep(1, 20) -p.fac[c(5, 10, 15)] = 0 -pfit = glmnet(x, y, penalty.factor = p.fac) +The default is 1 for each coefficient, i.e. coefficients are penalized +equally. Note that any variable with `penalty.factor` equal to zero is +not penalized at all! This is useful in the case where some variables +are always to be included unpenalized in the model, such as the demographic +variables sex and age in medical studies. Note the penalty factors are internally rescaled to sum to `nvars`, the number of variables in the given `x` matrix. + +Here is an example where we set the penalty factors for variables 1, 3 and 5 to be zero: +```{r} +p.fac <- rep(1, 20) +p.fac[c(1, 3, 5)] <- 0 +pfit <- glmnet(x, y, penalty.factor = p.fac) plot(pfit, label = TRUE) ``` -We see from the labels that the three variables with 0 penalty factors always stay in the model, while the others follow typical regularization paths and shrunken to 0 eventually. +We see from the labels that the three variables with zero penalty factors always stay in the model, while the others follow typical regularization paths and shrunk to zero eventually. -Some other useful arguments. `exclude` allows one to block certain variables from being the model at all. Of course, one could simply subset these out of `x`, but sometimes `exclude` is more useful, since it returns a full vector of coefficients, just with the excluded ones set to zero. There is also an `intercept` argument which defaults to `TRUE`; if `FALSE` the intercept is forced to be zero. +`exclude` allows one to block certain variables from being the model at all. Of course, one could simply subset these out of `x`, but sometimes `exclude` is more useful, since it returns a full vector of coefficients, just with the excluded ones set to zero. -#### Customizing plots +The `intercept` argument allows the user to decide if an intercept should be included in the model or not (it is never penalized). The default is `intercept = TRUE`. If `intercept = FALSE` the intercept is forced to be zero. -Sometimes, especially when the number of variables is small, we want to add variable labels to a plot. Since `glmnet` is intended primarily for wide data, this is not supprted in `plot.glmnet`. However, it is easy to do, as the following little toy example shows. +## Linear Regression: `family = "mgaussian"` (multi-response) -We first generate some data, with 10 variables, and for lack of imagination and ease we give them simple character names. -We then fit a glmnet model, and make the standard plot. -```{r} -set.seed(101) -x=matrix(rnorm(1000),100,10) -y=rnorm(100) -vn=paste("var",1:10) -fit=glmnet(x,y) -plot(fit) -``` - -We wish to label the curves with the variable names. Here s a simple way to do this, using the `axis` command in R (and a little research into how to customize it). We need to have the positions of the coefficients at the end of the path, and we need to make some space using the `par` command, so that our labels will fit in. -This requires knowing how long your labels are, but here they are all quite short. - -```{r} -par(mar=c(4.5,4.5,1,4)) -plot(fit) -vnat=coef(fit) -vnat=vnat[-1,ncol(vnat)] # remove the intercept, and get the coefficients at the end of the path -axis(4, at=vnat,line=-.5,label=vn,las=1,tick=FALSE, cex.axis=0.5) -``` - -We have done nothing here to avoid overwriting of labels, in the event that they are close together. This would be a bit more work, but perhaps best left alone, anyway. - - -### Multiresponse Gaussian Family +The multi-response Gaussian family is useful when there are a number of (correlated) responses, also known as the "multi-task learning" problem. Here, a variable is either included in the model for all the responses, or excluded for all the responses. Most of the arguments for this family are the same as that for `family = "gaussian"`, so we focus on the differences with the single response model. -The multiresponse Gaussian family is obtained using `family = "mgaussian"` option in `glmnet`. It is very similar to the single-response case above. This is useful when there are a number of (correlated) responses - the so-called "multi-task learning" problem. Here the sharing involves which variables are selected, since when a variable is selected, a coefficient is fit for each response. Most of the options are the same, so we focus here on the differences with the single response model. +As the name suggests, the response $y$ is not a vector but a matrix of quantitative responses. As a result, the coefficients at each value of lambda are also a matrix. -Obviously, as the name suggests, $y$ is not a vector, but a matrix of quantitative responses in this section. The coefficients at each value of lambda are also a matrix as a result. - -Here we solve the following problem: +`glmnet` solves the problem $$ -\min_{(\beta_0, \beta) \in \mathbb{R}^{(p+1)\times K}}\frac{1}{2N} \sum_{i=1}^N ||y_i -\beta_0-\beta^T x_i||^2_F+\lambda \left[ (1-\alpha)||\beta||_F^2/2 + \alpha\sum_{j=1}^p||\beta_j||_2\right]. +\min_{(\beta_0, \beta) \in \mathbb{R}^{(p+1)\times K}}\frac{1}{2N} \sum_{i=1}^N \|y_i -\beta_0-\beta^T x_i\|^2_F+\lambda \left[ (1-\alpha)\|\beta\|_F^2/2 + \alpha\sum_{j=1}^p\|\beta_j\|_2\right]. $$ -Here $\beta_j$ is the jth row of the $p\times K$ coefficient matrix $\beta$, and we replace the absolute penalty on each single coefficient by a group-lasso penalty on each coefficient K-vector $\beta_j$ for a single predictor $x_j$. +Here $\beta_j$ is the $j$th row of the $p\times K$ coefficient matrix +$\beta$, and we replace the absolute penalty on each single +coefficient by a group-lasso penalty on each coefficient $K$-vector +$\beta_j$ for a single predictor (i.e. column of the `x` matrix). The +group lasso penalty behaves like the lasso, but on the whole group of +coefficients for each response: they are either all zero, or else none +are zero, but are shrunk by an amount depending on $\lambda$. -We use a set of data generated beforehand for illustration. +We use a set of data generated beforehand for illustration. We fit a regularized multi-response Gaussian model to the data, with an object `mfit` returned. ```{r} data(MultiGaussianExample) +mfit <- glmnet(x, y, family = "mgaussian") ``` -We fit the data, with an object "mfit" returned. -```{r} -mfit = glmnet(x, y, family = "mgaussian") -``` -For multiresponse Gaussian, the options in `glmnet` are almost the same as the single-response case, such as `alpha`, `weights`, `nlambda`, `standardize`. A exception to be noticed is that `standardize.response` is only for `mgaussian` family. The default value is `FALSE`. If `standardize.response = TRUE`, it standardizes the response variables. +The `standardize.response` argument is only for `mgaussian` family. If `standardize.response = TRUE`, the response variables are standardized (default is `FALSE`). -To visualize the coefficients, we use the `plot` function. +As before, we can use the `plot` method to visualize the coefficients: ```{r} plot(mfit, xvar = "lambda", label = TRUE, type.coef = "2norm") ``` -Note that we set `type.coef = "2norm"`. Under this setting, a single curve is plotted per variable, with value equal to the $\ell_2$ norm. The default setting is `type.coef = "coef"`, where a coefficient plot is created for each response (multiple figures). +Note that we set `type.coef = "2norm"`. Under this setting, a single curve is plotted per variable, with value equal to the $\ell_2$ norm of the variable's coefficient vector. The default setting is `type.coef = "coef"`, where a coefficient plot is created for each response (multiple figures). `xvar` and `label` are two other arguments which have the same functionality as in the single-response case. -`xvar` and `label` are two other options besides ordinary graphical parameters. They are the same as the single-response case. - -We can extract the coefficients at requested values of $\lambda$ by using the function `coef` and make predictions by `predict`. The usage is similar and we only provide an example of `predict` here. +We can extract the coefficients and make predictions at requested +values of $\lambda$ by using the `coef` and `predict` methods +respectively, as before. Here is an example of a `predict` call: ```{r} predict(mfit, newx = x[1:5,], s = c(0.1, 0.01)) ``` -The prediction result is saved in a three-dimensional array with the first two dimensions being the prediction matrix for each response variable and the third indicating the response variables. +The prediction result is saved in a three-dimensional array with the +first two dimensions being the prediction matrix for each response +variable and the third corresponding to the response variables. -We can also do k-fold cross-validation. The options are almost the same as the ordinary Gaussian family and we do not expand here. -```{r} -cvmfit = cv.glmnet(x, y, family = "mgaussian") -``` -We plot the resulting `cv.glmnet` object "cvmfit". -```{r} -plot(cvmfit) -``` +## Logistic Regression: `family = "binomial"` -To show explicitly the selected optimal values of $\lambda$, type -```{r} -cvmfit$lambda.min -cvmfit$lambda.1se -``` -As before, the first one is the value at which the minimal mean squared error is achieved and the second is for the most regularized model whose mean squared error is within one standard error of the minimal. +Logistic regression is a widely-used model when the response is binary. Suppose the response variable takes values in $\mathcal{G}=\{1,2\}$. Denote $y_i = I(g_i=1)$. We model -Prediction for `cv.glmnet` object works almost the same as for `glmnet` object. We omit the details here. - -## Logistic Regression - -Logistic regression is another widely-used model when the response is categorical. If there are two possible outcomes, we use the binomial distribution, else we use the multinomial. +$$\mbox{Pr}(G=2|X=x)=\frac{e^{\beta_0+\beta^Tx}}{1+e^{\beta_0+\beta^Tx}},$$ -### Binomial Models +which can be written in the following form: -For the binomial model, suppose the response variable takes value in $\mathcal{G}=\{1,2\}$. Denote $y_i = I(g_i=1)$. We model -$$\mbox{Pr}(G=2|X=x)=\frac{e^{\beta_0+\beta^Tx}}{1+e^{\beta_0+\beta^Tx}},$$ -which can be written in the following form $$\log\frac{\mbox{Pr}(G=2|X=x)}{\mbox{Pr}(G=1|X=x)}=\beta_0+\beta^Tx,$$ the so-called "logistic" or log-odds transformation. -The objective function for the penalized logistic regression uses the negative binomial log-likelihood, and is +The objective function for logistic regression is the penalized negative binomial log-likelihood, and is $$ -\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}} -\left[\frac{1}{N} \sum_{i=1}^N y_i \cdot (\beta_0 + x_i^T \beta) - \log (1+e^{(\beta_0+x_i^T \beta)})\right] + \lambda \big[ (1-\alpha)||\beta||_2^2/2 + \alpha||\beta||_1\big]. +\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}} -\left[\frac{1}{N} \sum_{i=1}^N y_i \cdot (\beta_0 + x_i^T \beta) - \log (1+e^{(\beta_0+x_i^T \beta)})\right] + \lambda \big[ (1-\alpha)\|\beta\|_2^2/2 + \alpha\|\beta\|_1\big]. $$ -Logistic regression is often plagued with degeneracies when $p > N$ and exhibits wild behavior even when $N$ is close to $p$; -the elastic-net penalty alleviates these issues, and regularizes and selects variables as well. - -Our algorithm uses a quadratic approximation to the log-likelihood, and then coordinate descent on the resulting penalized weighted least-squares problem. These constitute an outer and inner loop. - +Logistic regression is often plagued with degeneracies when $p > N$ and exhibits wild behavior even when $N$ is close to $p$; the elastic net penalty alleviates these issues, and regularizes and selects variables as well. -For illustration purpose, we load pre-generated input matrix `x` and the response vector `y` from the data file. +We use a "proximal Newton" algorithm for optimizing this +criterion. This makes repeated use of a quadratic approximation to the +log-likelihood, and then weighted coordinate descent on the resulting +penalized weighted least-squares problem. These constitute an outer +and inner loop, also known as iteratively reweighted penalized least squares. + +For illustration purposes, we load the pre-generated input matrix `x` +and the response vector `y` from the data file. The input matrix $x$ +is the same as for other families. For binomial logistic regression, +the response variable $y$ should be either a binary vector, a factor +with two levels, or a two-column matrix of counts or proportions. The +latter is useful for grouped binomial data, or in applications where +we have "soft" class membership, such as occurs in the EM algorithm. ```{r} data(BinomialExample) ``` -The input matrix $x$ is the same as other families. For binomial logistic regression, the response variable $y$ should be either a factor with two levels, or a two-column matrix of counts or proportions. -Other optional arguments of `glmnet` for binomial regression are almost same as those for Gaussian family. Don't forget to set `family` option to "binomial". +Other optional arguments of `glmnet` for binomial regression are almost same as those for Gaussian family. Don't forget to set `family` option to "binomial": ```{r} -fit = glmnet(x, y, family = "binomial") -``` -Like before, we can print and plot the fitted object, extract the coefficients at specific $\lambda$'s and also make predictions. For plotting, the optional arguments such as `xvar` and `label` are similar to the Gaussian. We plot against the deviance explained and show the labels. -```{r} -plot(fit, xvar = "dev", label = TRUE) +fit <- glmnet(x, y, family = "binomial") ``` -Prediction is a little different for logistic from Gaussian, mainly in the option `type`. "link" and "response" are never equivalent and "class" is only available for logistic regression. In summary, -* "link" gives the linear predictors +As before, we can print and plot the fitted object, extract the coefficients at specific $\lambda$'s and also make predictions. For plotting, the optional arguments such as `xvar` and `label` work in the same way as for `family = "gaussian"`. Prediction is a little different for `family = "binomial"`, mainly in the function argument `type`: -* "response" gives the fitted probabilities +* "link" gives the linear predictors. -* "class" produces the class label corresponding to the maximum probability. +* "response" gives the fitted probabilities. -* "coefficients" computes the coefficients at values of `s` - -* "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. +* "class" produces the class label corresponding to the maximum probability. -For "binomial" models, results ("link", "response", "coefficients", "nonzero") are returned only for the class corresponding to the second level of the factor response. +As with `family = "gaussian"`, "coefficients" computes the coefficients at values of `s` and "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. Note that the results ("link", "response", "coefficients", "nonzero") are returned only for the class corresponding to the second level of the factor response. In the following example, we make prediction of the class labels at $\lambda = 0.05, 0.01$. ```{r} predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01)) ``` -For logistic regression, `cv.glmnet` has similar arguments and usage as Gaussian. `nfolds`, `weights`, `lambda`, `parallel` are all available to users. There are some differences in `type.measure`: "deviance" and "mse" do not both mean squared loss and "class" is enabled. Hence, + +For logistic regression, `cv.glmnet` has similar arguments and usage as Gaussian. `nfolds`, `weights`, `lambda`, `parallel` are all available to users. There are some differences in `type.measure`: "deviance" and "mse" do not both mean squared loss. Rather, + * "mse" uses squared loss. * "deviance" uses actual deviance. @@ -538,256 +488,260 @@ * "auc" (for two-class logistic regression ONLY) gives area under the ROC curve. -For example, +For example, the code below uses misclassification error as the criterion for 10-fold cross-validation: ```{r} -cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class") +cvfit <- cv.glmnet(x, y, family = "binomial", type.measure = "class") ``` -It uses misclassification error as the criterion for 10-fold cross-validation. -We plot the object and show the optimal values of $\lambda$. +As before, we can plot the object and show the optimal values of $\lambda$. ```{r} plot(cvfit) -``` -```{r} cvfit$lambda.min cvfit$lambda.1se ``` -`coef` and `predict` are simliar to the Gaussian case and we omit the details. We review by some examples. -```{r} -coef(cvfit, s = "lambda.min") -``` -As mentioned previously, the results returned here are only for the second level of the factor response. +`coef` and `predict` for the `cv.glmnet` object for `family = "binomial"` are simliar to the Gaussian case and we omit the details. -```{r} -predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") -``` - -Like other GLMs, glmnet allows for an "offset". This is a fixed vector of N numbers that is added into the linear predictor. -For example, you may have fitted some other logistic regression using other variables (and data), and now you want to see if the present variables can add anything. So you use the predicted logit from the other model as an offset in. +Like other generalized linear models, `glmnet` allows for an +"offset". This is a fixed vector of $N$ numbers that is added into the +linear predictor. For example, you may have fitted some other logistic +regression using other variables (and data), and now you want to see +if the present variables can add further predictive power. To do this, +you can use the predicted logit from the other model as an offset in +the `glmnet` call. Offsets are also useful in Poisson models, which we +discuss later. -### Multinomial Models +## Multinomial Regression: `family = "multinomial"` -For the multinomial model, suppose the response variable has $K$ levels ${\cal G}=\{1,2,\ldots,K\}$. Here we model +The multinomial model extends the binomial when the number of classes +is more than two. Suppose the response variable has $K$ levels ${\cal G}=\{1,2,\ldots,K\}$. Here we model $$\mbox{Pr}(G=k|X=x)=\frac{e^{\beta_{0k}+\beta_k^Tx}}{\sum_{\ell=1}^Ke^{\beta_{0\ell}+\beta_\ell^Tx}}.$$ +There is a linear predictor for each class! -Let ${Y}$ be the $N \times K$ indicator response matrix, with elements $y_{i\ell} = I(g_i=\ell)$. Then the elastic-net penalized negative log-likelihood function becomes +Let ${Y}$ be the $N \times K$ indicator response matrix, with elements $y_{i\ell} = I(g_i=\ell)$. Then the elastic net penalized negative log-likelihood function becomes $$ -\ell(\{\beta_{0k},\beta_{k}\}_1^K) = -\left[\frac{1}{N} \sum_{i=1}^N \Big(\sum_{k=1}^Ky_{il} (\beta_{0k} + x_i^T \beta_k)- \log \big(\sum_{k=1}^K e^{\beta_{0k}+x_i^T \beta_k}\big)\Big)\right] +\lambda \left[ (1-\alpha)||\beta||_F^2/2 + \alpha\sum_{j=1}^p||\beta_j||_q\right]. +\ell(\{\beta_{0k},\beta_{k}\}_1^K) = -\left[\frac{1}{N} \sum_{i=1}^N \Big(\sum_{k=1}^Ky_{il} (\beta_{0k} + x_i^T \beta_k)- \log \big(\sum_{\ell=1}^K e^{\beta_{0\ell}+x_i^T \beta_\ell}\big)\Big)\right] +\lambda \left[ (1-\alpha)\|\beta\|_F^2/2 + \alpha\sum_{j=1}^p\|\beta_j\|_q\right]. $$ -Here we really abuse notation! $\beta$ is a $p\times K$ matrix of coefficients. $\beta_k$ refers to the kth column (for outcome category k), and $\beta_j$ the jth row (vector of K coefficients for variable j). -The last penalty term is $||\beta_j||_q$, we have two options for q: $q\in \{1,2\}$. -When q=1, this is a lasso penalty on each of the parameters. When q=2, this is a grouped-lasso penalty on all the K coefficients for a particular variables, which makes them all be zero or nonzero together. +Here we really abuse notation! $\beta$ is a $p\times K$ matrix of coefficients. $\beta_k$ refers to the $k$th column (for outcome category $k$), and $\beta_j$ the $j$th row (vector of $K$ coefficients for variable $j$). The last penalty term is $\|\beta_j\|_q$. We support two options for $q$: $q\in \{1,2\}$. When $q=1$, this is a lasso penalty on each of the parameters. When $q=2$, this is a grouped-lasso penalty on all the $K$ coefficients for a particular variable, which makes them all be zero or nonzero together. +The standard Newton algorithm can be tedious here. Instead, for $q=1$ we use a so-called partial Newton algorithm by making a partial quadratic approximation to the log-likelihood, allowing only $(\beta_{0k}, \beta_k)$ to vary for a single class at a time. For each value of $\lambda$, we first cycle over all classes indexed by $k$, computing each time a partial quadratic approximation about the parameters of the current class. Then, the inner procedure is almost the same as for the binomial case. When $q=2$, we use a different approach that we will not explain here. -The standard Newton algorithm can be tedious here. Instead, we use a so-called partial Newton algorithm by making a partial quadratic approximation to the log-likelihood, allowing only $(\beta_{0k}, \beta_k)$ to vary for a single class at a time. -For each value of $\lambda$, we first cycle over all classes indexed by $k$, computing each time a partial quadratic approximation about the parameters of the current class. Then the inner procedure is almost the same as for the binomial case. -This is the case for lasso (q=1). When q=2, we use a different approach, which we wont dwell on here. - - -For the multinomial case, the usage is similar to logistic regression, and we mainly illustrate by examples and address any differences. We load a set of generated data. +For the `family = "multinomial"` case, usage is similar to that for `family = "binomial"`. In this section we describe the differences. First, we load a set of generated data: ```{r} data(MultinomialExample) ``` -The optional arguments in `glmnet` for multinomial logistic regression are mostly similar to binomial regression except for a few cases. - -The response variable can be a `nc >= 2` level factor, or a `nc`-column matrix of counts or proportions. -Internally glmnet will make the rows of this matrix sum to 1, and absorb the total mass into the weight for that observation. -`offset` should be a `nobs x nc` matrix if there is one. +The response variable can be a `nc >= 2` level factor, or an `nc`-column matrix of counts or proportions. Internally glmnet will make the rows of this matrix sum to 1, and absorb the total mass into the weight for that observation. `offset` should be a `nobs x nc` matrix if one is provided. -A special option for multinomial regression is `type.multinomial`, which allows the usage of a grouped lasso penalty if `type.multinomial = "grouped"`. This will ensure that the multinomial coefficients for a variable are all in or out together, just like for the multi-response Gaussian. - -```{r} -fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped") -``` - -We plot the resulting object "fit". +A special option for multinomial regression is `type.multinomial`, which allows the usage of a grouped lasso penalty ($q = 2$) if `type.multinomial = "grouped"`. The default is `type.multinomial = "ungrouped"` ($q = 1$). ```{r} +fit <- glmnet(x, y, family = "multinomial", type.multinomial = "grouped") plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm") ``` -The options are `xvar`, `label` and `type.coef`, in addition to other ordinary graphical parameters. - -`xvar` and `label` are the same as other families while `type.coef` is only for multinomial regression and multiresponse Gaussian model. It can produce a figure of coefficients for each response variable if `type.coef = "coef"` or a figure showing the $\ell_2$-norm in one figure if `type.coef = "2norm"` +For the `plot` method, the function arguments are `xvar`, `label` and `type.coef`, in addition to other ordinary graphical parameters. `xvar` and `label` are the same as other families while `type.coef` is only for multinomial regression and multi-response Gaussian model. It can produce a figure of coefficients for each response variable if `type.coef = "coef"` or a figure showing the $\ell_2$-norm in one figure if `type.coef = "2norm"`. -We can also do cross-validation and plot the returned object. +We can also do cross-validation and plot the returned object. Note +that although `type.multinomial` is not a named argument in +`cv.glmnet`, in fact any argument that can be passed to `glmnet` is +valid in the argument list of `cv.glmnet`. Such arguments are passed +via the `...` argument directly to the calls to `glmnet` inside the +`cv.glmnet` function. ```{r} -cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped", parallel = TRUE) +cvfit <- cv.glmnet(x, y, family = "multinomial", type.multinomial = "grouped") plot(cvfit) ``` -Note that although `type.multinomial` is not a typical argument in `cv.glmnet`, in fact any argument that can be passed to `glmnet` is valid in the argument list of `cv.glmnet`. We also use parallel computing to accelerate the calculation. - Users may wish to predict at the optimally selected $\lambda$: ```{r} predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") ``` -## Poisson Models +## Poisson Regression: `family = "poisson"` -Poisson regression is used to model count data under the assumption of Poisson error, or otherwise non-negative data where the mean and variance are proportional. Like the Gaussian and binomial model, the Poisson is a member of the exponential family of distributions. We usually model its positive mean on the log scale: $\log \mu(x) = \beta_0+\beta' x$. -The log-likelihood for observations $\{x_i,y_i\}_1^N$ is given my +Poisson regression is used to model count data under the assumption of Poisson error, or otherwise non-negative data where the mean and variance are proportional. Like the Gaussian and binomial models, the Poisson distribution is a member of the exponential family of distributions. We usually model its positive mean on the log scale: $\log \mu(x) = \beta_0+\beta' x$. + +The log-likelihood for observations $\{x_i,y_i\}_1^N$ is given by $$ -l(\beta|X, Y) = \sum_{i=1}^N \left(y_i (\beta_0+\beta' x_i) - e^{\beta_0+\beta^Tx_i}\right). +l(\beta|X, Y) = \sum_{i=1}^N \left(y_i (\beta_0+\beta^T x_i) - e^{\beta_0+\beta^Tx_i}\right). $$ -As before, we optimize the penalized log-lielihood: - $$ -\min_{\beta_0,\beta} -\frac1N l(\beta|X, Y) + \lambda \left((1-\alpha) \sum_{i=1}^N \beta_i^2/2) +\alpha \sum_{i=1}^N |\beta_i|\right). +As before, we optimize the penalized log-likelihood: +$$ +\min_{\beta_0,\beta} -\frac1N l(\beta|X, Y) + \lambda \left((1-\alpha) \sum_{i=1}^N \beta_i^2/2 +\alpha \sum_{i=1}^N |\beta_i|\right). $$ -Glmnet uses an outer Newton loop, and an inner weighted least-squares loop (as in logistic regression) to optimize this criterion. - - +`glmnet` uses an outer Newton loop and an inner weighted least-squares loop (as in logistic regression) to optimize this criterion. -First, we load a pre-generated set of Poisson data. +First, we load a pre-generated set of Poisson data: ```{r} data(PoissonExample) ``` -We apply the function `glmnet` with the `"poisson"` option. +We apply the function `glmnet` with `family = "poisson"`: ```{r} -fit = glmnet(x, y, family = "poisson") +fit <- glmnet(x, y, family = "poisson") ``` -The optional input arguments of `glmnet` for `"poisson"` family are similar to those for others. +The optional input arguments of `glmnet` for `"poisson"` family are similar to those for other families. -`offset` is a useful argument particularly in Poisson models. - -When dealing with rate data in Poisson models, the counts collected are often based on different exposures, such as length of time observed, area and years. A poisson rate $\mu(x)$ is relative to a unit exposure time, so if an observation $y_i$ was exposed for $E_i$ units of time, then the expected count would be $E_i\mu(x)$, and the log mean would be $\log(E_i)+\log(\mu(x))$. In a case like this, we would supply an *offset* $\log(E_i)$ for each observation. -Hence `offset` is a vector of length `nobs` that is included in the linear predictor. Other families can also use options, typically for different reasons. - -(Warning: if `offset` is supplied in `glmnet`, offsets must also also be supplied to `predict` to make reasonable predictions.) +`offset` is a particularly useful argument for Poisson models. When dealing with rate data in Poisson models, the counts collected are often based on different exposures such as length of time observed, area and years. A poisson rate $\mu(x)$ is relative to a unit exposure time, so if an observation $y_i$ was exposed for $E_i$ units of time, then the expected count would be $E_i\mu(x)$, and the log mean would be $\log(E_i)+\log(\mu(x))$. In a case like this, we would supply an *offset* $\log(E_i)$ for each observation. Hence `offset` is a vector of length $N$ that is included in the linear predictor. (_Warning_: if `offset` is supplied in `glmnet`, offsets must also also be supplied to `predict` via the `newoffset` argument to make reasonable predictions.) Again, we plot the coefficients to have a first sense of the result. ```{r} plot(fit) ``` -Like before, we can extract the coefficients and make predictions at certain $\lambda$'s by using `coef` and `predict` respectively. The optional input arguments are similar to those for other families. In function `predict`, the option `type`, which is the type of prediction required, has its own specialties for Poisson family. That is, -* "link" (default) gives the linear predictors like others -* "response" gives the fitted mean -* "coefficients" computes the coefficients at the requested values for `s`, which can also be realized by `coef` function -* "nonzero" returns a a list of the indices of the nonzero coefficients for each value of `s`. - -For example, we can do as follows. -```{r} +As before, we can extract the coefficients and make predictions at certain $\lambda$'s using `coef` and `predict` respectively. The optional input arguments are similar to those for other families. For the `predict` method, the argument `type` has the same meaning as that for `family = "binomial"`, except that "response" gives the fitted mean (rather than fitted probabilities in the binomial case). For example, we can do the following: +```{r out.lines = 7} coef(fit, s = 1) predict(fit, newx = x[1:5,], type = "response", s = c(0.1,1)) ``` We may also use cross-validation to find the optimal $\lambda$'s and thus make inferences. ```{r} -cvfit = cv.glmnet(x, y, family = "poisson") +cvfit <- cv.glmnet(x, y, family = "poisson") ``` -Options are almost the same as the Gaussian family except that for `type.measure`, -* "deviance" (default) gives the deviance -* "mse" stands for mean squared error -* "mae" is for mean absolute error. +Options are almost the same as the Gaussian family except that for `type.measure`: -We can plot the `cv.glmnet` object. -```{r} -plot(cvfit) -``` +* "deviance" (default) gives the deviance. +* "mse" is for mean squared error. +* "mae" is for mean absolute error. -We can also show the optimal $\lambda$'s and the corresponding coefficients. -```{r} -opt.lam = c(cvfit$lambda.min, cvfit$lambda.1se) -coef(cvfit, s = opt.lam) -``` -The `predict` method is similar and we do not repeat it here. +## Cox Regression: `family = "cox"` -## Cox Models +The Cox proportional hazards model is commonly used for the study of +the relationship beteween predictor variables and survival time. We +have another vignette (["Regularized Cox +Regression"](https://glmnet.stanford.edu/articles/Coxnet.html)) +dedicated solely to fitting regularized Cox models with the `glmnet` +package; please consult that vignette for details. -The Cox proportional hazards model is commonly used for the study of the relationship beteween predictor variables and survival time. In the usual survival analysis framework, we have data of the form $(y_1, x_1, \delta_1), \ldots, (y_n, x_n, \delta_n)$ where $y_i$, the observed time, is a time of failure if $\delta_i$ is 1 or right-censoring if $\delta_i$ is 0. We also let $t_1 < t_2 < \ldots < t_m$ be the increasing list of unique failure times, and $j(i)$ denote the index of the observation failing at time $t_i$. +## Programmable GLM families: `family = family()` -The Cox model assumes a semi-parametric form for the hazard -$$ -h_i(t) = h_0(t) e^{x_i^T \beta}, -$$ -where $h_i(t)$ is the hazard for patient $i$ at time $t$, $h_0(t)$ is a shared baseline hazard, and $\beta$ is a fixed, length $p$ vector. In the classic setting $n \geq p$, inference is made via the partial likelihood -$$ -L(\beta) = \prod_{i=1}^m \frac{e^{x_{j(i)}^T \beta}}{\sum_{j \in R_i} e^{x_j^T \beta}}, -$$ -where $R_i$ is the set of indices $j$ with $y_j \geq t_i$ (those at risk at time $t_i$). +Since version 4.0, `glmnet` has the facility to fit any GLM family by +specifying a `family` object, as used by `stats::glm`. For these more +general families, the outer Newton loop is performed in R, while the +inner elastic-net loop is performed in Fortran, for each value of +lambda. The price for this generality is a small hit in speed. +For details, see the vignette +["GLM `family` functions in `glmnet`"](https://glmnet.stanford.edu/articles/glmnetFamily.html) -Note there is no intercept in the Cox mode (its built into the baseline hazard, and like it, would cancel in the partial likelihood.) +## Assessing models on test data -We penalize the negative log of the partial likelihood, just like the other models, with an elastic-net penalty. +Once we have fit a series of models using `glmnet`, we often assess their performance on a set of evaluation or test data. We usually go through the process of building a prediction matrix, deciding on the performance measure, and computing these measures for a series of values for `lambda` (and `gamma` for relaxed fits). `glmnet` provides three functions (`assess.glmnet`, `roc.glmnet` and `confusion.glmnet`) that make these tasks easier. -We use a pre-generated set of sample data and response. Users can load their own data and follow a similar procedure. In this case $x$ must be an $n\times p$ matrix of covariate values - each row corresponds to a patient and each column a covariate. $y$ is an $n \times 2$ matrix, with a column "time" of failure/censoring times, and "status" a 0/1 indicator, with 1 meaning the time is a failure time, and zero a censoring time. +### Performance measures +The function `assess.glmnet` computes the same performance measures produced by `cv.glmnet`, but on a validation or test dataset. ```{r} -data(CoxExample) -y[1:5,] +data(BinomialExample) +itrain <- 1:70 +fit <- glmnet(x[itrain, ], y[itrain], family = "binomial", nlambda = 5) +assess.glmnet(fit, newx = x[-itrain, ], newy = y[-itrain]) ``` -The `Surv` function in the package `survival` can create such a matrix. Note, however, that the `coxph` and related linear models can handle interval and other forms of censoring, while glmnet can only handle right censoring in its present form. -We apply the `glmnet` function to compute the solution path under default settings. -```{r} -fit = glmnet(x, y, family = "cox") +This produces a list with *all* the measures suitable for a binomial model, computed for the entire sequence of lambdas in the fit object. Here the function identifies the model family from the `fit` object. + +A second use case builds the prediction matrix before calling `assess.glmnet`: +```{r, eval=FALSE} +pred <- predict(fit, newx = x[-itrain, ]) +assess.glmnet(pred, newy = y[-itrain], family = "binomial") ``` -All the standard options are available such as `alpha`, `weights`, `nlambda` and `standardize`. Their usage is similar as in the Gaussian case and we omit the details here. Users can also refer to the help file `help(glmnet)`. -We can plot the coefficients. +Here we have to provide the `family` as an argument; the results (not shown) are the same. Users can see the various measures suitable for each family via ```{r} -plot(fit) +glmnet.measures() ``` -As before, we can extract the coefficients at certain values of $\lambda$. -```{r} -coef(fit, s = 0.05) +`assess.glmnet` can also take the result of `cv.glmnet` as input. In this case the predictions are made at the optimal values for the parameter(s). +```{r out.lines = 11} +cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "binomial", nlambda = 30) +assess.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) ``` -Since the Cox Model is not commonly used for prediction, we do not give an illustrative example on prediction. If needed, users can refer to the help file by typing `help(predict.glmnet)`. +This uses the default value of `s = "lambda.1se"`, just like `predict` would have done. Users can provide additional arguments that get passed on to `predict`. For example, the code below shows the performance measures for `s = "lambda.min"`: +```{r out.lines = 11} +assess.glmnet(cfit, newx = x[-itrain, ],newy = y[-itrain], s = "lambda.min") +``` + +### Prevalidation -Also, the function `cv.glmnet` can be used to compute $k$-fold cross-validation for the Cox model. The usage is similar to that for other families except for two main differences. +One interesting use case for `assess.glmnet` is to get the results of cross-validation using other measures. By specifying `keep = TRUE` in the `cv.glmnet` call, a matrix of prevalidated predictions are stored in the returned output as the `fit.preval` component. We can then use this component in the call to `assess.glmnet`: +```{r out.lines = 11} +cfit <- cv.glmnet(x, y, family = "binomial", keep = TRUE, nlambda = 30) +assess.glmnet(cfit$fit.preval, newy = y, family = "binomial") +``` -One is that `type.measure` only supports "deviance"(also default), which gives the partial-likelihood. +Users can verify that the first measure here `deviance` is identical to the component `cvm` on the `cfit` object. -The other is in the option `grouped`. `grouped = TRUE` obtains the CV partial likelihood for the Kth fold by subtraction; by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the on the (K-1)/K dataset. This makes more efficient use of risk sets. With `grouped=FALSE` the log partial likelihood is computed only on the Kth fold, which is only reasonable if each fold has a large number of observations. +### ROC curves for binomial data + +In the special case of binomial models, users often would like to see the ROC curve for validation or test data. Here the function `roc.glmnet` provides the goodies. Its first argument is as in `assess.glmnet`. Here we illustrate one use case, using the prevlidated CV fit. ```{r} -cvfit = cv.glmnet(x, y, family = "cox") +cfit <- cv.glmnet(x, y, family = "binomial", type.measure = "auc", + keep = TRUE) +rocs <- roc.glmnet(cfit$fit.preval, newy = y) ``` -Once fit, we can view the optimal $\lambda$ value and a cross validated error plot to help evaluate our model. + +`roc.glmnet` returns a list of cross-validated ROC data, one for each model along the path. The code below demonstrates how one can plot the output. The first line identifies the `lambda` value giving the best area under the curve (AUC). Then we plot all the ROC curves in grey and the "winner" in red. ```{r} -plot(cvfit) +best <- cvfit$index["min",] +plot(rocs[[best]], type = "l") +invisible(sapply(rocs, lines, col="grey")) +lines(rocs[[best]], lwd = 2,col = "red") ``` -As previously, the left vertical line in our plot shows us where the CV-error curve hits its minimum. The right vertical line shows us the most regularized model with CV-error within 1 standard deviation of the minimum. We also extract such optimal $\lambda$'s. +### Confusion matrices for classification + +For binomial and multinomial models, we often wish to examine the classification performance on new data. The function `confusion.glmnet` will do that for us. ```{r} -cvfit$lambda.min -cvfit$lambda.1se +data(MultinomialExample) +set.seed(101) +itrain <- sample(1:500, 400, replace = FALSE) +cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "multinomial") +cnf <- confusion.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) ``` -We can check the active covariates in our model and see their coefficients. + +`confusion.glmnet` produces a table of class "confusion.table" which inherits from class "table", and we also provide a `print` method for it. ```{r} -coef.min = coef(cvfit, s = "lambda.min") -active.min = which(coef.min != 0) -index.min = coef.min[active.min] +print(cnf) ``` + +The first argument to `confusion.glmnet` should be a `glmnet` or `cv.glmnet` object (from which predictions can be made), or a matrix/array of predictions, such as the *kept* `"fit.preval"` component in the output of a `cv.glmnet` call with `keep = TRUE`. When a matrix/array of predictions is provided, we need to specify the `family` option, otherwise *confusion* can exist between "binomial" and "multinomial" prediction matrices. + +When predictions for more than one model in the path is provided, `confusion.glmnet` returns a list of confusion tables. For example, the prevalidated predictions from `cv.glmnet` are for the whole `lambda` path, and so we are returned a list of confusion tables. In the code below, we identify and print the one achieving the smallest classification error. ```{r} -index.min -coef.min +cfit <- cv.glmnet(x, y, family = "multinomial", type = "class", keep = TRUE) +cnf <- confusion.glmnet(cfit$fit.preval, newy = y, family = "multinomial") +best <- cfit$index["min",] +print(cnf[[best]]) ``` -## Sparse Matrices +## Other Package Features - Our package supports sparse input matrices, which allow efficient storage and operations of large matrices but with only a few nonzero entries. It is available for all families except for the `cox` family. The usage of sparse matrices (inherit from class `"sparseMatrix"` as in package `Matrix`) in `glmnet ` is the same as if a regular matrix is provided. +In this section, we describe other features in the `glmnet` package that might be of interest to users. -We load a set of sample data created beforehand. +### Sparse matrix support + +Our package supports sparse input matrices, which allow the efficient storage and operation of large matrices having only a few nonzero entries. The usage of sparse matrices (inherits from class `"sparseMatrix"` as in the `Matrix` package) in `glmnet` is the same as if a regular matrix is provided. + +We load a set of sample data created beforehand. It loads `x`, a 100*20 sparse input matrix and `y`, the response vector. ```{r} data(SparseExample) -``` -It loads `x`, a 100*20 sparse input matrix and `y`, the response vector. -```{r} class(x) ``` -Users can create a sparse matrix with the function `sparseMatrix` by providing the locations and values of the nonzero entries. Alternatively, `Matrix` function can also be used to contruct a sparse matrix by setting `sparse = TRUE`, but this defeats the purpose somewhat. +Users can create a sparse matrix with the function `sparseMatrix` by +providing the locations and values of the nonzero +entries. Alternatively, the `Matrix` function from the `Matrix` +package can also be used to contruct a sparse matrix by setting +`sparse = TRUE`, but this defeats the purpose somewhat if the matrix +is large. We can fit the model the same way as before. ```{r} -fit = glmnet(x, y) +fit <- glmnet(x, y) ``` -We also do the cross-validation and plot the resulting object. + +We can also do the cross-validation and plot the resulting object. ```{r} cvfit = cv.glmnet(x, y) plot(cvfit) @@ -797,50 +751,135 @@ Note that sparse matrices can also be used for `newx`, the new input matrix in the `predict` function. For example, ```{r} -i = sample(1:5, size = 25, replace = TRUE) -j = sample(1:20, size = 25, replace = TRUE) -x = rnorm(25) -nx = sparseMatrix(i = i, j = j, x = x, dims = c(5, 20)) +i <- sample(1:5, size = 25, replace = TRUE) +j <- sample(1:20, size = 25, replace = TRUE) +x <- rnorm(25) +nx <- sparseMatrix(i = i, j = j, x = x, dims = c(5, 20)) predict(cvfit, newx = nx, s = "lambda.min") ``` +### Fitting big and/or sparse unpenalized generalized linear models + +The `glmnet` package includes a function `bigGlm` for fitting a single _unpenalized_ generalized linear model (GLM), but allowing all the options of `glmnet`. In other words, the user can set coefficient upper and/or lower bounds, and can provide the `x` matrix in sparse matrix format. This is not too much more than fitting a model with a single value of `lambda = 0` (with some protection from edge cases). `predict` and `print` methods can be called on the output. +```{r} +data(BinomialExample) +fit <- bigGlm(x, y, family = "binomial", lower.limits = -1) +print(fit) +``` + +### Creating `x` from mixed variables and/or missing data + +The `glmnet` package includes a function `makeX` that makes it easy to create the model matrix `x` needed as input to `glmnet`. It takes as input a data frame, which can contain vectors, matrices and factors. Some of the features are: + +* Factors are *one-hot* encoded to form indicator matrices. +* Missing values in the resultant matrix can be replaced by the column means. +* The `sparse` option returns a matrix in column-sparse format. This is useful if the data are large, and factors have many levels. +* Two data frames can be provided, `train` and `test`. This ensures the factor levels correspond, and also imputes missing data in the test data from means in the training data. + +Our first example demonstrates how `makeX` works with factors: +```{r} +set.seed(101) +X <- matrix(rnorm(5), nrow = 5) +X2 <- sample(letters[1:3], 5, replace = TRUE) +X3 <- sample(LETTERS[1:3], 5, replace = TRUE) +df <- data.frame(X, X2, X3) +makeX(df) +``` + +Include the option `sparse = TRUE` if a sparse output is desired: +```{r} +makeX(df, sparse = TRUE) +``` + +Next, let us add some missing values to our data matrix. By default, `makeX` leaves `NA`s as is: +```{r} +Xn <- X ; Xn[3,1] <- NA +X2n <- X2; X2n[1] <- NA +X3n <- X3; X3n[5] <- NA +dfn <- data.frame(Xn, X2n, X3n) +dfn +makeX(dfn) +``` + +We can impute the missing values with column means by passing the option `na.impute = TRUE`: +```{r} +makeX(dfn, na.impute = TRUE, sparse = TRUE) +``` + +Finally if a test set is available as well, both the training and test sets can be passed to `makeX` at the same time so that all the levels of factors present in the training and test sets will be represented correctly in the output matrix. In the example below, the third column of the training set only contains "B" and "C" while that of the training set only contains "A" and "C". By passing both data sets to `makeX` at the same time, this third column is correctly expanded into 3 feature columns for both the training and test sets. +```{r} +set.seed(102) +X <- matrix(rnorm(5), nrow = 5) +X2 <- sample(letters[1:3], 5, replace = TRUE) +X3 <- sample(LETTERS[1:3], 5, replace = TRUE) +Xn <- X ; Xn[5,1] <- NA +X2n <- X2; X2n[1] <- NA +X3n <- X3; X3n[2] <- NA +dftn <- data.frame(Xn, X2n, X3n) +dftn +makeX(dfn, dftn, sparse = TRUE) +``` + +### Progress bar + +Ever run a job on a big dataset, and wonder how long it will take? `glmnet` and `cv.glmnet` come equipped with a progress bar, which can by displayed by passing `trace.it = TRUE` to these functions. +```{r, eval=FALSE} +fit <- glmnet(x, y, trace.it = TRUE) +``` + +``##`` + +`` |================================== |65%`` + +This display changes in place as the fit is produced. The progress bar is also very helpful with `cv.glmnet`: +```{r, eval=FALSE} +fit <- cv.glmnet(x, y, trace.it = TRUE) +``` +``##`` + +`` Training`` + +`` |=============================================| 100%`` + +`` Fold: 1/10`` + +`` |=============================================| 100%`` + +`` Fold: 2/10`` + +`` |=============================================| 100%`` + +`` Fold: 3/10`` + +`` |============================= | 70%`` + +Tracing of the folds works a little differently when distributed computing is used. + +If the user wants `glmnet` and `cv.glmnet` to always print the progress bar, this can be achieved (for a session) via a call to `glmnet.control` with the `itrace` argument: +```{r, eval=FALSE} +glmnet.control(itrace = 1) +``` + +To reset it, one makes a similar call and sets `itrace = 0`. + ## Appendix 0: Convergence Criteria -Glmnet uses a convergence criterion that focuses not on coefficient -change but rather the impact of the change on the fitted values, and -hence the loss part of the objective. The net result is a -weighted norm of the coefficient change vector. +Glmnet uses a convergence criterion that focuses not on coefficient change but rather the impact of the change on the fitted values, and hence the loss part of the objective. The net result is a weighted norm of the coefficient change vector. -For gaussian models it uses the following. Suppose observation $i$ -has weight $w_i$. Let $v_j$ be the (weighted) +For Gaussian models it uses the following. Suppose observation $i$ has weight $w_i$. Let $v_j$ be the (weighted) sum-of-squares for variable $x_j$: $$v_j=\sum_{i=1}^Nw_ix_{ij}^2.$$ -If there is an intercept in the model, these $x_j$ will be centered by -the weighted mean, and hence this would be a weighted variance. -After $\hat\beta_j^o$ has been updated to $\hat\beta_j^n$, we compute -$\Delta_j=v_j(\hat\beta_j^o-\hat\beta_j^n)^2$. After a complete cycle of coordinate descent, we look at -$\Delta_{max}=\max_j\Delta_j$. Why this measure? -We can write +If there is an intercept in the model, these $x_j$ will be centered by the weighted mean, and hence this would be a weighted variance. After $\hat\beta_j^o$ has been updated to $\hat\beta_j^n$, we compute $\Delta_j=v_j(\hat\beta_j^o-\hat\beta_j^n)^2$. After a complete cycle of coordinate descent, we look at $\Delta_{max}=\max_j\Delta_j$. Why this measure? We can write $$\Delta_j=\frac1N\sum_{i=1}^N w_j(x_{ij}\hat\beta_j^o-x_{ij}\hat\beta_j^n)^2,$$ -which measures the weighted sum of squares of changes in fitted values -for this term. This measures the impact of the change in this -coefficient on the fit. If the largest such change is negligible, we stop. - - -For logistic regression, and other non-Gaussian models it is similar -for the inner loop. Only now the weights for each observation are more -complex. For example, for logisitic regression the weights are those -that arise from the current Newton step, namely $w_i^*=w_i\hat p_i(1-\hat p_i)$. Here $\hat p_i$ are the fitted probabilities as we -entered the current inner loop. The intuition is the same --- it -measures the impact of the coefficient change on the current weighted -least squares loss, or quadratic approximation to the log-likelihood -loss. - -What about outer-loop convergence? We use the same measure, except now -$\hat\beta^o$ is the coefficient vector before we entered this inner -loop, and $\hat\beta^n$ the converged solution for this inner -loop. Hence if this Newton step had no impact, we declare outer-loop convergence. +which measures the weighted sum of squares of changes in fitted values for this term. This measures the impact of the change in this coefficient on the fit. If the largest such change is negligible, we stop. +For logistic regression and other non-Gaussian models it is similar for the inner loop, only now the weights for each observation are more complex. For example, for logistic regression the weights are those that arise from the current Newton step, i.e. $w_i^*=w_i\hat p_i(1-\hat p_i)$, where the $\hat p_i$'s are the fitted probabilities as we entered the current inner loop. The intuition is the same: it measures the impact of the coefficient change on the current weighted least squares loss, or quadratic approximation to the log-likelihood loss. + +What about outer-loop convergence? + +* If the argument `family` was a character string, we use the same measure, except now $\hat\beta^o$ is the coefficient vector before we entered this inner loop, and $\hat\beta^n$ the converged solution for this inner loop. Hence if this Newton step had no impact, we declare outer-loop convergence. + +* If the argument `family` was a class "family" object, outer-loop convergence is determined by the change in the objective function value. If the fractional change in the objective function value is less than the `epsnr` control parameter, we declare outer-loop convergence. `epsnr` can be changed via a call to `glmnet.control`. ## Appendix 1: Internal Parameters @@ -848,115 +887,123 @@ There are several parameters that users can change: -`fdev` - minimum fractional change in deviance for stopping path; factory default = 1.0e-5 +* `fdev` - minimum fractional change in deviance for stopping path; factory default = 1.0e-5. + +* `devmax` - maximum fraction of explained deviance for stopping path; factory default = 0.999. -`devmax` - maximum fraction of explained deviance for stopping path; factory default = 0.999 +* `eps` - minimum value of `lambda.min.ratio` (see `glmnet` documentation); factory default= 1.0e-6. -* `eps` - minimum value of lambda.min.ratio (see glmnet); factory default= 1.0e-6 +* `big` - large floating point number; factory default = 9.9e35. Inf in definition of `upper.limits` is set to `big`. -* `big` - large floating point number; factory default = 9.9e35. Inf in definition of upper.limit is set to big +* `mnlam` - minimum number of path points (lambda values) allowed; factory default = 5. -* `mnlam` - minimum number of path points (lambda values) allowed; factory default = 5 +* `pmin` - minimum null probability for any class; factory default = 1.0e-5. -* `pmin` - minimum null probability for any class; factory default = 1.0e-5 +* `exmx` - maximum allowed exponent; factory default = 250.0. -* `exmx` - maximum allowed exponent; factory default = 250.0 +* `prec` - convergence threshold for multi-response bounds adjustment solution; factory default = 1.0e-10. -* `prec` - convergence threshold for multi-response bounds adjustment solution; factory default = 1.0e-10 +* `mxit` - maximum iterations for multi-response bounds adjustment solution; factory default = 100. -* `mxit` - maximum iterations for multiresponse bounds adjustment solution; factory default = 100 +* `epsnr`: convergence threshold for the iteratively reweighted least squares loop (see "The `family` Argument for `glmnet`" vignette); factory default = 1e-08. -* `factory` - If `TRUE`, reset all the parameters to the factory default; default is `FALSE` +* `mxitnr`: maximum iterations for the iteratively reweighted least squares loop for each value of $\lambda$ (see "The `family` Argument for `glmnet`" vignette); factory default = 25. -We illustrate the usage by an example. Note that any changes made hold for the duration of the R session, or unless they are changed by the user with a subsequent call to `glmnet.control`. +* `factory` - If `TRUE`, reset all the parameters to the factory default; default is `FALSE`. +We illustrate how to change these control parameters through an example. Note that any changes made hold for the duration of the R session, or unless they are changed by the user with a subsequent call to `glmnet.control`. ```{r} data(QuickStartExample) -fit = glmnet(x, y) -print(fit) +fit <- glmnet(x, y) +length(fit$lambda) # number of lambda values fit ``` -We can change the minimum fractional change in deviance for stopping path and compare the results. + +We can change the minimum fractional change in deviance for stopping path and compare the results. By setting `fdev` to be larger than the default, we see that the computation stopped earlier in the path. ```{r} -glmnet.control(fdev = 0) -fit = glmnet(x, y) -print(fit) +glmnet.control(fdev = 0.1) +fit <- glmnet(x, y) +length(fit$lambda) # number of lambda values fit ``` -We set `fdev = 0` to continue all along the path, even without much change. The length of the sequence becomes 100, which is the default of `nlambda`. -Users can also reset to the default settings. +Users can reset to the default settings with the following code: ```{r} glmnet.control(factory = TRUE) ``` -The current settings are obtained as follows. -```{r} + +To view current settings, call `glmnet.control` without any arguments: +```{r out.lines = 8} glmnet.control() ``` ## Appendix 2: Comparison with Other Packages -Some people may want to use `glmnet` to solve the Lasso or elastic-net problem at a single $\lambda$. We compare here the solution by `glmnet` with other packages (such as CVX), and also as an illustration of parameter settings in this situation. -__Warning__: Though such problems can be solved by `glmnet`, it is __not recommended__ and is not the spirit of the package. `glmnet` fits the __entire__ solution path for Lasso or elastic-net problems efficiently with various techniques such as warm start. Those advantages will disappear if the $\lambda$ sequence is forced to be only one value. +Some may want to use `glmnet` to solve the lasso or elastic net problem at a single $\lambda$. We compare here the solution by `glmnet` with other packages (such as CVX), and also as an illustration of parameter settings in this situation. -Nevertheless, we still illustrate with a typical example in linear model in the following for the purpose of comparison. Given $X, Y$ and $\lambda_0 > 0$, we want to find $\beta$ such that +(__Warning__: Though such problems can be solved by `glmnet`, it is __not recommended__ and is not the spirit of the package. `glmnet` fits the __entire__ solution path for the lasso or elastic net problems efficiently with various techniques such as using warm starts and strong rules. Those advantages will disappear if the $\lambda$ sequence is forced to be only one value.) + +We illustrate with a typical example in linear models for the purpose of comparison. Given $X$ and $Y$, we want to find $\beta$ such that $$ -\min_{\beta} ||Y - X\beta||_2^2 + \lambda_0 ||\beta||_1, +\min_{\beta} \|Y - X\beta\|_2^2 + \lambda_0 \|\beta\|_1, $$ where, say, $\lambda_0 = 8$. -We first solve using `glmnet`. Notice that there is no intercept term in the objective function, and the columns of $X$ are not necessarily standardized. Corresponding parameters have to be set to make it work correctly. In addition, there is a $1/(2n)$ factor before the quadratic term by default, we need to adjust $\lambda$ accordingly. For the purpose of comparison, the `thresh` option is specified to be 1e-20. However, this is not necessary in many practical applications. +We first solve this using `glmnet`. Notice that there is no intercept term in the objective function, and the columns of $X$ are not necessarily standardized. Corresponding parameters have to be set to make it work correctly. In addition, there is a $1/(2n)$ factor before the quadratic term by default, so we need to adjust $\lambda$ accordingly. For the purpose of comparison, we set `thresh = 1e-20`. However, this is not necessary in many practical applications. ```{r, echo=FALSE} data(QuickStartExample) ``` -```{r,eval=FALSE} -fit = glmnet(x, y, intercept = F, standardize = F, lambda = 8/(2*dim(x)[1]), thresh = 1e-20) +```{r} +np <- dim(x); n <- np[1]; p <-np[2] + +fit <- glmnet(x, y, intercept = F, standardize = F, + lambda = 8 / (2 * n), thresh = 1e-20) ``` -We then extract the coefficients (with no intercept). + +We then extract the coefficients (with no intercept): ```{r,eval=FALSE} -beta_glmnet = as.matrix(predict(fit, type = "coefficients")[-1,]) +beta_glmnet <- as.matrix(predict(fit, type = "coefficients")[-1,]) ``` -In linear model as here this approach worked because we were using squared error loss, but with any nonlinear family, it will probably fail. The reason is we are not using step length optimization, and so rely on very good warm starts to put us in the quadratic region of the loss function. - -Alternatively, a more stable and __strongly recommended__ way to perform this task is to first fit the entire Lasso or elastic-net path without specifying `lambda`, but then provide the requested $\lambda_0$ to `predict` function to extract the corresponding coefficients. In fact, if $\lambda_0$ is not in the $\lambda$ sequence generated by `glmnet`, the path will be refitted along a new $\lambda$ sequence that includes the requested value $\lambda_0$ and the old sequence, and the coefficients will be returned at $\lambda_0$ based on the new fit. Remember to set `exact = TRUE` in `predict` function to get the exact solution. Otherwise, it will be approximated by linear interpolation. +Alternatively, a more stable and __strongly recommended__ way to perform this task is to first fit the entire lasso or elastic net path without specifying `lambda`, but then provide the requested $\lambda_0$ to a `predict` call to extract the corresponding coefficients. (Remember to set `exact = TRUE` in the `predict` call to get the exact solution. Otherwise, it will be approximated by linear interpolation.) ```{r} -fit = glmnet(x, y, intercept = F, standardize = F, thresh = 1e-20) -beta_glmnet = as.matrix(predict(fit, s = 8/(2*dim(x)[1]), type = "coefficients", - exact = TRUE, x=x, y=y)[-1,]) +fit <- glmnet(x, y, intercept = F, standardize = F, thresh = 1e-20) +beta_glmnet <- as.matrix(predict(fit, s = 8 / (2 * n), + type = "coefficients", + exact = TRUE, x = x, y = y)[-1,]) ``` -We also use CVX, a general convex optimization solver, to solve this specific Lasso problem. Users could also call CVX from R using the `CVXfromR` package and solve the problem as follows. +Next, we use CVX, a general convex optimization solver, to solve this +specific lasso problem. CVX is implemented in the CVXR package on +CRAN. + ```{r, eval=FALSE} -library(CVXfromR) -setup.dir = "change/this/to/your/cvx/directory" -n = dim(x)[1]; p = dim(x)[2] -cvxcode = paste("variables beta(p)", - "minimize(square_pos(norm(y - x * beta, 2)) + lambda * norm(beta, 1))", - sep = ";") -Lasso = CallCVX(cvxcode, const.var = list(p = p, x = x, y = y, lambda = 8), opt.var.names = "beta", setup.dir = setup.dir, matlab.call = "change/this/to/path/to/matlab") -beta_CVX = Lasso$beta +library(CVXR) +beta <- Variable(p) +loss <- sum((y-x%*%beta)^2)/(2*n) +lassoPenalty <- function(beta,lambda)lambda*p_norm(beta,1) +obj <- loss + lassoPenalty(beta, lambda = 8/(2*n)) +prob <- Problem(Minimize(obj)) +result <- solve(prob) +beta_CVX <- result$getValue(beta) ``` -For convenience here, the results were saved in `CVXResult.RData`, and we simply load in the results. - +For convenience, the results were saved in `CVXResult.RData`, and we simply load in the results. ```{r} data(CVXResults) ``` -In addition, we use `lars` to solve the same problem. -```{r,message=FALSE} -require(lars) +Finally, we solve the same problem with the `lars` package: +```{r, message=FALSE} +library(lars) +fit_lars <- lars(x, y, type = "lasso", intercept = F, normalize = F) +beta_lars <- predict(fit_lars, s = 8 / 2, type = "coefficients", + mode = "lambda")$coefficients ``` -```{r} -fit_lars = lars(x, y, type = "lasso", intercept = F, normalize = F) -beta_lars = predict(fit_lars, s = 8/2, type = "coefficients", mode = "lambda")$coefficients -``` - -The results are listed below up to 6 decimal digits (due to convergence thresholds). +The results are listed below up to 6 decimal digits (due to convergence thresholds). We see that all three packages give the same result. ```{r} -cmp = round(cbind(beta_glmnet, beta_lars, beta_CVX), digits = 6) -colnames(cmp) = c("beta_glmnet", "beta_lars", "beta_CVX") +cmp <- round(cbind(beta_glmnet, beta_lars, beta_CVX), digits = 6) +colnames(cmp) <- c("beta_glmnet", "beta_lars", "beta_CVX") cmp ``` Binary files /tmp/tmp3NAdo6/f1kG0NcwdC/r-cran-glmnet-4.0-2/inst/doc/relax.pdf and /tmp/tmp3NAdo6/BBEWWRnMdv/r-cran-glmnet-4.1/inst/doc/relax.pdf differ diff -Nru r-cran-glmnet-4.0-2/inst/doc/relax.R r-cran-glmnet-4.1/inst/doc/relax.R --- r-cran-glmnet-4.0-2/inst/doc/relax.R 2020-06-14 23:21:51.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/relax.R 2021-01-11 00:03:26.000000000 +0000 @@ -1,138 +1,57 @@ -## ------------------------------------------------------------------------ +## ----include=FALSE------------------------------------------------------------ +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) + +## ----out.lines = 15----------------------------------------------------------- library(glmnet) data(QuickStartExample) -fit=glmnet(x,y, relax=TRUE) +fit <- glmnet(x, y, relax = TRUE) print(fit) -## ------------------------------------------------------------------------ -par(mfrow=c(1,3)) -plot(fit) -plot(fit,gamma=0.5) -plot(fit,gamma=0) - -## ------------------------------------------------------------------------ -cfit=cv.glmnet(x,y,relax=TRUE) +## ----------------------------------------------------------------------------- +par(mfrow = c(1, 3), mar=c(4,4,5.5,1)) +plot(fit, main = "gamma = 1") +plot(fit, gamma = 0.5, main = "gamma = 0.5") +plot(fit, gamma = 0, main = "gamma = 0") + +## ----------------------------------------------------------------------------- +set.seed(1) +cfit <- cv.glmnet(x, y, relax = TRUE) plot(cfit) -## ---- eval=FALSE--------------------------------------------------------- -# predict(cvfit,newx) +## ----------------------------------------------------------------------------- +plot(cfit, se.bands = FALSE) + +## ----------------------------------------------------------------------------- +predict(cfit, newx = x[1:5, ], s = "lambda.min", gamma = "gamma.min") -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- print(cfit) -## ----`relaxed`----------------------------------------------------------- -fit=glmnet(x,y) -fitr=relax.glmnet(fit,x=x,y=y) +## ----`relaxed`---------------------------------------------------------------- +fit <- glmnet(x,y) +fitr <- relax.glmnet(fit, x = x, y = y) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- print(cfit) print.cv.glmnet(cfit) -## ------------------------------------------------------------------------ -fitr=cv.glmnet(x,y,gamma=0,relax=TRUE) +## ----------------------------------------------------------------------------- +fitr <- cv.glmnet(x, y, gamma = 0, relax = TRUE) plot(fitr) -## ---- eval=FALSE--------------------------------------------------------- -# fit=glmnet(x,y,trace=TRUE) - -## ---- eval=FALSE--------------------------------------------------------- -# fit=cv.glmnet(x,y,trace=TRUE) - -## ---- eval=FALSE--------------------------------------------------------- -# glmnet.control(itrace=1) - -## ------------------------------------------------------------------------ - data(CoxExample) - -## ------------------------------------------------------------------------ - cvfit=cv.glmnet(x,y,family="cox",type.measure="C") - plot(cvfit) - -## ------------------------------------------------------------------------ -data(BinomialExample) -itrain=1:70 -fit=glmnet(x[itrain,],y[itrain],family="binomial",nlambda=20) -assess.glmnet(fit,newx=x[-itrain,],newy=y[-itrain]) - -## ---- eval=FALSE--------------------------------------------------------- -# pred=predict(fit,newx=x[-itrain,]) -# assess.glmnet(pred,newy=y[-itrain],family="binomial") - -## ------------------------------------------------------------------------ -glmnet.measures() - -## ------------------------------------------------------------------------ -cfit=cv.glmnet(x[itrain,],y[itrain],family="binomial", nlambda = 30) -assess.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain]) - -## ------------------------------------------------------------------------ -assess.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain], s="lambda.min") - -## ------------------------------------------------------------------------ -cfit=cv.glmnet(x,y,family="binomial",keep=TRUE, nlambda = 30) -assess.glmnet(cfit$fit.preval,newy=y,family="binomial") - -## ------------------------------------------------------------------------ -cfit=cv.glmnet(x,y,family="binomial", type.measure="auc", keep=TRUE) -rocs=roc.glmnet(cfit$fit.preval,newy=y) -which=match(cfit$lambda.min,cfit$lambda) -plot(rocs[[which]],type="l") -nopr=sapply(rocs,lines,col="grey") -lines(rocs[[which]],lwd=2,col="red") - -## ------------------------------------------------------------------------ -data(MultinomialExample) -set.seed(101) -itrain=sample(1:500,400,replace=FALSE) -cfit=cv.glmnet(x[itrain,],y[itrain],family="multinomial") -cnf=confusion.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain]) -print(cnf) - -## ------------------------------------------------------------------------ -cfit=cv.glmnet(x,y,family="multinomial",type="class",keep=TRUE) -cnf=confusion.glmnet(cfit$fit.preval,newy=y,family="multinomial") -which=match(cfit$lambda.min,cfit$lambda) -print(cnf[[which]]) - -## ------------------------------------------------------------------------ -data(BinomialExample) -fit=bigGlm(x,y,family="binomial",lower.limits=-1) -print(fit) - -## ------------------------------------------------------------------------ -set.seed(101) -X = matrix(rnorm(20),10,2) -X3=sample(letters[1:3],10,replace=TRUE) -X4=sample(LETTERS[1:3],10,replace=TRUE) -df=data.frame(X,X3,X4) -makeX(df) - -## ------------------------------------------------------------------------ -makeX(df,sparse=TRUE) - -## ------------------------------------------------------------------------ -Xn=X -Xn[3,1]=NA;Xn[5,2]=NA -X3n=X3; -X3n[6]=NA -X4n=X4 -X4n[9]=NA -dfn=data.frame(Xn,X3n,X4n) -makeX(dfn) - -## ------------------------------------------------------------------------ -makeX(dfn,na.impute=TRUE,sparse=TRUE) - -## ------------------------------------------------------------------------ -X = matrix(rnorm(10),5,2) -X3=sample(letters[1:3],5,replace=TRUE) -X4=sample(LETTERS[1:3],5,replace=TRUE) -Xn=X -Xn[3,1]=NA;Xn[5,2]=NA -X3n=X3; -X3n[1]=NA -X4n=X4 -X4n[2]=NA -dftn=data.frame(Xn,X3n,X4n) -makeX(dfn,dftn,na.impute=TRUE, sparse=TRUE) - diff -Nru r-cran-glmnet-4.0-2/inst/doc/relax.Rmd r-cran-glmnet-4.1/inst/doc/relax.Rmd --- r-cran-glmnet-4.0-2/inst/doc/relax.Rmd 2019-11-07 00:14:41.000000000 +0000 +++ r-cran-glmnet-4.1/inst/doc/relax.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,10 @@ --- -title: "Relaxed fits and other additions in `glmnet` 3.0" -author: "Trevor Hastie, Balasubramanian Narasimhan and Rob Tibshirani" -date: "October 15, 2019" +title: "The Relaxed Lasso" +author: + - Trevor Hastie + - Balasubramanian Narasimhan + - Rob Tibshirani +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -10,470 +13,137 @@ toc: yes toc_depth: 3 vignette: > - %\VignetteIndexEntry{Relaxed fits} + %\VignetteIndexEntry{The Relaxed Lasso} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- +```{r include=FALSE} +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) +``` + ## Introduction -In our vignette "glmnet" we give details for fitting lasso and -elastic-net regularized models, for -CV and various aspects of glmnet modeling. In this vignette, we -highlight some of the new tools and features in the major revision glmnet 3.0. - -The main edition is the introduction of the *relaxed lasso*. The idea -is to take a glmnet fitted object, and then for each lambda, refit the -variables in the active set without any penalization. This gives the -`relaxed` fit (note, there have been other definitions of a relaxed -fit, but this is the one we prefer). -This could of course be done for elastic net fits as well as lasso. -However, if the number of variables gets too close to the sample size -N, the relaxed path will be truncated. -Furthermore, for binomial and other nonlinear GLMs convergence can be -an issue with our current implementation if the number of variables is -too large, and perversely if the relaxed fit is too strong. - -Suppose the `glmnet` fitted linear predictor at $\lambda$ is -$\hat\eta_\lambda(x)$ and the relaxed version is $\tilde -\eta_\lambda(x)$. We also allow for shrinkage between the two: +In this vignette, we describe how the `glmnet` package can be used to fit the *relaxed lasso*. + +The idea of the relaxed lasso is to take a `glmnet` fitted object, and then for each lambda, refit the variables in the active set without any penalization. This gives the "relaxed" fit. (We note that there have been other definitions of a relaxed fit, but this is the one we prefer.) This could of course be done for elastic net fits as well as lasso. However, if the number of variables gets too close to the sample size $N$, the relaxed path will be truncated. Furthermore, for binomial and other nonlinear generalized linear models (GLMs) convergence can be an issue with our current implementation if the number of variables is too large, and perversely if the relaxed fit is too strong. + +Suppose the `glmnet` fitted linear predictor at $\lambda$ is $\hat\eta_\lambda(x)$ and the relaxed version is $\tilde\eta_\lambda(x)$. We also allow for shrinkage between the two: $$\tilde \eta_{\lambda,\gamma}=(1-\gamma)\tilde \eta_\lambda(x)+\gamma\hat\eta_\lambda(x).$$ -$\gamma\in[0,1]$ is an additional tuning parameter which can be -selected by cross validation. -The debiasing will potentially improve prediction performance, and CV -will typically select a model with a smaller number of variables. -This procedure is very competitive with forward-stepwise and -best-subset regression, and has a considerable speed advantage when -the number of variables is large. This is especially true for -best-subset, but even so for forward stepwise. The latter has to plod -through the variables one-at-a-time, while glmnet will just plunge in -and find a good active set. +$\gamma\in[0,1]$ is an additional tuning parameter which can be selected by cross-validation (CV). The debiasing will potentially improve prediction performance, and CV will typically select a model with a smaller number of variables. -Further details may be found in @glmnet, @coxnet, @strongrules, @block -and @best_subset. +This procedure is very competitive with forward-stepwise and best-subset regression, and has a considerable speed advantage when the number of variables is large. This is especially true for best-subset, but even so for forward stepwise. The latter has to plod through the variables one-at-a-time, while `glmnet` will just plunge in +and find a good active set. -## Simple relaxed fit +Further details on this form of relaxed fitting can be found in +@best_subset; more information on glmnet and elastic-net model in +general is given in @glmnet, +@coxnet, @strongrules, and @block. -To get things going, we show the most basic use. -We use the same data used in the `glmnet` vignette. +## Simple relaxed fitting -```{r} +We demonstrate the most basic relaxed lasso fit as a first example. We load some pre-generated data and fit the relaxed lasso on it by calling `glmnet` with `relax = TRUE`: +```{r out.lines = 15} library(glmnet) data(QuickStartExample) -fit=glmnet(x,y, relax=TRUE) +fit <- glmnet(x, y, relax = TRUE) print(fit) ``` -There is an extra column `%Dev R` where the `R` stands for "relaxed", -which is the percent deviance explained by the relaxed fit. This is -always higher than its neighboring column, which is the same for the -penalized fit (on the training data). +In addition to the three columns usually printed for `glmnet` objects +(`Df`, `%Dev` and `Lambda`), there is an extra column `%Dev R` (`R` +stands for "relaxed") which is the percent deviance explained by the +relaxed fit. This is always higher than its neighboring column, which +is the percent deviance exaplined for the penalized fit (on the +training data). Notice that when the `Df` stays the same, the `%Dev R` +does not change, since this typically means the active set is the +same. (The code is also smart enough to only fit such models once, so +in the truncated display shown, 9 lasso models are fit, but only 4 +relaxed fits are computed). -The fit object is class `relaxed`, which inherits from class `glmnet`. -One can plot it, with additional flexibility. +The fit object is of class `"relaxed"`, which inherits from class `"glmnet"`. Hence, the usual `plot` method for `"glmnet"` objects can be used. The code below demonstrates some additional flexibility that `"relaxed"` objects have for plotting. ```{r} -par(mfrow=c(1,3)) -plot(fit) -plot(fit,gamma=0.5) -plot(fit,gamma=0) +par(mfrow = c(1, 3), mar=c(4,4,5.5,1)) +plot(fit, main = "gamma = 1") +plot(fit, gamma = 0.5, main = "gamma = 0.5") +plot(fit, gamma = 0, main = "gamma = 0") ``` -So again, `gamma=1` is the traditional `glmnet` fit, while `gamma=0` -is the unpenalized fit, and `gamma=0.5` is a mixture of the two (at -the coefficient level, and hence also the linear predictors). - -We can also select `gamma` using `cv.glmnet`, which by default uses -the 5 values `c(0, 0.25, 0.5, 0.75, 1)`. +`gamma = 1` is the traditional `glmnet` fit (also `relax = FALSE`, the default), `gamma = 0` is the unpenalized fit, and `gamma = 0.5` is a mixture of the two (at the coefficient level, and hence also the linear predictors). +We can also select `gamma` using `cv.glmnet`, which by default uses the 5 values `c(0, 0.25, 0.5, 0.75, 1)`. This returns an object of class `"cv.relaxed"`. ```{r} -cfit=cv.glmnet(x,y,relax=TRUE) +set.seed(1) +cfit <- cv.glmnet(x, y, relax = TRUE) plot(cfit) ``` -The plot command has an `se.bands` option if you don't like the -default shading of these bands. - -Just like before, you can make predictions from a CV object, and it -uses the selected values for `lambda` and `gamma`. - -```{r, eval=FALSE} -predict(cvfit,newx) +To remove the shading of the standard error bands, pass `se.bands = FALSE`: +```{r} +plot(cfit, se.bands = FALSE) ``` -A new feature in `glmnet` is a print method for `cv.glmnet` and a -`cv.relaxed` object. +As with regular `"cv.glmnet"` objects, you can make predictions from a relaxed CV object. Just as the `s` option (for `lambda`) admits two special strings `"lambda.1se"` and `"lambda.min"` for special values of `lambda`, the `gamma` option admits two special strings `"gamma.1se"` and `"gamma.min"` for special values of `gamma`. For example, the code below makes predictions for `newx` at the `lambda` and `gamma` values that has the smallest CV error: +```{r} +predict(cfit, newx = x[1:5, ], s = "lambda.min", gamma = "gamma.min") +``` +Printing class `"cv.relaxed"` objects gives some basic information on the cross-validation: ```{r} print(cfit) ``` ## More details on relaxed fitting -Although `glmnet` has a `relax` option, you can created a relaxed -version by post-processing a `glmnet` object. +While we only demonstrate relaxed fits for the default Gaussian family, *any* of the families fit by `glmnet` can also be fit with the `relaxed` option. +Although `glmnet` has a `relax` option, you can also fit relaxed lasso models by post-processing a `glmnet` object with the `relax.glmnet` function. ```{r `relaxed`} -fit=glmnet(x,y) -fitr=relax.glmnet(fit,x=x,y=y) +fit <- glmnet(x,y) +fitr <- relax.glmnet(fit, x = x, y = y) ``` -This will rarely need to be done; one use case is if the original fit -took a long time, and the user wanted to avoid refitting it. -Note that in the call the arguments are named, since they are -passed in via the `...` argument to `relax.glmnet`. - -Needless to say, *any* of the families fit by `glmnet` can also be fit -with the `relaxed` option. - -As mentioned, a `relaxed` object is also a `glmnet` object. Apart from -the class modification, it has an additional componet named `relaxed` -which is itself a `glmnet` object, but with the relaxed coefficients. -The default behavior of extractor functions like `predict` and `coef`, -as well as `plot` will be to present results from the `glmnet` fit, -unless a value of `gamma` is given different from the default value -`gamma=1` (see the plots above). The `print` method gives additional -info on the relaxed fit. - -Likewise, a `cv.relaxed` object inherits from class `cv.glmnet`. -Here the `predict` method by default uses the optimal relaxed fit; if -predictions from the CV-optimal *original* `glmnet` fit are desired, one -can directly use `predict.cv.glmnet`. Similarly for the `print` -command, which we illustrate here. +This will rarely need to be done; one use case is if the original fit took a long time, and the user wants to avoid refitting it. Note that the arguments are named in the call in order for them to be passed correctly via the `...` argument in `relax.glmnet`. + +As mentioned, a `"relaxed"` object inherits from class `"glmnet"`. Apart from the class modification, it has an additional component named `relaxed` which is itself a `glmnet` object, but with the relaxed coefficients. The default behavior of extractor functions like `predict` and `coef`, as well as `plot` will be to present results from the `glmnet` fit, unless a value of `gamma` is given different from the default value `gamma = 1` (see the plots above). The `print` method gives additional info on the relaxed fit. +Likewise, a `cv.relaxed` object inherits from class `cv.glmnet`. Here the `predict` method by default uses the optimal relaxed fit; if predictions from the CV-optimal *original* `glmnet` fit are desired, one can directly use `predict.cv.glmnet`. Similarly, use `print` to print information for cross-validation on the relaxed fit, and `print.cv.glmnet` for information on the cross-validation for the original `glmnet` fit. ```{r} print(cfit) print.cv.glmnet(cfit) ``` -## Relaxed fits and glms - -`glmnet` itself is used to fit the relaxed fits, by using a single -value of zero -for `lambda`. However, for nonlinear models such as binomial, -multinomial and poisson, there can be convergence issues. This is -because `glmnet` does not do stepsize optimization, rather relying on -the pathwise fit to stay in the "quadratic" zone of the log -likelihood. We have an optional `path=TRUE` option for `relax.glmnet`, which actually -fits a regurized path toward the `lambda=0` solution, and thus avoids -the issue. The default is `path=FALSE` since this option adds to the -computing time. - -### Forward stepwise and relaxed fit - -One use case for a relaxed fit is as a faster version of forward -stepwise regression. With a large number `p` of variables, forward-stepwise regression can be tedious. Lasso on the other hand, because -of its convexity, can plunge in and identify good candidate sets of -variables over 100 values of `lambda`, even though `p` could be in the -10s of thousands. In a case like this, one can have `cv.glmnet` do the -selection. - -```{r} -fitr=cv.glmnet(x,y,gamma=0,relax=TRUE) -plot(fitr) -``` - -Notice that we only allow `gamma=0`, so in this case we are not considering the blended fits. - - - -## Progress bar - -We finally have a progress bar for `glmnet` and `cv.glmnet`. Ever run a -job on a big dataset, and wonder how long it will take? Now you can -use the `trace.it = TRUE` argument to these functions. - -```{r, eval=FALSE} -fit=glmnet(x,y,trace=TRUE) -``` - -``##`` - -`` |================================== |65%`` - -Here we abbreviated the argument to `trace`. This display changes in -place as the fit is produced. -Also very helpful with `cv.glmnet` - -```{r, eval=FALSE} -fit=cv.glmnet(x,y,trace=TRUE) -``` -``##`` - -`` Training`` - -`` |=============================================| 100%`` - -`` Fold: 1/10`` - -`` |=============================================| 100%`` - -`` Fold: 2/10`` - -`` |=============================================| 100%`` - -`` Fold: 3/10`` +### Possible convergence issues for relaxed fits -`` |=============================================| 100%`` +`glmnet` itself is used to fit the relaxed fits by using a single value of zero for `lambda`. However, for nonlinear models such as `family = "binomial"`, `family = "multinomial"` and `family="poisson"`, there can be convergence issues. This is because `glmnet` does not do step size optimization, rather relying on +the pathwise fit to stay in the "quadratic" zone of the log-likelihood. We have an optional `path = TRUE` option for `relax.glmnet`, which actually fits a regurized path toward the `lambda = 0` solution, and thus avoids +the issue. The default is `path = FALSE` since this option adds to the computing time. -`` Fold: 4/10`` - -`` |=============================================| 100%`` - -`` Fold: 5/10`` - -`` |=============================================| 100%`` - -`` Fold: 6/10`` - -`` |============================= | 70%`` - - -Tracing of the folds works a little differently when distributed -computing is used. - -Here the `trace` argument should be used in each call to `glmnet` or -`cv.glmnet`. One can set this option session wide via a call to -`glmnet.control` with its new `itrace` argument: - -```{r, eval=FALSE} -glmnet.control(itrace=1) -``` - -To reset it, one makes a similar call and sets `itrace=0`. - -## C index for Cox models - - We have a new performance measure for the Cox model: the Harrel *C index*. - This is like the AUC measure of concordance for survival - data, but only considers comparable pairs. Pure concordance would - record the fraction of pairs for which the order of the death times - agree with the order of the predicted risk. But with survival data, - if an observation is right censored at a time *before* another - observation's death time, they are not comparable. - -```{r} - data(CoxExample) -``` - -```{r} - cvfit=cv.glmnet(x,y,family="cox",type.measure="C") - plot(cvfit) -``` - -## Assessing models on test data - -Once we have fit a series of models using `glmnet`, we often assess -their performance on a set of evaluation or test data. We usually go -through the process of building a prediction matrix, and then deciding -on the measure, and computing the values for a series of values for -`lambda` and now `gamma`. Here we provide three functions for making -these tasks easier. - -### Performance measures - -The function `assess.glmnet` computes the same performance measures produced by -`cv.glmnet`, but on a validation or test dataset. +## Application to forward stepwise regression +One use case for a relaxed fit is as a faster version of forward stepwise regression. With a large number `p` of variables, forward stepwise regression can be tedious. On the other hand, because the lasso solves a convex problem, it can plunge in and identify good candidate sets of variables over 100 values of `lambda`, even though `p` could be in the tens of thousands. In a case like this, one can have `cv.glmnet` do the selection of variables. ```{r} -data(BinomialExample) -itrain=1:70 -fit=glmnet(x[itrain,],y[itrain],family="binomial",nlambda=20) -assess.glmnet(fit,newx=x[-itrain,],newy=y[-itrain]) -``` - -This produces a list with *all* the measures suitable for a binomial -model, computed for the entire sequence of lambdas in the fit object. -Here the function identifies the model family from the fit object. - -A second use case builds the prediction matrix first - -```{r, eval=FALSE} -pred=predict(fit,newx=x[-itrain,]) -assess.glmnet(pred,newy=y[-itrain],family="binomial") -``` - -Here we have to provide the `family` as an argument; the results (not -shown) are the same. Users can see the various measures suitable for -each family via - -```{r} -glmnet.measures() -``` - -The assess function can also take the result of `cv.glmnet` as input. -In this case the predictions are made at the optimal values for the -parameter(s). - -```{r} -cfit=cv.glmnet(x[itrain,],y[itrain],family="binomial", nlambda = 30) -assess.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain]) -``` - -This used the default value of `s=lambda.1se`, just like `predict` -would have done. -Users can provide additional arguments that get passed on to predict: - -```{r} -assess.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain], s="lambda.min") -``` - - - -One interesting use case is to get the results of CV using other -measures, via the `keep` argument. In this case the `fit.preval` -object is a matrix of prevalidated predictions made using the folds `foldid` - -```{r} -cfit=cv.glmnet(x,y,family="binomial",keep=TRUE, nlambda = 30) -assess.glmnet(cfit$fit.preval,newy=y,family="binomial") -``` - -Users can verify that the first measure here `deviance` is identical -to the component `cvm` on the `cfit` object. - -### ROC curves for binomial data - -In the special case of binomial models, users often would like to see -the ROC curve for validation or test data. Here the function -`roc.glmnet` provides the goodies. Its first argument is as in -`assess.glmnet`. Here we illustrate one use case, using the -prevlidated CV fit as before. - - -```{r} -cfit=cv.glmnet(x,y,family="binomial", type.measure="auc", keep=TRUE) -rocs=roc.glmnet(cfit$fit.preval,newy=y) -which=match(cfit$lambda.min,cfit$lambda) -plot(rocs[[which]],type="l") -nopr=sapply(rocs,lines,col="grey") -lines(rocs[[which]],lwd=2,col="red") -``` - -In this case `roc.glmnet` returns a list of cross-validated ROC data, one for each -model along the path. In the third line we identify the CV -winner. Then we plot all the curves in grey, and the winner in red. - -### Confusion matrices for classification - -For binomial and multinomial models, we often which to examine the -classification performance on new data. The function -`confusion.glmnet` will do that. - -```{r} -data(MultinomialExample) -set.seed(101) -itrain=sample(1:500,400,replace=FALSE) -cfit=cv.glmnet(x[itrain,],y[itrain],family="multinomial") -cnf=confusion.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain]) -print(cnf) -``` - -It produces a table of class `confusion.table` which inherits from -calss `table`, and we also provide a print method. - -The first argument to `confusion.glmnet` should be either a `glmnet` object, or a -`cv.glmnet` object, from which predictions can be made, or a -matrix/array of predictions, such as the *kept* `fit.predval` object -from `cv.glmnet`. - -In the second case we need to specify the `family`, -otherwise *confusion* can exist between `binomial` and `multinomial` -prediction matrices. -Here we show a multinomial example - -```{r} -cfit=cv.glmnet(x,y,family="multinomial",type="class",keep=TRUE) -cnf=confusion.glmnet(cfit$fit.preval,newy=y,family="multinomial") -which=match(cfit$lambda.min,cfit$lambda) -print(cnf[[which]]) -``` -Since the `fit.preval` object has predictions for the whole path, the -result of `confusion.glmnet` here is a list of confusion tables. -We identify and print the one corresponding to the minimum -classification error. - - -## Fitting big and/or sparse GLMs - -We include a function `bigGlm` for fitting a single GLM model -(unpenalized), but allowing all the options of `glmnet`. -In other words, coefficient upper and/or lower bounds and sparse `x` -matrices. This is not too much more than fitting a model with a single -value of `lambda=0` (with some protection from edge cases). -There is also a `predict` and `print` method. - -```{r} -data(BinomialExample) -fit=bigGlm(x,y,family="binomial",lower.limits=-1) -print(fit) -``` - -## Producing x from mixed variables, and missing data - -We have created a function `makeX` that makes it easy to create the -model matrix `x` needed as input to `glmnet`. It takes as input a data -frame, which can contain vectors, matrices and factors. Some of the features are - -* Factors are *one-hot* encoded to form indicator matrices -* Missing values in the resultant matrix can be replaced by the column - means -* The `sparse` option returns a matrix in column-sparse format. This - is useful if the data are large, and factors have many levels. -* Two dataframes can be provided, `train` and `test`. This ensures the - factor levels correspond, and also imputes missing data in the test - data from means in the training data. - - We start with a simple case with some factors. - -```{r} -set.seed(101) -X = matrix(rnorm(20),10,2) -X3=sample(letters[1:3],10,replace=TRUE) -X4=sample(LETTERS[1:3],10,replace=TRUE) -df=data.frame(X,X3,X4) -makeX(df) -``` - -Or if a sparse output was desired: -```{r} -makeX(df,sparse=TRUE) -``` - -And now some missing values - -```{r} -Xn=X -Xn[3,1]=NA;Xn[5,2]=NA -X3n=X3; -X3n[6]=NA -X4n=X4 -X4n[9]=NA -dfn=data.frame(Xn,X3n,X4n) -makeX(dfn) -``` -which we can replace with column-mean imputations (and make sparse, if -we like) - -```{r} -makeX(dfn,na.impute=TRUE,sparse=TRUE) +fitr <- cv.glmnet(x, y, gamma = 0, relax = TRUE) +plot(fitr) ``` -Finally if a test set is available as well - -```{r} -X = matrix(rnorm(10),5,2) -X3=sample(letters[1:3],5,replace=TRUE) -X4=sample(LETTERS[1:3],5,replace=TRUE) -Xn=X -Xn[3,1]=NA;Xn[5,2]=NA -X3n=X3; -X3n[1]=NA -X4n=X4 -X4n[2]=NA -dftn=data.frame(Xn,X3n,X4n) -makeX(dfn,dftn,na.impute=TRUE, sparse=TRUE) -``` - +Notice that we only allow `gamma = 0`, so in this case we are not considering the blended fits. ## References diff -Nru r-cran-glmnet-4.0-2/inst/mortran/wls.m r-cran-glmnet-4.1/inst/mortran/wls.m --- r-cran-glmnet-4.0-2/inst/mortran/wls.m 2020-06-11 15:32:05.000000000 +0000 +++ r-cran-glmnet-4.1/inst/mortran/wls.m 2021-01-06 22:06:54.000000000 +0000 @@ -336,7 +336,7 @@ subroutine get_int_parms2(epsnr,mxitnr); implicit double precision(a-h,o-z); data epsnr0,mxitnr0 - /1.0d-8,25/; + /1.0d-6,25/; epsnr=epsnr0; mxitnr=mxitnr0; return; entry chg_epsnr(arg); epsnr0=arg; return; diff -Nru r-cran-glmnet-4.0-2/man/assess.glmnet.Rd r-cran-glmnet-4.1/man/assess.glmnet.Rd --- r-cran-glmnet-4.0-2/man/assess.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/assess.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -7,12 +7,22 @@ \alias{roc.glmnet} \title{assess performance of a 'glmnet' object using test data.} \usage{ -assess.glmnet(object, newx = NULL, newy, weights = NULL, - family = c("gaussian", "binomial", "poisson", "multinomial", "cox", - "mgaussian"), ...) +assess.glmnet( + object, + newx = NULL, + newy, + weights = NULL, + family = c("gaussian", "binomial", "poisson", "multinomial", "cox", "mgaussian"), + ... +) -confusion.glmnet(object, newx = NULL, newy, family = c("binomial", - "multinomial"), ...) +confusion.glmnet( + object, + newx = NULL, + newy, + family = c("binomial", "multinomial"), + ... +) roc.glmnet(object, newx = NULL, newy, ...) } diff -Nru r-cran-glmnet-4.0-2/man/beta_CVX.Rd r-cran-glmnet-4.1/man/beta_CVX.Rd --- r-cran-glmnet-4.0-2/man/beta_CVX.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/beta_CVX.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -5,7 +5,9 @@ \alias{x} \alias{y} \title{Simulated data for the glmnet vignette} -\format{Data objects used to demonstrate features in the glmnet vignette} +\format{ +Data objects used to demonstrate features in the glmnet vignette +} \description{ Simple simulated data, used to demonstrate the features of glmnet } diff -Nru r-cran-glmnet-4.0-2/man/cox.fit.Rd r-cran-glmnet-4.1/man/cox.fit.Rd --- r-cran-glmnet-4.0-2/man/cox.fit.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/cox.fit.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coxpath.R +\name{cox.fit} +\alias{cox.fit} +\title{Fit a Cox regression model with elastic net regularization for a single +value of lambda} +\usage{ +cox.fit( + x, + y, + weights, + lambda, + alpha = 1, + offset = rep(0, nobs), + thresh = 1e-10, + maxit = 1e+05, + penalty.factor = rep(1, nvars), + exclude = c(), + lower.limits = -Inf, + upper.limits = Inf, + warm = NULL, + from.cox.path = FALSE, + save.fit = FALSE, + trace.it = 0 +) +} +\arguments{ +\item{x}{Input matrix, of dimension \code{nobs x nvars}; each row is an +observation vector. If it is a sparse matrix, it is assumed to be unstandardized. +It should have attributes \code{xm} and \code{xs}, where \code{xm(j)} and +\code{xs(j)} are the centering and scaling factors for variable j respsectively. +If it is not a sparse matrix, it is assumed that any standardization needed +has already been done.} + +\item{y}{Survival response variable, must be a Surv or stratifySurv object.} + +\item{weights}{Observation weights. \code{cox.fit} does NOT standardize +these weights.} + +\item{lambda}{A single value for the \code{lambda} hyperparameter.} + +\item{alpha}{See glmnet help file} + +\item{offset}{See glmnet help file} + +\item{thresh}{Convergence threshold for coordinate descent. Each inner +coordinate-descent loop continues until the maximum change in the objective +after any coefficient update is less than thresh times the null deviance. +Default value is \code{1e-10}.} + +\item{maxit}{Maximum number of passes over the data; default is \code{10^5}. +(If a warm start object is provided, the number of passes the warm start object +performed is included.)} + +\item{penalty.factor}{See glmnet help file} + +\item{exclude}{See glmnet help file} + +\item{lower.limits}{See glmnet help file} + +\item{upper.limits}{See glmnet help file} + +\item{warm}{Either a \code{glmnetfit} object or a list (with name \code{beta} +containing coefficients) which can be used as a warm start. Default is +\code{NULL}, indicating no warm start. For internal use only.} + +\item{from.cox.path}{Was \code{cox.fit()} called from \code{cox.path()}? +Default is FALSE.This has implications for computation of the penalty factors.} + +\item{save.fit}{Return the warm start object? Default is FALSE.} + +\item{trace.it}{Controls how much information is printed to screen. If +\code{trace.it=2}, some information about the fitting procedure is printed to +the console as the model is being fitted. Default is \code{trace.it=0} +(no information printed). (\code{trace.it=1} not used for compatibility with +\code{glmnet.path}.)} +} +\value{ +An object with class "coxnet", "glmnetfit" and "glmnet". The list +returned contains more keys than that of a "glmnet" object. +\item{a0}{Intercept value, \code{NULL} for "cox" family.} +\item{beta}{A \code{nvars x 1} matrix of coefficients, stored in sparse matrix +format.} +\item{df}{The number of nonzero coefficients.} +\item{dim}{Dimension of coefficient matrix.} +\item{lambda}{Lambda value used.} +\item{dev.ratio}{The fraction of (null) deviance explained. The deviance +calculations incorporate weights if present in the model. The deviance is +defined to be 2*(loglike_sat - loglike), where loglike_sat is the log-likelihood +for the saturated model (a model with a free parameter per observation). +Hence dev.ratio=1-dev/nulldev.} +\item{nulldev}{Null deviance (per observation). This is defined to be +2*(loglike_sat -loglike(Null)). The null model refers to the 0 model.} +\item{npasses}{Total passes over the data.} +\item{jerr}{Error flag, for warnings and errors (largely for internal +debugging).} +\item{offset}{A logical variable indicating whether an offset was included +in the model.} +\item{call}{The call that produced this object.} +\item{nobs}{Number of observations.} +\item{warm_fit}{If \code{save.fit=TRUE}, output of FORTRAN routine, used for +warm starts. For internal use only.} +\item{family}{Family used for the model, always "cox".} +\item{converged}{A logical variable: was the algorithm judged to have +converged?} +\item{boundary}{A logical variable: is the fitted value on the boundary of +the attainable values?} +\item{obj_function}{Objective function value at the solution.} +} +\description{ +Fit a Cox regression model via penalized maximum likelihood for a single +value of lambda. Can deal with (start, stop] data and strata, as well as +sparse design matrices. +} +\details{ +WARNING: Users should not call \code{cox.fit} directly. Higher-level +functions in this package call \code{cox.fit} as a subroutine. If a +warm start object is provided, some of the other arguments in the function +may be overriden. + +\code{cox.fit} solves the elastic net problem for a single, user-specified +value of lambda. \code{cox.fit} works for Cox regression models, including +(start, stop] data and strata. It solves the problem using iteratively +reweighted least squares (IRLS). For each IRLS iteration, \code{cox.fit} +makes a quadratic (Newton) approximation of the log-likelihood, then calls +\code{elnet.fit} to minimize the resulting approximation. + +In terms of standardization: \code{cox.fit} does not standardize \code{x} +and \code{weights}. \code{penalty.factor} is standardized so that they sum +up to \code{nvars}. +} diff -Nru r-cran-glmnet-4.0-2/man/coxgrad.Rd r-cran-glmnet-4.1/man/coxgrad.Rd --- r-cran-glmnet-4.0-2/man/coxgrad.Rd 2019-10-22 23:04:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/coxgrad.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -2,37 +2,72 @@ % Please edit documentation in R/coxgrad.R \name{coxgrad} \alias{coxgrad} -\title{compute gradient for cox model} +\alias{coxgrad2} +\alias{coxgrad3} +\title{Compute gradient for Cox model} \usage{ -coxgrad(f, time, d, w, eps = 1e-05) +coxgrad(eta, y, w, std.weights = TRUE, diag.hessian = FALSE) + +coxgrad2(eta, y, w, std.weights = TRUE, diag.hessian = FALSE) + +coxgrad3(eta, y, w, std.weights = TRUE, diag.hessian = FALSE) } \arguments{ -\item{f}{fit vector} +\item{eta}{Fit vector (usually from glmnet at a particular lambda).} -\item{time}{time vector (can have ties)} +\item{y}{Survival response variable, must be a \code{Surv} or +\code{stratifySurv} object.} -\item{d}{death/censoring indicator 1/0} +\item{w}{Observation weights (default is all equal to 1).} -\item{w}{observation weights (default equal)} +\item{std.weights}{If TRUE (default), observation weights are standardized +to sum to 1.} -\item{eps}{(default 0.00001) Breaks ties between death and censoring by making death times \code{eps} earlier} +\item{diag.hessian}{If \code{TRUE}, compute the diagonal of the Hessian +of the log partial likelihood as well. Default is \code{FALSE}.} } \value{ -a single gradient vector the same length as \code{f} +A single gradient vector the same length as \code{eta}. If +\code{diag.hessian=TRUE}, the diagonal of the Hessian is +included as an attribute "diag_hessian". } \description{ -Compute the gradient of the partial likelihood at a particular fit +Compute the gradient of the log partial likelihood at a particular fit for Cox +model. } \details{ Compute a gradient vector at the fitted vector for the log partial likelihood. -This is like a residual vector, and useful for manual screening of predictors for \code{glmnet} -in applications where \code{p} is very large (as in GWAS). Uses the Breslow approach to ties +This is like a residual vector, and useful for manual screening of +predictors for \code{glmnet} in applications where \code{p} is very large +(as in GWAS). Uses the Breslow approach to ties. + +This function is essentially a wrapper: it checks whether the response +provided is right-censored or (start, stop] survival data, and calls the +appropriate internal routine. For right-censored data it calls +\code{coxgrad2()}. For (start, stop] data, it calls \code{coxgrad3()}. +} +\examples{ +set.seed(1) +eta <- rnorm(10) +time <- runif(10, min = 1, max = 10) +d <- ifelse(rnorm(10) > 0, 1, 0) +y <- survival::Surv(time, d) +coxgrad(eta, y) + +# return diagonal of Hessian as well +coxgrad(eta, y, diag.hessian = TRUE) + +# example with (start, stop] data +y2 <- survival::Surv(time, time + runif(10), d) +coxgrad(eta, y2) + +# example with strata +y2 <- stratifySurv(y, rep(1:2, length.out = 10)) +coxgrad(eta, y2) + } \seealso{ \code{coxnet.deviance} } -\author{ -Trevor Hastie\cr Maintainer: Trevor Hastie \href{mailto:hastie@stanford.edu}{hastie@stanford.edu} -} \keyword{Cox} \keyword{model} diff -Nru r-cran-glmnet-4.0-2/man/coxnet.deviance.Rd r-cran-glmnet-4.1/man/coxnet.deviance.Rd --- r-cran-glmnet-4.0-2/man/coxnet.deviance.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/coxnet.deviance.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -2,39 +2,123 @@ % Please edit documentation in R/coxnet.deviance.R \name{coxnet.deviance} \alias{coxnet.deviance} -\title{compute deviance for cox model output} +\alias{coxnet.deviance0} +\alias{coxnet.deviance2} +\alias{coxnet.deviance3} +\title{Compute deviance for Cox model} \usage{ -coxnet.deviance(pred = NULL, y, x = 0, offset = NULL, - weights = NULL, beta = NULL) +coxnet.deviance( + pred = NULL, + y, + x = NULL, + offset = NULL, + weights = NULL, + std.weights = TRUE, + beta = NULL +) + +coxnet.deviance0( + pred = NULL, + y, + x = NULL, + offset = NULL, + weights = NULL, + std.weights = TRUE, + beta = NULL +) + +coxnet.deviance2( + pred = NULL, + y, + x = NULL, + offset = NULL, + weights = NULL, + std.weights = TRUE, + beta = NULL +) + +coxnet.deviance3( + pred = NULL, + y, + x = NULL, + offset = NULL, + weights = NULL, + std.weights = TRUE, + beta = NULL +) } \arguments{ -\item{pred}{matrix of predictions} +\item{pred}{Fit vector or matrix (usually from glmnet at a particular +lambda or a sequence of lambdas).} + +\item{y}{Survival response variable, must be a \code{Surv} or +\code{stratifySurv} object.} -\item{y}{a survival response matrix, as produced by \code{Surv}} +\item{x}{Optional \code{x} matrix, to be supplied if \code{pred = NULL}.} -\item{x}{optional \code{x} matrix, if \code{pred} is \code{NULL}} +\item{offset}{Optional offset vector.} -\item{offset}{optional offset} +\item{weights}{Observation weights (default is all equal to 1).} -\item{weights}{optional observation weights} +\item{std.weights}{If TRUE (default), observation weights are standardized +to sum to 1.} -\item{beta}{optional coefficient vector/matrix, supplied if \code{pred=NULL}} +\item{beta}{Optional coefficient vector/matrix, to be supplied if +\code{pred = NULL}.} } \value{ -a single or vector of deviances +A vector of deviances, one for each column of predictions. } \description{ -Given a fit or coefficients, compute the deciance (-2 log partial likelihood) for -right-censored survival data +Compute the deviance (-2 log partial likelihood) for Cox model. } \details{ -\code{coxnet.deviance} computes the deviance for a single prediction, or a matrix of predictions +Computes the deviance for a single set of predictions, or for a matrix +of predictions. The user can either supply the predictions +directly through the \code{pred} option, or by supplying the \code{x} matrix +and \code{beta} coefficients. Uses the Breslow approach to ties. + +The function first checks if \code{pred} is passed: if so, it is used as +the predictions. If \code{pred} is not passed but \code{x} and \code{beta} +are passed, then these values are used to compute the predictions. If +neither \code{x} nor \code{beta} are passed, then the predictions are all +taken to be 0. + +\code{coxnet.deviance()} is a wrapper: it calls \code{coxnet.deviance0()} +if the response is right-censored data, and calls \code{coxnet.deviance3()} +if the response is (start, stop] survival data. + +\code{coxnet.deviance2()} gives the same output as \code{coxnet.deviance0()} +but is written completely in R. It is not called by +\code{coxnet.deviance()}, and is kept in the package for completeness. +} +\examples{ +set.seed(1) +eta <- rnorm(10) +time <- runif(10, min = 1, max = 10) +d <- ifelse(rnorm(10) > 0, 1, 0) +y <- survival::Surv(time, d) +coxnet.deviance(pred = eta, y = y) + +# if pred not provided, it is set to zero vector +coxnet.deviance(y = y) + +# example with x and beta +x <- matrix(rnorm(10 * 3), nrow = 10) +beta <- matrix(1:3, ncol = 1) +coxnet.deviance(y = y, x = x, beta = beta) + +# example with (start, stop] data +y2 <- survival::Surv(time, time + runif(10), d) +coxnet.deviance(pred = eta, y = y2) + +# example with strata +y2 <- stratifySurv(y, rep(1:2, length.out = 10)) +coxnet.deviance(pred = eta, y = y2) + } \seealso{ \code{coxgrad} } -\author{ -Trevor Hastie\cr Maintainer: Trevor Hastie \href{mailto:hastie@stanford.edu}{hastie@stanford.edu} -} \keyword{Cox} \keyword{model} diff -Nru r-cran-glmnet-4.0-2/man/cox_obj_function.Rd r-cran-glmnet-4.1/man/cox_obj_function.Rd --- r-cran-glmnet-4.0-2/man/cox_obj_function.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/cox_obj_function.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coxpath.R +\name{cox_obj_function} +\alias{cox_obj_function} +\title{Elastic net objective function value for Cox regression model} +\usage{ +cox_obj_function(y, pred, weights, lambda, alpha, coefficients, vp) +} +\arguments{ +\item{y}{Survival response variable, must be a \code{Surv} or +\code{stratifySurv} object.} + +\item{pred}{Model's predictions for \code{y}.} + +\item{weights}{Observation weights.} + +\item{lambda}{A single value for the \code{lambda} hyperparameter.} + +\item{alpha}{The elasticnet mixing parameter, with \eqn{0 \le \alpha \le 1}.} + +\item{coefficients}{The model's coefficients.} + +\item{vp}{Penalty factors for each of the coefficients.} +} +\description{ +Returns the elastic net objective function value for Cox regression model. +} diff -Nru r-cran-glmnet-4.0-2/man/cox.path.Rd r-cran-glmnet-4.1/man/cox.path.Rd --- r-cran-glmnet-4.0-2/man/cox.path.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/cox.path.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,134 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coxpath.R +\name{cox.path} +\alias{cox.path} +\title{Fit a Cox regression model with elastic net regularization for a path of +lambda values} +\usage{ +cox.path( + x, + y, + weights = NULL, + offset = NULL, + alpha = 1, + nlambda = 100, + lambda.min.ratio = ifelse(nobs < nvars, 0.01, 1e-04), + lambda = NULL, + standardize = TRUE, + thresh = 1e-10, + exclude = NULL, + penalty.factor = rep(1, nvars), + lower.limits = -Inf, + upper.limits = Inf, + maxit = 1e+05, + trace.it = 0, + ... +) +} +\arguments{ +\item{x}{See glmnet help file} + +\item{y}{Survival response variable, must be a \code{Surv} or +\code{stratifySurv} object.} + +\item{weights}{See glmnet help file} + +\item{offset}{See glmnet help file} + +\item{alpha}{See glmnet help file} + +\item{nlambda}{See glmnet help file} + +\item{lambda.min.ratio}{See glmnet help file} + +\item{lambda}{See glmnet help file} + +\item{standardize}{See glmnet help file} + +\item{thresh}{Convergence threshold for coordinate descent. Each inner +coordinate-descent loop continues until the maximum change in the objective +after any coefficient update is less than thresh times the null deviance. +Default value is \code{1e-10}.} + +\item{exclude}{See glmnet help file} + +\item{penalty.factor}{See glmnet help file} + +\item{lower.limits}{See glmnet help file} + +\item{upper.limits}{See glmnet help file} + +\item{maxit}{See glmnet help file} + +\item{trace.it}{Controls how much information is printed to screen. Default is +\code{trace.it=0} (no information printed). If \code{trace.it=1}, a progress +bar is displayed. If \code{trace.it=2}, some information about the fitting +procedure is printed to the console as the model is being fitted.} + +\item{...}{Other arguments passed from glmnet (not used right now).} +} +\value{ +An object of class "coxnet" and "glmnet". +\item{a0}{Intercept value, \code{NULL} for "cox" family.} +\item{beta}{A \code{nvars x length(lambda)} matrix of coefficients, stored in +sparse matrix format.} +\item{df}{The number of nonzero coefficients for each value of lambda.} +\item{dim}{Dimension of coefficient matrix.} +\item{lambda}{The actual sequence of lambda values used. When alpha=0, the +largest lambda reported does not quite give the zero coefficients reported +(lambda=inf would in principle). Instead, the largest lambda for alpha=0.001 +is used, and the sequence of lambda values is derived from this.} +\item{dev.ratio}{The fraction of (null) deviance explained. The deviance +calculations incorporate weights if present in the model. The deviance is +defined to be 2*(loglike_sat - loglike), where loglike_sat is the log-likelihood +for the saturated model (a model with a free parameter per observation). +Hence dev.ratio=1-dev/nulldev.} +\item{nulldev}{Null deviance (per observation). This is defined to be +2*(loglike_sat -loglike(Null)). The null model refers to the 0 model.} +\item{npasses}{Total passes over the data summed over all lambda values.} +\item{jerr}{Error flag, for warnings and errors (largely for internal +debugging).} +\item{offset}{A logical variable indicating whether an offset was included +in the model.} +\item{call}{The call that produced this object.} +\item{nobs}{Number of observations.} +} +\description{ +Fit a Cox regression model via penalized maximum likelihood for a path of +lambda values. Can deal with (start, stop] data and strata, as well as +sparse design matrices. +} +\details{ +Sometimes the sequence is truncated before \code{nlambda} values of lambda +have been used. This happens when \code{cox.path} detects that the +decrease in deviance is marginal (i.e. we are near a saturated fit). +} +\examples{ +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +x <- matrix(xvec, nrow = nobs) +beta <- rnorm(nvars / 3) +fx <- x[, seq(nvars / 3)] \%*\% beta / 3 +ty <- rexp(nobs, exp(fx)) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +jsurv <- survival::Surv(ty, tcens) +fit1 <- glmnet:::cox.path(x, jsurv) + +# works with sparse x matrix +x_sparse <- Matrix::Matrix(x, sparse = TRUE) +fit2 <- glmnet:::cox.path(x_sparse, jsurv) + +# example with (start, stop] data +set.seed(2) +start_time <- runif(100, min = 0, max = 5) +stop_time <- start_time + runif(100, min = 0.1, max = 3) +status <- rbinom(n = nobs, prob = 0.3, size = 1) +jsurv_ss <- survival::Surv(start_time, stop_time, status) +fit3 <- glmnet:::cox.path(x, jsurv_ss) + +# example with strata +jsurv_ss2 <- stratifySurv(jsurv_ss, rep(1:2, each = 50)) +fit4 <- glmnet:::cox.path(x, jsurv_ss2) +} diff -Nru r-cran-glmnet-4.0-2/man/cv.glmnet.Rd r-cran-glmnet-4.1/man/cv.glmnet.Rd --- r-cran-glmnet-4.0-2/man/cv.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/cv.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,11 +4,24 @@ \alias{cv.glmnet} \title{Cross-validation for glmnet} \usage{ -cv.glmnet(x, y, weights = NULL, offset = NULL, lambda = NULL, - type.measure = c("default", "mse", "deviance", "class", "auc", "mae", - "C"), nfolds = 10, foldid = NULL, alignment = c("lambda", - "fraction"), grouped = TRUE, keep = FALSE, parallel = FALSE, - gamma = c(0, 0.25, 0.5, 0.75, 1), relax = FALSE, trace.it = 0, ...) +cv.glmnet( + x, + y, + weights = NULL, + offset = NULL, + lambda = NULL, + type.measure = c("default", "mse", "deviance", "class", "auc", "mae", "C"), + nfolds = 10, + foldid = NULL, + alignment = c("lambda", "fraction"), + grouped = TRUE, + keep = FALSE, + parallel = FALSE, + gamma = c(0, 0.25, 0.5, 0.75, 1), + relax = FALSE, + trace.it = 0, + ... +) } \arguments{ \item{x}{\code{x} matrix as in \code{glmnet}.} @@ -112,9 +125,10 @@ \code{keep=TRUE}, this is the array of prevalidated fits. Some entries can be \code{NA}, if that and subsequent values of \code{lambda} are not reached for that fold} \item{foldid}{if \code{keep=TRUE}, the fold assignments used} +\item{index}{a one column matrix with the indices of \code{lambda.min} and \code{lambda.1se} in the sequence of coefficients, fits etc.} \item{relaxed}{if \code{relax=TRUE}, this additional item has the CV info for each of the mixed fits. In particular it also selects \code{lambda, -gamma} pairs corresponding to the 1SE rule, as well as the minimum error.} +gamma} pairs corresponding to the 1se rule, as well as the minimum error. It also has a component \code{index}, a two-column matrix which contains the \code{lambda} and \code{gamma} indices corresponding to the "min" and "1se" solutions.} } \description{ Does k-fold cross-validation for glmnet, produces a plot, and returns a @@ -137,7 +151,7 @@ If \code{relax=TRUE} then the values of \code{gamma} are used to mix the fits. If \eqn{\eta} is the fit for lasso/elastic net, and \eqn{\eta_R} is the relaxed fit (with unpenalized coefficients), then a relaxed fit mixed by -\eqn{\gamma} is \deqn{\eta(\gamma)=(1-\gamma)\eta_R+\gamma\eta}. There is +\eqn{\gamma} is \deqn{\eta(\gamma)=(1-\gamma)\eta_R+\gamma\eta.} There is practically no extra cost for having a lot of values for \code{gamma}. However, 5 seems sufficient for most purposes. CV then selects both \code{gamma} and \code{lambda}. diff -Nru r-cran-glmnet-4.0-2/man/elnet.fit.Rd r-cran-glmnet-4.1/man/elnet.fit.Rd --- r-cran-glmnet-4.0-2/man/elnet.fit.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/elnet.fit.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,10 +4,23 @@ \alias{elnet.fit} \title{Solve weighted least squares (WLS) problem for a single lambda value} \usage{ -elnet.fit(x, y, weights, lambda, alpha = 1, intercept = TRUE, - thresh = 1e-07, maxit = 1e+05, penalty.factor = rep(1, nvars), - exclude = c(), lower.limits = -Inf, upper.limits = Inf, - warm = NULL, from.glmnet.fit = FALSE, save.fit = FALSE) +elnet.fit( + x, + y, + weights, + lambda, + alpha = 1, + intercept = TRUE, + thresh = 1e-07, + maxit = 1e+05, + penalty.factor = rep(1, nvars), + exclude = c(), + lower.limits = -Inf, + upper.limits = Inf, + warm = NULL, + from.glmnet.fit = FALSE, + save.fit = FALSE +) } \arguments{ \item{x}{Input matrix, of dimension \code{nobs x nvars}; each row is an diff -Nru r-cran-glmnet-4.0-2/man/fid.Rd r-cran-glmnet-4.1/man/fid.Rd --- r-cran-glmnet-4.0-2/man/fid.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/fid.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coxgrad.R +\name{fid} +\alias{fid} +\title{Helper function for Cox deviance and gradient} +\usage{ +fid(x, index) +} +\arguments{ +\item{x}{Sorted vector of death times.} + +\item{index}{Vector of indices for the death times.} +} +\value{ +A list with two arguments. +\item{index_first}{A vector of indices for the first observation at each +death time as they appear in the sorted list.} +\item{index_ties}{If there are no ties at all, this is NULL. If not, this is +a list with length equal to the number of unique times with ties. For each +time with ties, index_ties gives the indices of the observations with a +death at that time.} +} +\description{ +Helps to find ties in death times of data. +} +\examples{ +# Example with no ties +glmnet:::fid(c(1, 4, 5, 6), 1:5) + +# Example with ties +glmnet:::fid(c(1, 1, 1, 2, 3, 3, 4, 4, 4), 1:9) +} diff -Nru r-cran-glmnet-4.0-2/man/get_cox_lambda_max.Rd r-cran-glmnet-4.1/man/get_cox_lambda_max.Rd --- r-cran-glmnet-4.0-2/man/get_cox_lambda_max.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/get_cox_lambda_max.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coxpath.R +\name{get_cox_lambda_max} +\alias{get_cox_lambda_max} +\title{Get lambda max for Cox regression model} +\usage{ +get_cox_lambda_max( + x, + y, + alpha, + weights = rep(1, nrow(x)), + offset = rep(0, nrow(x)), + exclude = c(), + vp = rep(1, ncol(x)) +) +} +\arguments{ +\item{x}{Input matrix, of dimension \code{nobs x nvars}; each row is an +observation vector. If it is a sparse matrix, it is assumed to be unstandardized. +It should have attributes \code{xm} and \code{xs}, where \code{xm(j)} and +\code{xs(j)} are the centering and scaling factors for variable j respsectively. +If it is not a sparse matrix, it is assumed to be standardized.} + +\item{y}{Survival response variable, must be a \code{Surv} or +\code{stratifySurv} object.} + +\item{alpha}{The elasticnet mixing parameter, with \eqn{0 \le \alpha \le 1}.} + +\item{weights}{Observation weights.} + +\item{offset}{Offset for the model. Default is a zero vector of length +\code{nrow(y)}.} + +\item{exclude}{Indices of variables to be excluded from the model.} + +\item{vp}{Separate penalty factors can be applied to each coefficient.} +} +\description{ +Return the lambda max value for Cox regression model, used for computing +initial lambda values. For internal use only. +} +\details{ +This function is called by \code{cox.path} for the value of lambda max. + +When \code{x} is not sparse, it is expected to already by centered and scaled. +When \code{x} is sparse, the function will get its attributes \code{xm} and +\code{xs} for its centering and scaling factors. The value of +\code{lambda_max} changes depending on whether \code{x} is centered and +scaled or not, so we need \code{xm} and \code{xs} to get the correct value. +} diff -Nru r-cran-glmnet-4.0-2/man/get_start.Rd r-cran-glmnet-4.1/man/get_start.Rd --- r-cran-glmnet-4.0-2/man/get_start.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/get_start.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,8 +4,18 @@ \alias{get_start} \title{Get null deviance, starting mu and lambda max} \usage{ -get_start(x, y, weights, family, intercept, is.offset, offset, exclude, vp, - alpha) +get_start( + x, + y, + weights, + family, + intercept, + is.offset, + offset, + exclude, + vp, + alpha +) } \arguments{ \item{x}{Input matrix, of dimension \code{nobs x nvars}; each row is an diff -Nru r-cran-glmnet-4.0-2/man/glmnet.control.Rd r-cran-glmnet-4.1/man/glmnet.control.Rd --- r-cran-glmnet-4.0-2/man/glmnet.control.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/glmnet.control.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,10 +4,21 @@ \alias{glmnet.control} \title{internal glmnet parameters} \usage{ -glmnet.control(fdev = 1e-05, devmax = 0.999, eps = 1e-06, - big = 9.9e+35, mnlam = 5, pmin = 1e-09, exmx = 250, - prec = 1e-10, mxit = 100, itrace = 0, epsnr = 1e-08, - mxitnr = 25, factory = FALSE) +glmnet.control( + fdev = 1e-05, + devmax = 0.999, + eps = 1e-06, + big = 9.9e+35, + mnlam = 5, + pmin = 1e-09, + exmx = 250, + prec = 1e-10, + mxit = 100, + itrace = 0, + epsnr = 1e-06, + mxitnr = 25, + factory = FALSE +) } \arguments{ \item{fdev}{minimum fractional change in deviance for stopping path; factory @@ -40,7 +51,7 @@ and \code{cv.glmnet}. factory default = 0} \item{epsnr}{convergence threshold for \code{glmnet.fit}. factory default = -1.0e-8} +1.0e-6} \item{mxitnr}{maximum iterations for the IRLS loop in \code{glmnet.fit}. factory default = 25} diff -Nru r-cran-glmnet-4.0-2/man/glmnet.fit.Rd r-cran-glmnet-4.1/man/glmnet.fit.Rd --- r-cran-glmnet-4.0-2/man/glmnet.fit.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/glmnet.fit.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,11 +4,26 @@ \alias{glmnet.fit} \title{Fit a GLM with elastic net regularization for a single value of lambda} \usage{ -glmnet.fit(x, y, weights, lambda, alpha = 1, offset = rep(0, nobs), - family = gaussian(), intercept = TRUE, thresh = 1e-10, - maxit = 1e+05, penalty.factor = rep(1, nvars), exclude = c(), - lower.limits = -Inf, upper.limits = Inf, warm = NULL, - from.glmnet.path = FALSE, save.fit = FALSE, trace.it = 0) +glmnet.fit( + x, + y, + weights, + lambda, + alpha = 1, + offset = rep(0, nobs), + family = gaussian(), + intercept = TRUE, + thresh = 1e-10, + maxit = 1e+05, + penalty.factor = rep(1, nvars), + exclude = c(), + lower.limits = -Inf, + upper.limits = Inf, + warm = NULL, + from.glmnet.path = FALSE, + save.fit = FALSE, + trace.it = 0 +) } \arguments{ \item{x}{Input matrix, of dimension \code{nobs x nvars}; each row is an diff -Nru r-cran-glmnet-4.0-2/man/glmnet-internal.Rd r-cran-glmnet-4.1/man/glmnet-internal.Rd --- r-cran-glmnet-4.0-2/man/glmnet-internal.Rd 2019-10-24 18:26:56.000000000 +0000 +++ r-cran-glmnet-4.1/man/glmnet-internal.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -10,7 +10,6 @@ \alias{cvcompute} \alias{getcoef} \alias{getcoef.multinomial} -\alias{response.coxnet} \alias{fix.lam} \alias{error.bars} \alias{getmin} diff -Nru r-cran-glmnet-4.0-2/man/glmnet.measures.Rd r-cran-glmnet-4.1/man/glmnet.measures.Rd --- r-cran-glmnet-4.0-2/man/glmnet.measures.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/glmnet.measures.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,8 +4,10 @@ \alias{glmnet.measures} \title{Display the names of the measures used in CV for different "glmnet" families} \usage{ -glmnet.measures(family = c("all", "gaussian", "binomial", "poisson", - "multinomial", "cox", "mgaussian", "GLM")) +glmnet.measures( + family = c("all", "gaussian", "binomial", "poisson", "multinomial", "cox", + "mgaussian", "GLM") +) } \arguments{ \item{family}{If a "glmnet" family is supplied, a list of the names of diff -Nru r-cran-glmnet-4.0-2/man/glmnet.path.Rd r-cran-glmnet-4.1/man/glmnet.path.Rd --- r-cran-glmnet-4.0-2/man/glmnet.path.Rd 2020-06-14 23:14:20.000000000 +0000 +++ r-cran-glmnet-4.1/man/glmnet.path.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,12 +4,26 @@ \alias{glmnet.path} \title{Fit a GLM with elastic net regularization for a path of lambda values} \usage{ -glmnet.path(x, y, weights = NULL, lambda = NULL, nlambda = 100, - lambda.min.ratio = ifelse(nobs < nvars, 0.01, 1e-04), alpha = 1, - offset = NULL, family = gaussian(), standardize = TRUE, - intercept = TRUE, thresh = 1e-10, maxit = 1e+05, - penalty.factor = rep(1, nvars), exclude = integer(0), - lower.limits = -Inf, upper.limits = Inf, trace.it = 0) +glmnet.path( + x, + y, + weights = NULL, + lambda = NULL, + nlambda = 100, + lambda.min.ratio = ifelse(nobs < nvars, 0.01, 1e-04), + alpha = 1, + offset = NULL, + family = gaussian(), + standardize = TRUE, + intercept = TRUE, + thresh = 1e-10, + maxit = 1e+05, + penalty.factor = rep(1, nvars), + exclude = integer(0), + lower.limits = -Inf, + upper.limits = Inf, + trace.it = 0 +) } \arguments{ \item{x}{Input matrix, of dimension \code{nobs x nvars}; each row is an diff -Nru r-cran-glmnet-4.0-2/man/glmnet.Rd r-cran-glmnet-4.1/man/glmnet.Rd --- r-cran-glmnet-4.0-2/man/glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -5,26 +5,41 @@ \alias{relax.glmnet} \title{fit a GLM with lasso or elasticnet regularization} \usage{ -glmnet(x, y, family = c("gaussian", "binomial", "poisson", "multinomial", - "cox", "mgaussian"), weights = NULL, offset = NULL, alpha = 1, - nlambda = 100, lambda.min.ratio = ifelse(nobs < nvars, 0.01, 1e-04), - lambda = NULL, standardize = TRUE, intercept = TRUE, - thresh = 1e-07, dfmax = nvars + 1, pmax = min(dfmax * 2 + 20, - nvars), exclude = NULL, penalty.factor = rep(1, nvars), - lower.limits = -Inf, upper.limits = Inf, maxit = 1e+05, +glmnet( + x, + y, + family = c("gaussian", "binomial", "poisson", "multinomial", "cox", "mgaussian"), + weights = NULL, + offset = NULL, + alpha = 1, + nlambda = 100, + lambda.min.ratio = ifelse(nobs < nvars, 0.01, 1e-04), + lambda = NULL, + standardize = TRUE, + intercept = TRUE, + thresh = 1e-07, + dfmax = nvars + 1, + pmax = min(dfmax * 2 + 20, nvars), + exclude = NULL, + penalty.factor = rep(1, nvars), + lower.limits = -Inf, + upper.limits = Inf, + maxit = 1e+05, type.gaussian = ifelse(nvars < 500, "covariance", "naive"), type.logistic = c("Newton", "modified.Newton"), - standardize.response = FALSE, type.multinomial = c("ungrouped", - "grouped"), relax = FALSE, trace.it = 0, ...) + standardize.response = FALSE, + type.multinomial = c("ungrouped", "grouped"), + relax = FALSE, + trace.it = 0, + ... +) -relax.glmnet(fit, x, ..., maxp = n - 3, path = FALSE, - check.args = TRUE) +relax.glmnet(fit, x, ..., maxp = n - 3, path = FALSE, check.args = TRUE) } \arguments{ \item{x}{input matrix, of dimension nobs x nvars; each row is an observation vector. Can be in sparse matrix format (inherit from class -\code{"sparseMatrix"} as in package \code{Matrix}; not yet available for -\code{family="cox"})} +\code{"sparseMatrix"} as in package \code{Matrix})} \item{y}{response variable. Quantitative for \code{family="gaussian"}, or \code{family="poisson"} (non-negative counts). For \code{family="binomial"} @@ -32,17 +47,17 @@ or proportions (the second column is treated as the target class; for a factor, the last level in alphabetical order is the target class). For \code{family="multinomial"}, can be a \code{nc>=2} level factor, or a matrix -with \code{nc} columns of counts or proportions. For either +with \code{nc} columns of counts or proportions. For either \code{"binomial"} or \code{"multinomial"}, if \code{y} is presented as a -vector, it will be coerced into a factor. For \code{family="cox"}, \code{y} -should be a two-column matrix with columns named 'time' and 'status'. The -latter is a binary variable, with '1' indicating death, and '0' indicating -right censored. The function \code{Surv()} in package \pkg{survival} -produces such a matrix. For \code{family="mgaussian"}, \code{y} is a matrix +vector, it will be coerced into a factor. For \code{family="cox"}, preferably +a \code{Surv} object from the survival package: see Details section for +more information. For \code{family="mgaussian"}, \code{y} is a matrix of quantitative responses.} -\item{family}{Response type (see above). Either a character string representing -one of the built-in families, or else a \code{glm()} family object.} +\item{family}{Either a character string representing +one of the built-in families, or else a \code{glm()} family object. For more +information, see Details section below or the documentation for response +type (above).} \item{weights}{observation weights. Can be total counts if responses are proportion matrices. Default is 1 for each observation} @@ -218,8 +233,22 @@ descent. For \code{family="gaussian"} this is the lasso sequence if \code{alpha=1}, else it is the elasticnet sequence. +The objective function for \code{"gaussian"} is \deqn{1/2 RSS/nobs + +\lambda*penalty,} and for the other models it is \deqn{-loglik/nobs + +\lambda*penalty.} Note also that for \code{"gaussian"}, \code{glmnet} +standardizes y to have unit variance (using 1/n rather than 1/(n-1) formula) +before computing its lambda sequence (and then unstandardizes the resulting +coefficients); if you wish to reproduce/compare results with other software, +best to supply a standardized y. The coefficients for any predictor +variables with zero variance are set to zero for all values of lambda. +\subsection{Details on \code{family} option}{ + From version 4.0 onwards, glmnet supports both the original built-in families, as well as \emph{any} family object as used by \code{stats:glm()}. +This opens the door to a wide variety of additional models. For example +\code{family=binomial(link=cloglog)} or \code{family=negative.binomial(theta=1.5)} (from the MASS library). +Note that the code runs faster for the built-in families. + The built in families are specifed via a character string. For all families, the object produced is a lasso or elasticnet regularization path for fitting the generalized linear regression paths, by maximizing the appropriate penalized @@ -233,16 +262,8 @@ penalties take care of redundancies. A two-class \code{"multinomial"} model will produce the same fit as the corresponding \code{"binomial"} model, except the pair of coefficient matrices will be equal in magnitude and -opposite in sign, and half the \code{"binomial"} values. Note that the -objective function for \code{"gaussian"} is \deqn{1/2 RSS/nobs + -\lambda*penalty,} and for the other models it is \deqn{-loglik/nobs + -\lambda*penalty.} Note also that for \code{"gaussian"}, \code{glmnet} -standardizes y to have unit variance (using 1/n rather than 1/(n-1) formula) -before computing its lambda sequence (and then unstandardizes the resulting -coefficients); if you wish to reproduce/compare results with other software, -best to supply a standardized y. The coefficients for any predictor -variables with zero variance are set to zero for all values of lambda. -Two useful additional families are the \code{family="mgaussian"} family and +opposite in sign, and half the \code{"binomial"} values. +Two useful additional families are the \code{family="mgaussian"} family and the \code{type.multinomial="grouped"} option for multinomial fitting. The former allows a multi-response gaussian model to be fit, using a "group -lasso" penalty on the coefficients for each variable. Tying the responses @@ -257,12 +278,23 @@ \emph{before} the death times in computing the Breslow approximation; if users prefer the usual convention of \emph{after}, they can add a small number to all censoring times to achieve this effect. +} -Version 4.0 and later allows for the family argument to be a S3 class \code{"family"} object -(a list of functions and expressions). -This opens the door to a wide variety of additional models. For example -\code{family=binomial(link=cloglog)} or \code{family=negative.binomial(theta=1.5)} (from the MASS library). -Note that the code runs faster for the built-in families. +\subsection{Details on response for \code{family="cox"}}{ + +For Cox models, the response should preferably be a \code{Surv} object, +created by the \code{Surv()} function in \pkg{survival} package. For +right-censored data, this object should have type "right", and for +(start, stop] data, it should have type "counting". To fit stratified Cox +models, strata should be added to the response via the \code{stratifySurv()} +function before passing the response to \code{glmnet()}. (For backward +compatibility, right-censored data can also be passed as a +two-column matrix with columns named 'time' and 'status'. The +latter is a binary variable, with '1' indicating death, and '0' indicating +right censored.) +} + +\subsection{Details on \code{relax} option}{ If \code{relax=TRUE} a duplicate sequence of models is produced, where each active set in the @@ -274,6 +306,7 @@ supply the fit, and all the original arguments used to create that fit. They can limit the length of the relaxed path via 'maxp'. } +} \examples{ # Gaussian @@ -332,6 +365,22 @@ fit = glmnet(x, y, family = "cox") plot(fit) +# Cox example with (start, stop] data +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +x <- matrix(xvec, nrow = nobs) +start_time <- runif(100, min = 0, max = 5) +stop_time <- start_time + runif(100, min = 0.1, max = 3) +status <- rbinom(n = nobs, prob = 0.3, size = 1) +jsurv_ss <- survival::Surv(start_time, stop_time, status) +fit <- glmnet(x, jsurv_ss, family = "cox") + +# Cox example with strata +jsurv_ss2 <- stratifySurv(jsurv_ss, rep(1:2, each = 50)) +fit <- glmnet(x, jsurv_ss2, family = "cox") + # Sparse n = 10000 p = 200 @@ -355,22 +404,21 @@ \references{ Friedman, J., Hastie, T. and Tibshirani, R. (2008) \emph{Regularization Paths for Generalized Linear Models via Coordinate -Descent}, \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}\cr -\emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr -\url{https://www.jstatsoft.org/v33/i01/}\cr Simon, N., Friedman, J., Hastie, -T., Tibshirani, R. (2011) \emph{Regularization Paths for Cox's Proportional +Descent (2010), Journal of Statistical Software, Vol. 33(1), 1-22}, +\url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}.\cr +Simon, N., Friedman, J., Hastie, T. and Tibshirani, R. (2011) +\emph{Regularization Paths for Cox's Proportional Hazards Model via Coordinate Descent, Journal of Statistical Software, Vol. -39(5) 1-13}\cr \url{https://www.jstatsoft.org/v39/i05/}\cr Tibshirani, +39(5), 1-13}, \url{https://www.jstatsoft.org/v39/i05/}.\cr Tibshirani, Robert, Bien, J., Friedman, J., Hastie, T.,Simon, N.,Taylor, J. and Tibshirani, Ryan. (2012) \emph{Strong Rules for Discarding Predictors in -Lasso-type Problems, JRSSB vol 74},\cr -\url{https://statweb.stanford.edu/~tibs/ftp/strong.pdf}\cr \emph{Stanford -Statistics Technical Report}\cr \url{https://arxiv.org/abs/1707.08692}\cr -Hastie, T., Tibshirani, Robert, Tibshirani, Ryan (2019) \emph{Extended +Lasso-type Problems, JRSSB, Vol. 74(2), 245-266}, +\url{https://statweb.stanford.edu/~tibs/ftp/strong.pdf}.\cr +Hastie, T., Tibshirani, Robert and Tibshirani, Ryan. \emph{Extended Comparisons of Best Subset Selection, Forward Stepwise Selection, and the -Lasso}\cr -\emph{Glmnet webpage with four vignettes} -\url{https://glmnet.stanford.edu} +Lasso (2017), Stanford Statistics Technical Report}, +\url{https://arxiv.org/abs/1707.08692}.\cr +Glmnet webpage with four vignettes, \url{https://glmnet.stanford.edu}. } \seealso{ \code{print}, \code{predict}, \code{coef} and \code{plot} methods, @@ -378,7 +426,7 @@ } \author{ Jerome Friedman, Trevor Hastie, Balasubramanian Narasimhan, Noah -Simon and Rob Tibshirani\cr Maintainer: Trevor Hastie +Simon, Kenneth Tay and Rob Tibshirani\cr Maintainer: Trevor Hastie \email{hastie@stanford.edu} } \keyword{models} diff -Nru r-cran-glmnet-4.0-2/man/mycoxph.Rd r-cran-glmnet-4.1/man/mycoxph.Rd --- r-cran-glmnet-4.0-2/man/mycoxph.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/mycoxph.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/survfit.coxnet.R +\name{mycoxph} +\alias{mycoxph} +\title{Helper function to fit coxph model for survfit.coxnet} +\usage{ +mycoxph(object, s, ...) +} +\arguments{ +\item{object}{A class \code{coxnet} object.} + +\item{s}{The value of the penalty parameter lambda at which the survival +curve is required.} + +\item{...}{The same ... that was passed to survfit.coxnet.} +} +\description{ +This function constructs the coxph call needed to run the "hack" of +coxph with 0 iterations. It's a separate function as we have to deal with +function options like strata, offset and observation weights. +} diff -Nru r-cran-glmnet-4.0-2/man/mycoxpred.Rd r-cran-glmnet-4.1/man/mycoxpred.Rd --- r-cran-glmnet-4.0-2/man/mycoxpred.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/mycoxpred.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/survfit.coxnet.R +\name{mycoxpred} +\alias{mycoxpred} +\title{Helper function to amend ... for new data in survfit.coxnet} +\usage{ +mycoxpred(object, s, ...) +} +\arguments{ +\item{object}{A class \code{coxnet} object.} + +\item{s}{The response for the fitted model.} + +\item{...}{The same ... that was passed to survfit.coxnet.} +} +\description{ +This function amends the function arguments passed to survfit.coxnet +via ... if new data was passed to survfit.coxnet. It's a separate +function as we have to deal with function options like newstrata +and newoffset. +} diff -Nru r-cran-glmnet-4.0-2/man/plot.glmnet.Rd r-cran-glmnet-4.1/man/plot.glmnet.Rd --- r-cran-glmnet-4.0-2/man/plot.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/plot.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -8,17 +8,25 @@ \alias{plot.relaxed} \title{plot coefficients from a "glmnet" object} \usage{ -\method{plot}{glmnet}(x, xvar = c("norm", "lambda", "dev"), - label = FALSE, ...) +\method{plot}{glmnet}(x, xvar = c("norm", "lambda", "dev"), label = FALSE, ...) -\method{plot}{mrelnet}(x, xvar = c("norm", "lambda", "dev"), - label = FALSE, type.coef = c("coef", "2norm"), ...) +\method{plot}{mrelnet}( + x, + xvar = c("norm", "lambda", "dev"), + label = FALSE, + type.coef = c("coef", "2norm"), + ... +) -\method{plot}{multnet}(x, xvar = c("norm", "lambda", "dev"), - label = FALSE, type.coef = c("coef", "2norm"), ...) +\method{plot}{multnet}( + x, + xvar = c("norm", "lambda", "dev"), + label = FALSE, + type.coef = c("coef", "2norm"), + ... +) -\method{plot}{relaxed}(x, xvar = c("lambda", "dev"), label = FALSE, - gamma = 1, ...) +\method{plot}{relaxed}(x, xvar = c("lambda", "dev"), label = FALSE, gamma = 1, ...) } \arguments{ \item{x}{fitted \code{"glmnet"} model} diff -Nru r-cran-glmnet-4.0-2/man/predict.cv.glmnet.Rd r-cran-glmnet-4.1/man/predict.cv.glmnet.Rd --- r-cran-glmnet-4.0-2/man/predict.cv.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/predict.cv.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -7,11 +7,15 @@ \alias{predict.cv.relaxed} \title{make predictions from a "cv.glmnet" object.} \usage{ -\method{predict}{cv.glmnet}(object, newx, s = c("lambda.1se", - "lambda.min"), ...) +\method{predict}{cv.glmnet}(object, newx, s = c("lambda.1se", "lambda.min"), ...) -\method{predict}{cv.relaxed}(object, newx, s = c("lambda.1se", - "lambda.min"), gamma = c("gamma.1se", "gamma.min"), ...) +\method{predict}{cv.relaxed}( + object, + newx, + s = c("lambda.1se", "lambda.min"), + gamma = c("gamma.1se", "gamma.min"), + ... +) } \arguments{ \item{object}{Fitted \code{"cv.glmnet"} or \code{"cv.relaxed"} object.} diff -Nru r-cran-glmnet-4.0-2/man/predict.glmnetfit.Rd r-cran-glmnet-4.1/man/predict.glmnetfit.Rd --- r-cran-glmnet-4.0-2/man/predict.glmnetfit.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/predict.glmnetfit.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -4,8 +4,15 @@ \alias{predict.glmnetfit} \title{Get predictions from a \code{glmnetfit} fit object} \usage{ -\method{predict}{glmnetfit}(object, newx, s = NULL, type = c("link", - "response", "coefficients", "nonzero"), exact = FALSE, newoffset, ...) +\method{predict}{glmnetfit}( + object, + newx, + s = NULL, + type = c("link", "response", "coefficients", "nonzero"), + exact = FALSE, + newoffset, + ... +) } \arguments{ \item{object}{Fitted "glmnetfit" object.} diff -Nru r-cran-glmnet-4.0-2/man/predict.glmnet.Rd r-cran-glmnet-4.1/man/predict.glmnet.Rd --- r-cran-glmnet-4.0-2/man/predict.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/predict.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -16,13 +16,26 @@ \usage{ \method{coef}{glmnet}(object, s = NULL, exact = FALSE, ...) -\method{predict}{glmnet}(object, newx, s = NULL, type = c("link", - "response", "coefficients", "nonzero", "class"), exact = FALSE, - newoffset, ...) +\method{predict}{glmnet}( + object, + newx, + s = NULL, + type = c("link", "response", "coefficients", "nonzero", "class"), + exact = FALSE, + newoffset, + ... +) -\method{predict}{relaxed}(object, newx, s = NULL, gamma = 1, +\method{predict}{relaxed}( + object, + newx, + s = NULL, + gamma = 1, type = c("link", "response", "coefficients", "nonzero", "class"), - exact = FALSE, newoffset, ...) + exact = FALSE, + newoffset, + ... +) } \arguments{ \item{object}{Fitted \code{"glmnet"} model object or a \code{"relaxed"} diff -Nru r-cran-glmnet-4.0-2/man/print.cv.glmnet.Rd r-cran-glmnet-4.1/man/print.cv.glmnet.Rd --- r-cran-glmnet-4.0-2/man/print.cv.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/print.cv.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -5,8 +5,7 @@ \alias{print.cv.relaxed} \title{print a cross-validated glmnet object} \usage{ -\method{print}{cv.glmnet}(x, digits = max(3, getOption("digits") - 3), - ...) +\method{print}{cv.glmnet}(x, digits = max(3, getOption("digits") - 3), ...) } \arguments{ \item{x}{fitted 'cv.glmnet' object} diff -Nru r-cran-glmnet-4.0-2/man/print.glmnet.Rd r-cran-glmnet-4.1/man/print.glmnet.Rd --- r-cran-glmnet-4.0-2/man/print.glmnet.Rd 2020-06-14 21:35:39.000000000 +0000 +++ r-cran-glmnet-4.1/man/print.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -23,11 +23,11 @@ } \details{ The call that produced the object \code{x} is printed, followed by a -three-column matrix with columns \code{Df}, \code{\%Dev} and \code{Lambda}. +three-column matrix with columns \code{Df}, \verb{\%Dev} and \code{Lambda}. The \code{Df} column is the number of nonzero coefficients (Df is a -reasonable name only for lasso fits). \code{\%Dev} is the percent deviance +reasonable name only for lasso fits). \verb{\%Dev} is the percent deviance explained (relative to the null deviance). In the case of a 'relaxed' fit, -an additional column is inserted, \code{\%Dev R} which gives the percent +an additional column is inserted, \verb{\%Dev R} which gives the percent deviance explained by the relaxed model. For a "bigGlm" model, a simpler summary is printed. } diff -Nru r-cran-glmnet-4.0-2/man/response.coxnet.Rd r-cran-glmnet-4.1/man/response.coxnet.Rd --- r-cran-glmnet-4.0-2/man/response.coxnet.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/response.coxnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/response.coxnet.R +\name{response.coxnet} +\alias{response.coxnet} +\title{Make response for coxnet} +\usage{ +response.coxnet(y) +} +\arguments{ +\item{y}{Response variable. Either a class "Surv" object or a two-column +matrix with columns named 'time' and 'status'.} +} +\value{ +A class "Surv" object. +} +\description{ +Internal function to make the response y passed to glmnet suitable +for coxnet (i.e. glmnet with family = "cox"). Sanity checks are performed +here too. +} +\details{ +If y is a class "Surv" object, this function returns y with no changes. If +y is a two-column matrix with columns named 'time' and 'status', it is +converted into a "Surv" object. +} diff -Nru r-cran-glmnet-4.0-2/man/stratifySurv.Rd r-cran-glmnet-4.1/man/stratifySurv.Rd --- r-cran-glmnet-4.0-2/man/stratifySurv.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/stratifySurv.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stratifySurv.R +\name{stratifySurv} +\alias{stratifySurv} +\title{Add strata to a Surv object} +\usage{ +stratifySurv(y, strata) +} +\arguments{ +\item{y}{A Surv object.} + +\item{strata}{A vector of length equal to the number of observations in +y, indicating strata membership.} +} +\value{ +An object of class \code{stratifySurv} (in addition to all the +classes \code{y} belonged to). +} +\description{ +Helper function to add strata as an attribute to a Surv object. The +output of this function can be used as the response in \code{glmnet()} +for fitting stratified Cox models. +} +\details{ +When fitting a stratified Cox model with \code{glmnet()}, strata should +be added to a \code{Surv} response with this helper function. Note that +it is not sufficient to add strata as an attribute to the \code{Surv} +response manually: if the result does not have class \code{stratifySurv}, +subsetting of the response will not work properly. +} +\examples{ +y <- survival::Surv(1:10, rep(0:1, length.out = 10)) +strata <- rep(1:3, length.out = 10) +y2 <- stratifySurv(y, strata) # returns stratifySurv object + +} diff -Nru r-cran-glmnet-4.0-2/man/survfit.coxnet.Rd r-cran-glmnet-4.1/man/survfit.coxnet.Rd --- r-cran-glmnet-4.0-2/man/survfit.coxnet.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/survfit.coxnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/survfit.coxnet.R +\name{survfit.coxnet} +\alias{survfit.coxnet} +\title{Compute a survival curve from a coxnet object} +\usage{ +\method{survfit}{coxnet}(formula, s = NULL, ...) +} +\arguments{ +\item{formula}{A class \code{coxnet} object.} + +\item{s}{Value(s) of the penalty parameter lambda at which the survival +curve is required. Default is the entire sequence used to create the model. +However, it is recommended that \code{survfit.coxnet} is called for +a single penalty parameter.} + +\item{...}{This is the mechanism for passing additional arguments like +(i) x= and y= for the x and y used to fit the model, +(ii) weights= and offset= when the model was fit with these options, +(iii) arguments for new data (newx, newoffset, newstrata), and +(iv) arguments to be passed to survfit.coxph().} +} +\value{ +If \code{s} is a single value, an object of class "survfitcox" +and "survfit" containing one or more survival curves. Otherwise, a list +of such objects, one element for each value in \code{s}. +Methods defined for survfit objects are print, summary and plot. +} +\description{ +Computes the predicted survivor function for a Cox proportional hazards +model with elastic net penalty. +} +\details{ +To be consistent with other functions in \code{glmnet}, if \code{s} +is not specified, survival curves are returned for the entire lambda +sequence. This is not recommended usage: it is best to call +\code{survfit.coxnet} with a single value of the penalty parameter +for the \code{s} option. +} +\examples{ +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +x <- matrix(xvec, nrow = nobs) +beta <- rnorm(nvars / 3) +fx <- x[, seq(nvars / 3)] \%*\% beta / 3 +ty <- rexp(nobs, exp(fx)) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +y <- survival::Surv(ty, tcens) +fit1 <- glmnet(x, y, family = "cox") + +# survfit object for Cox model where lambda = 0.1 +sf1 <- survival::survfit(fit1, s = 0.1, x = x, y = y) +plot(sf1) + +# example with new data +sf2 <- survival::survfit(fit1, s = 0.1, x = x, y = y, newx = x[1:3, ]) +plot(sf2) + +# example with strata +y2 <- stratifySurv(y, rep(1:2, length.out = nobs)) +fit2 <- glmnet(x, y2, family = "cox") +sf3 <- survival::survfit(fit2, s = 0.1, x = x, y = y2) +sf4 <- survival::survfit(fit2, s = 0.1, x = x, y = y2, + newx = x[1:3, ], newstrata = c(1, 1, 1)) + +} diff -Nru r-cran-glmnet-4.0-2/man/survfit.cv.glmnet.Rd r-cran-glmnet-4.1/man/survfit.cv.glmnet.Rd --- r-cran-glmnet-4.0-2/man/survfit.cv.glmnet.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/survfit.cv.glmnet.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/survfit.cv.glmnet.R +\name{survfit.cv.glmnet} +\alias{survfit.cv.glmnet} +\title{Compute a survival curve from a cv.glmnet object} +\usage{ +\method{survfit}{cv.glmnet}(formula, s = c("lambda.1se", "lambda.min"), ...) +} +\arguments{ +\item{formula}{A class \code{cv.glmnet} object. The object should have +been fit with \code{family = "cox"}.} + +\item{s}{Value(s) of the penalty parameter lambda at which predictions +are required. Default is the value s="lambda.1se" stored on the CV object. +Alternatively s="lambda.min" can be used. If s is numeric, it is taken +as the value(s) of lambda to be used.} + +\item{...}{Other arguments to be passed to \code{survfit.coxnet}.} +} +\value{ +If \code{s} is a single value, an object of class "survfitcox" +and "survfit" containing one or more survival curves. Otherwise, a list +of such objects, one element for each value in \code{s}. +Methods defined for survfit objects are print, summary and plot. +} +\description{ +Computes the predicted survivor function for a Cox proportional hazards +model with elastic net penalty from a cross-validated glmnet model. +} +\details{ +This function makes it easier to use the results of cross-validation +to compute a survival curve. +} +\examples{ +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +x <- matrix(xvec, nrow = nobs) +beta <- rnorm(nvars / 3) +fx <- x[, seq(nvars / 3)] \%*\% beta / 3 +ty <- rexp(nobs, exp(fx)) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +y <- survival::Surv(ty, tcens) +cvfit <- cv.glmnet(x, y, family = "cox") +# default: s = "lambda.1se" +survival::survfit(cvfit, x = x, y = y) + +# s = "lambda.min" +survival::survfit(cvfit, s = "lambda.min", x = x, y = y) +} diff -Nru r-cran-glmnet-4.0-2/man/use.cox.path.Rd r-cran-glmnet-4.1/man/use.cox.path.Rd --- r-cran-glmnet-4.0-2/man/use.cox.path.Rd 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/man/use.cox.path.Rd 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coxpath.R +\name{use.cox.path} +\alias{use.cox.path} +\title{Check if glmnet should call cox.path} +\usage{ +use.cox.path(x, y) +} +\arguments{ +\item{x}{Design matrix.} + +\item{y}{Response variable.} +} +\value{ +TRUE if cox.path() should be called, FALSE otherwise. +} +\description{ +Helper function to check if glmnet() should call cox.path(). +} +\details{ +For \code{family="cox"}, we only call the original coxnet() function if +(i) x is not sparse, (ii) y is right-censored data, and (iii) we are +not fitting a stratified Cox model. This function also throws an error +if y has a "strata" attribute but is not of type "stratifySurv". +} diff -Nru r-cran-glmnet-4.0-2/MD5 r-cran-glmnet-4.1/MD5 --- r-cran-glmnet-4.0-2/MD5 2020-06-16 00:00:02.000000000 +0000 +++ r-cran-glmnet-4.1/MD5 2021-01-11 08:00:31.000000000 +0000 @@ -1,6 +1,6 @@ -a14f32af87c60ca34ca215ca5f04d1d4 *DESCRIPTION -fe47f4aaa5f3fe9f56175460efff4a30 *NAMESPACE -bdef0b1d25fcff7759cedda71be2a3c1 *NEWS.md +7dbcbaa91d4c0e101ad971de57e2afd5 *DESCRIPTION +a4e9841f3d7c3df90e1e5e013d53e5ec *NAMESPACE +77ec04aaf832ea5bd4db63e241d7999b *NEWS.md b7e403cbac1975dfabbd5b1e6ec68cfc *R/Cindex.R 4bd3eb09b5b5e4d5dd4a6d885c734b0d *R/assess.coxnet.R 2f177df723d0565cbae683046d5ad550 *R/assess.glmnet.R @@ -9,24 +9,25 @@ 8b2f65e1c21c123294a132887081c627 *R/bigGlm.R d8808544bfbafb423ce60e359ade4b64 *R/blend.relaxed.R 04a38d922c4c78910aee28eb063fd474 *R/buildPredmat.array.R -bb2d1c1bb9d300028adab59cbb78a5f3 *R/buildPredmat.coxnetlist.R +70162e7910431ac9e89fcb5526469b84 *R/buildPredmat.coxnetlist.R 019453d01483ecda68866eaa73e1df3c *R/buildPredmat.default.R -23bbaa9bd006d8a2d3b01a42f50c150a *R/check.dots.R +885a478709d197d973cdc621cd3e5225 *R/check.dots.R 05d1834f8f3dc30e0428eb6d1dcb1ba1 *R/coef.cv.glmnet.R da1e68b0c4ae43a6ffcaa2ea3d7c287a *R/coef.cv.relaxed.R f288cf974491d93b56e5c4d6d204d658 *R/coef.glmnet.R a42ad691868ee94b3eae743299af81bc *R/coef.relaxed.R ab05dba77ad585b2ac3a465c26fc2b00 *R/coefnorm.R ee4cc296f6a922b28ba89fe64fa5ce56 *R/confusion.glmnet.R -042e71558b81ea66fb5005cbb5df58b9 *R/coxgrad.R +9b3af97c1b2d9e96d3a3b180fa3263cd *R/coxgrad.R b2ee26c95947789e26860e88068ac7b1 *R/coxnet.R -e77cce0cd2ae0dca23e4b88f8d6d432f *R/coxnet.deviance.R +89f6488ee17ae4e8007199ea07e00d63 *R/coxnet.deviance.R +5ba2e4f8c3010b240bbc8de5929d5bd1 *R/coxpath.R 9c81bd7c22c3ab10099606ad4aaabdf6 *R/cv.coxnet.R b3593fdbb51ba45fa9581d246850de4f *R/cv.elnet.R b2f4b2ab04489c3378cc4532551720ba *R/cv.fishnet.R -d3880c21371f02a177f605a97ceef7fa *R/cv.glmnet.R +827babb2bb71ff37abe9a6304736d28b *R/cv.glmnet.R 987e0f369d7652826d113042b0d1a8da *R/cv.glmnet.raw.R -71e519ac69c8a621de89140fa29df71d *R/cv.glmnetfit.R +1500e23337c4de6276f9745adfbcefcf *R/cv.glmnetfit.R 86747e59ed9762208d848cf84d815095 *R/cv.lognet.R 5f5234bbe3dc2fbc77015609ad7094d7 *R/cv.mrelnet.R ece778c00ca6b58e088f6a700d23e15b *R/cv.multnet.R @@ -40,15 +41,15 @@ 9cf716424fdcf017725b7941fed3727c *R/family.glmnet.R a4660a7abf8a28e207417120e5b10957 *R/fishnet.R ebce52c8995ee16c058c2aa93bdcf582 *R/fix.lam.R -0a75012638b7a5b9faf7885d93563974 *R/getOptcv.glmnet.R -cc7e7100464c5b9a60356ab46923d3f5 *R/getOptcv.relaxed.R +c65547cba8e3d5ad16b28a84881e7fca *R/getOptcv.glmnet.R +b0325f5e584fbe646aedb73d02f6b86d *R/getOptcv.relaxed.R dd14d7acb21ae8f1bb87a54ea763950b *R/getcoef.R c1a8bceef71c915555f8bffc7298137e *R/getcoef.multinomial.R -54d249eed2b1f4d0bdf7313f835d0060 *R/glmnet-package.R -8714d9347a943e78b197f5e62c71c191 *R/glmnet.R -669df7b2810c382bf28767379ae616be *R/glmnet.control.R +cc5c36c94de80e5399ef2db9355d4b09 *R/glmnet-package.R +e5232c597fd8090e37245eef421bebac *R/glmnet.R +e0da0e93d1a87cc5d5342050a15e6354 *R/glmnet.control.R 741a587449cd2f3613c2e52f25bc0280 *R/glmnet.measures.R -ee40f476ac147d89ad5caa10e5931fc8 *R/glmnetFlex.R +a180d89b66ae7aca6631781b8a7d29ed *R/glmnetFlex.R a6224dfa92ac162a19e455b90ec64108 *R/glmnet_softmax.R 5bb185a77e6a3cdae7401bd4ffcaf8f7 *R/jerr.R 65abc981c18160e0a229da5225aeb812 *R/jerr.coxnet.R @@ -58,7 +59,7 @@ 955ddc9c91f5cb11e10bce2feb17b4aa *R/jerr.mrelnet.R 2448cfc1a8d2d38e7e3289b2af7b5600 *R/lambda.interp.R de7722522b536bc2b8d48e8f1892f76f *R/lognet.R -d9b47605e933d04950d0923395f35c4c *R/makeX.R +dca4e022c62f90ac14272ddf8d4a1214 *R/makeX.R de3fcbceced4117b989ef8083113da6c *R/mrelnet.R 17d20a46471df38f1d1c6bb7cb16421a *R/na.mean.R 874f190461e3637bb4fe16c7da03ea6c *R/nonzeroCoef.R @@ -83,16 +84,19 @@ ed58f90aa0b7d7a5ead21c0838d5b330 *R/predict.relaxed.R 70cd43e9955641d2041c50030491a921 *R/print.bigGlm.R db093ef96748c43424e48993e2c23b0c *R/print.confusion.table.R -0f55831fddf378db1bfd45a9dd0a30e7 *R/print.cv.glmnet.R -e6bac43231cbbdf5279f929a4b706dbc *R/print.cv.relaxed.R +33d0f53e0210a76bcc6d6372b7a7577b *R/print.cv.glmnet.R +f57e0561181a529df64e8b15133a0f2b *R/print.cv.relaxed.R 85eea47cfd8403a66297ea317bad0be3 *R/print.glmnet.R 39b5e1b6697c05e7cb02eaa86a5c3d88 *R/relax.glmnet.R -ce06f0932252d48ac76dc6d0eab19b73 *R/response.coxnet.R +0f7e686cd02800174b7dc2165163d843 *R/response.coxnet.R b012181c241a4f6699d73f63bec2bd1d *R/rmult.R 533da5bd46d7fed918af5a04b1539d6c *R/roc.glmnet.R +b871a0324f13aacb70a1193bbbef0a2a *R/stratifySurv.R +e44a2d10d5135f6931a13542d8996d56 *R/survfit.coxnet.R +e2f22c34e46e1ec2964d1968e7116342 *R/survfit.cv.glmnet.R 0d040824da1ffcd1edb57b252f6b1f6e *R/zeromat.R 8920f300fb3d337d7048ef8af005d41e *README.md -9636489d0de7b7e82db0b2e29d380bcf *build/vignette.rds +9305e0262835e4bf9ec618fb06fe020b *build/vignette.rds 058ca4ea844ab733e99395fa1488546b *data/BinomialExample.RData 7a404bc6a255ac95a6cc06cf4fe6ddbf *data/CVXResults.RData 64d84fd86c91dfde6cbda5f13da82327 *data/CoxExample.RData @@ -101,64 +105,76 @@ c4f36770e60c6421603e9933930fec06 *data/PoissonExample.RData beaffe6c0ae597f8803216e3bbd4a2d1 *data/QuickStartExample.RData 7056d5f98a624c060d0522045f1b68f8 *data/SparseExample.RData -c22d19c217bc163a2c38b84add608c31 *inst/CITATION -f22779e1c8486853c9301836419f8220 *inst/doc/Coxnet.R -72bfbc6589c6442d8f33e6e56cf414f3 *inst/doc/Coxnet.Rmd -8ba72535efd55a4864b0571e02b7fbc4 *inst/doc/Coxnet.pdf -0626cfff426d60250a1f974475aafa9b *inst/doc/glmnet.R -2f2bc73a1c43b0220e1ada17362947bb *inst/doc/glmnet.Rmd -1e86d0da3ae1bfbecc33a7491a6e80fd *inst/doc/glmnet.pdf -71bb4625df1f8b879287c42ad69f3707 *inst/doc/glmnetFamily.R -d48173fcfb4a94406d39c8105447da3f *inst/doc/glmnetFamily.Rmd -b9009d1ccb1411d024fdde5fd9f6c13c *inst/doc/glmnetFamily.pdf -89bd88dbeae63e44c7a3a83a40cae553 *inst/doc/relax.R -9a02753803d77927adb06d655a99ecb0 *inst/doc/relax.Rmd -097dd33a85772f8519a3a32a62ca58ad *inst/doc/relax.pdf +edcd84fb25d51203916ae3a39b1fc655 *inst/CITATION +c2a99f2de8eaafe08dcdfdb754e1772e *inst/doc/Coxnet.R +fae7a9d02341e6576b9560e26701ad9b *inst/doc/Coxnet.Rmd +feba7e66967d9c0d4c37cef37a68faef *inst/doc/Coxnet.pdf +2d93b78f53f037a964659ffaabe3aaac *inst/doc/glmnet.R +bdb8b818d178d74a2af22608038932a1 *inst/doc/glmnet.Rmd +3b4fa408510b8907124855f46368d750 *inst/doc/glmnet.pdf +85579e41e7c36b006e98a47bb0ca7f6f *inst/doc/glmnetFamily.R +cab72afa072399ca5ae1906987ef89f4 *inst/doc/glmnetFamily.Rmd +345ea04d844463c07594f3f1ed3b963d *inst/doc/glmnetFamily.pdf +4a3dfcb5e091bb6803bf35f9c689c16f *inst/doc/relax.R +690ad421e350cde18389898140f41325 *inst/doc/relax.Rmd +679e67b99c42eda40d6e6fc4e822407f *inst/doc/relax.pdf 6b106667c04d867e9c1e279eedeff266 *inst/mortran/README.Rmd eca43bff4d1443c399d8305003777a9b *inst/mortran/glmnet5dpclean.m -ddc972102d9c8b0c1f350a1fc706b5ee *inst/mortran/wls.m +d8516f118db49ea4fbde5d5b2f6e67a6 *inst/mortran/wls.m 52e95162e7dc0e62ec329602b05cd001 *inst/testscripts/save_results.R 5a4389a6279bf8f6db41b6c7c5862bf9 *man/Cindex.Rd -d6b5a73f40776df16736644c85237a54 *man/assess.glmnet.Rd -bf0550fb152ec21cd4b2fc4a07ffd71d *man/beta_CVX.Rd +dbc0b807ae35fcb67ab8f8f849daa1d2 *man/assess.glmnet.Rd +d0a7f0da9827e41030fd7644b421bb58 *man/beta_CVX.Rd 413f29637810f0c84718ca2f5bb00ab1 *man/bigGlm.Rd -c5fba5e66633aa4b7a312bdda1c4e875 *man/coxgrad.Rd -2ab1a023435e409ab1913670d447778c *man/coxnet.deviance.Rd -700ea086fd818b0316709322ef91574e *man/cv.glmnet.Rd +9b08d40ce7e666b059b2679f2c878d9f *man/cox.fit.Rd +e6388e65131a54875a8f44c9e536e17e *man/cox.path.Rd +02d8b57b5fe0bb24c5a4577fdbeb8649 *man/cox_obj_function.Rd +6ea3ab538b40b0cc85c5a26d301f3065 *man/coxgrad.Rd +a064e66c50cd2a6653cc27036b22a611 *man/coxnet.deviance.Rd +e62e0d6b138bdec998598f8b502b2f05 *man/cv.glmnet.Rd 18d78ba828a20778f74374075a832d81 *man/dev_function.Rd 9ade2a133f3c6e4049a766a561a05145 *man/deviance.glmnet.Rd -9dc0304a5757f117452d861cac3389d0 *man/elnet.fit.Rd +d465886739abcf9ba23dd6928504e073 *man/elnet.fit.Rd +2604c132d7a0c60ba0bb2842687b9aaf *man/fid.Rd 0cb75faa99bbe1ccf8b63d7618f1d6a6 *man/figures/logo.png +3c5ba6e37b05e27f1f70179ebc5176c4 *man/get_cox_lambda_max.Rd 282c6f7db8fc51844dc28629a8b1e261 *man/get_eta.Rd -e01130d3bc1b340a687ac7d9539784e4 *man/get_start.Rd -5f20347d9a40d8d09134a01ae58a6d75 *man/glmnet-internal.Rd +1e9b7d5537b29ef0a4c097e26e0fb2a7 *man/get_start.Rd +ec561c83a75b43232a6a10335e911ac3 *man/glmnet-internal.Rd d6e356e0500e6f9f2678cb03f2ec828d *man/glmnet-package.Rd -50e9e7857ad32d54aaa7078bae3b3022 *man/glmnet.Rd -7fa75634a9f80619c998a4081a68cc3e *man/glmnet.control.Rd -f3677a570fd766d8dc00212b8a35c1d6 *man/glmnet.fit.Rd -35ed40d73458cabb03ff4d7fd630f22f *man/glmnet.measures.Rd -8e75acfee349a1db99c6ca034aa71e09 *man/glmnet.path.Rd +322ad6277d1da769781a9d18404ddf25 *man/glmnet.Rd +5cc46f5cf2cc5389a89709d6ae129d0c *man/glmnet.control.Rd +6d468edc0d0317c2098d27c64e17955e *man/glmnet.fit.Rd +fed187cbcdf90f65defb1e5de08b3539 *man/glmnet.measures.Rd +93b09ae713424336b982280c0e54d44f *man/glmnet.path.Rd a58741a0f8c88395ad8a12d3afc84cae *man/makeX.Rd +450ff2e96f274536ed5c1c7ae2b15478 *man/mycoxph.Rd +751f3230c6220d30f9e428ff985dc9c9 *man/mycoxpred.Rd 8e5d2c806cb13cc2b06207f68a31eba9 *man/na.replace.Rd 8ac2ff867e5e3c5fed650e9ba7fae90f *man/obj_function.Rd 64f275140d08ca77bde537001a99564d *man/pen_function.Rd 0a65347ee5ffbb8ac5344a3d9a0c4cc5 *man/plot.cv.glmnet.Rd -befd16466d52bc8dda535c909a0d5c0b *man/plot.glmnet.Rd -be490d1b8599f34e4d47dd3eaf6ea198 *man/predict.cv.glmnet.Rd -f5b94f979a0b49a6f013189767799576 *man/predict.glmnet.Rd -c4e0e6fa0fc370b1d3c8bd72d2b45421 *man/predict.glmnetfit.Rd -98578eda412f15fa7808567a181d1fcc *man/print.cv.glmnet.Rd -8b98865e9e84911c6e8d4d4e015e7a29 *man/print.glmnet.Rd +1f418aa4c5e1911ca6cc000e4f22bf5f *man/plot.glmnet.Rd +9fd729f8acf90d79120a3c58295a7a85 *man/predict.cv.glmnet.Rd +5c1735040debd1431eb6e0b194e7938f *man/predict.glmnet.Rd +c94db387bc9bcc16d607294a3a3f50b6 *man/predict.glmnetfit.Rd +8e4be7a110c2b05437a16f7282d01de0 *man/print.cv.glmnet.Rd +3f6f703e63011a228505e5db088aa645 *man/print.glmnet.Rd +e4c08beafea3fe0967fe62702d6df665 *man/response.coxnet.Rd 3adbd56c4685f732227a9845b470aeb8 *man/rmult.Rd +f098c09b7e61325c0215231e000ae780 *man/stratifySurv.Rd +25bf7abec92edfb4312fa50c0a1ebc66 *man/survfit.coxnet.Rd +f2447246ed1bc0682526b694b2ff1daa *man/survfit.cv.glmnet.Rd +24b1289dbc6e8d8bcc0a90fe57286599 *man/use.cox.path.Rd 496f9de703e1b3d9ce71d4039b0ecaf8 *src/glmnet5dpclean.f 32b239d462d1f062fe7389da6d2b2a27 *src/glmnet_init.c 3965fc0eb8205870ea724035d96d8945 *src/pb.c -7b2f0ed6ae767497a2bdfe416097db9b *src/wls.f -72bfbc6589c6442d8f33e6e56cf414f3 *vignettes/Coxnet.Rmd +f2fefcc9ad61b762fa701181df5ab618 *src/wls.f +fae7a9d02341e6576b9560e26701ad9b *vignettes/Coxnet.Rmd 789121c8d4e2c1681d05f19d7a69a054 *vignettes/assets/coxnet.RDS -ef63dce9f91dc617a84540178b92a89f *vignettes/assets/glmnet_refs.bib +3969a7d30dda77ddac9a8dcea40f8f58 *vignettes/assets/glmnet_refs.bib 6b485f932628ec1755201811c3e05f49 *vignettes/assets/vignette_binomial.png 3f2e5f9cf200b832dd68492f66f63f9e *vignettes/assets/vignette_gaussian.png -2f2bc73a1c43b0220e1ada17362947bb *vignettes/glmnet.Rmd -d48173fcfb4a94406d39c8105447da3f *vignettes/glmnetFamily.Rmd -9a02753803d77927adb06d655a99ecb0 *vignettes/relax.Rmd +bdb8b818d178d74a2af22608038932a1 *vignettes/glmnet.Rmd +cab72afa072399ca5ae1906987ef89f4 *vignettes/glmnetFamily.Rmd +690ad421e350cde18389898140f41325 *vignettes/relax.Rmd diff -Nru r-cran-glmnet-4.0-2/NAMESPACE r-cran-glmnet-4.1/NAMESPACE --- r-cran-glmnet-4.0-2/NAMESPACE 2020-06-14 23:20:08.000000000 +0000 +++ r-cran-glmnet-4.1/NAMESPACE 2021-01-06 22:06:54.000000000 +0000 @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method("[",stratifySurv) S3method(buildPredmat,array) S3method(buildPredmat,coxnetlist) S3method(buildPredmat,default) @@ -39,6 +40,8 @@ S3method(print,cv.relaxed) S3method(print,glmnet) S3method(print,relaxed) +S3method(survfit,coxnet) +S3method(survfit,cv.glmnet) export(Cindex) export(assess.glmnet) export(bigGlm) @@ -62,6 +65,7 @@ export(relax.glmnet) export(rmult) export(roc.glmnet) +export(stratifySurv) import(Matrix) import(foreach) import(methods) @@ -77,6 +81,7 @@ importFrom(graphics,text) importFrom(shape,colorlegend) importFrom(stats,approx) +importFrom(stats,as.formula) importFrom(stats,binomial) importFrom(stats,coef) importFrom(stats,contrasts) @@ -94,6 +99,9 @@ importFrom(stats,weighted.mean) importFrom(survival,Surv) importFrom(survival,concordance) +importFrom(survival,coxph) importFrom(survival,is.Surv) +importFrom(survival,strata) +importFrom(survival,survfit) importFrom(utils,packageDescription) useDynLib(glmnet) diff -Nru r-cran-glmnet-4.0-2/NEWS.md r-cran-glmnet-4.1/NEWS.md --- r-cran-glmnet-4.0-2/NEWS.md 2020-06-14 21:34:04.000000000 +0000 +++ r-cran-glmnet-4.1/NEWS.md 2021-01-06 22:06:54.000000000 +0000 @@ -1,9 +1,22 @@ +# glmnet 4.1 + +Expanded scope for the Cox model. +* We now allow (start, stop) data in + addition to the original right-censored all start at zero option. +* Allow for strata as in `survival:coxph` +* Allow for sparse X matrix with Cox models (was not available before) +* Provide method for `survival:survfit` + +Vignettes are revised and reorganized. +Additional index information stored on `cv.glmnet` objects, and +included when printed. + # glmnet 4.0-2 * Biggest change. Cindex and auc calculations now use the `concordance` function from package `survival` * Minor changes. Allow coefficient warm starts for glmnet.fit. The print - method for glmnet now really prints %dDev rather than the fraction. + method for glmnet now really prints %Dev rather than the fraction. # glmnet 4.0 diff -Nru r-cran-glmnet-4.0-2/R/buildPredmat.coxnetlist.R r-cran-glmnet-4.1/R/buildPredmat.coxnetlist.R --- r-cran-glmnet-4.0-2/R/buildPredmat.coxnetlist.R 2019-10-19 20:53:16.000000000 +0000 +++ r-cran-glmnet-4.1/R/buildPredmat.coxnetlist.R 2021-01-06 22:06:54.000000000 +0000 @@ -25,16 +25,16 @@ if (grouped) { plfull = coxnet.deviance(x = x, y = y, offset = offset, weights = weights, beta = coefmat) - plminusk = coxnet.deviance(x = x[!which, ], - y = y[!which, ], offset = offset[!which], - weights = weights[!which],beta = coefmat) + plminusk = coxnet.deviance(x = x[!which, ], y = y[!which, ], + offset = offset[!which], + weights = weights[!which], + beta = coefmat) cvraw[i, seq(nlami)] = (plfull - plminusk)[seq(nlami)] } else { - plk = coxnet.deviance(x = x[which, ], - y = y[which, - ], offset = offset[which], weights = weights[which], - beta = coefmat) + plk = coxnet.deviance(x = x[which, ], y = y[which, ], + offset = offset[which], + weights = weights[which], beta = coefmat) cvraw[i, seq(nlami)] = plk[seq(nlami)] } } diff -Nru r-cran-glmnet-4.0-2/R/check.dots.R r-cran-glmnet-4.1/R/check.dots.R --- r-cran-glmnet-4.0-2/R/check.dots.R 2019-09-04 20:59:46.000000000 +0000 +++ r-cran-glmnet-4.1/R/check.dots.R 2021-01-06 22:06:54.000000000 +0000 @@ -1,17 +1,29 @@ -check_dots<- - function(object,...,need=c("x","y","weights","offset","penalty.factor","lower.limits","upper.limits"),error="used coef.glmnet() or predict.glmnet() with `exact=TRUE`"){ - if(is.null(need))return(invisible()) - thiscall=object$call - ncall=names(thiscall)[-1] - w=match(ncall,need,0) - need=need[w] - nargs=names(list(...)) - w=match(need,nargs,0)>0 - if(!all(w)){ - margs=need[!w] - stop(paste(error,"so must in addition supply original argument(s) ",paste(margs,collapse=" and "), " in order to safely rerun glmnet"),call.=FALSE) - } - invisible() +check_dots <- function(object, ..., + need = c("x", "y", "weights", "offset", "penalty.factor", + "lower.limits", "upper.limits"), + error_start = "used coef.glmnet() or predict.glmnet() with `exact=TRUE`", + error_end = " in order to safely rerun glmnet", + prefix = NULL) { + if (is.null(need)) return(invisible()) + + # extract the function options we need from the object's call + thiscall = object$call + ncall = names(thiscall)[-1] + w = match(ncall, need, 0) + need = need[w] + if (length(need) == 0) return(invisible()) + + # check that ... indeed has those function options + if (!is.null(prefix)) need <- paste0(prefix, need) + nargs = names(list(...)) + w = match(need, nargs, 0) > 0 + if(!all(w)) { + margs = need[!w] + stop(paste(error_start, + "so must in addition supply original argument(s) ", + paste(margs,collapse=" and "), + error_end), call.=FALSE) } - + invisible() +} diff -Nru r-cran-glmnet-4.0-2/R/coxgrad.R r-cran-glmnet-4.1/R/coxgrad.R --- r-cran-glmnet-4.0-2/R/coxgrad.R 2019-10-22 22:52:30.000000000 +0000 +++ r-cran-glmnet-4.1/R/coxgrad.R 2021-01-06 22:06:54.000000000 +0000 @@ -1,78 +1,349 @@ -#' compute gradient for cox model +#' Compute gradient for Cox model #' -#' Compute the gradient of the partial likelihood at a particular fit +#' Compute the gradient of the log partial likelihood at a particular fit for Cox +#' model. #' #' Compute a gradient vector at the fitted vector for the log partial likelihood. -#' This is like a residual vector, and useful for manual screening of predictors for \code{glmnet} -#' in applications where \code{p} is very large (as in GWAS). Uses the Breslow approach to ties +#' This is like a residual vector, and useful for manual screening of +#' predictors for \code{glmnet} in applications where \code{p} is very large +#' (as in GWAS). Uses the Breslow approach to ties. +#' +#' This function is essentially a wrapper: it checks whether the response +#' provided is right-censored or (start, stop] survival data, and calls the +#' appropriate internal routine. For right-censored data it calls +#' \code{coxgrad2()}. For (start, stop] data, it calls \code{coxgrad3()}. #' #' @aliases coxgrad -#' @param f fit vector -#' @param time time vector (can have ties) -#' @param d death/censoring indicator 1/0 -#' @param w observation weights (default equal) -#' @param eps (default 0.00001) Breaks ties between death and censoring by making death times \code{eps} earlier -#' @return a single gradient vector the same length as \code{f} -#' @author Trevor Hastie\cr Maintainer: Trevor Hastie +#' @param eta Fit vector (usually from glmnet at a particular lambda). +#' @param y Survival response variable, must be a \code{Surv} or +#' \code{stratifySurv} object. +#' @param w Observation weights (default is all equal to 1). +#' @param std.weights If TRUE (default), observation weights are standardized +#' to sum to 1. +#' @param diag.hessian If \code{TRUE}, compute the diagonal of the Hessian +#' of the log partial likelihood as well. Default is \code{FALSE}. +#' +#' @return A single gradient vector the same length as \code{eta}. If +#' \code{diag.hessian=TRUE}, the diagonal of the Hessian is +#' included as an attribute "diag_hessian". +#' +#' @examples +#' set.seed(1) +#' eta <- rnorm(10) +#' time <- runif(10, min = 1, max = 10) +#' d <- ifelse(rnorm(10) > 0, 1, 0) +#' y <- survival::Surv(time, d) +#' coxgrad(eta, y) +#' +#' # return diagonal of Hessian as well +#' coxgrad(eta, y, diag.hessian = TRUE) +#' +#' # example with (start, stop] data +#' y2 <- survival::Surv(time, time + runif(10), d) +#' coxgrad(eta, y2) +#' +#' # example with strata +#' y2 <- stratifySurv(y, rep(1:2, length.out = 10)) +#' coxgrad(eta, y2) +#' #' @seealso \code{coxnet.deviance} #' @keywords Cox model #' -#' @export coxgrad +#' @export +coxgrad <- function(eta, y, w, std.weights = TRUE, diag.hessian = FALSE) { + # if y has 2 columns, it is right-censored data + # if y has 3 columns, it is (start, stop] data + # otherwise, throw errors + if (ncol(y) == 2) { + return(coxgrad2(eta, y, w, std.weights, diag.hessian)) + } else if (ncol(y) == 3) { + return(coxgrad3(eta, y, w, std.weights, diag.hessian)) + } else { + stop("Response y should have 2 or 3 columns") + } +} + +#' @rdname coxgrad +coxgrad2 <- function(eta, y, w, std.weights = TRUE, diag.hessian = FALSE) { + if (missing(w)) w=rep(1,length(eta)) + if (std.weights) w=w/sum(w) + nobs <- nrow(y) + + # extract strata (if any) + if ("strata" %in% names(attributes(y))) { + strata <- attr(y, "strata") + } else { + strata <- rep(1, nobs) + } + if (length(strata) != nobs) stop("length of strata != nobs") + + # if all in same strata, do the computations + # if not, do strata-level computations and concatenate + if (length(unique(strata)) == 1) { + time <- y[, "time"] + d <- y[, "status"] + eta <- scale(eta, TRUE, FALSE) # center eta so exponents are not too large + + # order exp(eta), time, d and w in ascending time order + # for tied times, all deaths come before censored observations + if ("stop_time" %in% names(attributes(y))) { + o <- attr(y, "stop_time") + } else { + o <- order(time, d, decreasing = c(FALSE, TRUE)) + } + exp_eta <- exp(eta)[o] + time <- time[o] + d <- d[o] + w <- w[o] + rskden <- rev(cumsum(rev(exp_eta*w))) ##reverse order inside;last guy is in all the risk sets + + ### See if there are dups in death times + dups <- fid(time[d == 1],seq(length(d))[d == 1]) + dd <- d + ww <- w + + ### next code replaces each sequence of tied death indicators by a new + ### sequence where only the first is a 1 and the rest are zero. This + ### makes the accounting in the following step work properly we also + ### sums the weights in each of the tied death sets, and assign that + ### weight to the first + if(!is.null(ties<-dups$index_ties)){ + dd[unlist(ties)]=0 + dd[dups$index_first]=1 + wsum=sapply(ties,function(i,w)sum(w[i]),ww) + tie1=sapply(ties,function(i)i[1]) + ww[tie1]=wsum + } + + ### Get counts over risk sets at each death time + rskcount=cumsum(dd)#this says how many of the risk sets each observation is in; 0 is none + ### We now form partial sums of the 1/den just at the risk sets + rskdeninv=cumsum((ww/rskden)[dd==1]) + ### pad with a zero, so we can index it + rskdeninv=c(0,rskdeninv) + + ### compute gradient for each obs + grad <- w * (d - exp_eta * rskdeninv[rskcount+1]) + grad[o] <- grad + + # if diag.hessian = TRUE, return the diagonal of the hessian too + if (diag.hessian) { + rskdeninv2 <- cumsum((ww/(rskden^2))[dd==1]) + rskdeninv2 <- c(0, rskdeninv2) + w_exp_eta <- w * exp_eta + diag_hessian <- w_exp_eta^2 * rskdeninv2[rskcount+1] - w_exp_eta * rskdeninv[rskcount+1] + diag_hessian[o] <- diag_hessian + attr(grad, "diag_hessian") <- diag_hessian + } + return(grad) + } else { + # more than one strata provided: compute strata-level values and + # concatenate + overall_grad <- rep(NA, nobs) + if (diag.hessian) overall_diag_hessian <- rep(NA, nobs) + for (i in unique(strata)) { + ii <- which(strata == i) + strata_res <- coxgrad2(eta[ii], y[ii, , drop = FALSE], w[ii], + std.weights = FALSE, diag.hessian = diag.hessian) + overall_grad[ii] <- strata_res + if (diag.hessian) { + overall_diag_hessian[ii] <- attr(strata_res, "diag_hessian") + } + } + if (diag.hessian) { + attr(overall_grad, "diag_hessian") <- overall_diag_hessian + } + return(overall_grad) + } +} + +#' @rdname coxgrad +coxgrad3 <- function(eta, y, w, std.weights = TRUE, diag.hessian = FALSE) { + if (missing(w)) w=rep(1,length(eta)) + if (std.weights) w=w/sum(w) + nobs <- nrow(y) + + # extract strata (if any) + if ("strata" %in% names(attributes(y))) { + strata <- attr(y, "strata") + } else { + strata <- rep(1, nobs) + } + if (length(strata) != nobs) stop("length of strata != nobs") -coxgrad=function(f,time,d,w,eps=0.00001){ -### f is fitted function from glmnet at a particular lambda -### time is death or censoring time -### d is death indicator; d=0 means censored, d=1 means death -### w is a weight vector of non-negative weights, which will be normalized to sum to 1 - if(missing(w))w=rep(1,length(f)) - w=w/sum(w) - f=scale(f,TRUE,FALSE)#center f so exponents are not too large - time=time-d*eps#break ties between death times and non death times, leaving tied death times tied - o=order(time) - ef=exp(f)[o] - time=time[o] - d=d[o] - w=w[o] - rskden=rev(cumsum(rev(ef*w))) ##reverse order inside;last guy is in all the risk sets -### See if there are dups in death times - dups=fid(time[d==1],seq(length(d))[d==1]) - dd=d - ww=w -### next code replaces each sequence of tied death indicators by a new -### sequence where only the first is a 1 and the rest are zero. This -### makes the accounting in the following step work properly we also -### sums the weights in each of the tied death sets, and assign that -### weight to the first - if(!is.null(ties<-dups$index_ties)){ - dd[unlist(ties)]=0 - dd[dups$index_first]=1 - wsum=sapply(ties,function(i,w)sum(w[i]),ww) - tie1=sapply(ties,function(i)i[1]) - ww[tie1]=wsum - } -### Get counts over risk sets at each death time - rskcount=cumsum(dd)#this says how many of the risk sets each observation is in; 0 is none -### We now form partial sums of the 1/den just at the risk sets - rskdeninv=cumsum((ww/rskden)[dd==1]) -### pad with a zero, so we can index it - rskdeninv=c(0,rskdeninv) -### compute gradient for each obs - grad=(d-rskdeninv[rskcount+1]*ef)*w - grad[o]=grad - grad - } - -fid=function(x,index){ -### Input: -### x is a sorted vector of death times -### index is vector of indices of this set -### Output: -### index of first member of every death set as they appear in sorted list -### list of ties for each element of index, in the case of two or more ties; - ## if no ties, this list is NULL + # if all in same strata, do the computations + # if not, do strata-level computations and concatenate + if (length(unique(strata)) == 1) { + start_time <- y[, "start"] + stop_time <- y[, "stop"] + d <- y[, "status"] + eta <- scale(eta, TRUE, FALSE) # center eta so exponents are not too large + + # get ordering for stop time (ascending, deaths before censored), + # start time (ascending), and match info if cached + if ("stop_time" %in% names(attributes(y))) { + stop_o <- attr(y, "stop_time") + } else { + stop_o <- order(stop_time, d, decreasing = c(FALSE, TRUE)) + } + if ("start_time" %in% names(attributes(y))) { + start_o <- attr(y, "start_time") + } else { + start_o <- order(start_time, decreasing = c(FALSE)) + } + if ("ss_match" %in% names(attributes(y))) { + ss_match <- attr(y, "ss_match") + } else { + ss_match <- match(start_o, stop_o) + } + + # keep a set of values which are ordered by start time + w_exp_eta_start <- (w * exp(eta))[start_o] + start_time_start <- start_time[start_o] + + # reorder everything by stop time + exp_eta <- exp(eta)[stop_o] + start_time <- start_time[stop_o] + stop_time <- stop_time[stop_o] + d <- d[stop_o] + w <- w[stop_o] + + ### See if there are dups in death times + dups <- fid(stop_time[d == 1],seq(length(d))[d == 1]) + dd <- d + ww <- w + + ### next code replaces each sequence of tied death indicators by a new + ### sequence where only the first is a 1 and the rest are zero. This + ### makes the accounting in the following step work properly we also + ### sums the weights in each of the tied death sets, and assign that + ### weight to the first + if(!is.null(ties<-dups$index_ties)){ + dd[unlist(ties)]=0 + dd[dups$index_first]=1 + wsum=sapply(ties,function(i,w)sum(w[i]),ww) + tie1=sapply(ties,function(i)i[1]) + ww[tie1]=wsum + } + + # compute risk set sums rskden[i] = \sum_{j in R_i} w_j exp(eta_j) + # where i indexes the observations. (In the end, we will only care + # about the indices i which have actual death times.) + rskden <- rev(cumsum(rev(exp_eta*w))) + current_sum <- 0 + death_time <- stop_time[dups$index_first] + ndeaths <- length(death_time) + death_idx <- ndeaths; start_idx <- nobs + while (death_idx > 0 && start_idx > 0) { + if (start_time_start[start_idx] < death_time[death_idx]) { + # current start time belongs in risk set ending in stop time, + # so we should remove the current cumulative sum and consider + # the next risk set + stop_idx <- dups$index_first[death_idx] + rskden[stop_idx] <- rskden[stop_idx] - current_sum + death_idx <- death_idx - 1 + } else { + # current start time does not belong in risk set ending in stop + # time, so we should add it to current_sum and check if the + # start time before it should also be added + current_sum <- current_sum + w_exp_eta_start[start_idx] + start_idx <- start_idx - 1 + } + } + + # compute the terms rskterm[k] = \sum_{i in C_k} d[i] / rskden[i] and + # rskterm2[k] = \sum_{i in C_k} d[i] / rskden[i]^2. + # Here, k indexes the observations, index i runs over the unique death + # times. + rskfactor <- (ww / rskden)[dd == 1] + rskfactor2 <- (ww / rskden^2)[dd == 1] + rskdeninv <- c(0, cumsum(rskfactor)) # pad with 0 so that we can index + rskdeninv2 <- c(0, cumsum(rskfactor2)) + + # this says how many of the risk sets each observation is in; 0 is none + # (however, if start time is not zero, then we could be including an + # observation in too many risk sets: we will remove that later.) + rskcount <- cumsum(dd) + rskterm <- rskdeninv[rskcount+1] + rskterm2 <- rskdeninv2[rskcount+1] + current_sum <- 0; current_sum2 <- 0 + death_idx <- 1; start_idx <- 1 + while (death_idx <= ndeaths && start_idx <= nobs) { + if (start_time_start[start_idx] < death_time[death_idx]) { + # current observation belongs in risk set ending in death time, + # so we should remove the current cumulative sum and consider + # the next observation + stop_idx <- ss_match[start_idx] # match(start_o[start_idx], stop_o) + rskterm[stop_idx] <- rskterm[stop_idx] - current_sum + rskterm2[stop_idx] <- rskterm2[stop_idx] - current_sum2 + start_idx <- start_idx + 1 + } else { + # current observation doesn't belong in risk set ending in death + # time, so we should add the rskfactor associated with this + # death time to current_sum and check if the term assoc. with + # the death time after it should also be added + current_sum <- current_sum + rskfactor[death_idx] + current_sum2 <- current_sum2 + rskfactor2[death_idx] + death_idx <- death_idx + 1 + } + } + grad <- w * (d - exp_eta * rskterm) + grad[stop_o] <- grad + + # if diag.hessian = TRUE, return the diagonal of the hessian too + if (diag.hessian) { + w_exp_eta <- w * exp_eta + diag_hessian <- w_exp_eta^2 * rskterm2 - w_exp_eta * rskterm + diag_hessian[stop_o] <- diag_hessian + attr(grad, "diag_hessian") <- diag_hessian + } + return(grad) + } else { + # more than one strata provided: compute strata-level values and + # concatenate + overall_grad <- rep(NA, nobs) + if (diag.hessian) overall_diag_hessian <- rep(NA, nobs) + for (i in unique(strata)) { + ii <- which(strata == i) + strata_res <- coxgrad3(eta[ii], y[ii, , drop = FALSE], w[ii], + std.weights = FALSE, diag.hessian = diag.hessian) + overall_grad[ii] <- strata_res + if (diag.hessian) { + overall_diag_hessian[ii] <- attr(strata_res, "diag_hessian") + } + } + if (diag.hessian) { + attr(overall_grad, "diag_hessian") <- overall_diag_hessian + } + return(overall_grad) + } +} + +#' Helper function for Cox deviance and gradient +#' +#' Helps to find ties in death times of data. +#' +#' @param x Sorted vector of death times. +#' @param index Vector of indices for the death times. +#' +#' @return A list with two arguments. +#' \item{index_first}{A vector of indices for the first observation at each +#' death time as they appear in the sorted list.} +#' \item{index_ties}{If there are no ties at all, this is NULL. If not, this is +#' a list with length equal to the number of unique times with ties. For each +#' time with ties, index_ties gives the indices of the observations with a +#' death at that time.} +#' +#' @examples +#' # Example with no ties +#' glmnet:::fid(c(1, 4, 5, 6), 1:5) +#' +#' # Example with ties +#' glmnet:::fid(c(1, 1, 1, 2, 3, 3, 4, 4, 4), 1:9) +fid <- function(x,index) { idup=duplicated(x) if(!any(idup)) list(index_first=index,index_ties=NULL) - else{ + else { ndup=!idup xu=x[ndup]# first death times index_first=index[ndup] @@ -80,6 +351,5 @@ index_ties=split(index,ities) nties=sapply(index_ties,length) list(index_first=index_first,index_ties=index_ties[nties>1]) - } } - +} diff -Nru r-cran-glmnet-4.0-2/R/coxnet.deviance.R r-cran-glmnet-4.1/R/coxnet.deviance.R --- r-cran-glmnet-4.0-2/R/coxnet.deviance.R 2019-10-22 22:39:39.000000000 +0000 +++ r-cran-glmnet-4.1/R/coxnet.deviance.R 2021-01-06 22:06:54.000000000 +0000 @@ -1,80 +1,470 @@ -#' compute deviance for cox model output -#' -#' Given a fit or coefficients, compute the deciance (-2 log partial likelihood) for -#' right-censored survival data -#' -#' \code{coxnet.deviance} computes the deviance for a single prediction, or a matrix of predictions +#' Compute deviance for Cox model +#' +#' Compute the deviance (-2 log partial likelihood) for Cox model. #' +#' Computes the deviance for a single set of predictions, or for a matrix +#' of predictions. The user can either supply the predictions +#' directly through the \code{pred} option, or by supplying the \code{x} matrix +#' and \code{beta} coefficients. Uses the Breslow approach to ties. +#' +#' The function first checks if \code{pred} is passed: if so, it is used as +#' the predictions. If \code{pred} is not passed but \code{x} and \code{beta} +#' are passed, then these values are used to compute the predictions. If +#' neither \code{x} nor \code{beta} are passed, then the predictions are all +#' taken to be 0. +#' +#' \code{coxnet.deviance()} is a wrapper: it calls \code{coxnet.deviance0()} +#' if the response is right-censored data, and calls \code{coxnet.deviance3()} +#' if the response is (start, stop] survival data. +#' +#' \code{coxnet.deviance2()} gives the same output as \code{coxnet.deviance0()} +#' but is written completely in R. It is not called by +#' \code{coxnet.deviance()}, and is kept in the package for completeness. +#' #' @aliases coxnet.deviance -#' @param pred matrix of predictions -#' @param y a survival response matrix, as produced by \code{Surv} -#' @param x optional \code{x} matrix, if \code{pred} is \code{NULL} -#' @param offset optional offset -#' @param weights optional observation weights -#' @param beta optional coefficient vector/matrix, supplied if \code{pred=NULL} -#' @return a single or vector of deviances -#' @author Trevor Hastie\cr Maintainer: Trevor Hastie +#' @param pred Fit vector or matrix (usually from glmnet at a particular +#' lambda or a sequence of lambdas). +#' @param y Survival response variable, must be a \code{Surv} or +#' \code{stratifySurv} object. +#' @param x Optional \code{x} matrix, to be supplied if \code{pred = NULL}. +#' @param offset Optional offset vector. +#' @param weights Observation weights (default is all equal to 1). +#' @param std.weights If TRUE (default), observation weights are standardized +#' to sum to 1. +#' @param beta Optional coefficient vector/matrix, to be supplied if +#' \code{pred = NULL}. +#' +#' @return A vector of deviances, one for each column of predictions. +#' +#' @examples +#' set.seed(1) +#' eta <- rnorm(10) +#' time <- runif(10, min = 1, max = 10) +#' d <- ifelse(rnorm(10) > 0, 1, 0) +#' y <- survival::Surv(time, d) +#' coxnet.deviance(pred = eta, y = y) +#' +#' # if pred not provided, it is set to zero vector +#' coxnet.deviance(y = y) +#' +#' # example with x and beta +#' x <- matrix(rnorm(10 * 3), nrow = 10) +#' beta <- matrix(1:3, ncol = 1) +#' coxnet.deviance(y = y, x = x, beta = beta) +#' +#' # example with (start, stop] data +#' y2 <- survival::Surv(time, time + runif(10), d) +#' coxnet.deviance(pred = eta, y = y2) +#' +#' # example with strata +#' y2 <- stratifySurv(y, rep(1:2, length.out = 10)) +#' coxnet.deviance(pred = eta, y = y2) +#' #' @seealso \code{coxgrad} #' @keywords Cox model #' -#' @export coxnet.deviance -coxnet.deviance <- - function (pred = NULL, y, x = 0, offset = NULL, weights = NULL, - beta = NULL) -{ - storage.mode(x) = "double" - y = response.coxnet(y) - ty = y$time - tevent = y$event - ty = ty + (1 - tevent) * 100 * .Machine$double.eps - nobs = as.integer(length(ty)) - nvars = as.integer(ncol(x)) - nvec=1 +#' @export +coxnet.deviance <- function(pred = NULL, y, x = NULL, offset = NULL, + weights = NULL, std.weights = TRUE, beta = NULL) { + y <- response.coxnet(y) + + # if y has 2 columns, it is right-censored data + # if y has 3 columns, it is (start, stop] data + # otherwise, throw error + if (ncol(y) == 2) { + return(coxnet.deviance0(pred = pred, y = y, x = x, offset = offset, + weights = weights, std.weights = std.weights, + beta = beta)) + } else if (ncol(y) == 3) { + return(coxnet.deviance3(pred = pred, y = y, x = x, offset = offset, + weights = weights, std.weights = std.weights, + beta = beta)) + } else { + stop("Response y should have 2 or 3 columns") + } +} + +#' @rdname coxnet.deviance +coxnet.deviance0 <- function(pred = NULL, y, x = NULL, offset = NULL, + weights = NULL, std.weights = TRUE, beta = NULL) { + ty <- y[, "time"] + tevent <- y[, "status"] + ty <- ty + (1 - tevent) * 100 * .Machine$double.eps + nobs <- as.integer(length(ty)) + + # hack for the case where user passes in x as sparse matrix + if (!is.null(x) && inherits(x, "sparseMatrix")) { + if (is.null(beta)) + stop("if x is passed, beta must also be passed") + pred <- as.matrix(x %*% beta) + return(coxnet.deviance0(pred = pred, y = y, offset = offset, + weights = weights, std.weights = std.weights)) + } + + # Sort out the pred, x and beta options. + # If user provided `pred`, we let x = pred and beta = identity matrix. + # This allows us to use the loglike Fortran routine to compute the + # partial log likelihood. + # In the end, only x and beta are passed to the Fortran routine. + if (!is.null(pred)) { + x <- as.matrix(pred) + nvec <- ncol(x) + beta <- diag(nvec) + nvars <- as.integer(nvec) + } else if (is.null(x) && is.null(beta)) { + x <- matrix(0, nrow = nobs, ncol = 1) + beta <- double(0) + nvec <- 1 + nvars <- as.integer(0) + } else if (!is.null(x) && !is.null(beta)) { + x <- as.matrix(x) + beta <- as.matrix(beta) + nvec <- ncol(beta) + nvars <- nrow(beta) + } else { + stop("user must pass either `pred`, or both `x` and `beta`") + } + storage.mode(x) <- "double" + storage.mode(beta) <- "double" + nvec <- as.integer(nvec) + nvars <- as.integer(nvars) + + # normalize weights to sum to nobs + if (is.null(weights)) + weights <- rep(1, nobs) + else { + if (std.weights) weights <- nobs * weights / sum(weights) + weights <- as.double(weights) + } + + if (is.null(offset)) + offset <- rep(0, nobs) + else offset <- as.double(offset) + + # extract strata (if any) + if ("strata" %in% names(attributes(y))) { + strata <- attr(y, "strata") + } else { + strata <- rep(1, nobs) + } + if (length(strata) != nobs) stop("length of strata != nobs") + + # if all in same strata, do the deviance computation + # if not, take the sum of the strata-level deviances + if (length(unique(strata)) == 1) { + ### Compute saturated loglikelihood + wd <- weights[tevent == 1] + tyd <- ty[tevent == 1] + if (any(duplicated(tyd))) { + wd <- tapply(wd, tyd, sum) + } + wd <- wd[wd > 0] + lsat <- -sum(wd * log(wd)) + #### + + fit <- .Fortran("loglike", nobs, nvars, x, ty, tevent, offset, + weights, nvec, beta, flog = double(nvec), jerr = integer(1), + PACKAGE = "glmnet") + if (fit$jerr != 0) { + errmsg <- jerr(fit$jerr, maxit = 0, pmax = 0, family = "cox") + if (errmsg$fatal) + stop(errmsg$msg, call. = FALSE) + else warning(errmsg$msg, call. = FALSE) + } + return(2 * (lsat - fit$flog)) + } else { + # more than one strata provided: return the sum of strata-level deviances + tot_dev <- 0 + for (i in unique(strata)) { + ii <- which(strata == i) + tot_dev <- tot_dev + + coxnet.deviance0(y = y[ii, , drop = FALSE], x = x[ii, , drop = FALSE], + beta = beta, offset = offset[ii], + weights = weights[ii], std.weights = FALSE) + } + return(tot_dev) + } +} + +#' @rdname coxnet.deviance +coxnet.deviance2 <- function(pred = NULL, y, x = NULL, offset = NULL, + weights = NULL, std.weights = TRUE, beta = NULL) { + if (!is.Surv(y)) stop("y must be a Surv object") + nobs <- nrow(y) + + # if pred is NULL, use beta and x to compute pred + # if beta is NULL too, set pred to all zeros + if (is.null(pred)) { + if ((!is.null(x) && is.null(beta)) || (is.null(x) && !is.null(beta))) + stop("user must pass either `pred`, or both `x` and `beta`") + if (is.null(beta)) { + pred <- rep(0, times = nobs) + } else { + pred <- x %*% beta + } + } + + # if more than one column of predictions is passed, run coxnet.deviance2() + # for each column + if (!is.null(ncol(pred)) && ncol(pred) > 1) { + return(sapply(seq(ncol(pred)), + function(j) coxnet.deviance2( + pred = pred[, j], y = y, offset = offset, + weights = weights, std.weights = std.weights))) + } else { + # check that pred is of the right length + if(length(pred) != nobs) stop("pred and y must have the same length") + + # normalize weights to sum to nobs if (is.null(weights)) - weights = rep(1, nobs) + w <- rep(1, nobs) else { - weights=nobs*weights/sum(weights) - weights = as.double(weights) + if (length(weights) != nobs) stop("weights and y must have the same length") + if (std.weights) { + w <- nobs * weights / sum(weights) + } else { + w <- weights + } + } + + # if there's an offset, add it to the pred vector + if (is.null(offset)) { + offset <- rep(0, nobs) + } else { + if (length(offset) != nobs) stop("offset and y must have the same length") + pred <- pred + offset + } + + # extract strata (if any) + if ("strata" %in% names(attributes(y))) { + strata <- attr(y, "strata") + } else { + strata <- rep(1, nobs) } -### Compute saturated loglikelihood - wd=weights[tevent==1] - tyd=ty[tevent==1] - if(any(duplicated(tyd))){ - wd=tapply(wd,tyd,sum) - } - wd=wd[wd>0] - lsat=-sum(wd*log(wd)) -#### - if (is.null(offset)) - offset = rep(0, nobs) - else offset=as.double(offset) + if (length(strata) != nobs) stop("length of strata != nobs") + + # if all in same strata, do the deviance computation + # if not, take the sum of the strata-level deviances + if (length(unique(strata)) == 1) { + time <- y[, "time"] + d <- y[, "status"] + + ### Compute saturated loglikelihood + wd <- w[d == 1] + tyd <- time[d == 1] + if (any(duplicated(tyd))) { + wd <- tapply(wd, tyd, sum) + } + wd <- wd[wd > 0] + lsat <- -sum(wd * log(wd)) + #### + + # order time, d, pred and w in ascending time order + # for tied times, all deaths come before censored observations + if ("stop_time" %in% names(attributes(y))) { + o <- attr(y, "stop_time") + } else { + o <- order(time, d, decreasing = c(FALSE, TRUE)) + } + time <- time[o] + d <- d[o] + pred <- pred[o] + w <- w[o] + + ### See if there are dups in death times + dups <- fid(time[d==1],seq(length(d))[d==1]) + dd <- d + ww <- w + + ### next code replaces each sequence of tied death indicators by a new + ### sequence where only the first is a 1 and the rest are zero. This + ### makes the accounting in the following step work properly we also + ### sums the weights in each of the tied death sets, and assign that + ### weight to the first + if(!is.null(ties<-dups$index_ties)){ + dd[unlist(ties)]=0 + dd[dups$index_first]=1 + wsum=sapply(ties,function(i,w)sum(w[i]),ww) + tie1=sapply(ties,function(i)i[1]) + ww[tie1]=wsum + } + + # compute the sum inside the log term of the partial likelihood + w_exp_pred <- w * exp(pred) + rsk <- rev(cumsum(rev(w_exp_pred))) + + # take just the terms related to actual death times + log_terms <- (ww * log(rsk))[dd > 0] + loglik <- sum((w * pred)[d > 0]) - sum(log_terms) + + return(2 * (lsat -loglik)) + } else { + # more than one strata provided: return the sum of strata-level + # deviances + tot_dev <- 0 + for (i in unique(strata)) { + ii <- which(strata == i) + tot_dev <- tot_dev + + coxnet.deviance2(pred = pred[ii], y = y[ii, , drop = FALSE], + offset = NULL, weights = w[ii], std.weights = FALSE) + } + return(tot_dev) + } + } +} + +#' @rdname coxnet.deviance +coxnet.deviance3 <- function(pred = NULL, y, x = NULL, offset = NULL, + weights = NULL, std.weights = TRUE, beta = NULL) { + if (!is.Surv(y)) stop("y must be a Surv object") + nobs <- nrow(y) + + # if pred is NULL, use beta and x to compute pred + # if beta is NULL too, set pred to all zeros + if (is.null(pred)) { + if ((!is.null(x) && is.null(beta)) || (is.null(x) && !is.null(beta))) + stop("user must pass either `pred`, or both `x` and `beta`") if (is.null(beta)) { - beta = double(0) - nvars = as.integer(0) + pred <- rep(0, times = nobs) + } else { + pred <- x %*% beta } + } + + # if more than one column of predictions is passed, run coxnet.deviance3() + # for each column + if (!is.null(ncol(pred)) && ncol(pred) > 1) { + return(sapply(seq(ncol(pred)), + function(j) coxnet.deviance3( + pred = pred[, j], y = y, offset = offset, + weights = weights, std.weights = std.weights))) + } else { + # check that pred is of the right length + if(length(pred) != nobs) stop("pred and y must have the same length") + + # normalize weights to sum to nobs + if (is.null(weights)) + w <- rep(1, nobs) else { - beta = as.matrix(beta) - nvec = ncol(beta) + if (length(weights) != nobs) stop("weights and y must have the same length") + if (std.weights) { + w <- nobs * weights / sum(weights) + } else { + w <- weights + } } - if(!is.null(pred)){ - # trick to get a set of deviances based on predictions" - x=as.matrix(pred) - nvec=ncol(x) - storage.mode(x)="double" - beta=diag(nvec) - nvars=as.integer(nvec) - storage.mode(beta)="double" + + # if there's an offset, add it to the pred vector + if (is.null(offset)) { + offset <- rep(0, nobs) + } else { + if (length(offset) != nobs) stop("offset and y must have the same length") + pred <- pred + offset } - nvec=as.integer(nvec) - - fit = .Fortran("loglike", nobs, nvars, x, ty, tevent, offset, - weights, nvec, beta, flog = double(nvec), jerr = integer(1), - PACKAGE = "glmnet") - if (fit$jerr != 0) { - errmsg = jerr(fit$jerr, maxit = 0, pmax = 0, family = "cox") - if (errmsg$fatal) - stop(errmsg$msg, call. = FALSE) - else warning(errmsg$msg, call. = FALSE) + + # extract strata (if any) + if ("strata" %in% names(attributes(y))) { + strata <- attr(y, "strata") + } else { + strata <- rep(1, nobs) } - 2 *(lsat-fit$flog) -} + if (length(strata) != nobs) stop("length of strata != nobs") + + # if all in same strata, do the deviance computation + # if not, take the sum of the strata-level deviances + if (length(unique(strata)) == 1) { + start_time <- y[, "start"] + stop_time <- y[, "stop"] + d <- y[, "status"] + + ### Compute saturated loglikelihood + wd <- w[d == 1] + tyd <- stop_time[d == 1] + if (any(duplicated(tyd))) { + wd <- tapply(wd, tyd, sum) + } + wd <- wd[wd > 0] + lsat <- -sum(wd * log(wd)) + #### + + # get ordering for stop time (ascending, deaths before censored), and + # start time (ascending) + if ("stop_time" %in% names(attributes(y))) { + stop_o <- attr(y, "stop_time") + } else { + stop_o <- order(stop_time, d, decreasing = c(FALSE, TRUE)) + } + if ("start_time" %in% names(attributes(y))) { + start_o <- attr(y, "start_time") + } else { + start_o <- order(start_time, decreasing = c(FALSE)) + } + + # keep a set of values which are ordered by start time + w_exp_pred_start <- (w * exp(pred))[start_o] + start_time_start <- start_time[start_o] + + # reorder everything by stop time + start_time <- start_time[stop_o] + stop_time <- stop_time[stop_o] + d <- d[stop_o] + pred <- pred[stop_o] + w <- w[stop_o] + + ### See if there are dups in death times + dups <- fid(stop_time[d == 1], seq(length(d))[d == 1]) + dd <- d + ww <- w + + ### next code replaces each sequence of tied death indicators by a new + ### sequence where only the first is a 1 and the rest are zero. This + ### makes the accounting in the following step work properly we also + ### sums the weights in each of the tied death sets, and assign that + ### weight to the first + if(!is.null(ties<-dups$index_ties)){ + dd[unlist(ties)]=0 + dd[dups$index_first]=1 + wsum=sapply(ties,function(i,w)sum(w[i]),ww) + tie1=sapply(ties,function(i)i[1]) + ww[tie1]=wsum + } + + # compute risk set sums rsk[i] = \sum_{j in R_i} w_j exp(eta_j) + # where i indexes the observations. (In the end, we will only care + # about the indices i which have actual death times.) + rsk <- rev(cumsum(rev(w * exp(pred)))) + current_sum <- 0 + stop_idx <- nobs; start_idx <- nobs + while (stop_idx > 0 && start_idx > 0) { + if (start_time_start[start_idx] < stop_time[stop_idx]) { + # current start time belongs in risk set ending in stop time, + # so we should remove the current cumulative sum and consider + # the next risk set + rsk[stop_idx] <- rsk[stop_idx] - current_sum + stop_idx <- stop_idx - 1 + } else { + # current start time does not belong in risk set ending in stop + # time, so we should add it to current_sum and check if the + # start time before it should also be added + current_sum <- current_sum + w_exp_pred_start[start_idx] + start_idx <- start_idx - 1 + } + } + + log_terms <- ww[dups$index_first] * (log(rsk[dd == 1])) + loglik <- sum((w * pred)[d > 0]) - sum(log_terms) + return(2 * (lsat -loglik)) + } else { + # more than one strata provided: return the sum of strata-level + # deviances + tot_dev <- 0 + for (i in unique(strata)) { + ii <- which(strata == i) + tot_dev <- tot_dev + + coxnet.deviance3(pred = pred[ii], y = y[ii, , drop = FALSE], + offset = NULL, weights = w[ii], std.weights = FALSE) + } + return(tot_dev) + } + } +} \ No newline at end of file diff -Nru r-cran-glmnet-4.0-2/R/coxpath.R r-cran-glmnet-4.1/R/coxpath.R --- r-cran-glmnet-4.0-2/R/coxpath.R 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/R/coxpath.R 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,750 @@ +#' Check if glmnet should call cox.path +#' +#' Helper function to check if glmnet() should call cox.path(). +#' +#' For \code{family="cox"}, we only call the original coxnet() function if +#' (i) x is not sparse, (ii) y is right-censored data, and (iii) we are +#' not fitting a stratified Cox model. This function also throws an error +#' if y has a "strata" attribute but is not of type "stratifySurv". +#' +#' @param x Design matrix. +#' @param y Response variable. +#' +#' @return TRUE if cox.path() should be called, FALSE otherwise. +use.cox.path <- function(x, y) { + y <- response.coxnet(y) + use_cox_path <- TRUE + # We only return FALSE if: + # x is not sparse AND y is right-censored data AND no strata + # (strata variable being all equal counts as no strata) + if (!inherits(x, "sparseMatrix") && ncol(y) == 2) { + if (!("strata" %in% names(attributes(y))) || + length(unique(attr(y, "strata"))) == 1) + use_cox_path <- FALSE + } + + # if strata provided in y but y not of class stratifySurv, throw error + if ("strata" %in% names(attributes(y)) && !inherits(y, "stratifySurv")) + stop(paste0("For fitting stratified Cox models, y must be of class ", + "stratifySurv, see ?stratifySurv for more details")) + + return(use_cox_path) +} + +#' Fit a Cox regression model with elastic net regularization for a path of +#' lambda values +#' +#' Fit a Cox regression model via penalized maximum likelihood for a path of +#' lambda values. Can deal with (start, stop] data and strata, as well as +#' sparse design matrices. +#' +#' Sometimes the sequence is truncated before \code{nlambda} values of lambda +#' have been used. This happens when \code{cox.path} detects that the +#' decrease in deviance is marginal (i.e. we are near a saturated fit). +#' +#' @param x See glmnet help file +#' @param y Survival response variable, must be a \code{Surv} or +#' \code{stratifySurv} object. +#' @param weights See glmnet help file +#' @param offset See glmnet help file +#' @param alpha See glmnet help file +#' @param nlambda See glmnet help file +#' @param lambda.min.ratio See glmnet help file +#' @param lambda See glmnet help file +#' @param standardize See glmnet help file +#' @param thresh Convergence threshold for coordinate descent. Each inner +#' coordinate-descent loop continues until the maximum change in the objective +#' after any coefficient update is less than thresh times the null deviance. +#' Default value is \code{1e-10}. +#' @param exclude See glmnet help file +#' @param penalty.factor See glmnet help file +#' @param lower.limits See glmnet help file +#' @param upper.limits See glmnet help file +#' @param maxit See glmnet help file +#' @param trace.it Controls how much information is printed to screen. Default is +#' \code{trace.it=0} (no information printed). If \code{trace.it=1}, a progress +#' bar is displayed. If \code{trace.it=2}, some information about the fitting +#' procedure is printed to the console as the model is being fitted. +#' @param ... Other arguments passed from glmnet (not used right now). +#' +#' @return An object of class "coxnet" and "glmnet". +#' \item{a0}{Intercept value, \code{NULL} for "cox" family.} +#' \item{beta}{A \code{nvars x length(lambda)} matrix of coefficients, stored in +#' sparse matrix format.} +#' \item{df}{The number of nonzero coefficients for each value of lambda.} +#' \item{dim}{Dimension of coefficient matrix.} +#' \item{lambda}{The actual sequence of lambda values used. When alpha=0, the +#' largest lambda reported does not quite give the zero coefficients reported +#' (lambda=inf would in principle). Instead, the largest lambda for alpha=0.001 +#' is used, and the sequence of lambda values is derived from this.} +#' \item{dev.ratio}{The fraction of (null) deviance explained. The deviance +#' calculations incorporate weights if present in the model. The deviance is +#' defined to be 2*(loglike_sat - loglike), where loglike_sat is the log-likelihood +#' for the saturated model (a model with a free parameter per observation). +#' Hence dev.ratio=1-dev/nulldev.} +#' \item{nulldev}{Null deviance (per observation). This is defined to be +#' 2*(loglike_sat -loglike(Null)). The null model refers to the 0 model.} +#' \item{npasses}{Total passes over the data summed over all lambda values.} +#' \item{jerr}{Error flag, for warnings and errors (largely for internal +#' debugging).} +#' \item{offset}{A logical variable indicating whether an offset was included +#' in the model.} +#' \item{call}{The call that produced this object.} +#' \item{nobs}{Number of observations.} +#' +#' @examples +#' set.seed(2) +#' nobs <- 100; nvars <- 15 +#' xvec <- rnorm(nobs * nvars) +#' xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +#' x <- matrix(xvec, nrow = nobs) +#' beta <- rnorm(nvars / 3) +#' fx <- x[, seq(nvars / 3)] %*% beta / 3 +#' ty <- rexp(nobs, exp(fx)) +#' tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +#' jsurv <- survival::Surv(ty, tcens) +#' fit1 <- glmnet:::cox.path(x, jsurv) +#' +#' # works with sparse x matrix +#' x_sparse <- Matrix::Matrix(x, sparse = TRUE) +#' fit2 <- glmnet:::cox.path(x_sparse, jsurv) +#' +#' # example with (start, stop] data +#' set.seed(2) +#' start_time <- runif(100, min = 0, max = 5) +#' stop_time <- start_time + runif(100, min = 0.1, max = 3) +#' status <- rbinom(n = nobs, prob = 0.3, size = 1) +#' jsurv_ss <- survival::Surv(start_time, stop_time, status) +#' fit3 <- glmnet:::cox.path(x, jsurv_ss) +#' +#' # example with strata +#' jsurv_ss2 <- stratifySurv(jsurv_ss, rep(1:2, each = 50)) +#' fit4 <- glmnet:::cox.path(x, jsurv_ss2) +cox.path <- function(x, y, weights=NULL, offset=NULL, + alpha=1.0, nlambda=100, + lambda.min.ratio=ifelse(nobs 1) { + warning("alpha > 1; set to 1") + alpha = 1 + } else if (alpha < 0) { + warning("alpha < 0; set to 0") + alpha = 0 + } + alpha = as.double(alpha) + + this.call <- match.call() + + np = dim(x) + if (is.null(np) || (np[2] <= 1)) stop("x should be a matrix with 2 or more columns") + nobs = as.integer(np[1]); nvars = as.integer(np[2]) + + # get feature variable names + vnames <- colnames(x) + if(is.null(vnames)) vnames <- paste("V",seq(nvars),sep="") + + # check weights + if(is.null(weights)) weights = rep(1,nobs) + else if (length(weights) != nobs) + stop(paste("Number of elements in weights (",length(weights), + ") not equal to the number of rows of x (",nobs,")",sep="")) + weights <- as.double(weights) + + # check that response y is a Surv object of the correct length + y <- response.coxnet(y) + if (nrow(y) != nobs) stop(paste0("number of observations in y (" , nrow(y), + ") not equal to the number of rows of x (", + nobs, ")")) + + # check offset option + is.offset <- !(is.null(offset)) + if (is.offset == FALSE) { + offset <- rep(0, times = nrow(y)) + } + + # check and standardize penalty factors (to sum to nvars) + if(any(penalty.factor == Inf)) { + exclude = c(exclude, seq(nvars)[penalty.factor == Inf]) + exclude = sort(unique(exclude)) + } + if(length(exclude) > 0) { + jd <- match(exclude, seq(nvars), 0) + if(!all(jd > 0)) stop ("Some excluded variables out of range") + penalty.factor[jd] = 1 # ow can change lambda sequence + } else { + jd <- as.integer(0) + } + vp = pmax(0, penalty.factor) + if (max(vp) <= 0) stop("All penalty factors are <= 0") + vp = as.double(vp * nvars / sum(vp)) + + # if all the non-excluded variables have zero variance, throw error + isconst <- function(x) 1 - (max(x) == min(x)) * 1 + if (inherits(x, "sparseMatrix")) { + xt <- as(t(x), "dgCMatrix") + lx <- split(xt@x, xt@i) + const_vars <- sapply(lx, isconst) + } + else const_vars <- apply(x, 2, isconst) + exclude <- c(exclude, which(const_vars == 0)) + exclude <- sort(unique(exclude)) + if (length(exclude) == nvars) stop("All used predictors have zero variance") + + ### check on limits + control <- glmnet.control() + if (thresh >= control$epsnr) + warning("thresh should be smaller than glmnet.control()$epsnr", + call. = FALSE) + + if(any(lower.limits > 0)){ stop("Lower limits should be non-positive") } + if(any(upper.limits < 0)){ stop("Upper limits should be non-negative") } + lower.limits[lower.limits == -Inf] = -control$big + upper.limits[upper.limits == Inf] = control$big + if (length(lower.limits) < nvars) { + if(length(lower.limits) == 1) lower.limits = rep(lower.limits, nvars) else + stop("Require length 1 or nvars lower.limits") + } else lower.limits = lower.limits[seq(nvars)] + if (length(upper.limits) < nvars) { + if(length(upper.limits) == 1) upper.limits = rep(upper.limits, nvars) else + stop("Require length 1 or nvars upper.limits") + } else upper.limits = upper.limits[seq(nvars)] + + if (any(lower.limits == 0) || any(upper.limits == 0)) { + ###Bounds of zero can mess with the lambda sequence and fdev; + ###ie nothing happens and if fdev is not zero, the path can stop + fdev <- glmnet.control()$fdev + if(fdev!= 0) { + glmnet.control(fdev = 0) + on.exit(glmnet.control(fdev = fdev)) + } + } + ### end check on limits + ### end preparation of generic arguments + + # standardize x if necessary + xm <- rep(0.0, times = nvars) + if (standardize) { + xs <- apply(x, 2, function(r) sqrt(weighted.mean(r^2, weights) - + weighted.mean(r, weights)^2)) + xs[exclude] <- 1.0 + } else { + xs <- rep(1.0, times = nvars) + } + if (!inherits(x, "sparseMatrix")) { + x <- t((t(x) - xm) / xs) + } else { + attr(x, "xm") <- xm + attr(x, "xs") <- xs + } + lower.limits <- lower.limits * xs + upper.limits <- upper.limits * xs + + if (!("strata" %in% names(attributes(y)))) + y <- stratifySurv(y, rep(1, nobs)) + + # Pre-compute and cache some important information: ordering by stop time + # (ascending, deaths before censored), and for (start, stop] data: ordering + # by start time and some match information. + # Information is computed at the strata level + if (ncol(y) == 2) { + stop_o <- numeric(nobs) + for (i in unique(attr(y, "strata"))) { + ii <- which(attr(y, "strata") == i) + stop_o[ii] <- order(y[ii, "time"], y[ii, "status"], + decreasing = c(FALSE, TRUE)) + } + attr(y, "stop_time") <- stop_o + } else { + stop_o <- numeric(nobs) + start_o <- numeric(nobs) + ss_match <- numeric(nobs) + for (i in unique(attr(y, "strata"))) { + ii <- which(attr(y, "strata") == i) + stop_o[ii] <- order(y[ii, "stop"], y[ii, "status"], + decreasing = c(FALSE, TRUE)) + start_o[ii] <- order(y[ii, "start"], decreasing = c(FALSE)) + ss_match[ii] <- match(start_o[ii], stop_o[ii]) + } + attr(y, "stop_time") <- stop_o + attr(y, "start_time") <- start_o + attr(y, "ss_match") <- ss_match + } + + # compute null deviance + # currently using std.weights = FALSE in order to match glmnet output + nulldev <- coxnet.deviance(y = y, offset = offset, weights = weights, + std.weights = FALSE) + + # compute lambda_max and lambda values + nlam = as.integer(nlambda) + user_lambda = FALSE # did user provide their own lambda values? + if (is.null(lambda)) { + if (lambda.min.ratio >= 1) stop("lambda.min.ratio should be less than 1") + lambda_max <- get_cox_lambda_max(x, y, alpha, weights, offset, exclude, vp) + ulam <- exp(seq(log(lambda_max), log(lambda_max * lambda.min.ratio), + length.out = nlam)) + } else { # user provided lambda values + user_lambda = TRUE + if (any(lambda < 0)) stop("lambdas should be non-negative") + ulam <- as.double(rev(sort(lambda))) + nlam <- as.integer(length(lambda)) + } + + # start progress bar + if (trace.it == 1) pb <- createPB(min = 0, max = nlam, style = 3) + + beta <- matrix(0, nrow = nvars, ncol = nlam) + dev.ratio <- rep(NA, length = nlam) + fit <- NULL + mnl <- min(nlam, control$mnlam) + for (k in 1:nlam) { + # get the correct lambda value to fit + if (k > 1) { + cur_lambda <- ulam[k] + } else { + cur_lambda <- ifelse(user_lambda, ulam[k], control$big) + } + + if (trace.it == 2) cat("Fitting lambda index", k, ":", ulam[k], fill = TRUE) + fit <- cox.fit(x, y, weights / sum(weights), + lambda = cur_lambda, alpha = alpha, + offset = offset, thresh = thresh, maxit = maxit, + penalty.factor = vp, exclude = exclude, + lower.limits = lower.limits, upper.limits = upper.limits, + warm = fit, from.cox.path = TRUE, save.fit = TRUE, + trace.it = trace.it) + if (trace.it == 1) utils::setTxtProgressBar(pb, k) + # if error code non-zero, a non-fatal error must have occurred + # print warning, ignore this lambda value and return result + # for all previous lambda values + if (fit$jerr != 0) { + errmsg <- jerr.glmnetfit(fit$jerr, maxit, k) + warning(errmsg$msg, call. = FALSE) + k <- k - 1 + break + } + beta[, k] <- as.vector(fit$beta) + dev.ratio[k] <- fit$dev.ratio + + # early stopping if dev.ratio almost 1 or no improvement + if (k >= mnl && user_lambda == FALSE) { + if (dev.ratio[k] > control$devmax * 0.99 / 0.999) break + if (k > 1 && dev.ratio[k] - dev.ratio[k - mnl + 1] < + control$fdev * 100 * dev.ratio[k]) break + } + } + if (trace.it == 1) { + utils::setTxtProgressBar(pb, nlam) + cat("", fill = TRUE) + } + + # truncate beta, dev.ratio, lambda if necessary + if (k < nlam) { + beta <- beta[, 1:k, drop = FALSE] + dev.ratio <- dev.ratio[1:k] + ulam <- ulam[1:k] + } + + # return coefficients to original scale (because of x standardization) + beta <- beta / xs + + # output + stepnames <- paste0("s", 0:(length(ulam) - 1)) + out <- list(a0 = NULL) + out$beta <- Matrix::Matrix(beta, sparse = TRUE, + dimnames = list(vnames, stepnames)) + out$df <- as.vector(colSums(abs(beta) > 0)) # as.vector to remove names + out$dim <- dim(beta) + out$lambda <- ulam + out$dev.ratio <- dev.ratio + out$nulldev <- nulldev + out$npasses <- fit$npasses + out$jerr <- fit$jerr + out$offset <- is.offset + out$call <- this.call + out$nobs <- nobs + class(out) <- c("coxnet", "glmnet") + out +} + +#' Fit a Cox regression model with elastic net regularization for a single +#' value of lambda +#' +#' Fit a Cox regression model via penalized maximum likelihood for a single +#' value of lambda. Can deal with (start, stop] data and strata, as well as +#' sparse design matrices. +#' +#' WARNING: Users should not call \code{cox.fit} directly. Higher-level +#' functions in this package call \code{cox.fit} as a subroutine. If a +#' warm start object is provided, some of the other arguments in the function +#' may be overriden. +#' +#' \code{cox.fit} solves the elastic net problem for a single, user-specified +#' value of lambda. \code{cox.fit} works for Cox regression models, including +#' (start, stop] data and strata. It solves the problem using iteratively +#' reweighted least squares (IRLS). For each IRLS iteration, \code{cox.fit} +#' makes a quadratic (Newton) approximation of the log-likelihood, then calls +#' \code{elnet.fit} to minimize the resulting approximation. +#' +#' In terms of standardization: \code{cox.fit} does not standardize \code{x} +#' and \code{weights}. \code{penalty.factor} is standardized so that they sum +#' up to \code{nvars}. +#' +#' @param x Input matrix, of dimension \code{nobs x nvars}; each row is an +#' observation vector. If it is a sparse matrix, it is assumed to be unstandardized. +#' It should have attributes \code{xm} and \code{xs}, where \code{xm(j)} and +#' \code{xs(j)} are the centering and scaling factors for variable j respsectively. +#' If it is not a sparse matrix, it is assumed that any standardization needed +#' has already been done. +#' @param y Survival response variable, must be a Surv or stratifySurv object. +#' @param weights Observation weights. \code{cox.fit} does NOT standardize +#' these weights. +#' @param lambda A single value for the \code{lambda} hyperparameter. +#' @param alpha See glmnet help file +#' @param offset See glmnet help file +#' @param thresh Convergence threshold for coordinate descent. Each inner +#' coordinate-descent loop continues until the maximum change in the objective +#' after any coefficient update is less than thresh times the null deviance. +#' Default value is \code{1e-10}. +#' @param maxit Maximum number of passes over the data; default is \code{10^5}. +#' (If a warm start object is provided, the number of passes the warm start object +#' performed is included.) +#' @param penalty.factor See glmnet help file +#' @param exclude See glmnet help file +#' @param lower.limits See glmnet help file +#' @param upper.limits See glmnet help file +#' @param warm Either a \code{glmnetfit} object or a list (with name \code{beta} +#' containing coefficients) which can be used as a warm start. Default is +#' \code{NULL}, indicating no warm start. For internal use only. +#' @param from.cox.path Was \code{cox.fit()} called from \code{cox.path()}? +#' Default is FALSE.This has implications for computation of the penalty factors. +#' @param save.fit Return the warm start object? Default is FALSE. +#' @param trace.it Controls how much information is printed to screen. If +#' \code{trace.it=2}, some information about the fitting procedure is printed to +#' the console as the model is being fitted. Default is \code{trace.it=0} +#' (no information printed). (\code{trace.it=1} not used for compatibility with +#' \code{glmnet.path}.) +#' +#' @return An object with class "coxnet", "glmnetfit" and "glmnet". The list +#' returned contains more keys than that of a "glmnet" object. +#' \item{a0}{Intercept value, \code{NULL} for "cox" family.} +#' \item{beta}{A \code{nvars x 1} matrix of coefficients, stored in sparse matrix +#' format.} +#' \item{df}{The number of nonzero coefficients.} +#' \item{dim}{Dimension of coefficient matrix.} +#' \item{lambda}{Lambda value used.} +#' \item{dev.ratio}{The fraction of (null) deviance explained. The deviance +#' calculations incorporate weights if present in the model. The deviance is +#' defined to be 2*(loglike_sat - loglike), where loglike_sat is the log-likelihood +#' for the saturated model (a model with a free parameter per observation). +#' Hence dev.ratio=1-dev/nulldev.} +#' \item{nulldev}{Null deviance (per observation). This is defined to be +#' 2*(loglike_sat -loglike(Null)). The null model refers to the 0 model.} +#' \item{npasses}{Total passes over the data.} +#' \item{jerr}{Error flag, for warnings and errors (largely for internal +#' debugging).} +#' \item{offset}{A logical variable indicating whether an offset was included +#' in the model.} +#' \item{call}{The call that produced this object.} +#' \item{nobs}{Number of observations.} +#' \item{warm_fit}{If \code{save.fit=TRUE}, output of FORTRAN routine, used for +#' warm starts. For internal use only.} +#' \item{family}{Family used for the model, always "cox".} +#' \item{converged}{A logical variable: was the algorithm judged to have +#' converged?} +#' \item{boundary}{A logical variable: is the fitted value on the boundary of +#' the attainable values?} +#' \item{obj_function}{Objective function value at the solution.} +cox.fit <- function(x, y, weights, lambda, alpha = 1.0, offset = rep(0, nobs), + thresh = 1e-10, maxit = 100000, + penalty.factor = rep(1.0, nvars), exclude = c(), + lower.limits = -Inf, upper.limits = Inf, warm = NULL, + from.cox.path = FALSE, save.fit = FALSE, trace.it = 0) { + this.call <- match.call() + control <- glmnet.control() + + ### Prepare all generic arguments + nobs <- nrow(x) + nvars <- ncol(x) + is.offset <- !(missing(offset)) + if (is.offset == FALSE) { + offset <- as.double(rep(0, nobs)) + } + # add xm and xs attributes if they are missing for sparse x + # glmnet.fit assumes that x is already standardized. Any standardization + # the user wants should be done beforehand. + if (inherits(x, "sparseMatrix")) { + if ("xm" %in% names(attributes(x)) == FALSE) + attr(x, "xm") <- rep(0.0, times = nvars) + if ("xs" %in% names(attributes(x)) == FALSE) + attr(x, "xs") <- rep(1.0, times = nvars) + } + + # if calling from cox.path(), we do not need to check on exclude + # and penalty.factor arguments as they have been prepared by cox.path() + if (!from.cox.path) { + # check and standardize penalty factors (to sum to nvars) + if(any(penalty.factor == Inf)) { + exclude = c(exclude, seq(nvars)[penalty.factor == Inf]) + exclude = sort(unique(exclude)) + } + if(length(exclude) > 0) { + jd = match(exclude, seq(nvars), 0) + if(!all(jd > 0)) stop ("Some excluded variables out of range") + penalty.factor[jd] = 1 # ow can change lambda sequence + } + vp = pmax(0, penalty.factor) + vp = as.double(vp * nvars / sum(vp)) + } else { + vp <- as.double(penalty.factor) + } + + ### check on limits + lower.limits[lower.limits == -Inf] = -control$big + upper.limits[upper.limits == Inf] = control$big + if (length(lower.limits) < nvars) + lower.limits = rep(lower.limits, nvars) else + lower.limits = lower.limits[seq(nvars)] + if (length(upper.limits) < nvars) + upper.limits = rep(upper.limits, nvars) else + upper.limits = upper.limits[seq(nvars)] + ### end check on limits + ### end preparation of generic arguments + + # compute null deviance + if (is.null(warm)) { + nulldev <- coxnet.deviance(y = y, offset = offset, weights = weights, + std.weights = FALSE) + fit <- NULL + coefold <- rep(0, nvars) # initial coefs = 0 + eta <- offset + } else { + if ("glmnetfit" %in% class(warm)) { + if (class(warm$warm_fit) != "warmfit") stop("Invalid warm start object") + fit <- warm + nulldev <- fit$nulldev + coefold <- fit$warm_fit$a # prev value for coefficients + eta <- get_eta(x, coefold, 0) + offset + } else if ("list" %in% class(warm) && "beta" %in% names(warm)) { + fit <- warm + nulldev <- coxnet.deviance(y = y, offset = offset, weights = weights, + std.weights = FALSE) + coefold <- fit$beta # prev value for coefficients + eta <- get_eta(x, coefold, 0) + offset + fit$a0 <- 0 # needed for compatibility with elnet.fit() + } else { + stop("Invalid warm start object") + } + } + + start <- NULL # current value for coefficients + obj_val_old <- cox_obj_function(y, eta, weights, lambda, alpha, coefold, vp) + if (trace.it == 2) { + cat("Warm Start Objective:", obj_val_old, fill = TRUE) + } + conv <- FALSE # converged? + + # IRLS loop + for (iter in 1L:control$mxitnr) { + # compute working response and weights + coxgrad_results <- coxgrad(eta, y, weights, std.weights = FALSE, + diag.hessian = TRUE) + w <- -attributes(coxgrad_results)$diag_hessian + z <- (eta - offset) - ifelse(w != 0, -coxgrad_results / w, 0) + + # have to update the weighted residual in our fit object + # (in theory g and iy should be updated too, but we actually recompute g + # and iy anyway in wls.f) + if (!is.null(fit)) { + fit$warm_fit$r <- w * (z - eta + offset) + } + + # do WLS with warmstart from previous iteration + fit <- elnet.fit(x, z, w, lambda, alpha, intercept = FALSE, + thresh = thresh, maxit = maxit, penalty.factor = vp, + exclude = exclude, lower.limits = lower.limits, + upper.limits = upper.limits, warm = fit, + from.glmnet.fit = TRUE, save.fit = TRUE) + if (fit$jerr != 0) return(list(jerr = fit$jerr)) + + # update coefficients, eta, mu and obj_val + start <- fit$warm_fit$a + eta <- get_eta(x, start, 0) + offset + obj_val <- cox_obj_function(y, eta, weights, lambda, alpha, start, vp) + if (trace.it == 2) cat("Iteration", iter, "Objective:", obj_val, fill = TRUE) + + boundary <- FALSE + halved <- FALSE # did we have to halve the step size? + # if objective function is not finite, keep halving the stepsize until it is finite + # for the halving step, we probably have to adjust fit$g as well? + if (!is.finite(obj_val) || obj_val > control$big) { + warning("Infinite objective function!", call. = FALSE) + if (is.null(coefold)) + stop("no valid set of coefficients has been found: please supply starting values", + call. = FALSE) + warning("step size truncated due to divergence", call. = FALSE) + ii <- 1 + while (!is.finite(obj_val) || obj_val > control$big) { + if (ii > control$mxitnr) + stop("inner loop 1; cannot correct step size", call. = FALSE) + ii <- ii + 1 + start <- (start + coefold)/2 + eta <- get_eta(x, start, 0) + offset + obj_val <- cox_obj_function(y, eta, weights, lambda, alpha, start, vp) + if (trace.it == 2) cat("Iteration", iter, " Halved step 1, Objective:", + obj_val, fill = TRUE) + } + boundary <- TRUE + halved <- TRUE + } + + # if we did any halving, we have to update the coefficients, intercept + # and weighted residual in the warm_fit object + if (halved) { + fit$warm_fit$a <- start + fit$warm_fit$r <- w * (z - eta) + } + + # test for convergence + if (abs(obj_val - obj_val_old)/(0.1 + abs(obj_val)) < control$epsnr) { + conv <- TRUE + break + } + else { + coefold <- start + obj_val_old <- obj_val + } + } + # end of IRLS loop + + # checks on convergence and fitted values + if (!conv) + warning("cox.fit: algorithm did not converge", call. = FALSE) + + # prepare output object + if (save.fit == FALSE) { + fit$warm_fit <- NULL + } + # overwrite values from elnet.fit object + fit$a0 <- list(NULL) + fit$call <- this.call + fit$offset <- is.offset + fit$nulldev <- nulldev + fit$dev.ratio <- 1 - coxnet.deviance(y = y, pred = eta, weights = weights, + std.weights = FALSE) / nulldev + + # add new key-value pairs to list + fit$family <- "cox" + fit$converged <- conv + fit$boundary <- boundary + fit$obj_function <- obj_val + + class(fit) <- c("coxnet", "glmnetfit", "glmnet") + fit +} + +#' Elastic net objective function value for Cox regression model +#' +#' Returns the elastic net objective function value for Cox regression model. +#' +#' @param y Survival response variable, must be a \code{Surv} or +#' \code{stratifySurv} object. +#' @param pred Model's predictions for \code{y}. +#' @param weights Observation weights. +#' @param lambda A single value for the \code{lambda} hyperparameter. +#' @param alpha The elasticnet mixing parameter, with \eqn{0 \le \alpha \le 1}. +#' @param coefficients The model's coefficients. +#' @param vp Penalty factors for each of the coefficients. +cox_obj_function <- function(y, pred, weights, lambda, alpha, + coefficients, vp) { + coxnet.deviance(y = y, pred = pred, weights = weights, std.weights = FALSE) + + lambda * pen_function(coefficients, alpha, vp) +} + +#' Get lambda max for Cox regression model +#' +#' Return the lambda max value for Cox regression model, used for computing +#' initial lambda values. For internal use only. +#' +#' This function is called by \code{cox.path} for the value of lambda max. +#' +#' When \code{x} is not sparse, it is expected to already by centered and scaled. +#' When \code{x} is sparse, the function will get its attributes \code{xm} and +#' \code{xs} for its centering and scaling factors. The value of +#' \code{lambda_max} changes depending on whether \code{x} is centered and +#' scaled or not, so we need \code{xm} and \code{xs} to get the correct value. +#' +#' @param x Input matrix, of dimension \code{nobs x nvars}; each row is an +#' observation vector. If it is a sparse matrix, it is assumed to be unstandardized. +#' It should have attributes \code{xm} and \code{xs}, where \code{xm(j)} and +#' \code{xs(j)} are the centering and scaling factors for variable j respsectively. +#' If it is not a sparse matrix, it is assumed to be standardized. +#' @param y Survival response variable, must be a \code{Surv} or +#' \code{stratifySurv} object. +#' @param alpha The elasticnet mixing parameter, with \eqn{0 \le \alpha \le 1}. +#' @param weights Observation weights. +#' @param offset Offset for the model. Default is a zero vector of length +#' \code{nrow(y)}. +#' @param exclude Indices of variables to be excluded from the model. +#' @param vp Separate penalty factors can be applied to each coefficient. +get_cox_lambda_max <- function(x, y, alpha, weights = rep(1, nrow(x)), + offset = rep(0, nrow(x)), exclude = c(), + vp = rep(1, ncol(x))) { + nobs <- nrow(x); nvars <- ncol(x) + + # extract strata (if any) + if ("strata" %in% names(attributes(y))) { + strata <- attr(y, "strata") + } else { + strata <- rep(1, nobs) + } + if (length(strata) != nobs) stop("length of strata != nobs") + + # if some penalty factors are zero, we need to compute eta + vp_zero <- setdiff(which(vp == 0), exclude) + if (length(vp_zero) > 0) { + tempx <- x[, vp_zero, drop = FALSE] + if (length(unique(strata)) == 1) { + fit <- survival::coxph(y ~ offset(offset) + tempx, weights = weights) + } else { + fit <- survival::coxph(y ~ offset(offset) + tempx + strata(strata), + weights = weights) + } + + eta <- predict(fit) + } else { + eta <- offset + } + + isconst <- function(x) 1 - (max(x) == min(x)) * 1 + if (inherits(x, "sparseMatrix")) { + xt <- as(t(x), "dgCMatrix") + lx <- split(xt@x, xt@i) + ju <- sapply(lx, isconst) + } + else ju <- apply(x, 2, isconst) + ju[exclude] <- 0 + + # get cox gradient at "null" point + # note that coxgrad already includes weights, so no need to include them + # in subsequent computations + null_grad <- coxgrad(eta, y, weights) + + if (inherits(x, "sparseMatrix")) { + xm <- attr(x, "xm") + xs <- attr(x, "xs") + g <- unlist(lapply(1:nvars, function(j) + abs(sum(null_grad * (x[, j] - xm[j]) / xs[j])))) + } else { + g <- unlist(lapply(1:nvars, function(j) + abs(sum(null_grad * x[, j])))) + } + g <- g / ifelse(vp > 0, vp, 1) + g[ju == 0] <- 0 + lambda_max <- max(g) / max(alpha, 1e-3) + return(lambda_max) +} diff -Nru r-cran-glmnet-4.0-2/R/cv.glmnetfit.R r-cran-glmnet-4.1/R/cv.glmnetfit.R --- r-cran-glmnet-4.0-2/R/cv.glmnetfit.R 2020-05-06 19:54:31.000000000 +0000 +++ r-cran-glmnet-4.1/R/cv.glmnetfit.R 2021-01-06 22:06:44.000000000 +0000 @@ -1,7 +1,7 @@ cv.glmnetfit <-function(predmat,y,type.measure,weights,foldid,grouped){ family=attr(predmat,"family") mumat=family$linkinv(predmat) - nobs=nrow(mumat) + nobs=nrow(predmat)# was nrow(mumat), which failed for tweedie instance ## initialize from family function. Makes y a vector in case of binomial, and possibly changes weights ## Expects nobs to be defined, and creates n and mustart (neither used here) ## Some cases expect to see things, so we set it up just to make it work diff -Nru r-cran-glmnet-4.0-2/R/cv.glmnet.R r-cran-glmnet-4.1/R/cv.glmnet.R --- r-cran-glmnet-4.0-2/R/cv.glmnet.R 2019-10-28 18:47:24.000000000 +0000 +++ r-cran-glmnet-4.1/R/cv.glmnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -19,7 +19,7 @@ #' If \code{relax=TRUE} then the values of \code{gamma} are used to mix the #' fits. If \eqn{\eta} is the fit for lasso/elastic net, and \eqn{\eta_R} is #' the relaxed fit (with unpenalized coefficients), then a relaxed fit mixed by -#' \eqn{\gamma} is \deqn{\eta(\gamma)=(1-\gamma)\eta_R+\gamma\eta}. There is +#' \eqn{\gamma} is \deqn{\eta(\gamma)=(1-\gamma)\eta_R+\gamma\eta.} There is #' practically no extra cost for having a lot of values for \code{gamma}. #' However, 5 seems sufficient for most purposes. CV then selects both #' \code{gamma} and \code{lambda}. @@ -74,7 +74,7 @@ #' @param keep If \code{keep=TRUE}, a \emph{prevalidated} array is returned #' containing fitted values for each observation and each value of #' \code{lambda}. This means these fits are computed with this observation and -#' the rest of its fold omitted. The \code{folid} vector is also returned. +#' the rest of its fold omitted. The \code{foldid} vector is also returned. #' Default is keep=FALSE. If \code{relax=TRUE}, then a list of such arrays is #' returned, one for each value of 'gamma'. Note: if the value 'gamma=1' is #' omitted, this case is included in the list since it corresponds to the @@ -108,9 +108,10 @@ #' \code{keep=TRUE}, this is the array of prevalidated fits. Some entries can #' be \code{NA}, if that and subsequent values of \code{lambda} are not reached #' for that fold} \item{foldid}{if \code{keep=TRUE}, the fold assignments used} +#' \item{index}{a one column matrix with the indices of \code{lambda.min} and \code{lambda.1se} in the sequence of coefficients, fits etc.} #' \item{relaxed}{if \code{relax=TRUE}, this additional item has the CV info #' for each of the mixed fits. In particular it also selects \code{lambda, -#' gamma} pairs corresponding to the 1SE rule, as well as the minimum error.} +#' gamma} pairs corresponding to the 1se rule, as well as the minimum error. It also has a component \code{index}, a two-column matrix which contains the \code{lambda} and \code{gamma} indices corresponding to the "min" and "1se" solutions.} #' @author Jerome Friedman, Trevor Hastie and Rob Tibshirani\cr Noah Simon #' helped develop the 'coxnet' function.\cr Jeffrey Wong and B. Narasimhan #' helped with the parallel option\cr Maintainer: Trevor Hastie diff -Nru r-cran-glmnet-4.0-2/R/getOptcv.glmnet.R r-cran-glmnet-4.1/R/getOptcv.glmnet.R --- r-cran-glmnet-4.0-2/R/getOptcv.glmnet.R 2019-09-04 20:59:46.000000000 +0000 +++ r-cran-glmnet-4.1/R/getOptcv.glmnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -7,7 +7,9 @@ lambda.min = max(lambda[idmin], na.rm = TRUE) idmin = match(lambda.min, lambda) semin = (cvm + cvsd)[idmin] - idmin = cvm <= semin - lambda.1se = max(lambda[idmin], na.rm = TRUE) - list(lambda.min = lambda.min, lambda.1se = lambda.1se) + id1se = cvm <= semin + lambda.1se = max(lambda[id1se], na.rm = TRUE) + id1se = match(lambda.1se, lambda) + index=matrix(c(idmin,id1se),2,1,dimnames=list(c("min","1se"),"Lambda")) + list(lambda.min = lambda.min, lambda.1se = lambda.1se, index = index) } diff -Nru r-cran-glmnet-4.0-2/R/getOptcv.relaxed.R r-cran-glmnet-4.1/R/getOptcv.relaxed.R --- r-cran-glmnet-4.0-2/R/getOptcv.relaxed.R 2019-09-04 20:59:46.000000000 +0000 +++ r-cran-glmnet-4.1/R/getOptcv.relaxed.R 2021-01-06 22:06:54.000000000 +0000 @@ -1,13 +1,16 @@ getOptcv.relaxed <- function (statlist,cvname,gamma) { + index=matrix(NA,2,2,dimnames=list(c("min","1se"),c("Lambda","Gamma"))) cvm=lapply(statlist,"[[","cvm") nlams=sapply(cvm,length) + index.lambda=unlist(lapply(nlams,seq)) cvm=unlist(cvm) lambdas=unlist(lapply(statlist,"[[","lambda")) cvsd=unlist(lapply(statlist,"[[","cvsd")) nzero=unlist(lapply(statlist,"[[","nzero")) gammas=rep(gamma,nlams) + index.gamma = rep(seq(along=gamma),nlams) names(lambdas)=NULL names(gammas)=NULL if(match(cvname,c("AUC","C-index"),0))cvm=-cvm @@ -17,6 +20,8 @@ which=order(lambdas[idmin],gammas[idmin],decreasing=TRUE)[1] lambda.min = lambdas[idmin][which] gamma.min= gammas[idmin][which] + index["min","Lambda"]=index.lambda[idmin][which] + index["min","Gamma"]=index.gamma[idmin][which] nzero.min=nzero[idmin][which] idmin=seq(along=lambdas)[idmin][which] semin = (cvm + cvsd)[idmin] @@ -24,6 +29,8 @@ which=order(lambdas[idmin],gammas[idmin],decreasing=TRUE)[1] lambda.1se = lambdas[idmin][which] gamma.1se= gammas[idmin][which] + index["1se","Lambda"]=index.lambda[idmin][which] + index["1se","Gamma"]=index.gamma[idmin][which] nzero.1se=nzero[idmin][which] - list(lambda.min = lambda.min, lambda.1se = lambda.1se, gamma.min=gamma.min, gamma.1se=gamma.1se,nzero.min=nzero.min,nzero.1se=nzero.1se) + list(lambda.min = lambda.min, lambda.1se = lambda.1se, gamma.min=gamma.min, gamma.1se=gamma.1se,nzero.min=nzero.min,nzero.1se=nzero.1se, index=index) } diff -Nru r-cran-glmnet-4.0-2/R/glmnet.control.R r-cran-glmnet-4.1/R/glmnet.control.R --- r-cran-glmnet-4.0-2/R/glmnet.control.R 2020-05-06 19:54:31.000000000 +0000 +++ r-cran-glmnet-4.1/R/glmnet.control.R 2021-01-06 22:06:54.000000000 +0000 @@ -27,7 +27,7 @@ #' @param itrace If 1 then progress bar is displayed when running \code{glmnet} #' and \code{cv.glmnet}. factory default = 0 #' @param epsnr convergence threshold for \code{glmnet.fit}. factory default = -#' 1.0e-8 +#' 1.0e-6 #' @param mxitnr maximum iterations for the IRLS loop in \code{glmnet.fit}. factory #' default = 25 #' @param factory If \code{TRUE}, reset all the parameters to the factory @@ -47,14 +47,14 @@ glmnet.control <- function (fdev = 1e-05, devmax = 0.999, eps = 1e-06, big = 9.9e+35, mnlam = 5, pmin = 1e-09, exmx = 250, prec = 1e-10, mxit = 100, - itrace = 0, epsnr = 1e-08, mxitnr = 25, factory = FALSE) + itrace = 0, epsnr = 1e-06, mxitnr = 25, factory = FALSE) { inquiry=!nargs() if (factory) invisible(glmnet.control(fdev = 1e-05, devmax = 0.999, eps = 1e-06, big = 9.9e+35, mnlam = 5, pmin = 1e-09, exmx = 250, prec = 1e-10, mxit = 100, itrace = 0, - epsnr = 1e-08, mxitnr = 25)) + epsnr = 1e-06, mxitnr = 25)) else { if (!missing(fdev)) .Fortran("chg_fract_dev", as.double(fdev), PACKAGE = "glmnet") diff -Nru r-cran-glmnet-4.0-2/R/glmnetFlex.R r-cran-glmnet-4.1/R/glmnetFlex.R --- r-cran-glmnet-4.0-2/R/glmnetFlex.R 2020-06-14 23:13:53.000000000 +0000 +++ r-cran-glmnet-4.1/R/glmnetFlex.R 2021-01-06 22:06:54.000000000 +0000 @@ -99,7 +99,6 @@ #' #' # binomial with probit link #' fit1 <- glmnet:::glmnet.path(x, y, family = binomial(link = "probit")) - glmnet.path <- function(x, y, weights=NULL, lambda = NULL, nlambda = 100, lambda.min.ratio = ifelse(nobs 0, vp, 1) + g <- abs((drop(t(rv) %*% x) - sum(rv) * xm) / xs) } else { - g <- unlist(lapply(1:nvars, function(j) - abs(sum(r / v * m.e * x[, j] * weights)))) * ju / ifelse(vp > 0, vp, 1) + g <- abs(drop(t(rv) %*% x)) } - + g <- g * ju / ifelse(vp > 0, vp, 1) lambda_max <- max(g) / max(alpha, 1e-3) list(nulldev = nulldev, mu = mu, lambda_max = lambda_max) diff -Nru r-cran-glmnet-4.0-2/R/glmnet-package.R r-cran-glmnet-4.1/R/glmnet-package.R --- r-cran-glmnet-4.0-2/R/glmnet-package.R 2020-06-14 21:47:22.000000000 +0000 +++ r-cran-glmnet-4.1/R/glmnet-package.R 2021-01-06 22:06:54.000000000 +0000 @@ -43,7 +43,7 @@ #' #' @name glmnet-internal #' @aliases auc assess.coxnet auc.mat cvtype cvstats -#' cvcompute getcoef getcoef.multinomial response.coxnet fix.lam error.bars +#' cvcompute getcoef getcoef.multinomial fix.lam error.bars #' getmin elnet mrelnet lognet fishnet coefnorm coxnet cv.lognet cv.elnet #' cv.multnet cv.mrelnet cv.coxnet cv.fishnet cv.glmnet.raw cv.relaxed.raw #' blend.relaxed checkgamma.relax buildPredmat buildPredmat.mrelnetlist diff -Nru r-cran-glmnet-4.0-2/R/glmnet.R r-cran-glmnet-4.1/R/glmnet.R --- r-cran-glmnet-4.0-2/R/glmnet.R 2020-06-14 23:19:14.000000000 +0000 +++ r-cran-glmnet-4.1/R/glmnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -10,8 +10,23 @@ #' descent. For \code{family="gaussian"} this is the lasso sequence if #' \code{alpha=1}, else it is the elasticnet sequence. #' +#' The objective function for \code{"gaussian"} is \deqn{1/2 RSS/nobs + +#' \lambda*penalty,} and for the other models it is \deqn{-loglik/nobs + +#' \lambda*penalty.} Note also that for \code{"gaussian"}, \code{glmnet} +#' standardizes y to have unit variance (using 1/n rather than 1/(n-1) formula) +#' before computing its lambda sequence (and then unstandardizes the resulting +#' coefficients); if you wish to reproduce/compare results with other software, +#' best to supply a standardized y. The coefficients for any predictor +#' variables with zero variance are set to zero for all values of lambda. +#' +#' ## Details on `family` option +#' #' From version 4.0 onwards, glmnet supports both the original built-in families, #' as well as \emph{any} family object as used by `stats:glm()`. +#' This opens the door to a wide variety of additional models. For example +#' `family=binomial(link=cloglog)` or `family=negative.binomial(theta=1.5)` (from the MASS library). +#' Note that the code runs faster for the built-in families. +#' #' The built in families are specifed via a character string. For all families, #' the object produced is a lasso or elasticnet regularization path for fitting the #' generalized linear regression paths, by maximizing the appropriate penalized @@ -25,16 +40,8 @@ #' penalties take care of redundancies. A two-class \code{"multinomial"} model #' will produce the same fit as the corresponding \code{"binomial"} model, #' except the pair of coefficient matrices will be equal in magnitude and -#' opposite in sign, and half the \code{"binomial"} values. Note that the -#' objective function for \code{"gaussian"} is \deqn{1/2 RSS/nobs + -#' \lambda*penalty,} and for the other models it is \deqn{-loglik/nobs + -#' \lambda*penalty.} Note also that for \code{"gaussian"}, \code{glmnet} -#' standardizes y to have unit variance (using 1/n rather than 1/(n-1) formula) -#' before computing its lambda sequence (and then unstandardizes the resulting -#' coefficients); if you wish to reproduce/compare results with other software, -#' best to supply a standardized y. The coefficients for any predictor -#' variables with zero variance are set to zero for all values of lambda. -#' Two useful additional families are the \code{family="mgaussian"} family and +#' opposite in sign, and half the \code{"binomial"} values. +#' Two useful additional families are the \code{family="mgaussian"} family and #' the \code{type.multinomial="grouped"} option for multinomial fitting. The #' former allows a multi-response gaussian model to be fit, using a "group #' -lasso" penalty on the coefficients for each variable. Tying the responses @@ -50,11 +57,20 @@ #' users prefer the usual convention of \emph{after}, they can add a small #' number to all censoring times to achieve this effect. #' -#' Version 4.0 and later allows for the family argument to be a S3 class `"family"` object -#' (a list of functions and expressions). -#' This opens the door to a wide variety of additional models. For example -#' `family=binomial(link=cloglog)` or `family=negative.binomial(theta=1.5)` (from the MASS library). -#' Note that the code runs faster for the built-in families. +#' ## Details on response for `family="cox"` +#' +#' For Cox models, the response should preferably be a \code{Surv} object, +#' created by the \code{Surv()} function in \pkg{survival} package. For +#' right-censored data, this object should have type "right", and for +#' (start, stop] data, it should have type "counting". To fit stratified Cox +#' models, strata should be added to the response via the \code{stratifySurv()} +#' function before passing the response to \code{glmnet()}. (For backward +#' compatibility, right-censored data can also be passed as a +#' two-column matrix with columns named 'time' and 'status'. The +#' latter is a binary variable, with '1' indicating death, and '0' indicating +#' right censored.) +#' +#' ## Details on `relax` option #' #' If \code{relax=TRUE} #' a duplicate sequence of models is produced, where each active set in the @@ -68,24 +84,23 @@ #' #' @param x input matrix, of dimension nobs x nvars; each row is an observation #' vector. Can be in sparse matrix format (inherit from class -#' \code{"sparseMatrix"} as in package \code{Matrix}; not yet available for -#' \code{family="cox"}) +#' \code{"sparseMatrix"} as in package \code{Matrix}) #' @param y response variable. Quantitative for \code{family="gaussian"}, or #' \code{family="poisson"} (non-negative counts). For \code{family="binomial"} #' should be either a factor with two levels, or a two-column matrix of counts #' or proportions (the second column is treated as the target class; for a #' factor, the last level in alphabetical order is the target class). For #' \code{family="multinomial"}, can be a \code{nc>=2} level factor, or a matrix -#' with \code{nc} columns of counts or proportions. For either +#' with \code{nc} columns of counts or proportions. For either #' \code{"binomial"} or \code{"multinomial"}, if \code{y} is presented as a -#' vector, it will be coerced into a factor. For \code{family="cox"}, \code{y} -#' should be a two-column matrix with columns named 'time' and 'status'. The -#' latter is a binary variable, with '1' indicating death, and '0' indicating -#' right censored. The function \code{Surv()} in package \pkg{survival} -#' produces such a matrix. For \code{family="mgaussian"}, \code{y} is a matrix +#' vector, it will be coerced into a factor. For \code{family="cox"}, preferably +#' a \code{Surv} object from the survival package: see Details section for +#' more information. For \code{family="mgaussian"}, \code{y} is a matrix #' of quantitative responses. -#' @param family Response type (see above). Either a character string representing -#' one of the built-in families, or else a `glm()` family object. +#' @param family Either a character string representing +#' one of the built-in families, or else a `glm()` family object. For more +#' information, see Details section below or the documentation for response +#' type (above). #' @param weights observation weights. Can be total counts if responses are #' proportion matrices. Default is 1 for each observation #' @param offset A vector of length \code{nobs} that is included in the linear @@ -206,28 +221,27 @@ #' additional item is another glmnet object with different values for #' \code{beta} and \code{dev.ratio}} #' @author Jerome Friedman, Trevor Hastie, Balasubramanian Narasimhan, Noah -#' Simon and Rob Tibshirani\cr Maintainer: Trevor Hastie +#' Simon, Kenneth Tay and Rob Tibshirani\cr Maintainer: Trevor Hastie #' \email{hastie@@stanford.edu} #' @seealso \code{print}, \code{predict}, \code{coef} and \code{plot} methods, #' and the \code{cv.glmnet} function. #' @references Friedman, J., Hastie, T. and Tibshirani, R. (2008) #' \emph{Regularization Paths for Generalized Linear Models via Coordinate -#' Descent}, \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}\cr -#' \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr -#' \url{https://www.jstatsoft.org/v33/i01/}\cr Simon, N., Friedman, J., Hastie, -#' T., Tibshirani, R. (2011) \emph{Regularization Paths for Cox's Proportional +#' Descent (2010), Journal of Statistical Software, Vol. 33(1), 1-22}, +#' \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}.\cr +#' Simon, N., Friedman, J., Hastie, T. and Tibshirani, R. (2011) +#' \emph{Regularization Paths for Cox's Proportional #' Hazards Model via Coordinate Descent, Journal of Statistical Software, Vol. -#' 39(5) 1-13}\cr \url{https://www.jstatsoft.org/v39/i05/}\cr Tibshirani, +#' 39(5), 1-13}, \url{https://www.jstatsoft.org/v39/i05/}.\cr Tibshirani, #' Robert, Bien, J., Friedman, J., Hastie, T.,Simon, N.,Taylor, J. and #' Tibshirani, Ryan. (2012) \emph{Strong Rules for Discarding Predictors in -#' Lasso-type Problems, JRSSB vol 74},\cr -#' \url{https://statweb.stanford.edu/~tibs/ftp/strong.pdf}\cr \emph{Stanford -#' Statistics Technical Report}\cr \url{https://arxiv.org/abs/1707.08692}\cr -#' Hastie, T., Tibshirani, Robert, Tibshirani, Ryan (2019) \emph{Extended +#' Lasso-type Problems, JRSSB, Vol. 74(2), 245-266}, +#' \url{https://statweb.stanford.edu/~tibs/ftp/strong.pdf}.\cr +#' Hastie, T., Tibshirani, Robert and Tibshirani, Ryan. \emph{Extended #' Comparisons of Best Subset Selection, Forward Stepwise Selection, and the -#' Lasso}\cr -#' \emph{Glmnet webpage with four vignettes} -#' \url{https://glmnet.stanford.edu} +#' Lasso (2017), Stanford Statistics Technical Report}, +#' \url{https://arxiv.org/abs/1707.08692}.\cr +#' Glmnet webpage with four vignettes, \url{https://glmnet.stanford.edu}. #' @keywords models regression #' @examples #' @@ -287,6 +301,22 @@ #' fit = glmnet(x, y, family = "cox") #' plot(fit) #' +#' # Cox example with (start, stop] data +#' set.seed(2) +#' nobs <- 100; nvars <- 15 +#' xvec <- rnorm(nobs * nvars) +#' xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +#' x <- matrix(xvec, nrow = nobs) +#' start_time <- runif(100, min = 0, max = 5) +#' stop_time <- start_time + runif(100, min = 0.1, max = 3) +#' status <- rbinom(n = nobs, prob = 0.3, size = 1) +#' jsurv_ss <- survival::Surv(start_time, stop_time, status) +#' fit <- glmnet(x, jsurv_ss, family = "cox") +#' +#' # Cox example with strata +#' jsurv_ss2 <- stratifySurv(jsurv_ss, rep(1:2, each = 50)) +#' fit <- glmnet(x, jsurv_ss2, family = "cox") +#' #' # Sparse #' n = 10000 #' p = 200 @@ -322,145 +352,152 @@ ## new.call=this.call ## new.call[[1]]=as.name("glmnet.path") ## fit=eval(new.call, parent.frame()) - if(missing(thresh))thresh=1e-10 fit=glmnet.path(x,y,weights,lambda,nlambda,lambda.min.ratio,alpha,offset,family, standardize,intercept,thresh=thresh,maxit,penalty.factor,exclude,lower.limits, upper.limits,trace.it=trace.it) fit$call=this.call - } - else - { -### Must have been a call to old glmnet -### Prepare all the generic arguments, then hand off to family functions - family=match.arg(family) - if(alpha>1){ - warning("alpha >1; set to 1") - alpha=1 - } - if(alpha<0){ - warning("alpha<0; set to 0") - alpha=0 - } - alpha=as.double(alpha) - nlam=as.integer(nlambda) - y=drop(y) # we dont like matrix responses unless we need them - if(is.null(weights))weights=rep(1,nobs) - else if(length(weights)!=nobs)stop(paste("number of elements in weights (",length(weights),") not equal to the number of rows of x (",nobs,")",sep="")) - dimy=dim(y) - nrowy=ifelse(is.null(dimy),length(y),dimy[1]) - if(nrowy!=nobs)stop(paste("number of observations in y (",nrowy,") not equal to the number of rows of x (",nobs,")",sep="")) - vnames=colnames(x) - if(is.null(vnames))vnames=paste("V",seq(nvars),sep="") - ne=as.integer(dfmax) - nx=as.integer(pmax) - if(is.null(exclude))exclude=integer(0) - if(any(penalty.factor==Inf)){ + } else { + family=match.arg(family) + if (family == "cox" && use.cox.path(x, y)) { + # we should call the new cox.path() + fit <- cox.path(x,y,weights,offset,alpha,nlambda,lambda.min.ratio, + lambda,standardize,thresh,exclude,penalty.factor, + lower.limits,upper.limits,maxit,trace.it,...) + fit$call <- this.call + } else { + ### Must have been a call to old glmnet + ### Prepare all the generic arguments, then hand off to family functions + if(alpha>1){ + warning("alpha >1; set to 1") + alpha=1 + } + if(alpha<0){ + warning("alpha<0; set to 0") + alpha=0 + } + alpha=as.double(alpha) + nlam=as.integer(nlambda) + y=drop(y) # we dont like matrix responses unless we need them + if(is.null(weights))weights=rep(1,nobs) + else if(length(weights)!=nobs)stop(paste("number of elements in weights (",length(weights),") not equal to the number of rows of x (",nobs,")",sep="")) + dimy=dim(y) + nrowy=ifelse(is.null(dimy),length(y),dimy[1]) + if(nrowy!=nobs)stop(paste("number of observations in y (",nrowy,") not equal to the number of rows of x (",nobs,")",sep="")) + vnames=colnames(x) + if(is.null(vnames))vnames=paste("V",seq(nvars),sep="") + ne=as.integer(dfmax) + nx=as.integer(pmax) + if(is.null(exclude))exclude=integer(0) + if(any(penalty.factor==Inf)){ exclude=c(exclude,seq(nvars)[penalty.factor==Inf]) exclude=sort(unique(exclude)) - } - if(length(exclude)>0){ - jd=match(exclude,seq(nvars),0) - if(!all(jd>0))stop("Some excluded variables out of range") - penalty.factor[jd]=1 #ow can change lambda sequence - jd=as.integer(c(length(jd),jd)) - }else jd=as.integer(0) - vp=as.double(penalty.factor) - internal.parms=glmnet.control() - if(internal.parms$itrace)trace.it=1 - else{ + } + if(length(exclude)>0){ + jd=match(exclude,seq(nvars),0) + if(!all(jd>0))stop("Some excluded variables out of range") + penalty.factor[jd]=1 #ow can change lambda sequence + jd=as.integer(c(length(jd),jd)) + }else jd=as.integer(0) + vp=as.double(penalty.factor) + internal.parms=glmnet.control() + if(internal.parms$itrace)trace.it=1 + else{ if(trace.it){ - glmnet.control(itrace=1) - on.exit(glmnet.control(itrace=0)) - } + glmnet.control(itrace=1) + on.exit(glmnet.control(itrace=0)) } - ###check on limits - if(any(lower.limits>0)){stop("Lower limits should be non-positive")} - if(any(upper.limits<0)){stop("Upper limits should be non-negative")} - lower.limits[lower.limits==-Inf]=-internal.parms$big - upper.limits[upper.limits==Inf]=internal.parms$big - if(length(lower.limits)=1)stop("lambda.min.ratio should be less than 1") - flmin=as.double(lambda.min.ratio) - ulam=double(1) - } - else{ - flmin=as.double(1) - if(any(lambda<0))stop("lambdas should be non-negative") - ulam=as.double(rev(sort(lambda))) - nlam=as.integer(length(lambda)) - } - is.sparse=FALSE - ix=jx=NULL - if(inherits(x,"sparseMatrix")){##Sparse case - is.sparse=TRUE - x=as(x,"CsparseMatrix") - x=as(x,"dgCMatrix") - ix=as.integer(x@p+1) - jx=as.integer(x@i+1) - x=as.double(x@x) - } + } + ###check on limits + if(any(lower.limits>0)){stop("Lower limits should be non-positive")} + if(any(upper.limits<0)){stop("Upper limits should be non-negative")} + lower.limits[lower.limits==-Inf]=-internal.parms$big + upper.limits[upper.limits==Inf]=internal.parms$big + if(length(lower.limits)=1)stop("lambda.min.ratio should be less than 1") + flmin=as.double(lambda.min.ratio) + ulam=double(1) + } + else{ + flmin=as.double(1) + if(any(lambda<0))stop("lambdas should be non-negative") + ulam=as.double(rev(sort(lambda))) + nlam=as.integer(length(lambda)) + } + is.sparse=FALSE + ix=jx=NULL + if(inherits(x,"sparseMatrix")){##Sparse case + is.sparse=TRUE + x=as(x,"CsparseMatrix") + x=as(x,"dgCMatrix") + ix=as.integer(x@p+1) + jx=as.integer(x@i+1) + xd=x@x + } + else xd <- x + storage.mode(xd) <- "double" + if (trace.it) { if (relax) cat("Training Fit\n") pb <- createPB(min = 0, max = nlam, initial = 0, style = 3) - } - kopt=switch(match.arg(type.logistic), - "Newton"=0,#This means to use the exact Hessian - "modified.Newton"=1 # Use the upper bound - ) - if(family=="multinomial"){ - type.multinomial=match.arg(type.multinomial) - if(type.multinomial=="grouped")kopt=2 #overrules previous kopt - } - kopt=as.integer(kopt) + } + kopt=switch(match.arg(type.logistic), + "Newton"=0,#This means to use the exact Hessian + "modified.Newton"=1 # Use the upper bound + ) + if(family=="multinomial"){ + type.multinomial=match.arg(type.multinomial) + if(type.multinomial=="grouped")kopt=2 #overrules previous kopt + } + kopt=as.integer(kopt) - fit=switch(family, - "gaussian"=elnet(x,is.sparse,ix,jx,y,weights,offset,type.gaussian,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit), - "poisson"=fishnet(x,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit), - "binomial"=lognet(x,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit,kopt,family), - "multinomial"=lognet(x,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit,kopt,family), - "cox"=coxnet(x,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,vnames,maxit), - "mgaussian"=mrelnet(x,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,jsd,intr,vnames,maxit) - ) - if (trace.it) { + fit=switch(family, + "gaussian"=elnet(xd,is.sparse,ix,jx,y,weights,offset,type.gaussian,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit), + "poisson"=fishnet(xd,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit), + "binomial"=lognet(xd,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit,kopt,family), + "multinomial"=lognet(xd,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,intr,vnames,maxit,kopt,family), + "cox"=coxnet(xd,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,vnames,maxit), + "mgaussian"=mrelnet(xd,is.sparse,ix,jx,y,weights,offset,alpha,nobs,nvars,jd,vp,cl,ne,nx,nlam,flmin,ulam,thresh,isd,jsd,intr,vnames,maxit) + ) + if (trace.it) { utils::setTxtProgressBar(pb, nlam) close(pb) + } + if(is.null(lambda))fit$lambda=fix.lam(fit$lambda)##first lambda is infinity; changed to entry point + fit$call=this.call + fit$nobs=nobs + class(fit)=c(class(fit),"glmnet") } - if(is.null(lambda))fit$lambda=fix.lam(fit$lambda)##first lambda is infinity; changed to entry point -fit$call=this.call - fit$nobs=nobs - class(fit)=c(class(fit),"glmnet") } - if(relax) - relax.glmnet(fit, x=x,y=y,weights=weights,offset=offset, - lower.limits=lower.limits,upper.limits=upper.limits,penalty.factor=penalty.factor, - check.args=FALSE,...) - else - fit + + if(relax) + relax.glmnet(fit, x=x,y=y,weights=weights,offset=offset, + lower.limits=lower.limits,upper.limits=upper.limits,penalty.factor=penalty.factor, + check.args=FALSE,...) + else + fit } diff -Nru r-cran-glmnet-4.0-2/R/makeX.R r-cran-glmnet-4.1/R/makeX.R --- r-cran-glmnet-4.0-2/R/makeX.R 2020-05-10 23:56:36.000000000 +0000 +++ r-cran-glmnet-4.1/R/makeX.R 2021-01-06 22:06:54.000000000 +0000 @@ -92,6 +92,12 @@ nte=nrow(test) df=rbind(df,test) } + ### bug fix because of change of default behavior of data.frame + ## check if any character columns + classes = sapply(df,class) + if (any(classes == "character")) + df <- as.data.frame(unclass(df), stringsAsFactors = TRUE) + ### x=prepareX(df,sparse=sparse) if(na.impute){ xbar=colMeans(x[seq(ntr),],na.rm=TRUE) @@ -112,6 +118,26 @@ function(df,sparse=FALSE,...){ if(!inherits(df,"data.frame"))stop("first argument must be of class `data.frame`") whichfac=sapply(df,inherits,"factor") +### bug fix if factor has one level only + ## Cannot get contrasts to do the job here, so do it manually + df_level <- lapply(df[,whichfac,drop = FALSE], levels) + nlevels <- sapply(df_level,length) + which <- nlevels == 1 + if(any(which)){ + whichfac1 = whichfac + whichfac1[whichfac] = which + dn <- names(df) + cx <- as.matrix(df[,whichfac1],drop=FALSE) + unimat <- array(1,dim=dim(cx)) + cnames <- paste0(dn[whichfac1],unlist(df_level[which])) + unimat[is.na(cx)] <- NA + df[,whichfac1] <- data.frame(unimat) + dn[whichfac1] <- cnames + names(df) <- dn + whichfac[whichfac] = !which + warning(call. = FALSE,paste("Column(s) ",paste(cnames,collapse=", "), "are all 1, due to factors with a single level")) + } +### oldna=options()$na.action cna=as.character(substitute(na.action)) options(na.action=na.pass) diff -Nru r-cran-glmnet-4.0-2/R/print.cv.glmnet.R r-cran-glmnet-4.1/R/print.cv.glmnet.R --- r-cran-glmnet-4.0-2/R/print.cv.glmnet.R 2020-06-14 21:23:02.000000000 +0000 +++ r-cran-glmnet-4.1/R/print.cv.glmnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -39,8 +39,9 @@ optlams=c(x$lambda.min,x$lambda.1se) which=match(optlams,x$lambda) - mat=with(x,cbind(optlams,cvm[which],cvsd[which],nzero[which])) - dimnames(mat)=list(c("min","1se"),c("Lambda","Measure","SE","Nonzero")) + mat = with(x, cbind(optlams, which, cvm[which], cvsd[which], nzero[which])) + dimnames(mat) = list(c("min", "1se"), c("Lambda", "Index","Measure", + "SE", "Nonzero")) cat("Measure:", x$name,"\n\n") mat=data.frame(mat,check.names=FALSE) diff -Nru r-cran-glmnet-4.0-2/R/print.cv.relaxed.R r-cran-glmnet-4.1/R/print.cv.relaxed.R --- r-cran-glmnet-4.0-2/R/print.cv.relaxed.R 2019-10-19 20:53:16.000000000 +0000 +++ r-cran-glmnet-4.1/R/print.cv.relaxed.R 2021-01-06 22:06:54.000000000 +0000 @@ -3,21 +3,21 @@ print.cv.relaxed <- function(x, digits = max(3, getOption("digits") - 3), ...) { cat("\nCall: ", deparse(x$call), "\n\n") - cat("Measure:", x$name,"\n\n") - x=x$relaxed - optlams=c(x$lambda.min,x$lambda.1se) - wg1=match(x$gamma.min,x$gamma) - wl1=match(x$lambda.min,x$statlist[[wg1]]$lambda) - s1=with(x$statlist[[wg1]],c(x$gamma.min,x$lambda.min,cvm[wl1],cvsd[wl1],x$nzero.min)) - - wg2=match(x$gamma.1se,x$gamma) - wl2=match(x$lambda.1se,x$statlist[[wg2]]$lambda) - s2=with(x$statlist[[wg2]],c(x$gamma.1se,x$lambda.1se,cvm[wl2],cvsd[wl2],x$nzero.1se)) - - mat=rbind(s1,s2) - dimnames(mat)=list(c("min","1se"),c("Gamma","Lambda","Measure","SE","Nonzero")) - - mat=data.frame(mat,check.names=FALSE) - class(mat)=c("anova",class(mat)) - print(mat,digits=digits) + cat("Measure:", x$name, "\n\n") + x = x$relaxed + optlams = c(x$lambda.min, x$lambda.1se) + wg1 = match(x$gamma.min, x$gamma) + wl1 = match(x$lambda.min, x$statlist[[wg1]]$lambda) + s1 = with(x$statlist[[wg1]], c(x$gamma.min,wg1, x$lambda.min,wl1, + cvm[wl1], cvsd[wl1], x$nzero.min)) + wg2 = match(x$gamma.1se, x$gamma) + wl2 = match(x$lambda.1se, x$statlist[[wg2]]$lambda) + s2 = with(x$statlist[[wg2]], c(x$gamma.1se,wg2, x$lambda.1se,wl2, + cvm[wl2], cvsd[wl2], x$nzero.1se)) + mat = rbind(s1, s2) + dimnames(mat) = list(c("min", "1se"), c("Gamma","Index", "Lambda","Index", + "Measure", "SE", "Nonzero")) + mat = data.frame(mat, check.names = FALSE) + class(mat) = c("anova", class(mat)) + print(mat, digits = digits) } diff -Nru r-cran-glmnet-4.0-2/R/response.coxnet.R r-cran-glmnet-4.1/R/response.coxnet.R --- r-cran-glmnet-4.0-2/R/response.coxnet.R 2019-09-04 20:59:46.000000000 +0000 +++ r-cran-glmnet-4.1/R/response.coxnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -1,7 +1,49 @@ -response.coxnet=function(y){ - if(!is.matrix(y)||!all(match(c("time","status"),dimnames(y)[[2]],0)))stop("Cox model requires a matrix with columns 'time' (>0) and 'status' (binary) as a response; a 'Surv' object suffices",call.=FALSE) - ty=as.double(y[,"time"]) - tevent=as.double(y[,"status"]) - if(any(ty<=0))stop("negative event times encountered; not permitted for Cox family") -list(time=ty,event=tevent) +#' Make response for coxnet +#' +#' Internal function to make the response y passed to glmnet suitable +#' for coxnet (i.e. glmnet with family = "cox"). Sanity checks are performed +#' here too. +#' +#' If y is a class "Surv" object, this function returns y with no changes. If +#' y is a two-column matrix with columns named 'time' and 'status', it is +#' converted into a "Surv" object. +#' +#' @param y Response variable. Either a class "Surv" object or a two-column +#' matrix with columns named 'time' and 'status'. +#' +#' @return A class "Surv" object. +#' +#' @importFrom survival Surv +response.coxnet <- function(y) { + if (any(is.na(y))) stop(paste0("NAs encountered in response, not allowed")) + + # if Surv object, check that it is of correct type and perform sanity checks + # if all good, return with no changes + if (is.Surv(y)) { + if (attr(y, "type") == "right") { + if (any(y[, "time"] <= 0)) + stop("Non-positive event times encountered; not permitted for Cox family") + return(y) + } else if (attr(y, "type") == "counting") { + if (any(y[, "start"] < 0) || any(y[, "stop"] <= 0)) + stop(paste("Negative start/non-positive stop times encountered;", + "not permitted for Cox family")) + if (any(y[, "start"] >= y[, "stop"])) + stop("Some rows have start time >= stop time; not permitted") + return(y) + } else { + stop("cox.path() only supports 'Surv' objects of type 'right' or 'counting'") + } } + + # if two-column matrix passed, make it into a Surv object + if (!is.matrix(y) || !all(match(c("time","status"),dimnames(y)[[2]],0))) + stop(paste0("Cox model requires a matrix with columns 'time' (>0) and ", + "'status' (binary) as a response; a 'Surv' object suffices"), + call. = FALSE) + ty <- as.double(y[,"time"]) + tevent <- as.double(y[,"status"]) + if (any(ty <= 0)) + stop("negative event times encountered; not permitted for Cox family") + return(Surv(ty, tevent)) +} diff -Nru r-cran-glmnet-4.0-2/R/stratifySurv.R r-cran-glmnet-4.1/R/stratifySurv.R --- r-cran-glmnet-4.0-2/R/stratifySurv.R 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/R/stratifySurv.R 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,56 @@ +#' @export +`[.stratifySurv` <- function(x, i, j, drop = FALSE) { + strata <- attr(x, "strata") + stop_time <- NULL; start_time <- NULL; ss_match <- NULL + if ("stop_time" %in% names(attributes(x))) stop_time <- attr(x, "stop_time") + if ("start_time" %in% names(attributes(x))) start_time <- attr(x, "start_time") + if ("ss_match" %in% names(attributes(x))) ss_match <- attr(x, "ss_match") + obj <- NextMethod(`[`) + # !missing(i) && missing(j)? + if (!missing(i) && is.matrix(obj)) { + attr(obj, "strata") <- strata[i] + if (!is.null(stop_time)) attr(obj, "stop_time") <- stop_time[i] + if (!is.null(start_time)) attr(obj, "start_time") <- start_time[i] + if (!is.null(ss_match)) attr(obj, "ss_match") <- ss_match[i] + class(obj) <- class(x) + } + return(obj) +} + +#' Add strata to a Surv object +#' +#' Helper function to add strata as an attribute to a Surv object. The +#' output of this function can be used as the response in \code{glmnet()} +#' for fitting stratified Cox models. +#' +#' When fitting a stratified Cox model with \code{glmnet()}, strata should +#' be added to a \code{Surv} response with this helper function. Note that +#' it is not sufficient to add strata as an attribute to the \code{Surv} +#' response manually: if the result does not have class \code{stratifySurv}, +#' subsetting of the response will not work properly. +#' +#' @param y A Surv object. +#' @param strata A vector of length equal to the number of observations in +#' y, indicating strata membership. +#' +#' @return An object of class \code{stratifySurv} (in addition to all the +#' classes \code{y} belonged to). +#' +#' @examples +#' y <- survival::Surv(1:10, rep(0:1, length.out = 10)) +#' strata <- rep(1:3, length.out = 10) +#' y2 <- stratifySurv(y, strata) # returns stratifySurv object +#' +#' @importFrom survival is.Surv +#' @export +stratifySurv <- function(y, strata) { + y <- response.coxnet(y) + + if (length(y) != length(strata)) + stop("y and strata must have the same length (=nobs)") + + attr(y, "strata") <- strata + y_class <- class(y) + if (!("stratifySurv" %in% y_class)) class(y) <- c("stratifySurv", y_class) + return(y) +} \ No newline at end of file diff -Nru r-cran-glmnet-4.0-2/R/survfit.coxnet.R r-cran-glmnet-4.1/R/survfit.coxnet.R --- r-cran-glmnet-4.0-2/R/survfit.coxnet.R 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/R/survfit.coxnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,203 @@ +#' Compute a survival curve from a coxnet object +#' +#' Computes the predicted survivor function for a Cox proportional hazards +#' model with elastic net penalty. +#' +#' To be consistent with other functions in \code{glmnet}, if \code{s} +#' is not specified, survival curves are returned for the entire lambda +#' sequence. This is not recommended usage: it is best to call +#' \code{survfit.coxnet} with a single value of the penalty parameter +#' for the \code{s} option. +#' +#' @param formula A class \code{coxnet} object. +#' @param s Value(s) of the penalty parameter lambda at which the survival +#' curve is required. Default is the entire sequence used to create the model. +#' However, it is recommended that \code{survfit.coxnet} is called for +#' a single penalty parameter. +#' @param ... This is the mechanism for passing additional arguments like +#' (i) x= and y= for the x and y used to fit the model, +#' (ii) weights= and offset= when the model was fit with these options, +#' (iii) arguments for new data (newx, newoffset, newstrata), and +#' (iv) arguments to be passed to survfit.coxph(). +#' +#' @return If \code{s} is a single value, an object of class "survfitcox" +#' and "survfit" containing one or more survival curves. Otherwise, a list +#' of such objects, one element for each value in \code{s}. +#' Methods defined for survfit objects are print, summary and plot. +#' +#' @examples +#' set.seed(2) +#' nobs <- 100; nvars <- 15 +#' xvec <- rnorm(nobs * nvars) +#' xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +#' x <- matrix(xvec, nrow = nobs) +#' beta <- rnorm(nvars / 3) +#' fx <- x[, seq(nvars / 3)] %*% beta / 3 +#' ty <- rexp(nobs, exp(fx)) +#' tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +#' y <- survival::Surv(ty, tcens) +#' fit1 <- glmnet(x, y, family = "cox") +#' +#' # survfit object for Cox model where lambda = 0.1 +#' sf1 <- survival::survfit(fit1, s = 0.1, x = x, y = y) +#' plot(sf1) +#' +#' # example with new data +#' sf2 <- survival::survfit(fit1, s = 0.1, x = x, y = y, newx = x[1:3, ]) +#' plot(sf2) +#' +#' # example with strata +#' y2 <- stratifySurv(y, rep(1:2, length.out = nobs)) +#' fit2 <- glmnet(x, y2, family = "cox") +#' sf3 <- survival::survfit(fit2, s = 0.1, x = x, y = y2) +#' sf4 <- survival::survfit(fit2, s = 0.1, x = x, y = y2, +#' newx = x[1:3, ], newstrata = c(1, 1, 1)) +#' +#' @importFrom survival coxph survfit +#' @method survfit coxnet +#' @export +survfit.coxnet <- function(formula, s = NULL, ...) { + this.call <- match.call() + object <- formula + args <- list(...) + + if (!("x" %in% names(args)) || !("y" %in% names(args))) + stop(paste0("the design matrix x and response y used to fit the model ", + "need to be passed")) + y <- args$y + + # if s is NULL, get the whole lambda sequence + if (is.null(s)) s <- object$lambda + + # check that the required arguments are passed for running coxph + check_dots(object, ..., + need = c("offset", "weights"), + error_start = "used survfit.coxnet()", + error_end = "in order to run survfit.coxnet") + # if user wants to run survfit on new data, make sure the arguments required + # for making new predictions are provided + if ("newx" %in% names(args)) { + check_dots(object, ..., + need = c("offset"), + error_start = "used survfit.coxnet() with newx argument", + prefix = "new", + error_end = "in order to predict on new data") + + if ("strata" %in% names(attributes(y)) && + !("newstrata" %in% names(args))) + stop(paste0("used survfit.coxnet() with newx argument and ", + "stratified Cox model was fit, need newstrata argument ", + "in order to predict on new data")) + } + + res <- list() + for (i in seq_along(s)) { + # "hack": run coxph with 0 iterations + coxphmod <- mycoxph(object, s = s[i], ...) + + # If newx is provided, we need to compute the predections at these + # observations and pass it as an argument to the eventual survfit call + # as `newdata`. In addition, we have to handle additional options that + # could have been passed: offset + if ("newx" %in% names(args)) { + current_args <- mycoxpred(object, s = s[i], ...) + } else { + current_args <- args + } + + # make the call to survfit.coxph + current_args$formula <- coxphmod + current_args$se.fit <- FALSE # doesn't make sense to compute SEs + sf <- do.call(survfit, current_args) + sf$call <- this.call + res[[i]] <- sf + } + if (length(s) > 1) { + return(res) + } else { + return(res[[1]]) + } +} + +#' Helper function to fit coxph model for survfit.coxnet +#' +#' This function constructs the coxph call needed to run the "hack" of +#' coxph with 0 iterations. It's a separate function as we have to deal with +#' function options like strata, offset and observation weights. +#' +#' @param object A class \code{coxnet} object. +#' @param s The value of the penalty parameter lambda at which the survival +#' curve is required. +#' @param ... The same ... that was passed to survfit.coxnet. +#' +#' @importFrom survival coxph strata +#' @importFrom stats as.formula +mycoxph <- function(object, s, ...) { + args <- list(...) + x <- args$x + y <- args$y + glmnet_call_names <- names(object$call)[-1] + + # predict from coxnet model for original data frame x with s, gives fitted + # linear predictor + args$object <- object + args$newx <- x + args$s <- s + if ("offset" %in% glmnet_call_names) { + args$newoffset <- rep(0, length.out = nrow(x)) + } + eta <- do.call(predict, args) + + # construct list of arguments for coxph() call based on which special + # arguments were used in the original glmnet() call + coxphargs <- list(formula = "y ~ X1", data = data.frame(y, eta), + init = 1, iter = 0) + + if ("strata" %in% names(attributes(y))) { + coxphargs$data$strata <- attr(y, "strata") + coxphargs$formula <- paste(coxphargs$formula, "+ strata(strata)") + } + if ("weights" %in% glmnet_call_names) { + coxphargs$weights <- args$weights + } + if ("offset" %in% glmnet_call_names) { + coxphargs$data$offset <- args$offset + coxphargs$formula <- paste(coxphargs$formula, "+ offset(offset)") + } + coxphargs$formula <- as.formula(coxphargs$formula) + coxphmod <- do.call(coxph, coxphargs) + return(coxphmod) +} + +#' Helper function to amend ... for new data in survfit.coxnet +#' +#' This function amends the function arguments passed to survfit.coxnet +#' via ... if new data was passed to survfit.coxnet. It's a separate +#' function as we have to deal with function options like newstrata +#' and newoffset. +#' +#' @param object A class \code{coxnet} object. +#' @param s The response for the fitted model. +#' @param ... The same ... that was passed to survfit.coxnet. +#' +#' @importFrom stats predict +mycoxpred <- function(object, s, ...) { + args <- list(...) + + if ("newoffset" %in% names(args)) { + new_eta <- predict(object, newx = args$newx, s = s, + newoffset = rep(0, nrow(args$newx))) + } else { + new_eta <- predict(object, newx = args$newx, s = s) + } + new_df <- data.frame(new_eta) + if ("newoffset" %in% names(args)) new_df$offset <- args$newoffset + + if ("newstrata" %in% names(args)) { + new_df$strata <- args$newstrata + args$newstrata <- NULL + } + + args$newdata <- new_df + return(args) +} diff -Nru r-cran-glmnet-4.0-2/R/survfit.cv.glmnet.R r-cran-glmnet-4.1/R/survfit.cv.glmnet.R --- r-cran-glmnet-4.0-2/R/survfit.cv.glmnet.R 1970-01-01 00:00:00.000000000 +0000 +++ r-cran-glmnet-4.1/R/survfit.cv.glmnet.R 2021-01-06 22:06:54.000000000 +0000 @@ -0,0 +1,60 @@ +#' Compute a survival curve from a cv.glmnet object +#' +#' Computes the predicted survivor function for a Cox proportional hazards +#' model with elastic net penalty from a cross-validated glmnet model. +#' +#' This function makes it easier to use the results of cross-validation +#' to compute a survival curve. +#' +#' @param formula A class \code{cv.glmnet} object. The object should have +#' been fit with \code{family = "cox"}. +#' @param s Value(s) of the penalty parameter lambda at which predictions +#' are required. Default is the value s="lambda.1se" stored on the CV object. +#' Alternatively s="lambda.min" can be used. If s is numeric, it is taken +#' as the value(s) of lambda to be used. +#' @param ... Other arguments to be passed to \code{survfit.coxnet}. +#' +#' @return If \code{s} is a single value, an object of class "survfitcox" +#' and "survfit" containing one or more survival curves. Otherwise, a list +#' of such objects, one element for each value in \code{s}. +#' Methods defined for survfit objects are print, summary and plot. +#' +#' @examples +#' set.seed(2) +#' nobs <- 100; nvars <- 15 +#' xvec <- rnorm(nobs * nvars) +#' x <- matrix(xvec, nrow = nobs) +#' beta <- rnorm(nvars / 3) +#' fx <- x[, seq(nvars / 3)] %*% beta / 3 +#' ty <- rexp(nobs, exp(fx)) +#' tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +#' y <- survival::Surv(ty, tcens) +#' cvfit <- cv.glmnet(x, y, family = "cox") +#' # default: s = "lambda.1se" +#' survival::survfit(cvfit, x = x, y = y) +#' +#' # s = "lambda.min" +#' survival::survfit(cvfit, s = "lambda.min", x = x, y = y) +#' @importFrom survival survfit +#' @method survfit cv.glmnet +#' @export +survfit.cv.glmnet <- function(formula, s = c("lambda.1se", "lambda.min"), ...) { + this.call <- match.call() + object <- formula + + # check that a coxnet model was fit + if (!("coxnet" %in% class(object$glmnet.fit))) + stop("survfit only available for Cox models") + + # if user didn't specify s, pick it out from its lambda sequence + if (is.numeric(s)) lambda <- s + else + if (is.character(s)) { + s <- match.arg(s) + lambda <- object[[s]] + } + else stop("Invalid form for s") + sf <- survfit.coxnet(object$glmnet.fit, s = lambda, ...) + sf$call <- this.call + return(sf) +} diff -Nru r-cran-glmnet-4.0-2/src/wls.f r-cran-glmnet-4.1/src/wls.f --- r-cran-glmnet-4.0-2/src/wls.f 2020-06-11 15:32:05.000000000 +0000 +++ r-cran-glmnet-4.1/src/wls.f 2021-01-06 22:06:54.000000000 +0000 @@ -314,7 +314,7 @@ end subroutine get_int_parms2(epsnr,mxitnr) implicit double precision(a-h,o-z) - data epsnr0,mxitnr0 /1.0d-8,25/ + data epsnr0,mxitnr0 /1.0d-6,25/ epsnr=epsnr0 mxitnr=mxitnr0 return diff -Nru r-cran-glmnet-4.0-2/vignettes/assets/glmnet_refs.bib r-cran-glmnet-4.1/vignettes/assets/glmnet_refs.bib --- r-cran-glmnet-4.0-2/vignettes/assets/glmnet_refs.bib 2019-10-28 18:47:24.000000000 +0000 +++ r-cran-glmnet-4.1/vignettes/assets/glmnet_refs.bib 2021-01-06 22:06:55.000000000 +0000 @@ -55,4 +55,14 @@ Title = {Extended Comparisons of Best Subset Selection, Forward Stepwise Selection, and the Lasso}, Year = {2017}, Eprint = {arXiv:1707.08692}, +} + +@book{Therneau2000, +author = {Therneau, Terry M. and Grambsch, Patricia M.}, +file = {:Users/kjytay/Dropbox/Macbook/Documents/Mendeley/Therneau and Grambsch{\_}Modeling Survival Data- Extending the Cox Model.pdf:pdf}, +isbn = {9781441931610}, +pages = {1--346}, +publisher = {Springer}, +title = {{Modeling survival data: extending the Cox model}}, +year = {2000} } \ No newline at end of file diff -Nru r-cran-glmnet-4.0-2/vignettes/Coxnet.Rmd r-cran-glmnet-4.1/vignettes/Coxnet.Rmd --- r-cran-glmnet-4.0-2/vignettes/Coxnet.Rmd 2019-10-31 18:19:34.000000000 +0000 +++ r-cran-glmnet-4.1/vignettes/Coxnet.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,13 @@ --- -title: "Coxnet: Regularized Cox Regression" -author: "Noah Simon, Jerome Friedman, Trevor Hastie and Rob Tibshirani" -date: '`r Sys.Date()`' +title: "Regularized Cox Regression" +author: + - Kenneth Tay + - Noah Simon + - Jerome Friedman + - Trevor Hastie + - Rob Tibshirani + - Balasubramanian Narasimhan +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -10,88 +16,274 @@ toc: yes toc_depth: 3 vignette: > - %\VignetteIndexEntry{Coxnet: Regularized Cox Regression} + %\VignetteIndexEntry{Regularized Cox Regression} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- +```{r include=FALSE} +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) +``` + ## Introduction -We will give a short tutorial on using coxnet. Coxnet is a function -which fits the Cox Model regularized by an elastic net penalty. It is -used for underdetermined (or nearly underdetermined systems) and chooses -a small number of covariates to include in the model. Because the Cox -Model is rarely used for actual prediction, we will rather focus on -finding and interpretating an appropriate model. We give a simple -example of how to format data and run the Cox Model in glmnet with cross -validation. - -Further details may be found in @coxnet, @strongrules and @block. - -## Example - -We first load our data and set up the response. In this case $x$ must be -an $n$ by $p$ matrix of covariate values --- each row corresponds to a -patient and each column a covariate. $y$ is an $n$ length vector of -failure/censoring times, and status is an $n$ length vector with each -entry, a $1$ or a $0$, indicating whether the corresponding entry in $y$ -is indicative of a failure time or right censoring time ($1$ for -failure, $0$ for censoring) - -```{r} -library("glmnet") -library("survival") -patient.data <- readRDS("assets/coxnet.RDS") -``` - -We then call our functions to fit with the lasso penalty ($\alpha=1$), -and cross validate. We set maxit = 1000 (increasing the maximum number -of iterations to $1000$) because our data is relatively high -dimensional, so more iterations are needed for convergence. In practice, -the function will spit out an error if convergence isn't reached by the -maximum number of iterations. - -```{r, warning = TRUE} -cv.fit <- cv.glmnet(patient.data$x, Surv(patient.data$time, patient.data$status), family="cox", maxit = 1000) -fit <- glmnet(patient.data$x, Surv(patient.data$time,patient.data$status), family = "cox", maxit = 1000) -``` - -The Surv function packages the survival data into the form expected by -glmnet. Once fit, we can view the optimal $\lambda$ value and a cross -validated error plot to help evaluate our model. +This vignette describes how one can use the `glmnet` package to fit regularized Cox models. + +The Cox proportional hazards model is commonly used for the study of the relationship beteween predictor variables and survival time. In the usual survival analysis framework, we have data of the form $(y_1, x_1, \delta_1), \ldots, (y_n, x_n, \delta_n)$ where $y_i$, the observed time, is a time of failure if $\delta_i$ is 1 or a right-censored time if $\delta_i$ is 0. We also let $t_1 < t_2 < \ldots < t_m$ be the increasing list of unique failure times, and let $j(i)$ denote the index of the observation failing at time $t_i$. + +The Cox model assumes a semi-parametric form for the hazard +$$ +h_i(t) = h_0(t) e^{x_i^T \beta}, +$$ +where $h_i(t)$ is the hazard for patient $i$ at time $t$, $h_0(t)$ is a shared baseline hazard, and $\beta$ is a fixed, length $p$ vector. In the classic setting $n \geq p$, inference is made via the partial likelihood +$$ +L(\beta) = \prod_{i=1}^m \frac{e^{x_{j(i)}^T \beta}}{\sum_{j \in R_i} e^{x_j^T \beta}}, +$$ +where $R_i$ is the set of indices $j$ with $y_j \geq t_i$ (those at risk at time $t_i$). + +Note there is no intercept in the Cox model as it is built into the baseline hazard, and like it, would cancel in the partial likelihood. + +In `glmnet`, we penalize the negative log of the partial likelihood with an elastic net penalty. + +(Credits: The original `"coxnet"` algorithm for right-censored data was developed by Noah Simon, Jerome Friedman, Trevor Hastie and Rob Tibshirani: see @coxnet for details. The other features for Cox models, introduced in v4.1, were developed by Kenneth Tay, Trevor Hastie, Balasubramanian Narasimhan and Rob Tibshirani.) + +## Basic usage for right-censored data + +We use a pre-generated set of sample data and response. `x` must be an $n\times p$ matrix of covariate values --- each row corresponds to a patient and each column a covariate. `y` is an $n \times 2$ matrix, with a column `"time"` of failure/censoring times, and `"status"` a 0/1 indicator, with 1 meaning the time is a failure time, and 0 a censoring time. The `Surv` function in the `survival` package creates such a response matrix, and it is recommended that the user uses the output of a call to `Surv` for the response to `glmnet`. (For backward compatibility, `glmnet` can accept a two-column matrix with column names `"time"` and `"status"` for right-censored data.) +```{r} +library(glmnet) +library(survival) +data(CoxExample) +y[1:5, ] +``` + +We apply the `glmnet` function to compute the solution path under default settings: +```{r} +fit <- glmnet(x, y, family = "cox") +``` + +All the standard options such as `alpha`, `weights`, `nlambda` and `standardize` package, and their usage is similar as in the Gaussian case. (See the vignette ["An Introduction to `glmnet`"](https://glmnet.stanford.edu/articles/glmnet.html) for details, or refer to the help file `help(glmnet)`.) + +We can plot the coefficients with the `plot` method: +```{r} +plot(fit) +``` + +As before, we can extract the coefficients at certain values of $\lambda$: +```{r out.lines = 10} +coef(fit, s = 0.05) +``` + +Since the Cox Model is not commonly used for prediction, we do not give an illustrative example on prediction. If needed, users can refer to the help file by typing `help(predict.glmnet)`. + +### Cross-validation + +The function `cv.glmnet` can be used to compute $K$-fold cross-validation (CV) for the Cox model. The usage is similar to that for other families except for two main differences. + +First, `type.measure` only supports `"deviance"` (also default) which +gives the partial-likelihood, and `"C"`, which gives the Harrell +*C index*. This is like the area under the curve (AUC) measure of concordance for survival data, but only considers comparable pairs. Pure concordance would record the fraction of pairs for which the order of the death times agree with the order of the predicted risk. However, with survival data, if an observation is right censored at a time *before* another observation's death time, they are not comparable. + +The code below illustrates how one can perform cross-validation using the Harell C index. Note that unlike most error measures, a higher C index means better prediction performance. +```{r} +set.seed(1) +cvfit <- cv.glmnet(x, y, family = "cox", type.measure = "C") +``` + +Once fit, we can view the optimal $\lambda$ value and a cross validated error plot to help evaluate our model. +```{r} +plot(cvfit) +``` + +As with other families, the left vertical line in our plot shows us where the CV-error curve hits its minimum. The right vertical line shows us the most regularized model with CV-error within 1 standard deviation of the minimum. We also extract such optimal $\lambda$'s: +```{r} +cvfit$lambda.min +cvfit$lambda.1se +``` + +Second, the option `grouped = TRUE` (default) obtains the CV partial likelihood for the Kth fold by subtraction, i.e. by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the $(K-1)/K$ dataset. This makes more efficient use of risk sets. With `grouped = FALSE` the log partial likelihood is computed only on the $K$th fold, which is only reasonable if each fold has a large number of observations. + +### Handling of ties + +`glmnet` handles ties in survival time with the Breslow approximation. This is different from `survival` package's `coxph` function, whose default tie-handling method is the Efron approximation. +```{r} +# create x matrix +set.seed(1) +nobs <- 100; nvars <- 15 +x <- matrix(rnorm(nobs * nvars), nrow = nobs) + +# create response +ty <- rep(rexp(nobs / 5), each = 5) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +y <- Surv(ty, tcens) + +# coefficients from these two models will not line up because +# of different tie handling methods +glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0) +coxph_fit <- coxph(y ~ x) +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) +``` + +`glmnet` is not able to perform the Efron approximation at the moment. `survival`'s `coxph` can perform the Breslow approximation by specifying `ties = "breslow"`: +```{r} +# coefficients from these two models will line up +glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0) +coxph_fit <- coxph(y ~ x, ties = "breslow") +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) +``` + +## Cox models for start-stop data + +Since version 4.1 `glmnet` can fit models where the response is a (start, stop] time interval. As explained in @Therneau2000, the ability to work with start-stop responses opens the door to fitting regularized Cox models with + +* time-dependent covariates, +* time-dependent strata, +* left truncation, +* multiple time scales, +* multiple events per subject, +* independent increment, marginal, and conditional models for correlated data, and +* various forms of case-cohort models. +The code below shows how to create a response of this type (using `survival` package's `Surv` function) and how to fit such a model with `glmnet`. ```{r} +# create x matrix +set.seed(2) +nobs <- 100; nvars <- 15 +xvec <- rnorm(nobs * nvars) +xvec[sample.int(nobs * nvars, size = 0.4 * nobs * nvars)] <- 0 +x <- matrix(xvec, nrow = nobs) # non-sparse x +x_sparse <- Matrix::Matrix(xvec, nrow = nobs, sparse = TRUE) # sparse x + +# create start-stop data response +beta <- rnorm(5) +fx <- x_sparse[, 1:5] %*% beta / 3 +ty <- rexp(nobs, drop(exp(fx))) +tcens <- rbinom(n = nobs, prob = 0.3, size = 1) +starty <- runif(nobs) +yss <- Surv(starty, starty + ty, tcens) + +# fit regularized Cox model with start-stop data +fit <- glmnet(x, yss, family = "cox") +``` + +(Note that the call above would have worked as well if `x` was replaced by `x_sparse`.) `cv.glmnet` works with start-stop data too: +```{r} +cv.fit <- cv.glmnet(x, yss, family = "cox", nfolds = 5) plot(cv.fit) -cv.fit$lambda.min ``` -The left vertical line in our plot shows us where the CV-error curve -hits its minimum. The right vertical line shows us the most -regularized model with CV-error within$1$standard deviation of the -minimum. In this case, we see that the minimum was achieved by a -fairly regularized model, however the right line indicates that the -null model (no coefficients included) is within$1$sd of the -minimum. This might lead us to believe that in actuality the -covariates are not explaining any variability. For the time being we -will concern ourselves with the minimum CV-error model. We can check -which covariates our model chose to be active, and see the -coefficients of those covariates. +As a sanity check, the code below shows that fitting start-stop responses +using `glmnet` with `lambda = 0` matches up with `coxph`'s result: +```{r} +glmnet_fit <- glmnet(x, yss, family = "cox", lambda = 0) +coxph_fit <- coxph(yss ~ x) +plot(coef(glmnet_fit), coef(coxph_fit)) +abline(0, 1) +``` + +## Stratified Cox models + +One extension of the Cox regression model is to allow for strata that divide the observations into disjoint groups. Each group has its own baseline hazard function, but the groups share the same coefficient vector for the covariates provided by the design matrix `x`. + +`glmnet` can fit stratified Cox models with the elastic net +penalty. With `coxph` one can specify strata in the model formula. +Since `glmnet` does not use a model formula, we achieve this by adding +a strata attribute to the `Surv` response object. We achieve this via +the function `stratifySurv`: +```{r} +strata <- rep(1:5, length.out = nobs) +y2 <- stratifySurv(y, strata) +str(y2[1:6]) +``` +`stratifySurv` returns an object of class `stratifySurv`. We can then pass this `stratifySurv` object as the response to a `glmnet` call. `glmnet` will fit a stratified Cox model if it detects that the response has class `stratifySurv`. ```{r} -Coefficients <- coef(fit, s = cv.fit$lambda.min) -Active.Index <- which(Coefficients != 0) -Active.Coefficients <- Coefficients[Active.Index] +fit <- glmnet(x, y2, family = "cox") ``` -`coef(fit, s = cv.fit\$lambda.min)` returns the $p$ length coefficient -vector of the solution corresponding to $\lambda =$`cv.fit$lambda.min`. +This `stratifySurv` object can also be passed to `cv.glmnet` to fit stratified Cox models with cross-validation: +```{r} +cv.fit <- cv.glmnet(x, y2, family = "cox", nfolds = 5) +plot(cv.fit) +``` +Note that simply giving the response a `"strata"` attribute is not enough! The response needs to be of class `stratifySurv` in order for subsetting to work correctly. To protect against this, an error will be thrown if the response has a `"strata"` attribute but is not of class `stratifySurv`. Add strata via the `stratifySurv` function. ```{r} -Active.Index -Active.Coefficients +y3 <- y +attr(y3, "strata") <- strata +str(y3[1:6]) # note that the strata attribute is no longer there ``` -We see that our optimal model chose 2 active covariates ($X80$ and -$X394$) each with a small positive effect on hazard. +```{r error=TRUE} +fit <- glmnet(x, y3, family = "cox") +``` + +## Plotting survival curves + +Fitting a regularized Cox model using `glmnet` with `family = "cox"` +returns an object of class `"coxnet"`. Class `"coxnet"` objects have a +`survfit` method which allows the user to visualize the survival +curves from the model. In addition to the `"coxnet"` object, the user +must pass the `x` and `y` objects used to fit the model (for +computation of the baseline hazard), as well as the lambda value for which the survival curve is desired: +```{r} +fit <- glmnet(x, y, family = "cox") +survival::survfit(fit, s = 0.05, x = x, y = y) +``` + +We are unable to provide standard errors for these survival curves, so we do not present the confidence bounds for them. To plot the survival curve, pass the result of the `survfit` call to `plot`: +```{r} +plot(survival::survfit(fit, s = 0.05, x = x, y = y)) +``` + +As noted in the documentation for `survival::survfit.coxph`, without new data, a curve is produced for a single "pseudo" subject with covariate values equal to the means of the data set, and this resulting curve(s) almost never make sense. We can get survival curves for individual observations by passing a `newx` argument: +```{r} +survival::survfit(fit, s = 0.05, x = x, y = y, newx = x[1:3, ]) +plot(survival::survfit(fit, s = 0.05, x = x, y = y, newx = x[1:3, ])) +``` + +If the original model was fit with strata, then the `strata` option needs to be specified as well. If `newx` is being passed for such a model, the strata for these new observations need to be passed via `newstrata`. +```{r} +y2 <- stratifySurv(y, rep(1:2, length.out = nobs)) +fit <- glmnet(x, y2, family = "cox") +survival::survfit(fit, s = 0.01, x = x, y = y2) + +# survival curve plot for first two individuals in dataset +plot(survival::survfit(fit, s = 0.01, x = x, y = y2, + newx = x[1:2, ], newstrata = strata[1:2])) +``` + +To be consistent with other methods in `glmnet`, if the `s` parameter is not specified, survival curves are returned for the entire `lambda` sequence. The survival curves are returned as a list, one element for each `lambda` value. +```{r} +sf <- survival::survfit(fit, x = x, y = y2) +length(sf) +length(fit$lambda) +``` + +The `survfit` method is available for `cv.glmnet` objects as well. By default, the `s` value chosen is the "lambda.1se" value stored in the CV object. The `s` value can also be set to the `"lambda.min"` value stored in the +CV object. +```{r} +cv.fit <- cv.glmnet(x, y2, family = "cox", nfolds = 5) +survival::survfit(cv.fit, x = x, y = y2) +survival::survfit(cv.fit, s = "lambda.min", x = x, y = y2) +``` ## References diff -Nru r-cran-glmnet-4.0-2/vignettes/glmnetFamily.Rmd r-cran-glmnet-4.1/vignettes/glmnetFamily.Rmd --- r-cran-glmnet-4.0-2/vignettes/glmnetFamily.Rmd 2020-05-13 00:21:05.000000000 +0000 +++ r-cran-glmnet-4.1/vignettes/glmnetFamily.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,9 @@ --- -title: "Glm `family` functions in `glmnet` 4.0" -author: "Trevor Hastie and Kenneth Tay" -date: "April 30, 2020" +title: "The `family` Argument for `glmnet`" +author: + - Trevor Hastie + - Kenneth Tay +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -10,7 +12,7 @@ toc: yes toc_depth: 3 vignette: > - %\VignetteIndexEntry{Glm family} + %\VignetteIndexEntry{The family Argument for glmnet} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -28,106 +30,65 @@ penalized maximum likelihood. Concretely, it solves the problem $$ \min_{\beta_0, \beta} \frac{1}{N}\sum_{i=1}^N w_i l_i(y_i, \beta_0 + \beta^T x_i) + \lambda \left[\frac{1 - \alpha}{2}\|\beta\|_2^2 + \alpha \|\beta\|_1 \right] $$ -over a grid of values of $\lambda$ covering the entire range. In the equation above, $l_i(y, \eta)$ is the negative log-likelihood contribution for observation $i$. $\alpha \in [0,1]$ is a tuning parameter which bridges the gap between the lasso ($\alpha = 1$, the default) and ridge regression ($\alpha = 0$), while $\lambda$ controls the overall strength of the penalty. +over a grid of values of $\lambda$ covering the entire range. In the equation above, $l_i(y_i, \eta_i)$ is the negative log-likelihood contribution for observation $i$. $\alpha \in [0,1]$ is a tuning parameter which bridges the gap between the lasso ($\alpha = 1$, the default) and ridge regression ($\alpha = 0$), while $\lambda$ controls the overall strength of the penalty. -For `glmnet` v3.0-2 and below, `glmnet` could only solve the -minimization problem above for a limited number of built-in -(hardwired) families via its `family` argument. In particular, -`glmnet` could fit penalized Gaussian, binomial, and Poisson GLMs -(along with a few other special cases such as the Cox model, -multinomial regression, and multi-response Gaussian). In v4.0 onwards, -`glmnet` allows the user to fit a penalized regression model for *any* -GLM by allowing for any legitimate GLM family object, as used by the -`stats:glm` function. - -### Additional families - -Before v4.0, `glmnet`'s `family` argument had to be one of a limited -set of strings: `c("gaussian", "binomial", "poisson", "multinomial", -"cox", "mgaussian")`. This specified which of the *built-in* families -was to be used. From v4.0 onwards, in addition to these -strings, the `family argument to `glmnet` can also be the result of a call -to a `family` function. (To learn more about family functions in R, -run `?family` in the R console.) - -All the functionality of `glmnet` applies to these new families, and -hence their addition expands the scope of `glmnet` considerably. In -particular, - -* All the methods work as before, such as `plot`, `predict` etc. -* large and sparse `X` matrices as input ; -* upper and lower bound constraints on parameters; -* `cv.glmnet` for selecting the tuning parameters; -* `relax=TRUE` for fitting the unpenalized models to the - active sets; +`glmnet` solves the minimization problem above very efficiently for a limited number of built-in (hardwired) families. To fit these model families, one should specify the `family` argument to `glmnet` as a character string. The families that can be fit efficiently this way are the penalized Gaussian (`"gaussian"`), binomial (`"binomial"`), and Poisson (`"poisson"`) GLMs, along with a few other special cases: the Cox model (`"cox"`), multinomial regression (`"multinomial"`), and multi-response Gaussian (`"mgaussian"`). Details for how to fit these models can be found in the vignette ["An Introduction to `glmnet`"](https://glmnet.stanford.edu/articles/glmnet.html). + +Apart from these built-in families, `glmnet` also allows the user to fit a penalized regression model for *any* GLM by allowing the `family` argument to be any legitimate GLM family object, as used by the `stats:glm` function. + +### Using class "family" objects for the `family` argument + +The `family` argument to `glmnet` can be the result of a call to a `family` function. (To learn more about family functions in R, run `?family` in the R console.) All the functionality of `glmnet` applies to these new families, and hence their addition expands the scope of `glmnet` considerably. In particular, + +* All the methods, such as `plot` and `predict`, work as before; +* Large and sparse `x` matrices can be taken as input; +* The user can put upper and lower bound constraints on parameters; +* `cv.glmnet` can be used for selecting the tuning parameters; +* `relax = TRUE` can be specified for fitting unpenalized models to the active sets; * `offsets` can be provided; -* penalty strengths, standardization, and other options. +* Penalty strengths, standardization, and other options to `glmnet` work as before. + +When the `family` argument is a class `"family"` object, `glmnet` fits the model for each value of `lambda` with a proximal Newton algorithm, also known as iteratively reweighted least squares (IRLS). The outer loop of the IRLS algorithm is coded in R, while the inner loop solves the weighted least squares problem with the elastic net penalty, and is implemented in Fortran. The R code exploits warm starts as it iterates down the path, and so is reasonably efficient. ### More on GLM families -A GLM is linear model for a response variable whose conditional -distribution belongs to a one-dimensional exponential family. Apart -from Gaussian, Poisson and binomial, there are other interesting -members of this family. Some examples are Gamma, inverse Gaussian, -negative binomial, to name a few. A GLM consists of 3 parts: +A GLM is linear model for a response variable whose conditional distribution belongs to a one-dimensional exponential family. Apart from Gaussian, Poisson and binomial families, there are other interesting members of this family, e.g. Gamma, inverse Gaussian, negative binomial, to name a few. A GLM consists of 3 parts: 1. A linear predictor: $\eta_i = \sum_{j=1}^p \beta^T x_i$, 2. A link function: $\eta_i = g(\mu_i)$, and 3. A random component: $y_i \sim f(y \mid \mu_i)$. -The user gets to specify the link function $g$ and the family of -response distributions $f(\cdot \mid \mu)$, and fitting a GLM amounts -to estimating the parameter $\beta$ by maximum likelihood. - -In R, these 3 parts of the GLM are encapsulated in an object of class -`family` (run `?family` in the R console for more details). A `family` -object is a list of GLM components which allows functions such as -`stats:glm` to fit GLMs in R. As an example, the code below shows the -constituent parts for the binomial GLM, which is what is used to fit -linear logistic regression: +The user gets to specify the link function $g$ and the family of response distributions $f(\cdot \mid \mu)$, and fitting a GLM amounts to estimating the parameter $\beta$ by maximum likelihood. + +In R, these 3 parts of the GLM are encapsulated in an object of class `family` (run `?family` in the R console for more details). A `family` object is a list of GLM components which allows functions such as `stats:glm` to fit GLMs in R. As an example, the code below shows the constituent parts for the binomial GLM, which is what is used to fit linear logistic regression: ```{r} fam <- binomial() class(fam) names(fam) ``` -This is a list of functions and expressions that get used in the -*iteratively reweighted least squares* algorithm for fitting the GLM. +This is a list of functions and expressions that get used in the *iteratively reweighted least squares* (IRLS) algorithm for fitting the GLM. -From v4.0 onwards, `glmnet` can fit penalized GLMs for any family as -long as the family can be expressed as a `family` object. In fact, -users can make their own families, or customize existing families, -just as they can for regular GLMs. - -Generally this option should be used if the desired family is not -included in the built-in list. The reason is that the entire path -algorithm for the built-in families is implemented in Fortran, and so -will be faster. +`glmnet` can fit penalized GLMs for any family as long as the family can be expressed as a `family` object. In fact, users can make their own families, or customize existing families, just as they can for regular GLMs. + +Generally this option should be used only if the desired family is not included in the built-in list. The reason is that the entire path algorithm for the built-in families is implemented in Fortran, and so will be faster. ## Fitting Gaussian, binomial and Poisson GLMs -First we demonstrate how we can use this new version of `glmnet` to fit ordinary least squares with the elastic net penalty. We set up some fake data: +First, we demonstrate how we can use this new version of `glmnet` to fit ordinary least squares with the elastic net penalty. We set up some fake data: ```{r} set.seed(1) x <- matrix(rnorm(500), ncol = 5) y <- rowSums(x[, 1:2]) + rnorm(100) ``` -The function calls below demonstrate how we would fit the model with -the old and new `family` parameter options. To fit a linear regression -by least squares, we want to use the Gaussian family. There is a -*hard-wired* option for this, specified via `family="gaussian"` (which -is also the defaults for `glmnet`). Now we can also use `family = gaussian()` to fit the same model. +The function calls below demonstrate how we would fit the model with the old and new `family` parameter options. To fit a linear regression by least squares, we want to use the Gaussian family. There is a *hard-wired* option for this, specified via `family="gaussian"` (which is also the default for `glmnet`). Now we can also use `family = gaussian()` to fit the same model. ```{r message = FALSE} library(glmnet) oldfit <- glmnet(x, y, family = "gaussian") newfit <- glmnet(x, y, family = gaussian()) ``` -`glmnet` distinguishes these two cases because the first is a -character string, while the second is a GLM family object. -Of course if we really wanted to fit this model, we would use the -hard-wired version, because it is faster. Here we want to show that -they are equivalent, up to machine precision. +`glmnet` distinguishes these two cases because the first is a character string, while the second is a GLM family object. Of course if we really wanted to fit this model, we would use the hard-wired version, because it is faster. Here we want to show that they are equivalent, up to machine precision. There are slight differences in the algorithms used to compute the solutions, so some of the equality tests run using `testthat::expect_equal` might fail. However, these same tests can be made to pass by decreasing the `thresh` option in both function calls: @@ -144,8 +105,7 @@ } ``` -Next, we demonstrate the function calls for the binomial and Poisson -GLM families. +Next, we demonstrate the function calls for the binomial and Poisson GLM families. ```{r} biny <- ifelse(y > 0, 1, 0) # binary data @@ -164,15 +124,7 @@ In the examples above, the new version is simply replicating existing functionality in `glmnet`. For these GLMs, we recommend specifying the -GLM family as a string for computational efficiency. The Figures 1-2 -illustrate that existing code for these GLM families is more -efficient than the new code, especially for the Gaussian case. - - For the new families, the model is fit for each value -of `lambda` by a "proximal Newton" algorithm, with the outer loops -coded in R. The inner loop is fit by a weighted elastic-net algorithm, -which is implemented in Fortran. However, the R code also exploits -warm starts as it iterates down the path, so is reasonably efficient. +GLM family as a character string for computational efficiency. The figures below illustrate that existing code for these GLM families is more efficient than the new code, especially for the Gaussian case. (In the figures, `n` and `p` denote the number of observations and features respectively. Each point is the mean of 5 simulation runs. Note that both the `x` and `y` axes are on a log scale.) @@ -182,91 +134,82 @@ ## Fitting other GLMs -The real power of the new code is in fitting GLMs other than the three -in the previous section, by passing a GLM `"family"` object as the -`family` argument to `glmnet`. - - - - -For example, performing probit regression with the elastic net penalty is as simple as the code below: +The real power of using class `"family"` objects for the `family` argument is in fitting GLMs other than the three in the previous section. For example, performing probit regression with the elastic net penalty is as simple as the code below: ```{r} newfit <- glmnet(x, biny, family = binomial(link = "probit")) ``` -For the *complementary log-log* link we would specify `family = binomial(link = "cloglog")` -We can fit nonlinear least-squares models by using a different link -with the Gaussian family; for example `family=gaussian(link="log")`. +For the *complementary log-log* link we would specify `family = binomial(link = "cloglog")`. + +We can fit nonlinear least-squares models by using a different link with the Gaussian family; for example `family = gaussian(link = "log")`. For count data, we can fit a quasi-Poisson model that allows for overdispersion: ```{r} newfit <- glmnet(x, cnty, family = quasipoisson()) ``` -Performing negative binomial regression (instead of Poisson regression) is also easy: +The negative binomial is often used to model over-dispersed count data +(instead of Poisson regression), and is also easy: ```{r, eval=FALSE} library(MASS) newfit <- glmnet(x, cnty, family = negative.binomial(theta = 5)) ``` -There are many other families, including `quasi` where users can -customize their own families. In addition, there are additional -specialized -families, such as `statmod:tweedie` for overdispersed count data, for example. +There are many other families, including `quasi` where users can customize their own families. There are additional specialized families such as `statmod:tweedie` for overdispersed count data. -## `glmnetfit` class +## Class `"glmnetfit"` objects -If `glmnet` is called with GLM `family` object as its argument, it -returns an object with class: +If `glmnet` is called with a class `"family"` object as its argument, it returns an object with class ```{r} class(newfit) ``` -This is similar to the hard-wired classes; for example a -`family="gaussian"` has class + +This is similar to the hard-wired classes; for example a `family = "gaussian"` fit has class ```{r} -fit=glmnet(x,y,family="gaussian") +fit <- glmnet(x, y, family = "gaussian") class(fit) ``` -Importantly, both these inherit from class `"glmnet"`, and so all the S3 -methods such as `plot`, `predict`, `coef`, and `print` will work out -the box. +Importantly, both these inherit from class `"glmnet"`, and so all the S3 methods such as `plot`, `predict`, `coef`, and `print` will work out the box. ## Step size halving within iteratively reweighted least squares (IRLS) -Before v4.0, `glmnet` solved the optimization problem for non-Gaussian families via iteratively reweighted least squares (IRLS). In each iteration a unit Newton step was taken, and the algorithm terminated when the unit Newton step did not decrease the deviance sufficiently. Because the algorithm was forced to take a unit step, this could result in non-convergence of the algorithm in some cases. +For the built-in families, `glmnet` solves the optimization problem +for non-Gaussian families via iteratively reweighted least squares +(IRLS). In each iteration a unit Newton step is taken, and the +algorithm terminates when the unit Newton step fails to decrease the deviance sufficiently. Because the algorithm iss forced to take a unit step, this can result in non-convergence of the algorithm in some cases. Here is an example of the non-convergence for Poisson data. The `stats:glm` function converges and gives us coefficients that are reasonably close to the truth: ```{r} set.seed(2020) n <- 100 p <- 4 -x <- matrix(runif(n*p, 5, 10), n) +x <- matrix(runif(n * p, 5, 10), n) y <- rpois(n, exp(rowMeans(x))) # glm fit -glmfit <- glm(y ~ x-1, family = poisson) +glmfit <- glm(y ~ x - 1, family = poisson) coef(glmfit) ``` -Fitting `glmnet` with `lambda = 0` is equivalent to fitting a GLM. If we use `glmnet` version before v4.0, we encounter an issue with non-convergence: +Fitting `glmnet` with `lambda = 0` is equivalent to fitting an unregularized GLM. If we use `glmnet` with `family = "poisson"`, we encounter an issue with non-convergence: ```{r} -oldfit <- glmnet(x, y, family = "poisson", standardize = FALSE, intercept = FALSE, - lambda = 0) +oldfit <- glmnet(x, y, family = "poisson", standardize = FALSE, + intercept = FALSE, lambda = 0) coef(oldfit) ``` -This divergence happens because the unit Newton step was too large. To address this issue, from v4.0 onwards if a `family` object is passed to the `family` argument of `glmnet`, the IRLS algorithm will perform step size halving. After computing the Newton step, the algorithm checks if the new solution has infinite (or astronomically large) objective function value or if it results in invalid $\eta$ or $\mu$. If so, the algorithm halves the step size repeatedly until these invalid conditions no longer hold. +This divergence happens because the unit Newton step was too large. However, if a `family` object is passed to the `family` argument of `glmnet`, the IRLS algorithm will perform step size halving. After computing the Newton step, the algorithm checks if the new solution has infinite (or astronomically large) objective function value or if it results in invalid $\eta$ or $\mu$. If so, the algorithm halves the step size repeatedly until these invalid conditions no longer hold. The code below shows that this step size halving avoids the divergence we were experiencing in our running example: ```{r} glmnet.control(mxitnr = 50) # increase maximum no. of IRLS iterations allowed -newfit <- glmnet(x, y, family = poisson(), standardize = FALSE, intercept = FALSE, - lambda = 0) +newfit <- glmnet(x, y, family = poisson(), standardize = FALSE, + intercept = FALSE, lambda = 0) coef(newfit) ``` -In the process, `glmnet` warns the user that an infinite objective function value was encountered and that step size halving was done to address the issue. The coefficients are close to those obtained by `stats:glm`, and can be made to be numerically indistinguishable by tightening the convergence criterion in both function calls. +In the process, `glmnet` warns the user that an infinite objective function value was encountered and that step size halving was done to address the issue. The coefficients are close to those obtained by `stats:glm`, and can be made to be numerically indistinguishable by tightening the convergence criteria in both function calls. ```{r} thresh <- 1e-15 glmfit <- glm(y ~ x-1, family = poisson, @@ -278,23 +221,3 @@ expect_equal(as.numeric(coef(glmfit)), as.numeric(coef(newfit))[2:5]) ``` - - -## Appendix 1: Internal parameters - -With this generalization of `glmnet`, we have added two new internal parameters which control some aspects of the model computation. The factory default settings are expected to work in most cases and users do not need to make changes unless there are special requirements. - -These two parameters are related to the iteratively reweighted least squares (IRLS) loop for solving the optimization problem at each value of $\lambda$: - -- `epsnr`: convergence threshold for the IRLS loop; factory default = 1e-08 -- `mxitnr`: maximum iterations for the IRLS loop for each value of $\lambda$; factory default = 25 - -As with other internal parameters, `epsnr` and `mxitnr` can be changed by calling `glmnet.control`. For example, if we wanted to increase the maximum number of iterations allowed for the IRLS loop for each $\lambda$, we would run -```{r} -glmnet.control(mxitnr = 50) -``` - -Any changes made to these internal parameters will hold for the duration of the R session unless they are changed by the user with a subsequent call to `glmnet.control`. To restore the factory defaults, run -```{r} -glmnet.control(factory = TRUE) -``` diff -Nru r-cran-glmnet-4.0-2/vignettes/glmnet.Rmd r-cran-glmnet-4.1/vignettes/glmnet.Rmd --- r-cran-glmnet-4.0-2/vignettes/glmnet.Rmd 2020-05-06 19:54:31.000000000 +0000 +++ r-cran-glmnet-4.1/vignettes/glmnet.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,10 @@ --- title: "An Introduction to `glmnet`" -author: "Trevor Hastie and Junyang Qian" -date: "September 13, 2016" +author: + - Trevor Hastie + - Junyang Qian + - Kenneth Tay +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -15,106 +18,86 @@ \usepackage[utf8]{inputenc} --- +```{r include=FALSE} +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) +``` + + ## Introduction -Glmnet is a package that fits a generalized linear model via penalized -maximum likelihood. The regularization path is computed for the lasso -or elasticnet penalty at a grid of values for the regularization -parameter lambda. The algorithm is extremely fast, and can exploit -sparsity in the input matrix `x`. It fits linear, logistic and -multinomial, poisson, and Cox regression models. A variety of -predictions can be made from the fitted models. It can also fit -multi-response linear regression. - -The authors of glmnet are Jerome Friedman, Trevor Hastie, Rob -Tibshirani, Balasubramanian Narasimhan, Kenneth Tay and Noah Simon, with -contribution from Junyang Qian, and the R package is maintained by Trevor -Hastie. The matlab version of glmnet is maintained by Junyang -Qian, and the Python version by B. Balakumar (although both are a few -versions behind). This vignette describes the usage of glmnet in R. -There are additional vignettes that should be useful: - -* one that describes in detail the new `relaxed` features in `glmnet`, along with some new capabilities. -* a vignette devoted to Cox models in `glmnet`. -* the newest one that describes using `glm()` family objects. - - -`glmnet` solves the following problem - -$$ -\min_{\beta_0,\beta} \frac{1}{N} \sum_{i=1}^{N} w_i l(y_i,\beta_0+\beta^T x_i) + \lambda\left[(1-\alpha)||\beta||_2^2/2 + \alpha ||\beta||_1\right], -$$ - -over a grid of values of $\lambda$ covering the entire range. Here -$l(y,\eta)$ is the negative log-likelihood contribution for -observation $i$; e.g. for the Gaussian case it is -$\frac{1}{2}(y-\eta)^2$. The _elastic-net_ penalty is controlled by -$\alpha$, and bridges the gap between lasso ($\alpha=1$, the default) -and ridge ($\alpha=0$). The tuning parameter $\lambda$ controls the -overall strength of the penalty. - -It is known that the ridge penalty shrinks the coefficients of -correlated predictors towards each other while the lasso tends to pick -one of them and discard the others. The elastic-net penalty mixes -these two; if predictors are correlated in groups, an $\alpha=0.5$ -tends to select the groups in or out together. This is a higher level -parameter, and users might pick a value upfront, else experiment with -a few different values. One use of $\alpha$ is for numerical -stability; for example, the elastic net with $\alpha = 1 - \epsilon$ -for some small $\epsilon > 0$ performs much like the lasso, but -removes any degeneracies and wild behavior caused by extreme -correlations. - -The `glmnet` algorithms use cyclical coordinate descent, which -successively optimizes the objective function over each parameter with -others fixed, and cycles repeatedly until convergence. The package -also makes use of the strong rules for efficient restriction of the -active set. Due to highly efficient updates and techniques such as -warm starts and active-set convergence, our algorithms can compute the -solution path very fast. - -The code can handle sparse input-matrix formats, as well as range -constraints on coefficients. The core of `glmnet` is a set of fortran -subroutines, which make for very fast execution. +Glmnet is a package that fits generalized linear and similar models +via penalized maximum likelihood. The regularization path is computed +for the lasso or elastic net penalty at a grid of values (on the log +scale) for the +regularization parameter lambda. The algorithm is extremely fast, and +can exploit sparsity in the input matrix `x`. It fits linear, logistic +and multinomial, poisson, and Cox regression models. It can also fit +multi-response linear regression, generalized linear models for custom +families, and relaxed lasso regression models. The package includes +methods for prediction and plotting, and functions for cross-validation. + +The authors of glmnet are Jerome Friedman, Trevor Hastie, Rob Tibshirani, Balasubramanian Narasimhan, Kenneth Tay and Noah Simon, with contribution from Junyang Qian, and the R package is maintained by Trevor Hastie. A MATLAB version of glmnet is maintained by Junyang Qian, and a Python version by B. Balakumar (although both are a few versions behind). + +This vignette describes basic usage of glmnet in R. There are additional vignettes that should be useful: + +* ["Regularized Cox Regression"](https://glmnet.stanford.edu/articles/Coxnet.html) describes how to fit regularized Cox models for survival data with `glmnet`. +* ["GLM `family` functions in `glmnet`"](https://glmnet.stanford.edu/articles/glmnetFamily.html) describes how to fit custom generalized linear models (GLMs) with the elastic net penalty via the `family` argument. +* ["The Relaxed Lasso"](https://glmnet.stanford.edu/articles/relax.html) + describes how to fit relaxed lasso regression models using the `relax` argument. + +`glmnet` solves the problem -The package also includes methods for prediction and plotting, and a -function that performs K-fold cross-validation. +$$ +\min_{\beta_0,\beta} \frac{1}{N} \sum_{i=1}^{N} w_i l(y_i,\beta_0+\beta^T x_i) + \lambda\left[(1-\alpha)\|\beta\|_2^2/2 + \alpha \|\beta\|_1\right], +$$ + +over a grid of values of $\lambda$ covering the entire range of possible solutions. Here $l(y_i,\eta_i)$ is the negative log-likelihood contribution for observation $i$; e.g. for the Gaussian case it is $\frac{1}{2}(y_i-\eta_i)^2$. The _elastic net_ penalty is controlled by $\alpha$, and bridges the gap between lasso regression ($\alpha=1$, the default) and ridge regression ($\alpha=0$). The tuning parameter $\lambda$ controls the overall strength of the penalty. + +It is known that the ridge penalty shrinks the coefficients of correlated predictors towards each other while the lasso tends to pick one of them and discard the others. The elastic net penalty mixes these two: if predictors are correlated in groups, an $\alpha=0.5$ tends to either select or leave out the entire group of features. This is a higher level parameter, and users might pick a value upfront or experiment with a few different values. One use of $\alpha$ is for numerical stability; for example, the elastic net with $\alpha = 1 - \epsilon$ for some small $\epsilon > 0$ performs much like the lasso, but removes any degeneracies and wild behavior caused by extreme correlations. -The theory and algorithms in this implementation are described in -@glmnet, @coxnet, @strongrules and @block . +The `glmnet` algorithms use cyclical coordinate descent, which successively optimizes the objective function over each parameter with others fixed, and cycles repeatedly until convergence. The package also makes use of the strong rules for efficient restriction of the active set. Due to highly efficient updates and techniques such as warm starts and active-set convergence, our algorithms can compute the solution path very quickly. +The code can handle sparse input-matrix formats, as well as range constraints on coefficients. The core of `glmnet` is a set of Fortran subroutines, which make for very fast execution. + +The theory and algorithms in this implementation are described in @glmnet, @coxnet, @strongrules and @block. ## Installation -Like many other R packages, the simplest way to obtain `glmnet` is to -install it directly from CRAN. Type the following command in R -console: +Like many other R packages, the simplest way to obtain `glmnet` is to install it directly from CRAN. Type the following command in R console: ```{r, eval=FALSE} install.packages("glmnet", repos = "https://cran.us.r-project.org") ``` -Users may change the `repos` options depending on their locations and -preferences. Other options such as the directories where to install -the packages can be altered in the command. For more details, see -`help(install.packages)`. - -Here the R package has been downloaded and installed to the default -directories. - -Alternatively, users can download the package source from -[CRAN](https://cran.r-project.org/package=glmnet) and type Unix -commands to install it to the desired location. +Users may change the `repos` argument depending on their locations and preferences. Other arguments such as the directories to install the packages at can be altered in the command. For more details, see `help(install.packages)`. Alternatively, users can download the package source from [CRAN](https://cran.r-project.org/package=glmnet) and type Unix commands to install it to the desired location. ## Quick Start -The purpose of this section is to give users a general sense of the package, including the components, what they do and some basic usage. We will briefly go over the main functions, see the basic operations and have a look at the outputs. Users may have a better idea after this section what functions are available, which one to choose, or at least where to seek help. More details are given in later sections. +The purpose of this section is to give users a general sense of the package. We will briefly go over the main functions, basic operations and outputs. After this section, users may have a better idea of what functions are available, which ones to use, or at least where to seek help. First, we load the `glmnet` package: ```{r} library(glmnet) ``` -The default model used in the package is the Guassian linear model or "least squares", which we will demonstrate in this section. We load a set of data created beforehand for illustration. Users can either load their own data or use those saved in the workspace. + +The default model used in the package is the Guassian linear model or "least squares", which we will demonstrate in this section. We load a set of data created beforehand for illustration: ```{r} data(QuickStartExample) ``` @@ -122,187 +105,189 @@ We fit the model using the most basic call to `glmnet`. ```{r} -fit = glmnet(x, y) +fit <- glmnet(x, y) ``` -"fit" is an object of class `glmnet` that contains all the relevant information of the fitted model for further use. We do not encourage users to extract the components directly. Instead, various methods are provided for the object such as `plot`, `print`, `coef` and `predict` that enable us to execute those tasks more elegantly. +`fit` is an object of class `glmnet` that contains all the relevant information of the fitted model for further use. We do not encourage users to extract the components directly. Instead, various methods are provided for the object such as `plot`, `print`, `coef` and `predict` that enable us to execute those tasks more elegantly. -We can visualize the coefficients by executing the `plot` function: +We can visualize the coefficients by executing the `plot` method: ```{r} plot(fit) ``` -Each curve corresponds to a variable. It shows the path of its coefficient against the $\ell_1$-norm of the whole coefficient vector at as $\lambda$ varies. The axis above indicates the number of nonzero coefficients at the current $\lambda$, which is the effective degrees of freedom (_df_) for the lasso. Users may also wish to annotate the curves; this can be done by setting `label = TRUE` in the plot command. +Each curve corresponds to a variable. It shows the path of its coefficient against the $\ell_1$-norm of the whole coefficient vector as $\lambda$ varies. The axis above indicates the number of nonzero coefficients at the current $\lambda$, which is the effective degrees of freedom (_df_) for the lasso. Users may also wish to annotate the curves: this can be done by setting `label = TRUE` in the plot command. A summary of the `glmnet` path at each step is displayed if we just enter the object name or use the `print` function: -```{r height = 4} +```{r out.lines = 10} print(fit) ``` -It shows from left to right the number of nonzero coefficients (`Df`), the percent (of null) deviance explained (`%dev`) and the value of $\lambda$ (`Lambda`). Although by default `glmnet` calls for 100 values of `lambda` the program stops early if `%dev% does not change sufficently from one lambda to the next (typically near the end of the path.) - -We can obtain the actual coefficients at one or more $\lambda$'s within the range of the sequence: -```{r} -coef(fit,s=0.1) +It shows from left to right the number of nonzero coefficients (`Df`), +the percent (of null) deviance explained (`%dev`) and the value of +$\lambda$ (`Lambda`). Although `glmnet` fits the model for 100 values +of `lambda` by default, it stops early if `%dev` does not change +sufficently from one lambda to the next (typically near the end of the +path.) Here we have truncated the prinout for brevity. + +We can obtain the model coefficients at one or more $\lambda$'s within the range of the sequence: +```{r out.lines = 10} +coef(fit, s = 0.1) ``` -(why `s` and not `lambda`? In case later we want to allow one to specify the model size in other ways.) -Users can also make predictions at specific $\lambda$'s with new input data: +(Why `s` and not `lambda`? In case we want to allow one to specify the model size in other ways in the future.) Users can also make predictions at specific $\lambda$'s with new input data: ```{r} set.seed(29) -nx = matrix(rnorm(10*20),10,20) -predict(fit,newx=nx,s=c(0.1,0.05)) +nx <- matrix(rnorm(5 * 20), 5, 20) +predict(fit, newx = nx, s = c(0.1, 0.05)) ``` -The function `glmnet` returns a sequence of models for the users to choose from. In many cases, users may prefer the software to select one of them. Cross-validation is perhaps the simplest and most widely used method for that task. - -`cv.glmnet` is the main function to do cross-validation here, along with various supporting methods such as plotting and prediction. We still act on the sample data loaded before. +The function `glmnet` returns a sequence of models for the users to choose from. In many cases, users may prefer the software to select one of them. Cross-validation is perhaps the simplest and most widely used method for that task. `cv.glmnet` is the main function to do cross-validation here, along with various supporting methods such as plotting and prediction. ```{r} -cvfit = cv.glmnet(x, y) +cvfit <- cv.glmnet(x, y) ``` -`cv.glmnet` returns a `cv.glmnet` object, which is "cvfit" here, a list with all the ingredients of the cross-validation fit. As for `glmnet`, we do not encourage users to extract the components directly except for viewing the selected values of $\lambda$. The package provides well-designed functions for potential tasks. -We can plot the object. +`cv.glmnet` returns a `cv.glmnet` object, a list with all the ingredients of the cross-validated fit. As with `glmnet`, we do not encourage users to extract the components directly except for viewing the selected values of $\lambda$. The package provides well-designed functions for potential tasks. For example, we can plot the object: ```{r} plot(cvfit) ``` -It includes the cross-validation curve (red dotted line), and upper and lower standard deviation curves along the $\lambda$ sequence (error bars). Two selected $\lambda$'s are indicated by the vertical dotted lines (see below). +This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the $\lambda$ sequence (error bars). Two special values along the $\lambda$ sequence are indicated by the vertical dotted lines. `lambda.min` is the value of $\lambda$ that gives minimum mean cross-validated error, while `lambda.1se` is the value of $\lambda$ that gives the most regularized model such that the cross-validated error is within one standard error of the minimum. -We can view the selected $\lambda$'s and the corresponding coefficients. For example, -```{r} +We can use the following code to get the value of `lambda.min` and the model coefficients at that value of $\lambda$: +```{r out.lines = 10} cvfit$lambda.min -``` -`lambda.min` is the value of $\lambda$ that gives minimum mean cross-validated error. The other $\lambda$ saved is `lambda.1se`, which gives the most regularized model such that error is within one standard error of the minimum. To use that, we only need to replace `lambda.min` with `lambda.1se` above. -```{r} coef(cvfit, s = "lambda.min") ``` -Note that the coefficients are represented in the sparse matrix format. The reason is that the solutions along the regularization path are often sparse, and hence it is more efficient in time and space to use a sparse format. If you prefer non-sparse format, pipe the output through `as.matrix()`. -Predictions can be made based on the fitted `cv.glmnet` object. Let's see a toy example. +To get the corresponding values at `lambda.1se`, simply replace +`lambda.min` with `lambda.1se` above, or omit the `s` argument, since +`lambda.1se` is the default. + +Note that the coefficients are represented in the sparse matrix format. This is because the solutions along the regularization path are often sparse, and hence it is more efficient in time and space to use a sparse format. If you prefer non-sparse format, pipe the output through `as.matrix()`. + +Predictions can be made based on the fitted `cv.glmnet` object as well. The code below gives predictions for the new input matrix `newx` at `lambda.min`: ```{r} predict(cvfit, newx = x[1:5,], s = "lambda.min") ``` -`newx` is for the new input matrix and `s`, as before, is the value(s) of $\lambda$ at which predictions are made. - -That is the end of `glmnet` 101. With the tools introduced so far, users are able to fit the entire elastic net family, including ridge regression, using squared-error loss. In the package, there are many more options that give users a great deal of flexibility. To learn more, move on to later sections. -## Linear Regression +This concludes `glmnet` 101. With the tools introduced so far, users are able to fit the entire elastic net family, including ridge regression, using squared-error loss. There are many more arguments in the package that give users a great deal of flexibility. To learn more, move on to later sections. -Linear regression here refers to two families of models. One is `gaussian`, the Gaussian family, and the other is `mgaussian`, the multiresponse Gaussian family. We first discuss the ordinary Gaussian and the multiresponse one after that. +## Linear Regression: `family = "gaussian"` (default) -### Gaussian Family - -`gaussian ` is the default family option in the function `glmnet`. Suppose we have observations $x_i \in \mathbb{R}^p$ and the responses $y_i \in \mathbb{R}, i = 1, \ldots, N$. The objective function for the Gaussian family is +`"gaussian"` is the default `family` argument for the function `glmnet`. Suppose we have observations $x_i \in \mathbb{R}^p$ and the responses $y_i \in \mathbb{R}, i = 1, \ldots, N$. The objective function for the Gaussian family is $$ -\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}}\frac{1}{2N} \sum_{i=1}^N (y_i -\beta_0-x_i^T \beta)^2+\lambda \left[ (1-\alpha)||\beta||_2^2/2 + \alpha||\beta||_1\right], +\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}}\frac{1}{2N} \sum_{i=1}^N (y_i -\beta_0-x_i^T \beta)^2+\lambda \left[ (1-\alpha)\|\beta\|_2^2/2 + \alpha\|\beta\|_1\right], $$ -where $\lambda \geq 0$ is a complexity parameter and $0 \leq \alpha \leq 1$ is a compromise between ridge ($\alpha = 0$) and lasso ($\alpha = 1$). +where $\lambda \geq 0$ is a complexity parameter and $0 \leq \alpha \leq 1$ is a compromise between ridge regression ($\alpha = 0$) and lasso regression ($\alpha = 1$). -Coordinate descent is applied to solve the problem. Specifically, suppose we have current estimates $\tilde{\beta_0}$ and $\tilde{\beta}_\ell$ $\forall j\in 1,]\ldots,p$. By computing the gradient at $\beta_j = \tilde{\beta}_j$ and simple calculus, the update is +`glmnet` applies coordinate descent to solve the problem. Specifically, suppose we have current estimates $\tilde{\beta_0}$ and $\tilde{\beta}_\ell$ $\forall \ell\in 1,\ldots,p$. By computing the gradient at $\beta_j = \tilde{\beta}_j$ and simple calculus, the update is $$ \tilde{\beta}_j \leftarrow \frac{S(\frac{1}{N}\sum_{i=1}^N x_{ij}(y_i-\tilde{y}_i^{(j)}),\lambda \alpha)}{1+\lambda(1-\alpha)}, $$ where $\tilde{y}_i^{(j)} = \tilde{\beta}_0 + \sum_{\ell \neq j} x_{i\ell} \tilde{\beta}_\ell$, and $S(z, \gamma)$ is the soft-thresholding operator with value $\text{sign}(z)(|z|-\gamma)_+$. -This formula above applies when the `x` variables are standardized to have unit variance (the default); it is slightly more complicated when they are not. Note that for "family=gaussian", `glmnet` standardizes $y$ to have unit variance before computing its lambda sequence (and then unstandardizes the resulting coefficients); if you wish to reproduce/compare results with other software, best to supply a standardized $y$ first (Using the "1/N" variance formula). +This formula above applies when the `x` variables are standardized to have unit variance (the default); it is slightly more complicated when they are not. Note that for `family = "gaussian"`, `glmnet` standardizes $y$ to have unit variance before computing its `lambda` sequence (and then unstandardizes the resulting coefficients). If you wish to reproduce or compare results with other software, it is best to supply a standardized $y$ first (Using the "1/N" variance formula). -`glmnet` provides various options for users to customize the fit. We introduce some commonly used options here and they can be specified in the `glmnet` function. +### Commonly used function arguments -* `alpha` is for the elastic-net mixing parameter $\alpha$, with range $\alpha \in [0,1]$. $\alpha = 1$ is the lasso (default) and $\alpha = 0$ is the ridge. +`glmnet` provides various arguments for users to customize the fit: we introduce some commonly used arguments here. (For more information, type `?glmnet`.) -* `weights` is for the observation weights. Default is 1 for each observation. (Note: `glmnet` rescales the weights to sum to N, the sample size.) +* `alpha` is for the elastic net mixing parameter $\alpha$, with range $\alpha \in [0,1]$. $\alpha = 1$ is lasso regression (default) and $\alpha = 0$ is ridge regression. -* `nlambda` is the number of $\lambda$ values in the sequence. Default is 100. +* `weights` is for the observation weights, default is 1 for each observation. (Note: `glmnet` rescales the weights internally to sum to N, the sample size.) -* `lambda` can be provided, but is typically not and the program constructs a sequence. When automatically generated, the $\lambda$ sequence is determined by `lambda.max` and `lambda.min.ratio`. The latter is the ratio of smallest value of the generated $\lambda$ sequence (say `lambda.min`) to `lambda.max`. The program then generated `nlambda` values linear on the log scale from `lambda.max` down to `lambda.min`. `lambda.max` is not given, but easily computed from the input $x$ and $y$; it is the smallest value for `lambda` such that all the coefficients are zero. For `alpha=0` (ridge) `lambda.max` would be $\infty$; hence for this case we pick a value corresponding to a small value for `alpha` close to zero.) +* `nlambda` is the number of $\lambda$ values in the sequence (default is 100). -* `standardize` is a logical flag for `x` variable standardization, prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is `standardize=TRUE`. +* `lambda` can be provided if the user wants to specify the lambda sequence, but typical usage is for the program to construct the lambda sequence on its own. When automatically generated, the $\lambda$ sequence is determined by `lambda.max` and `lambda.min.ratio`. The latter is the ratio of smallest value of the generated $\lambda$ sequence (say `lambda.min`) to `lambda.max`. The program generates `nlambda` values linear on the log scale from `lambda.max` down to `lambda.min`. `lambda.max` is not user-specified but is computed from the input $x$ and $y$: it is the smallest value for `lambda` such that all the coefficients are zero. For `alpha = 0` (ridge) `lambda.max` would be $\infty$: in this case we pick a value corresponding to a small value for `alpha` close to zero.) -For more information, type `help(glmnet)` or simply `?glmnet`. +* `standardize` is a logical flag for `x` variable standardization prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is `standardize = TRUE`. -As an example, we set $\alpha = 0.2$ (more like a ridge regression), and give double weights to the latter half of the observations. To avoid too long a display here, we set `nlambda` to 20. In practice, however, the number of values of $\lambda$ is recommended to be 100 (default) or more. In most cases, it does not come with extra cost because of the warm-starts used in the algorithm, and for nonlinear models leads to better convergence properties. +As an example, we set $\alpha = 0.2$ (more like a ridge regression), and give double weight to the latter half of the observations. We set `nlambda` to 20 so that the model fit is only compute for 20 values of $\lambda$. In practice, we recommend `nlambda` to be 100 (default) or more. In most cases, it does not come with extra cost because of the warm-starts used in the algorithm, and for nonlinear models leads to better convergence properties. ```{r} -fit = glmnet(x, y, alpha = 0.2, weights = c(rep(1,50),rep(2,50)), nlambda = 20) +wts <- c(rep(1,50), rep(2,50)) +fit <- glmnet(x, y, alpha = 0.2, weights = wts, nlambda = 20) ``` -We can then print the `glmnet` object. + +We can then print the `glmnet` object: ```{r} print(fit) ``` -This displays the call that produced the object `fit` and a three-column matrix with columns `Df` (the number of nonzero coefficients), `%dev` (the percent deviance explained) and `Lambda` (the corresponding value of $\lambda$). +This displays the call that produced the object `fit` and a three-column matrix with columns `Df` (the number of nonzero coefficients), `%dev` (the percent deviance explained) and `Lambda` (the corresponding value of $\lambda$). (The `digits` argument can used to specify significant digits in the printout.) -(Note that the `digits` option can used to specify significant digits in the printout.) +Here the actual number of $\lambda$'s is less than that specified in the call. This is because of the algorithm's stopping criteria. According to the default internal settings, the computations stop if either the fractional change in deviance down the path is less than $10^{-5}$ or the fraction of explained deviance reaches $0.999$. From the last few lines of the output, we see the fraction of deviance does not change much and therefore the computation ends before the all 20 models are fit. The internal parameters governing the stopping criteria can be changed. For details, see the Appendix section or type `help(glmnet.control)`. -Here the actual number of $\lambda$'s here is less than specified in the call. The reason lies in the stopping criteria of the algorithm. According to the default internal settings, the computations stop if either the fractional change in deviance down the path is less than $10^{-5}$ or the fraction of explained deviance reaches $0.999$. From the last few lines , we see the fraction of deviance does not change much and therefore the computation ends when meeting the stopping criteria. We can change such internal parameters. For details, see the Appendix section or type `help(glmnet.control)`. +### Predicting and plotting with `glmnet` objects -We can plot the fitted object as in the previous section. There are more options in the `plot` function. +We can extract the coefficients and make predictions for a `glmnet` object at certain values of $\lambda$. Two commonly used arguments are: -Users can decide what is on the X-axis. `xvar` allows three measures: "norm" for the $\ell_1$-norm of the coefficients (default), "lambda" for the log-lambda value and "dev" for %deviance explained. +* `s` for specifiying the value(s) of $\lambda$ at which to extract coefficients/predictions. -Users can also label the curves with variable sequence numbers simply by setting `label = TRUE`. +* `exact` for indicating whether the exact values of coefficients are desired or not. If `exact = TRUE` and predictions are to be made at values of `s` not included in the original fit, these values of `s` are merged with `object$lambda` and the model is refit before predictions are made. If `exact = FALSE` (default), then the `predict` function uses linear interpolation to make predictions for values of `s` that do not coincide with lambdas used in the fitting algorithm. -Let's plot "fit" against the log-lambda value and with each curve labeled. +Here is a simple example illustrating the use of both these function arguments: ```{r} -plot(fit, xvar = "lambda", label = TRUE) -``` - -Now when we plot against %deviance we get a very different picture. This is percent deviance explained on the training data. What we see here is that toward the end of the path this value are not changing much, but the coefficients are "blowing up" a bit. This lets us focus attention on the parts of the fit that matter. This will especially be true for other models, such as logistic regression. -```{r} -plot(fit, xvar = "dev", label = TRUE) +fit <- glmnet(x, y) +any(fit$lambda == 0.5) # 0.5 not in original lambda sequence +coef.apprx <- coef(fit, s = 0.5, exact = FALSE) +coef.exact <- coef(fit, s = 0.5, exact = TRUE, x=x, y=y) +cbind2(coef.exact[which(coef.exact != 0)], + coef.apprx[which(coef.apprx != 0)]) ``` +The left and right columns show the coefficients for `exact = TRUE` and `exact = FALSE` respectively. (For brevity we only show the non-zero coefficients.) We see from the above that 0.5 is not in the sequence and that hence there are some small differences in coefficient values. Linear interpolation is usually accurate enough if there are no special requirements. Notice that with `exact = TRUE` we have to supply by named argument any data that was used in creating the original fit, in this case `x` and `y`. +Users can make predictions from the fitted `glmnet` object. In addition to the arguments in `coef`, the primary argument is `newx`, a matrix of new values for `x` at which predictions are desired. The `type` argument allows users to choose the type of prediction returned: -We can extract the coefficients and make predictions at certain values of $\lambda$. Two commonly used options are: +* "link" returns the fitted values (i.e. $\hat\beta_0 + x_i^T\hat\beta$) -* `s` specifies the value(s) of $\lambda$ at which extraction is made. +* "response" gives the same output "link" for "gaussian" family. -* `exact` indicates whether the exact values of coefficients are desired or not. That is, if `exact = TRUE`, and predictions are to be made at values of `s` not included in the original fit, these values of `s` are merged with `object$lambda`, and the model is refit before predictions are made. If `exact=FALSE` (default), then the predict function uses linear interpolation to make predictions for values of `s` that do not coincide with lambdas used in the fitting algorithm. +* "coefficients" returns the model codfficients. -A simple example is: +* "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. +For example, the following code gives the fitted values for the first 5 observations at $\lambda = 0.05$: ```{r} -fit = glmnet(x, y) -any(fit$lambda == 0.5) -coef.apprx = coef(fit, s = 0.5, exact = FALSE) -coef.exact = coef(fit, s = 0.5, exact = TRUE, x=x, y=y) -cbind2(coef.exact, coef.apprx) +predict(fit, newx = x[1:5,], type = "response", s = 0.05) ``` -The left column is for `exact = TRUE` and the right for `FALSE`. We -see from the above that 0.5 is not in the sequence and that hence -there are some difference, though not much. Linear interpolation is -mostly enough if there are no special requirements. Notice that with -`exact=TRUE` we have to supply by named argument any data that was used in creating the -original fit, in this case `x` and `y`. -Users can make predictions from the fitted object. In addition to the options in `coef`, the primary argument is `newx`, a matrix of new values for `x`. The `type` option allows users to choose the type of prediction: -* "link" gives the fitted values +If multiple values of `s` are supplied, a matrix of predictions is +produced. If no value of `s` is supplied, a matrix of predictions is +supplied, with columns corresponding to all the lambdas used in the fit. -* "response" the sames as "link" for "gaussian" family. +We can plot the fitted object as in the Quick Start section. Here we walk through more arguments for the `plot` function. The `xvar` argument allows users to decide what is plotted on the `x`-axis. `xvar` allows three measures: "norm" for the $\ell_1$-norm of the coefficients (default), "lambda" for the log-lambda value and "dev" for %deviance explained. Users can also label the curves with the variable index numbers simply by setting `label = TRUE`. -* "coefficients" computes the coefficients at values of `s` - -* "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. +For example, let's plot `fit` against the log-lambda value and with each curve labeled: +```{r} +plot(fit, xvar = "lambda", label = TRUE) +``` -For example, +Now when we plot against %deviance we get a very different +picture. This is percent deviance explained on the training data, and +is a measure of complexity of the model. We see that toward the end of the path, %deviance is not changing much but the coefficients are "blowing up" a bit. This enables us focus attention on the parts of the fit that matter. This will especially be true for other models, such as logistic regression. ```{r} -predict(fit, newx = x[1:5,], type = "response", s = 0.05) +plot(fit, xvar = "dev", label = TRUE) ``` -gives the fitted values for the first 5 observations at $\lambda = 0.05$. If multiple values of `s` are supplied, a matrix of predictions is produced. -Users can customize K-fold cross-validation. In addition to all the `glmnet` parameters, `cv.glmnet` has its special parameters including `nfolds` (the number of folds), `foldid` (user-supplied folds), `type.measure`(the loss used for cross-validation): -* "deviance" or "mse" uses squared loss +### Cross-validation -* "mae" uses mean absolute error +K-fold cross-validation can be performed using the `cv.glmnet` function. In addition to all the `glmnet` parameters, `cv.glmnet` has its special parameters including `nfolds` (the number of folds), `foldid` (user-supplied folds), and `type.measure`(the loss used for cross-validation): + +* "deviance" or "mse" for squared loss, and + +* "mae" uses mean absolute error. As an example, ```{r} -cvfit = cv.glmnet(x, y, type.measure = "mse", nfolds = 20) +cvfit <- cv.glmnet(x, y, type.measure = "mse", nfolds = 20) +``` +does 20-fold cross-validation based on mean squared error criterion (the default for "gaussian" family). Printing the resulting object gives some basic information on the cross-validation performed: +```{r} +print(cvfit) ``` -does 20-fold cross-validation, based on mean squared error criterion (default though). -Parallel computing is also supported by `cv.glmnet`. To make it work, users must register parallel beforehand. We give a simple example of comparison here. Unfortunately, the package `doMC` is not available on Windows platforms (it is on others), so we cannot run the code here, but we make it looks as if we have. +`cv.glmnet` also supports parallel computing. To make it work, users must register parallel beforehand. We give a simple example of comparison here. Unfortunately, the package `doMC` is not available on Windows platforms (it is on others), so we cannot run the code here, but we present timing information recorded during one of our test runs. ```{r, eval=FALSE} -require(doMC) -registerDoMC(cores=2) -X = matrix(rnorm(1e4 * 200), 1e4, 200) -Y = rnorm(1e4) +library(doMC) +registerDoMC(cores = 2) +X <- matrix(rnorm(1e4 * 200), 1e4, 200) +Y <- rnorm(1e4) ``` ```{r, eval=FALSE} @@ -323,211 +308,176 @@ As suggested from the above, parallel computing can significantly speed up the computation process especially for large-scale problems. -Functions `coef` and `predict` on cv.glmnet object are similar to those for a `glmnet` object, except that two special strings are also supported by `s` (the values of $\lambda$ requested): -* "lambda.1se": the largest $\lambda$ at which the MSE is within one standard error of the minimal MSE. +The `coef` and `predict` methods for `cv.glmnet` objects are similar to those for a `glmnet` object, except that two special strings are also supported by `s` (the values of $\lambda$ requested): -* "lambda.min": the $\lambda$ at which the minimal MSE is achieved. +* "lambda.min": the $\lambda$ at which the smallest MSE is achieved. -```{r} +* "lambda.1se": the largest $\lambda$ at which the MSE is within one standard error of the smallest MSE (default). + +```{r out.lines = 10} cvfit$lambda.min -coef(cvfit, s = "lambda.min") predict(cvfit, newx = x[1:5,], s = "lambda.min") +coef(cvfit, s = "lambda.min") ``` -Users can control the folds used. Here we use the same folds so we can also select a value for $\alpha$. - +Users can explicitly control the fold that each observation is assigned to via the `foldid` argument. This is useful, for example, in using cross-validation to select a value for $\alpha$: ```{r} -foldid=sample(1:10,size=length(y),replace=TRUE) -cv1=cv.glmnet(x,y,foldid=foldid,alpha=1) -cv.5=cv.glmnet(x,y,foldid=foldid,alpha=.5) -cv0=cv.glmnet(x,y,foldid=foldid,alpha=0) +foldid <- sample(1:10, size = length(y), replace = TRUE) +cv1 <- cv.glmnet(x, y, foldid = foldid, alpha = 1) +cv.5 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.5) +cv0 <- cv.glmnet(x, y, foldid = foldid, alpha = 0) ``` + There are no built-in plot functions to put them all on the same plot, so we are on our own here: ```{r} -par(mfrow=c(2,2)) -plot(cv1);plot(cv.5);plot(cv0) -plot(log(cv1$lambda),cv1$cvm,pch=19,col="red",xlab="log(Lambda)",ylab=cv1$name) -points(log(cv.5$lambda),cv.5$cvm,pch=19,col="grey") -points(log(cv0$lambda),cv0$cvm,pch=19,col="blue") -legend("topleft",legend=c("alpha= 1","alpha= .5","alpha 0"),pch=19,col=c("red","grey","blue")) +par(mfrow = c(2,2)) +plot(cv1); plot(cv.5); plot(cv0) +plot(log(cv1$lambda) , cv1$cvm , pch = 19, col = "red", + xlab = "log(Lambda)", ylab = cv1$name) +points(log(cv.5$lambda), cv.5$cvm, pch = 19, col = "grey") +points(log(cv0$lambda) , cv0$cvm , pch = 19, col = "blue") +legend("topleft", legend = c("alpha= 1", "alpha= .5", "alpha 0"), + pch = 19, col = c("red","grey","blue")) ``` -We see that lasso (`alpha=1`) does about the best here. We also see that the range of lambdas used differs with alpha. - +We see that the lasso (`alpha=1`) does about the best here. We also see that the range of lambdas used differs with `alpha`. -#### Coefficient upper and lower bounds +### Other function arguments -These are recently added features that enhance the scope of the models. Suppose we want to fit our model, but limit the coefficients to be bigger than -0.7 and less than 0.5. This is easily achieved via the `upper.limits` and `lower.limits` arguments: +In this section we breifly describe some other useful arguments when +calling `glmnet`: `upper.limits`, `lower.limits`, `penalty.factor`, `exclude` and `intercept`. +Suppose we want to fit our model but limit the coefficients to be bigger than -0.7 and less than 0.5. This can be achieved by specifying the `upper.limits` and `lower.limits` arguments: ```{r} -tfit=glmnet(x,y,lower=-.7,upper=.5) +tfit <- glmnet(x, y, lower.limits = -0.7, upper.limits = 0.5) plot(tfit) ``` -These are rather arbitrary limits; often we want the coefficients to be positive, so we can set only `lower.limit` to be 0. -(Note, the lower limit must be no bigger than zero, and the upper limit no smaller than zero.) -These bounds can be a vector, with different values for each coefficient. If given as a scalar, the same number gets recycled for all. - -#### Penalty factors +Often we want the coefficients to be positive: to do so, we just need to specify `lower.limits = 0`. (Note, the lower limit must be no bigger than zero, and the upper limit no smaller than zero.) These bounds can be a vector, with different values for each coefficient. If given as a scalar, the same number gets recycled for all. -This argument allows users to apply separate penalty factors to each coefficient. Its default is 1 for each parameter, but other values can be specified. In particular, any variable with `penalty.factor` equal to zero is not penalized at all! Let $v_j$ denote the penalty factor for $j$ th variable. The penalty term becomes +The `penalty.factor` argument allows users to apply separate penalty factors to each coefficient. This is very useful when we have prior knowledge or preference over the variables. Specifically, if $v_j$ denotes the penalty factor for the $j$th variable, the penalty term becomes $$ \lambda \sum_{j=1}^p \boldsymbol{v_j} P_\alpha(\beta_j) = \lambda \sum_{j=1}^p \boldsymbol{v_j} \left[ (1-\alpha)\frac{1}{2} \beta_j^2 + \alpha |\beta_j| \right]. $$ -Note the penalty factors are internally rescaled to sum to nvars. -This is very useful when people have prior knowledge or preference over the variables. In many cases, some variables may be so important that one wants to keep them all the time, which can be achieved by setting corresponding penalty factors to 0: - -```{r} -p.fac = rep(1, 20) -p.fac[c(5, 10, 15)] = 0 -pfit = glmnet(x, y, penalty.factor = p.fac) +The default is 1 for each coefficient, i.e. coefficients are penalized +equally. Note that any variable with `penalty.factor` equal to zero is +not penalized at all! This is useful in the case where some variables +are always to be included unpenalized in the model, such as the demographic +variables sex and age in medical studies. Note the penalty factors are internally rescaled to sum to `nvars`, the number of variables in the given `x` matrix. + +Here is an example where we set the penalty factors for variables 1, 3 and 5 to be zero: +```{r} +p.fac <- rep(1, 20) +p.fac[c(1, 3, 5)] <- 0 +pfit <- glmnet(x, y, penalty.factor = p.fac) plot(pfit, label = TRUE) ``` -We see from the labels that the three variables with 0 penalty factors always stay in the model, while the others follow typical regularization paths and shrunken to 0 eventually. +We see from the labels that the three variables with zero penalty factors always stay in the model, while the others follow typical regularization paths and shrunk to zero eventually. -Some other useful arguments. `exclude` allows one to block certain variables from being the model at all. Of course, one could simply subset these out of `x`, but sometimes `exclude` is more useful, since it returns a full vector of coefficients, just with the excluded ones set to zero. There is also an `intercept` argument which defaults to `TRUE`; if `FALSE` the intercept is forced to be zero. +`exclude` allows one to block certain variables from being the model at all. Of course, one could simply subset these out of `x`, but sometimes `exclude` is more useful, since it returns a full vector of coefficients, just with the excluded ones set to zero. -#### Customizing plots +The `intercept` argument allows the user to decide if an intercept should be included in the model or not (it is never penalized). The default is `intercept = TRUE`. If `intercept = FALSE` the intercept is forced to be zero. -Sometimes, especially when the number of variables is small, we want to add variable labels to a plot. Since `glmnet` is intended primarily for wide data, this is not supprted in `plot.glmnet`. However, it is easy to do, as the following little toy example shows. +## Linear Regression: `family = "mgaussian"` (multi-response) -We first generate some data, with 10 variables, and for lack of imagination and ease we give them simple character names. -We then fit a glmnet model, and make the standard plot. -```{r} -set.seed(101) -x=matrix(rnorm(1000),100,10) -y=rnorm(100) -vn=paste("var",1:10) -fit=glmnet(x,y) -plot(fit) -``` - -We wish to label the curves with the variable names. Here s a simple way to do this, using the `axis` command in R (and a little research into how to customize it). We need to have the positions of the coefficients at the end of the path, and we need to make some space using the `par` command, so that our labels will fit in. -This requires knowing how long your labels are, but here they are all quite short. - -```{r} -par(mar=c(4.5,4.5,1,4)) -plot(fit) -vnat=coef(fit) -vnat=vnat[-1,ncol(vnat)] # remove the intercept, and get the coefficients at the end of the path -axis(4, at=vnat,line=-.5,label=vn,las=1,tick=FALSE, cex.axis=0.5) -``` - -We have done nothing here to avoid overwriting of labels, in the event that they are close together. This would be a bit more work, but perhaps best left alone, anyway. - - -### Multiresponse Gaussian Family +The multi-response Gaussian family is useful when there are a number of (correlated) responses, also known as the "multi-task learning" problem. Here, a variable is either included in the model for all the responses, or excluded for all the responses. Most of the arguments for this family are the same as that for `family = "gaussian"`, so we focus on the differences with the single response model. -The multiresponse Gaussian family is obtained using `family = "mgaussian"` option in `glmnet`. It is very similar to the single-response case above. This is useful when there are a number of (correlated) responses - the so-called "multi-task learning" problem. Here the sharing involves which variables are selected, since when a variable is selected, a coefficient is fit for each response. Most of the options are the same, so we focus here on the differences with the single response model. +As the name suggests, the response $y$ is not a vector but a matrix of quantitative responses. As a result, the coefficients at each value of lambda are also a matrix. -Obviously, as the name suggests, $y$ is not a vector, but a matrix of quantitative responses in this section. The coefficients at each value of lambda are also a matrix as a result. - -Here we solve the following problem: +`glmnet` solves the problem $$ -\min_{(\beta_0, \beta) \in \mathbb{R}^{(p+1)\times K}}\frac{1}{2N} \sum_{i=1}^N ||y_i -\beta_0-\beta^T x_i||^2_F+\lambda \left[ (1-\alpha)||\beta||_F^2/2 + \alpha\sum_{j=1}^p||\beta_j||_2\right]. +\min_{(\beta_0, \beta) \in \mathbb{R}^{(p+1)\times K}}\frac{1}{2N} \sum_{i=1}^N \|y_i -\beta_0-\beta^T x_i\|^2_F+\lambda \left[ (1-\alpha)\|\beta\|_F^2/2 + \alpha\sum_{j=1}^p\|\beta_j\|_2\right]. $$ -Here $\beta_j$ is the jth row of the $p\times K$ coefficient matrix $\beta$, and we replace the absolute penalty on each single coefficient by a group-lasso penalty on each coefficient K-vector $\beta_j$ for a single predictor $x_j$. +Here $\beta_j$ is the $j$th row of the $p\times K$ coefficient matrix +$\beta$, and we replace the absolute penalty on each single +coefficient by a group-lasso penalty on each coefficient $K$-vector +$\beta_j$ for a single predictor (i.e. column of the `x` matrix). The +group lasso penalty behaves like the lasso, but on the whole group of +coefficients for each response: they are either all zero, or else none +are zero, but are shrunk by an amount depending on $\lambda$. -We use a set of data generated beforehand for illustration. +We use a set of data generated beforehand for illustration. We fit a regularized multi-response Gaussian model to the data, with an object `mfit` returned. ```{r} data(MultiGaussianExample) +mfit <- glmnet(x, y, family = "mgaussian") ``` -We fit the data, with an object "mfit" returned. -```{r} -mfit = glmnet(x, y, family = "mgaussian") -``` -For multiresponse Gaussian, the options in `glmnet` are almost the same as the single-response case, such as `alpha`, `weights`, `nlambda`, `standardize`. A exception to be noticed is that `standardize.response` is only for `mgaussian` family. The default value is `FALSE`. If `standardize.response = TRUE`, it standardizes the response variables. +The `standardize.response` argument is only for `mgaussian` family. If `standardize.response = TRUE`, the response variables are standardized (default is `FALSE`). -To visualize the coefficients, we use the `plot` function. +As before, we can use the `plot` method to visualize the coefficients: ```{r} plot(mfit, xvar = "lambda", label = TRUE, type.coef = "2norm") ``` -Note that we set `type.coef = "2norm"`. Under this setting, a single curve is plotted per variable, with value equal to the $\ell_2$ norm. The default setting is `type.coef = "coef"`, where a coefficient plot is created for each response (multiple figures). +Note that we set `type.coef = "2norm"`. Under this setting, a single curve is plotted per variable, with value equal to the $\ell_2$ norm of the variable's coefficient vector. The default setting is `type.coef = "coef"`, where a coefficient plot is created for each response (multiple figures). `xvar` and `label` are two other arguments which have the same functionality as in the single-response case. -`xvar` and `label` are two other options besides ordinary graphical parameters. They are the same as the single-response case. - -We can extract the coefficients at requested values of $\lambda$ by using the function `coef` and make predictions by `predict`. The usage is similar and we only provide an example of `predict` here. +We can extract the coefficients and make predictions at requested +values of $\lambda$ by using the `coef` and `predict` methods +respectively, as before. Here is an example of a `predict` call: ```{r} predict(mfit, newx = x[1:5,], s = c(0.1, 0.01)) ``` -The prediction result is saved in a three-dimensional array with the first two dimensions being the prediction matrix for each response variable and the third indicating the response variables. +The prediction result is saved in a three-dimensional array with the +first two dimensions being the prediction matrix for each response +variable and the third corresponding to the response variables. -We can also do k-fold cross-validation. The options are almost the same as the ordinary Gaussian family and we do not expand here. -```{r} -cvmfit = cv.glmnet(x, y, family = "mgaussian") -``` -We plot the resulting `cv.glmnet` object "cvmfit". -```{r} -plot(cvmfit) -``` +## Logistic Regression: `family = "binomial"` -To show explicitly the selected optimal values of $\lambda$, type -```{r} -cvmfit$lambda.min -cvmfit$lambda.1se -``` -As before, the first one is the value at which the minimal mean squared error is achieved and the second is for the most regularized model whose mean squared error is within one standard error of the minimal. +Logistic regression is a widely-used model when the response is binary. Suppose the response variable takes values in $\mathcal{G}=\{1,2\}$. Denote $y_i = I(g_i=1)$. We model -Prediction for `cv.glmnet` object works almost the same as for `glmnet` object. We omit the details here. - -## Logistic Regression - -Logistic regression is another widely-used model when the response is categorical. If there are two possible outcomes, we use the binomial distribution, else we use the multinomial. +$$\mbox{Pr}(G=2|X=x)=\frac{e^{\beta_0+\beta^Tx}}{1+e^{\beta_0+\beta^Tx}},$$ -### Binomial Models +which can be written in the following form: -For the binomial model, suppose the response variable takes value in $\mathcal{G}=\{1,2\}$. Denote $y_i = I(g_i=1)$. We model -$$\mbox{Pr}(G=2|X=x)=\frac{e^{\beta_0+\beta^Tx}}{1+e^{\beta_0+\beta^Tx}},$$ -which can be written in the following form $$\log\frac{\mbox{Pr}(G=2|X=x)}{\mbox{Pr}(G=1|X=x)}=\beta_0+\beta^Tx,$$ the so-called "logistic" or log-odds transformation. -The objective function for the penalized logistic regression uses the negative binomial log-likelihood, and is +The objective function for logistic regression is the penalized negative binomial log-likelihood, and is $$ -\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}} -\left[\frac{1}{N} \sum_{i=1}^N y_i \cdot (\beta_0 + x_i^T \beta) - \log (1+e^{(\beta_0+x_i^T \beta)})\right] + \lambda \big[ (1-\alpha)||\beta||_2^2/2 + \alpha||\beta||_1\big]. +\min_{(\beta_0, \beta) \in \mathbb{R}^{p+1}} -\left[\frac{1}{N} \sum_{i=1}^N y_i \cdot (\beta_0 + x_i^T \beta) - \log (1+e^{(\beta_0+x_i^T \beta)})\right] + \lambda \big[ (1-\alpha)\|\beta\|_2^2/2 + \alpha\|\beta\|_1\big]. $$ -Logistic regression is often plagued with degeneracies when $p > N$ and exhibits wild behavior even when $N$ is close to $p$; -the elastic-net penalty alleviates these issues, and regularizes and selects variables as well. - -Our algorithm uses a quadratic approximation to the log-likelihood, and then coordinate descent on the resulting penalized weighted least-squares problem. These constitute an outer and inner loop. - +Logistic regression is often plagued with degeneracies when $p > N$ and exhibits wild behavior even when $N$ is close to $p$; the elastic net penalty alleviates these issues, and regularizes and selects variables as well. -For illustration purpose, we load pre-generated input matrix `x` and the response vector `y` from the data file. +We use a "proximal Newton" algorithm for optimizing this +criterion. This makes repeated use of a quadratic approximation to the +log-likelihood, and then weighted coordinate descent on the resulting +penalized weighted least-squares problem. These constitute an outer +and inner loop, also known as iteratively reweighted penalized least squares. + +For illustration purposes, we load the pre-generated input matrix `x` +and the response vector `y` from the data file. The input matrix $x$ +is the same as for other families. For binomial logistic regression, +the response variable $y$ should be either a binary vector, a factor +with two levels, or a two-column matrix of counts or proportions. The +latter is useful for grouped binomial data, or in applications where +we have "soft" class membership, such as occurs in the EM algorithm. ```{r} data(BinomialExample) ``` -The input matrix $x$ is the same as other families. For binomial logistic regression, the response variable $y$ should be either a factor with two levels, or a two-column matrix of counts or proportions. -Other optional arguments of `glmnet` for binomial regression are almost same as those for Gaussian family. Don't forget to set `family` option to "binomial". +Other optional arguments of `glmnet` for binomial regression are almost same as those for Gaussian family. Don't forget to set `family` option to "binomial": ```{r} -fit = glmnet(x, y, family = "binomial") -``` -Like before, we can print and plot the fitted object, extract the coefficients at specific $\lambda$'s and also make predictions. For plotting, the optional arguments such as `xvar` and `label` are similar to the Gaussian. We plot against the deviance explained and show the labels. -```{r} -plot(fit, xvar = "dev", label = TRUE) +fit <- glmnet(x, y, family = "binomial") ``` -Prediction is a little different for logistic from Gaussian, mainly in the option `type`. "link" and "response" are never equivalent and "class" is only available for logistic regression. In summary, -* "link" gives the linear predictors +As before, we can print and plot the fitted object, extract the coefficients at specific $\lambda$'s and also make predictions. For plotting, the optional arguments such as `xvar` and `label` work in the same way as for `family = "gaussian"`. Prediction is a little different for `family = "binomial"`, mainly in the function argument `type`: -* "response" gives the fitted probabilities +* "link" gives the linear predictors. -* "class" produces the class label corresponding to the maximum probability. +* "response" gives the fitted probabilities. -* "coefficients" computes the coefficients at values of `s` - -* "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. +* "class" produces the class label corresponding to the maximum probability. -For "binomial" models, results ("link", "response", "coefficients", "nonzero") are returned only for the class corresponding to the second level of the factor response. +As with `family = "gaussian"`, "coefficients" computes the coefficients at values of `s` and "nonzero" retuns a list of the indices of the nonzero coefficients for each value of `s`. Note that the results ("link", "response", "coefficients", "nonzero") are returned only for the class corresponding to the second level of the factor response. In the following example, we make prediction of the class labels at $\lambda = 0.05, 0.01$. ```{r} predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01)) ``` -For logistic regression, `cv.glmnet` has similar arguments and usage as Gaussian. `nfolds`, `weights`, `lambda`, `parallel` are all available to users. There are some differences in `type.measure`: "deviance" and "mse" do not both mean squared loss and "class" is enabled. Hence, + +For logistic regression, `cv.glmnet` has similar arguments and usage as Gaussian. `nfolds`, `weights`, `lambda`, `parallel` are all available to users. There are some differences in `type.measure`: "deviance" and "mse" do not both mean squared loss. Rather, + * "mse" uses squared loss. * "deviance" uses actual deviance. @@ -538,256 +488,260 @@ * "auc" (for two-class logistic regression ONLY) gives area under the ROC curve. -For example, +For example, the code below uses misclassification error as the criterion for 10-fold cross-validation: ```{r} -cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class") +cvfit <- cv.glmnet(x, y, family = "binomial", type.measure = "class") ``` -It uses misclassification error as the criterion for 10-fold cross-validation. -We plot the object and show the optimal values of $\lambda$. +As before, we can plot the object and show the optimal values of $\lambda$. ```{r} plot(cvfit) -``` -```{r} cvfit$lambda.min cvfit$lambda.1se ``` -`coef` and `predict` are simliar to the Gaussian case and we omit the details. We review by some examples. -```{r} -coef(cvfit, s = "lambda.min") -``` -As mentioned previously, the results returned here are only for the second level of the factor response. +`coef` and `predict` for the `cv.glmnet` object for `family = "binomial"` are simliar to the Gaussian case and we omit the details. -```{r} -predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") -``` - -Like other GLMs, glmnet allows for an "offset". This is a fixed vector of N numbers that is added into the linear predictor. -For example, you may have fitted some other logistic regression using other variables (and data), and now you want to see if the present variables can add anything. So you use the predicted logit from the other model as an offset in. +Like other generalized linear models, `glmnet` allows for an +"offset". This is a fixed vector of $N$ numbers that is added into the +linear predictor. For example, you may have fitted some other logistic +regression using other variables (and data), and now you want to see +if the present variables can add further predictive power. To do this, +you can use the predicted logit from the other model as an offset in +the `glmnet` call. Offsets are also useful in Poisson models, which we +discuss later. -### Multinomial Models +## Multinomial Regression: `family = "multinomial"` -For the multinomial model, suppose the response variable has $K$ levels ${\cal G}=\{1,2,\ldots,K\}$. Here we model +The multinomial model extends the binomial when the number of classes +is more than two. Suppose the response variable has $K$ levels ${\cal G}=\{1,2,\ldots,K\}$. Here we model $$\mbox{Pr}(G=k|X=x)=\frac{e^{\beta_{0k}+\beta_k^Tx}}{\sum_{\ell=1}^Ke^{\beta_{0\ell}+\beta_\ell^Tx}}.$$ +There is a linear predictor for each class! -Let ${Y}$ be the $N \times K$ indicator response matrix, with elements $y_{i\ell} = I(g_i=\ell)$. Then the elastic-net penalized negative log-likelihood function becomes +Let ${Y}$ be the $N \times K$ indicator response matrix, with elements $y_{i\ell} = I(g_i=\ell)$. Then the elastic net penalized negative log-likelihood function becomes $$ -\ell(\{\beta_{0k},\beta_{k}\}_1^K) = -\left[\frac{1}{N} \sum_{i=1}^N \Big(\sum_{k=1}^Ky_{il} (\beta_{0k} + x_i^T \beta_k)- \log \big(\sum_{k=1}^K e^{\beta_{0k}+x_i^T \beta_k}\big)\Big)\right] +\lambda \left[ (1-\alpha)||\beta||_F^2/2 + \alpha\sum_{j=1}^p||\beta_j||_q\right]. +\ell(\{\beta_{0k},\beta_{k}\}_1^K) = -\left[\frac{1}{N} \sum_{i=1}^N \Big(\sum_{k=1}^Ky_{il} (\beta_{0k} + x_i^T \beta_k)- \log \big(\sum_{\ell=1}^K e^{\beta_{0\ell}+x_i^T \beta_\ell}\big)\Big)\right] +\lambda \left[ (1-\alpha)\|\beta\|_F^2/2 + \alpha\sum_{j=1}^p\|\beta_j\|_q\right]. $$ -Here we really abuse notation! $\beta$ is a $p\times K$ matrix of coefficients. $\beta_k$ refers to the kth column (for outcome category k), and $\beta_j$ the jth row (vector of K coefficients for variable j). -The last penalty term is $||\beta_j||_q$, we have two options for q: $q\in \{1,2\}$. -When q=1, this is a lasso penalty on each of the parameters. When q=2, this is a grouped-lasso penalty on all the K coefficients for a particular variables, which makes them all be zero or nonzero together. +Here we really abuse notation! $\beta$ is a $p\times K$ matrix of coefficients. $\beta_k$ refers to the $k$th column (for outcome category $k$), and $\beta_j$ the $j$th row (vector of $K$ coefficients for variable $j$). The last penalty term is $\|\beta_j\|_q$. We support two options for $q$: $q\in \{1,2\}$. When $q=1$, this is a lasso penalty on each of the parameters. When $q=2$, this is a grouped-lasso penalty on all the $K$ coefficients for a particular variable, which makes them all be zero or nonzero together. +The standard Newton algorithm can be tedious here. Instead, for $q=1$ we use a so-called partial Newton algorithm by making a partial quadratic approximation to the log-likelihood, allowing only $(\beta_{0k}, \beta_k)$ to vary for a single class at a time. For each value of $\lambda$, we first cycle over all classes indexed by $k$, computing each time a partial quadratic approximation about the parameters of the current class. Then, the inner procedure is almost the same as for the binomial case. When $q=2$, we use a different approach that we will not explain here. -The standard Newton algorithm can be tedious here. Instead, we use a so-called partial Newton algorithm by making a partial quadratic approximation to the log-likelihood, allowing only $(\beta_{0k}, \beta_k)$ to vary for a single class at a time. -For each value of $\lambda$, we first cycle over all classes indexed by $k$, computing each time a partial quadratic approximation about the parameters of the current class. Then the inner procedure is almost the same as for the binomial case. -This is the case for lasso (q=1). When q=2, we use a different approach, which we wont dwell on here. - - -For the multinomial case, the usage is similar to logistic regression, and we mainly illustrate by examples and address any differences. We load a set of generated data. +For the `family = "multinomial"` case, usage is similar to that for `family = "binomial"`. In this section we describe the differences. First, we load a set of generated data: ```{r} data(MultinomialExample) ``` -The optional arguments in `glmnet` for multinomial logistic regression are mostly similar to binomial regression except for a few cases. - -The response variable can be a `nc >= 2` level factor, or a `nc`-column matrix of counts or proportions. -Internally glmnet will make the rows of this matrix sum to 1, and absorb the total mass into the weight for that observation. -`offset` should be a `nobs x nc` matrix if there is one. +The response variable can be a `nc >= 2` level factor, or an `nc`-column matrix of counts or proportions. Internally glmnet will make the rows of this matrix sum to 1, and absorb the total mass into the weight for that observation. `offset` should be a `nobs x nc` matrix if one is provided. -A special option for multinomial regression is `type.multinomial`, which allows the usage of a grouped lasso penalty if `type.multinomial = "grouped"`. This will ensure that the multinomial coefficients for a variable are all in or out together, just like for the multi-response Gaussian. - -```{r} -fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped") -``` - -We plot the resulting object "fit". +A special option for multinomial regression is `type.multinomial`, which allows the usage of a grouped lasso penalty ($q = 2$) if `type.multinomial = "grouped"`. The default is `type.multinomial = "ungrouped"` ($q = 1$). ```{r} +fit <- glmnet(x, y, family = "multinomial", type.multinomial = "grouped") plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm") ``` -The options are `xvar`, `label` and `type.coef`, in addition to other ordinary graphical parameters. - -`xvar` and `label` are the same as other families while `type.coef` is only for multinomial regression and multiresponse Gaussian model. It can produce a figure of coefficients for each response variable if `type.coef = "coef"` or a figure showing the $\ell_2$-norm in one figure if `type.coef = "2norm"` +For the `plot` method, the function arguments are `xvar`, `label` and `type.coef`, in addition to other ordinary graphical parameters. `xvar` and `label` are the same as other families while `type.coef` is only for multinomial regression and multi-response Gaussian model. It can produce a figure of coefficients for each response variable if `type.coef = "coef"` or a figure showing the $\ell_2$-norm in one figure if `type.coef = "2norm"`. -We can also do cross-validation and plot the returned object. +We can also do cross-validation and plot the returned object. Note +that although `type.multinomial` is not a named argument in +`cv.glmnet`, in fact any argument that can be passed to `glmnet` is +valid in the argument list of `cv.glmnet`. Such arguments are passed +via the `...` argument directly to the calls to `glmnet` inside the +`cv.glmnet` function. ```{r} -cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped", parallel = TRUE) +cvfit <- cv.glmnet(x, y, family = "multinomial", type.multinomial = "grouped") plot(cvfit) ``` -Note that although `type.multinomial` is not a typical argument in `cv.glmnet`, in fact any argument that can be passed to `glmnet` is valid in the argument list of `cv.glmnet`. We also use parallel computing to accelerate the calculation. - Users may wish to predict at the optimally selected $\lambda$: ```{r} predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") ``` -## Poisson Models +## Poisson Regression: `family = "poisson"` -Poisson regression is used to model count data under the assumption of Poisson error, or otherwise non-negative data where the mean and variance are proportional. Like the Gaussian and binomial model, the Poisson is a member of the exponential family of distributions. We usually model its positive mean on the log scale: $\log \mu(x) = \beta_0+\beta' x$. -The log-likelihood for observations $\{x_i,y_i\}_1^N$ is given my +Poisson regression is used to model count data under the assumption of Poisson error, or otherwise non-negative data where the mean and variance are proportional. Like the Gaussian and binomial models, the Poisson distribution is a member of the exponential family of distributions. We usually model its positive mean on the log scale: $\log \mu(x) = \beta_0+\beta' x$. + +The log-likelihood for observations $\{x_i,y_i\}_1^N$ is given by $$ -l(\beta|X, Y) = \sum_{i=1}^N \left(y_i (\beta_0+\beta' x_i) - e^{\beta_0+\beta^Tx_i}\right). +l(\beta|X, Y) = \sum_{i=1}^N \left(y_i (\beta_0+\beta^T x_i) - e^{\beta_0+\beta^Tx_i}\right). $$ -As before, we optimize the penalized log-lielihood: - $$ -\min_{\beta_0,\beta} -\frac1N l(\beta|X, Y) + \lambda \left((1-\alpha) \sum_{i=1}^N \beta_i^2/2) +\alpha \sum_{i=1}^N |\beta_i|\right). +As before, we optimize the penalized log-likelihood: +$$ +\min_{\beta_0,\beta} -\frac1N l(\beta|X, Y) + \lambda \left((1-\alpha) \sum_{i=1}^N \beta_i^2/2 +\alpha \sum_{i=1}^N |\beta_i|\right). $$ -Glmnet uses an outer Newton loop, and an inner weighted least-squares loop (as in logistic regression) to optimize this criterion. - - +`glmnet` uses an outer Newton loop and an inner weighted least-squares loop (as in logistic regression) to optimize this criterion. -First, we load a pre-generated set of Poisson data. +First, we load a pre-generated set of Poisson data: ```{r} data(PoissonExample) ``` -We apply the function `glmnet` with the `"poisson"` option. +We apply the function `glmnet` with `family = "poisson"`: ```{r} -fit = glmnet(x, y, family = "poisson") +fit <- glmnet(x, y, family = "poisson") ``` -The optional input arguments of `glmnet` for `"poisson"` family are similar to those for others. +The optional input arguments of `glmnet` for `"poisson"` family are similar to those for other families. -`offset` is a useful argument particularly in Poisson models. - -When dealing with rate data in Poisson models, the counts collected are often based on different exposures, such as length of time observed, area and years. A poisson rate $\mu(x)$ is relative to a unit exposure time, so if an observation $y_i$ was exposed for $E_i$ units of time, then the expected count would be $E_i\mu(x)$, and the log mean would be $\log(E_i)+\log(\mu(x))$. In a case like this, we would supply an *offset* $\log(E_i)$ for each observation. -Hence `offset` is a vector of length `nobs` that is included in the linear predictor. Other families can also use options, typically for different reasons. - -(Warning: if `offset` is supplied in `glmnet`, offsets must also also be supplied to `predict` to make reasonable predictions.) +`offset` is a particularly useful argument for Poisson models. When dealing with rate data in Poisson models, the counts collected are often based on different exposures such as length of time observed, area and years. A poisson rate $\mu(x)$ is relative to a unit exposure time, so if an observation $y_i$ was exposed for $E_i$ units of time, then the expected count would be $E_i\mu(x)$, and the log mean would be $\log(E_i)+\log(\mu(x))$. In a case like this, we would supply an *offset* $\log(E_i)$ for each observation. Hence `offset` is a vector of length $N$ that is included in the linear predictor. (_Warning_: if `offset` is supplied in `glmnet`, offsets must also also be supplied to `predict` via the `newoffset` argument to make reasonable predictions.) Again, we plot the coefficients to have a first sense of the result. ```{r} plot(fit) ``` -Like before, we can extract the coefficients and make predictions at certain $\lambda$'s by using `coef` and `predict` respectively. The optional input arguments are similar to those for other families. In function `predict`, the option `type`, which is the type of prediction required, has its own specialties for Poisson family. That is, -* "link" (default) gives the linear predictors like others -* "response" gives the fitted mean -* "coefficients" computes the coefficients at the requested values for `s`, which can also be realized by `coef` function -* "nonzero" returns a a list of the indices of the nonzero coefficients for each value of `s`. - -For example, we can do as follows. -```{r} +As before, we can extract the coefficients and make predictions at certain $\lambda$'s using `coef` and `predict` respectively. The optional input arguments are similar to those for other families. For the `predict` method, the argument `type` has the same meaning as that for `family = "binomial"`, except that "response" gives the fitted mean (rather than fitted probabilities in the binomial case). For example, we can do the following: +```{r out.lines = 7} coef(fit, s = 1) predict(fit, newx = x[1:5,], type = "response", s = c(0.1,1)) ``` We may also use cross-validation to find the optimal $\lambda$'s and thus make inferences. ```{r} -cvfit = cv.glmnet(x, y, family = "poisson") +cvfit <- cv.glmnet(x, y, family = "poisson") ``` -Options are almost the same as the Gaussian family except that for `type.measure`, -* "deviance" (default) gives the deviance -* "mse" stands for mean squared error -* "mae" is for mean absolute error. +Options are almost the same as the Gaussian family except that for `type.measure`: -We can plot the `cv.glmnet` object. -```{r} -plot(cvfit) -``` +* "deviance" (default) gives the deviance. +* "mse" is for mean squared error. +* "mae" is for mean absolute error. -We can also show the optimal $\lambda$'s and the corresponding coefficients. -```{r} -opt.lam = c(cvfit$lambda.min, cvfit$lambda.1se) -coef(cvfit, s = opt.lam) -``` -The `predict` method is similar and we do not repeat it here. +## Cox Regression: `family = "cox"` -## Cox Models +The Cox proportional hazards model is commonly used for the study of +the relationship beteween predictor variables and survival time. We +have another vignette (["Regularized Cox +Regression"](https://glmnet.stanford.edu/articles/Coxnet.html)) +dedicated solely to fitting regularized Cox models with the `glmnet` +package; please consult that vignette for details. -The Cox proportional hazards model is commonly used for the study of the relationship beteween predictor variables and survival time. In the usual survival analysis framework, we have data of the form $(y_1, x_1, \delta_1), \ldots, (y_n, x_n, \delta_n)$ where $y_i$, the observed time, is a time of failure if $\delta_i$ is 1 or right-censoring if $\delta_i$ is 0. We also let $t_1 < t_2 < \ldots < t_m$ be the increasing list of unique failure times, and $j(i)$ denote the index of the observation failing at time $t_i$. +## Programmable GLM families: `family = family()` -The Cox model assumes a semi-parametric form for the hazard -$$ -h_i(t) = h_0(t) e^{x_i^T \beta}, -$$ -where $h_i(t)$ is the hazard for patient $i$ at time $t$, $h_0(t)$ is a shared baseline hazard, and $\beta$ is a fixed, length $p$ vector. In the classic setting $n \geq p$, inference is made via the partial likelihood -$$ -L(\beta) = \prod_{i=1}^m \frac{e^{x_{j(i)}^T \beta}}{\sum_{j \in R_i} e^{x_j^T \beta}}, -$$ -where $R_i$ is the set of indices $j$ with $y_j \geq t_i$ (those at risk at time $t_i$). +Since version 4.0, `glmnet` has the facility to fit any GLM family by +specifying a `family` object, as used by `stats::glm`. For these more +general families, the outer Newton loop is performed in R, while the +inner elastic-net loop is performed in Fortran, for each value of +lambda. The price for this generality is a small hit in speed. +For details, see the vignette +["GLM `family` functions in `glmnet`"](https://glmnet.stanford.edu/articles/glmnetFamily.html) -Note there is no intercept in the Cox mode (its built into the baseline hazard, and like it, would cancel in the partial likelihood.) +## Assessing models on test data -We penalize the negative log of the partial likelihood, just like the other models, with an elastic-net penalty. +Once we have fit a series of models using `glmnet`, we often assess their performance on a set of evaluation or test data. We usually go through the process of building a prediction matrix, deciding on the performance measure, and computing these measures for a series of values for `lambda` (and `gamma` for relaxed fits). `glmnet` provides three functions (`assess.glmnet`, `roc.glmnet` and `confusion.glmnet`) that make these tasks easier. -We use a pre-generated set of sample data and response. Users can load their own data and follow a similar procedure. In this case $x$ must be an $n\times p$ matrix of covariate values - each row corresponds to a patient and each column a covariate. $y$ is an $n \times 2$ matrix, with a column "time" of failure/censoring times, and "status" a 0/1 indicator, with 1 meaning the time is a failure time, and zero a censoring time. +### Performance measures +The function `assess.glmnet` computes the same performance measures produced by `cv.glmnet`, but on a validation or test dataset. ```{r} -data(CoxExample) -y[1:5,] +data(BinomialExample) +itrain <- 1:70 +fit <- glmnet(x[itrain, ], y[itrain], family = "binomial", nlambda = 5) +assess.glmnet(fit, newx = x[-itrain, ], newy = y[-itrain]) ``` -The `Surv` function in the package `survival` can create such a matrix. Note, however, that the `coxph` and related linear models can handle interval and other forms of censoring, while glmnet can only handle right censoring in its present form. -We apply the `glmnet` function to compute the solution path under default settings. -```{r} -fit = glmnet(x, y, family = "cox") +This produces a list with *all* the measures suitable for a binomial model, computed for the entire sequence of lambdas in the fit object. Here the function identifies the model family from the `fit` object. + +A second use case builds the prediction matrix before calling `assess.glmnet`: +```{r, eval=FALSE} +pred <- predict(fit, newx = x[-itrain, ]) +assess.glmnet(pred, newy = y[-itrain], family = "binomial") ``` -All the standard options are available such as `alpha`, `weights`, `nlambda` and `standardize`. Their usage is similar as in the Gaussian case and we omit the details here. Users can also refer to the help file `help(glmnet)`. -We can plot the coefficients. +Here we have to provide the `family` as an argument; the results (not shown) are the same. Users can see the various measures suitable for each family via ```{r} -plot(fit) +glmnet.measures() ``` -As before, we can extract the coefficients at certain values of $\lambda$. -```{r} -coef(fit, s = 0.05) +`assess.glmnet` can also take the result of `cv.glmnet` as input. In this case the predictions are made at the optimal values for the parameter(s). +```{r out.lines = 11} +cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "binomial", nlambda = 30) +assess.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) ``` -Since the Cox Model is not commonly used for prediction, we do not give an illustrative example on prediction. If needed, users can refer to the help file by typing `help(predict.glmnet)`. +This uses the default value of `s = "lambda.1se"`, just like `predict` would have done. Users can provide additional arguments that get passed on to `predict`. For example, the code below shows the performance measures for `s = "lambda.min"`: +```{r out.lines = 11} +assess.glmnet(cfit, newx = x[-itrain, ],newy = y[-itrain], s = "lambda.min") +``` + +### Prevalidation -Also, the function `cv.glmnet` can be used to compute $k$-fold cross-validation for the Cox model. The usage is similar to that for other families except for two main differences. +One interesting use case for `assess.glmnet` is to get the results of cross-validation using other measures. By specifying `keep = TRUE` in the `cv.glmnet` call, a matrix of prevalidated predictions are stored in the returned output as the `fit.preval` component. We can then use this component in the call to `assess.glmnet`: +```{r out.lines = 11} +cfit <- cv.glmnet(x, y, family = "binomial", keep = TRUE, nlambda = 30) +assess.glmnet(cfit$fit.preval, newy = y, family = "binomial") +``` -One is that `type.measure` only supports "deviance"(also default), which gives the partial-likelihood. +Users can verify that the first measure here `deviance` is identical to the component `cvm` on the `cfit` object. -The other is in the option `grouped`. `grouped = TRUE` obtains the CV partial likelihood for the Kth fold by subtraction; by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the on the (K-1)/K dataset. This makes more efficient use of risk sets. With `grouped=FALSE` the log partial likelihood is computed only on the Kth fold, which is only reasonable if each fold has a large number of observations. +### ROC curves for binomial data + +In the special case of binomial models, users often would like to see the ROC curve for validation or test data. Here the function `roc.glmnet` provides the goodies. Its first argument is as in `assess.glmnet`. Here we illustrate one use case, using the prevlidated CV fit. ```{r} -cvfit = cv.glmnet(x, y, family = "cox") +cfit <- cv.glmnet(x, y, family = "binomial", type.measure = "auc", + keep = TRUE) +rocs <- roc.glmnet(cfit$fit.preval, newy = y) ``` -Once fit, we can view the optimal $\lambda$ value and a cross validated error plot to help evaluate our model. + +`roc.glmnet` returns a list of cross-validated ROC data, one for each model along the path. The code below demonstrates how one can plot the output. The first line identifies the `lambda` value giving the best area under the curve (AUC). Then we plot all the ROC curves in grey and the "winner" in red. ```{r} -plot(cvfit) +best <- cvfit$index["min",] +plot(rocs[[best]], type = "l") +invisible(sapply(rocs, lines, col="grey")) +lines(rocs[[best]], lwd = 2,col = "red") ``` -As previously, the left vertical line in our plot shows us where the CV-error curve hits its minimum. The right vertical line shows us the most regularized model with CV-error within 1 standard deviation of the minimum. We also extract such optimal $\lambda$'s. +### Confusion matrices for classification + +For binomial and multinomial models, we often wish to examine the classification performance on new data. The function `confusion.glmnet` will do that for us. ```{r} -cvfit$lambda.min -cvfit$lambda.1se +data(MultinomialExample) +set.seed(101) +itrain <- sample(1:500, 400, replace = FALSE) +cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "multinomial") +cnf <- confusion.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) ``` -We can check the active covariates in our model and see their coefficients. + +`confusion.glmnet` produces a table of class "confusion.table" which inherits from class "table", and we also provide a `print` method for it. ```{r} -coef.min = coef(cvfit, s = "lambda.min") -active.min = which(coef.min != 0) -index.min = coef.min[active.min] +print(cnf) ``` + +The first argument to `confusion.glmnet` should be a `glmnet` or `cv.glmnet` object (from which predictions can be made), or a matrix/array of predictions, such as the *kept* `"fit.preval"` component in the output of a `cv.glmnet` call with `keep = TRUE`. When a matrix/array of predictions is provided, we need to specify the `family` option, otherwise *confusion* can exist between "binomial" and "multinomial" prediction matrices. + +When predictions for more than one model in the path is provided, `confusion.glmnet` returns a list of confusion tables. For example, the prevalidated predictions from `cv.glmnet` are for the whole `lambda` path, and so we are returned a list of confusion tables. In the code below, we identify and print the one achieving the smallest classification error. ```{r} -index.min -coef.min +cfit <- cv.glmnet(x, y, family = "multinomial", type = "class", keep = TRUE) +cnf <- confusion.glmnet(cfit$fit.preval, newy = y, family = "multinomial") +best <- cfit$index["min",] +print(cnf[[best]]) ``` -## Sparse Matrices +## Other Package Features - Our package supports sparse input matrices, which allow efficient storage and operations of large matrices but with only a few nonzero entries. It is available for all families except for the `cox` family. The usage of sparse matrices (inherit from class `"sparseMatrix"` as in package `Matrix`) in `glmnet ` is the same as if a regular matrix is provided. +In this section, we describe other features in the `glmnet` package that might be of interest to users. -We load a set of sample data created beforehand. +### Sparse matrix support + +Our package supports sparse input matrices, which allow the efficient storage and operation of large matrices having only a few nonzero entries. The usage of sparse matrices (inherits from class `"sparseMatrix"` as in the `Matrix` package) in `glmnet` is the same as if a regular matrix is provided. + +We load a set of sample data created beforehand. It loads `x`, a 100*20 sparse input matrix and `y`, the response vector. ```{r} data(SparseExample) -``` -It loads `x`, a 100*20 sparse input matrix and `y`, the response vector. -```{r} class(x) ``` -Users can create a sparse matrix with the function `sparseMatrix` by providing the locations and values of the nonzero entries. Alternatively, `Matrix` function can also be used to contruct a sparse matrix by setting `sparse = TRUE`, but this defeats the purpose somewhat. +Users can create a sparse matrix with the function `sparseMatrix` by +providing the locations and values of the nonzero +entries. Alternatively, the `Matrix` function from the `Matrix` +package can also be used to contruct a sparse matrix by setting +`sparse = TRUE`, but this defeats the purpose somewhat if the matrix +is large. We can fit the model the same way as before. ```{r} -fit = glmnet(x, y) +fit <- glmnet(x, y) ``` -We also do the cross-validation and plot the resulting object. + +We can also do the cross-validation and plot the resulting object. ```{r} cvfit = cv.glmnet(x, y) plot(cvfit) @@ -797,50 +751,135 @@ Note that sparse matrices can also be used for `newx`, the new input matrix in the `predict` function. For example, ```{r} -i = sample(1:5, size = 25, replace = TRUE) -j = sample(1:20, size = 25, replace = TRUE) -x = rnorm(25) -nx = sparseMatrix(i = i, j = j, x = x, dims = c(5, 20)) +i <- sample(1:5, size = 25, replace = TRUE) +j <- sample(1:20, size = 25, replace = TRUE) +x <- rnorm(25) +nx <- sparseMatrix(i = i, j = j, x = x, dims = c(5, 20)) predict(cvfit, newx = nx, s = "lambda.min") ``` +### Fitting big and/or sparse unpenalized generalized linear models + +The `glmnet` package includes a function `bigGlm` for fitting a single _unpenalized_ generalized linear model (GLM), but allowing all the options of `glmnet`. In other words, the user can set coefficient upper and/or lower bounds, and can provide the `x` matrix in sparse matrix format. This is not too much more than fitting a model with a single value of `lambda = 0` (with some protection from edge cases). `predict` and `print` methods can be called on the output. +```{r} +data(BinomialExample) +fit <- bigGlm(x, y, family = "binomial", lower.limits = -1) +print(fit) +``` + +### Creating `x` from mixed variables and/or missing data + +The `glmnet` package includes a function `makeX` that makes it easy to create the model matrix `x` needed as input to `glmnet`. It takes as input a data frame, which can contain vectors, matrices and factors. Some of the features are: + +* Factors are *one-hot* encoded to form indicator matrices. +* Missing values in the resultant matrix can be replaced by the column means. +* The `sparse` option returns a matrix in column-sparse format. This is useful if the data are large, and factors have many levels. +* Two data frames can be provided, `train` and `test`. This ensures the factor levels correspond, and also imputes missing data in the test data from means in the training data. + +Our first example demonstrates how `makeX` works with factors: +```{r} +set.seed(101) +X <- matrix(rnorm(5), nrow = 5) +X2 <- sample(letters[1:3], 5, replace = TRUE) +X3 <- sample(LETTERS[1:3], 5, replace = TRUE) +df <- data.frame(X, X2, X3) +makeX(df) +``` + +Include the option `sparse = TRUE` if a sparse output is desired: +```{r} +makeX(df, sparse = TRUE) +``` + +Next, let us add some missing values to our data matrix. By default, `makeX` leaves `NA`s as is: +```{r} +Xn <- X ; Xn[3,1] <- NA +X2n <- X2; X2n[1] <- NA +X3n <- X3; X3n[5] <- NA +dfn <- data.frame(Xn, X2n, X3n) +dfn +makeX(dfn) +``` + +We can impute the missing values with column means by passing the option `na.impute = TRUE`: +```{r} +makeX(dfn, na.impute = TRUE, sparse = TRUE) +``` + +Finally if a test set is available as well, both the training and test sets can be passed to `makeX` at the same time so that all the levels of factors present in the training and test sets will be represented correctly in the output matrix. In the example below, the third column of the training set only contains "B" and "C" while that of the training set only contains "A" and "C". By passing both data sets to `makeX` at the same time, this third column is correctly expanded into 3 feature columns for both the training and test sets. +```{r} +set.seed(102) +X <- matrix(rnorm(5), nrow = 5) +X2 <- sample(letters[1:3], 5, replace = TRUE) +X3 <- sample(LETTERS[1:3], 5, replace = TRUE) +Xn <- X ; Xn[5,1] <- NA +X2n <- X2; X2n[1] <- NA +X3n <- X3; X3n[2] <- NA +dftn <- data.frame(Xn, X2n, X3n) +dftn +makeX(dfn, dftn, sparse = TRUE) +``` + +### Progress bar + +Ever run a job on a big dataset, and wonder how long it will take? `glmnet` and `cv.glmnet` come equipped with a progress bar, which can by displayed by passing `trace.it = TRUE` to these functions. +```{r, eval=FALSE} +fit <- glmnet(x, y, trace.it = TRUE) +``` + +``##`` + +`` |================================== |65%`` + +This display changes in place as the fit is produced. The progress bar is also very helpful with `cv.glmnet`: +```{r, eval=FALSE} +fit <- cv.glmnet(x, y, trace.it = TRUE) +``` +``##`` + +`` Training`` + +`` |=============================================| 100%`` + +`` Fold: 1/10`` + +`` |=============================================| 100%`` + +`` Fold: 2/10`` + +`` |=============================================| 100%`` + +`` Fold: 3/10`` + +`` |============================= | 70%`` + +Tracing of the folds works a little differently when distributed computing is used. + +If the user wants `glmnet` and `cv.glmnet` to always print the progress bar, this can be achieved (for a session) via a call to `glmnet.control` with the `itrace` argument: +```{r, eval=FALSE} +glmnet.control(itrace = 1) +``` + +To reset it, one makes a similar call and sets `itrace = 0`. + ## Appendix 0: Convergence Criteria -Glmnet uses a convergence criterion that focuses not on coefficient -change but rather the impact of the change on the fitted values, and -hence the loss part of the objective. The net result is a -weighted norm of the coefficient change vector. +Glmnet uses a convergence criterion that focuses not on coefficient change but rather the impact of the change on the fitted values, and hence the loss part of the objective. The net result is a weighted norm of the coefficient change vector. -For gaussian models it uses the following. Suppose observation $i$ -has weight $w_i$. Let $v_j$ be the (weighted) +For Gaussian models it uses the following. Suppose observation $i$ has weight $w_i$. Let $v_j$ be the (weighted) sum-of-squares for variable $x_j$: $$v_j=\sum_{i=1}^Nw_ix_{ij}^2.$$ -If there is an intercept in the model, these $x_j$ will be centered by -the weighted mean, and hence this would be a weighted variance. -After $\hat\beta_j^o$ has been updated to $\hat\beta_j^n$, we compute -$\Delta_j=v_j(\hat\beta_j^o-\hat\beta_j^n)^2$. After a complete cycle of coordinate descent, we look at -$\Delta_{max}=\max_j\Delta_j$. Why this measure? -We can write +If there is an intercept in the model, these $x_j$ will be centered by the weighted mean, and hence this would be a weighted variance. After $\hat\beta_j^o$ has been updated to $\hat\beta_j^n$, we compute $\Delta_j=v_j(\hat\beta_j^o-\hat\beta_j^n)^2$. After a complete cycle of coordinate descent, we look at $\Delta_{max}=\max_j\Delta_j$. Why this measure? We can write $$\Delta_j=\frac1N\sum_{i=1}^N w_j(x_{ij}\hat\beta_j^o-x_{ij}\hat\beta_j^n)^2,$$ -which measures the weighted sum of squares of changes in fitted values -for this term. This measures the impact of the change in this -coefficient on the fit. If the largest such change is negligible, we stop. - - -For logistic regression, and other non-Gaussian models it is similar -for the inner loop. Only now the weights for each observation are more -complex. For example, for logisitic regression the weights are those -that arise from the current Newton step, namely $w_i^*=w_i\hat p_i(1-\hat p_i)$. Here $\hat p_i$ are the fitted probabilities as we -entered the current inner loop. The intuition is the same --- it -measures the impact of the coefficient change on the current weighted -least squares loss, or quadratic approximation to the log-likelihood -loss. - -What about outer-loop convergence? We use the same measure, except now -$\hat\beta^o$ is the coefficient vector before we entered this inner -loop, and $\hat\beta^n$ the converged solution for this inner -loop. Hence if this Newton step had no impact, we declare outer-loop convergence. +which measures the weighted sum of squares of changes in fitted values for this term. This measures the impact of the change in this coefficient on the fit. If the largest such change is negligible, we stop. +For logistic regression and other non-Gaussian models it is similar for the inner loop, only now the weights for each observation are more complex. For example, for logistic regression the weights are those that arise from the current Newton step, i.e. $w_i^*=w_i\hat p_i(1-\hat p_i)$, where the $\hat p_i$'s are the fitted probabilities as we entered the current inner loop. The intuition is the same: it measures the impact of the coefficient change on the current weighted least squares loss, or quadratic approximation to the log-likelihood loss. + +What about outer-loop convergence? + +* If the argument `family` was a character string, we use the same measure, except now $\hat\beta^o$ is the coefficient vector before we entered this inner loop, and $\hat\beta^n$ the converged solution for this inner loop. Hence if this Newton step had no impact, we declare outer-loop convergence. + +* If the argument `family` was a class "family" object, outer-loop convergence is determined by the change in the objective function value. If the fractional change in the objective function value is less than the `epsnr` control parameter, we declare outer-loop convergence. `epsnr` can be changed via a call to `glmnet.control`. ## Appendix 1: Internal Parameters @@ -848,115 +887,123 @@ There are several parameters that users can change: -`fdev` - minimum fractional change in deviance for stopping path; factory default = 1.0e-5 +* `fdev` - minimum fractional change in deviance for stopping path; factory default = 1.0e-5. + +* `devmax` - maximum fraction of explained deviance for stopping path; factory default = 0.999. -`devmax` - maximum fraction of explained deviance for stopping path; factory default = 0.999 +* `eps` - minimum value of `lambda.min.ratio` (see `glmnet` documentation); factory default= 1.0e-6. -* `eps` - minimum value of lambda.min.ratio (see glmnet); factory default= 1.0e-6 +* `big` - large floating point number; factory default = 9.9e35. Inf in definition of `upper.limits` is set to `big`. -* `big` - large floating point number; factory default = 9.9e35. Inf in definition of upper.limit is set to big +* `mnlam` - minimum number of path points (lambda values) allowed; factory default = 5. -* `mnlam` - minimum number of path points (lambda values) allowed; factory default = 5 +* `pmin` - minimum null probability for any class; factory default = 1.0e-5. -* `pmin` - minimum null probability for any class; factory default = 1.0e-5 +* `exmx` - maximum allowed exponent; factory default = 250.0. -* `exmx` - maximum allowed exponent; factory default = 250.0 +* `prec` - convergence threshold for multi-response bounds adjustment solution; factory default = 1.0e-10. -* `prec` - convergence threshold for multi-response bounds adjustment solution; factory default = 1.0e-10 +* `mxit` - maximum iterations for multi-response bounds adjustment solution; factory default = 100. -* `mxit` - maximum iterations for multiresponse bounds adjustment solution; factory default = 100 +* `epsnr`: convergence threshold for the iteratively reweighted least squares loop (see "The `family` Argument for `glmnet`" vignette); factory default = 1e-08. -* `factory` - If `TRUE`, reset all the parameters to the factory default; default is `FALSE` +* `mxitnr`: maximum iterations for the iteratively reweighted least squares loop for each value of $\lambda$ (see "The `family` Argument for `glmnet`" vignette); factory default = 25. -We illustrate the usage by an example. Note that any changes made hold for the duration of the R session, or unless they are changed by the user with a subsequent call to `glmnet.control`. +* `factory` - If `TRUE`, reset all the parameters to the factory default; default is `FALSE`. +We illustrate how to change these control parameters through an example. Note that any changes made hold for the duration of the R session, or unless they are changed by the user with a subsequent call to `glmnet.control`. ```{r} data(QuickStartExample) -fit = glmnet(x, y) -print(fit) +fit <- glmnet(x, y) +length(fit$lambda) # number of lambda values fit ``` -We can change the minimum fractional change in deviance for stopping path and compare the results. + +We can change the minimum fractional change in deviance for stopping path and compare the results. By setting `fdev` to be larger than the default, we see that the computation stopped earlier in the path. ```{r} -glmnet.control(fdev = 0) -fit = glmnet(x, y) -print(fit) +glmnet.control(fdev = 0.1) +fit <- glmnet(x, y) +length(fit$lambda) # number of lambda values fit ``` -We set `fdev = 0` to continue all along the path, even without much change. The length of the sequence becomes 100, which is the default of `nlambda`. -Users can also reset to the default settings. +Users can reset to the default settings with the following code: ```{r} glmnet.control(factory = TRUE) ``` -The current settings are obtained as follows. -```{r} + +To view current settings, call `glmnet.control` without any arguments: +```{r out.lines = 8} glmnet.control() ``` ## Appendix 2: Comparison with Other Packages -Some people may want to use `glmnet` to solve the Lasso or elastic-net problem at a single $\lambda$. We compare here the solution by `glmnet` with other packages (such as CVX), and also as an illustration of parameter settings in this situation. -__Warning__: Though such problems can be solved by `glmnet`, it is __not recommended__ and is not the spirit of the package. `glmnet` fits the __entire__ solution path for Lasso or elastic-net problems efficiently with various techniques such as warm start. Those advantages will disappear if the $\lambda$ sequence is forced to be only one value. +Some may want to use `glmnet` to solve the lasso or elastic net problem at a single $\lambda$. We compare here the solution by `glmnet` with other packages (such as CVX), and also as an illustration of parameter settings in this situation. -Nevertheless, we still illustrate with a typical example in linear model in the following for the purpose of comparison. Given $X, Y$ and $\lambda_0 > 0$, we want to find $\beta$ such that +(__Warning__: Though such problems can be solved by `glmnet`, it is __not recommended__ and is not the spirit of the package. `glmnet` fits the __entire__ solution path for the lasso or elastic net problems efficiently with various techniques such as using warm starts and strong rules. Those advantages will disappear if the $\lambda$ sequence is forced to be only one value.) + +We illustrate with a typical example in linear models for the purpose of comparison. Given $X$ and $Y$, we want to find $\beta$ such that $$ -\min_{\beta} ||Y - X\beta||_2^2 + \lambda_0 ||\beta||_1, +\min_{\beta} \|Y - X\beta\|_2^2 + \lambda_0 \|\beta\|_1, $$ where, say, $\lambda_0 = 8$. -We first solve using `glmnet`. Notice that there is no intercept term in the objective function, and the columns of $X$ are not necessarily standardized. Corresponding parameters have to be set to make it work correctly. In addition, there is a $1/(2n)$ factor before the quadratic term by default, we need to adjust $\lambda$ accordingly. For the purpose of comparison, the `thresh` option is specified to be 1e-20. However, this is not necessary in many practical applications. +We first solve this using `glmnet`. Notice that there is no intercept term in the objective function, and the columns of $X$ are not necessarily standardized. Corresponding parameters have to be set to make it work correctly. In addition, there is a $1/(2n)$ factor before the quadratic term by default, so we need to adjust $\lambda$ accordingly. For the purpose of comparison, we set `thresh = 1e-20`. However, this is not necessary in many practical applications. ```{r, echo=FALSE} data(QuickStartExample) ``` -```{r,eval=FALSE} -fit = glmnet(x, y, intercept = F, standardize = F, lambda = 8/(2*dim(x)[1]), thresh = 1e-20) +```{r} +np <- dim(x); n <- np[1]; p <-np[2] + +fit <- glmnet(x, y, intercept = F, standardize = F, + lambda = 8 / (2 * n), thresh = 1e-20) ``` -We then extract the coefficients (with no intercept). + +We then extract the coefficients (with no intercept): ```{r,eval=FALSE} -beta_glmnet = as.matrix(predict(fit, type = "coefficients")[-1,]) +beta_glmnet <- as.matrix(predict(fit, type = "coefficients")[-1,]) ``` -In linear model as here this approach worked because we were using squared error loss, but with any nonlinear family, it will probably fail. The reason is we are not using step length optimization, and so rely on very good warm starts to put us in the quadratic region of the loss function. - -Alternatively, a more stable and __strongly recommended__ way to perform this task is to first fit the entire Lasso or elastic-net path without specifying `lambda`, but then provide the requested $\lambda_0$ to `predict` function to extract the corresponding coefficients. In fact, if $\lambda_0$ is not in the $\lambda$ sequence generated by `glmnet`, the path will be refitted along a new $\lambda$ sequence that includes the requested value $\lambda_0$ and the old sequence, and the coefficients will be returned at $\lambda_0$ based on the new fit. Remember to set `exact = TRUE` in `predict` function to get the exact solution. Otherwise, it will be approximated by linear interpolation. +Alternatively, a more stable and __strongly recommended__ way to perform this task is to first fit the entire lasso or elastic net path without specifying `lambda`, but then provide the requested $\lambda_0$ to a `predict` call to extract the corresponding coefficients. (Remember to set `exact = TRUE` in the `predict` call to get the exact solution. Otherwise, it will be approximated by linear interpolation.) ```{r} -fit = glmnet(x, y, intercept = F, standardize = F, thresh = 1e-20) -beta_glmnet = as.matrix(predict(fit, s = 8/(2*dim(x)[1]), type = "coefficients", - exact = TRUE, x=x, y=y)[-1,]) +fit <- glmnet(x, y, intercept = F, standardize = F, thresh = 1e-20) +beta_glmnet <- as.matrix(predict(fit, s = 8 / (2 * n), + type = "coefficients", + exact = TRUE, x = x, y = y)[-1,]) ``` -We also use CVX, a general convex optimization solver, to solve this specific Lasso problem. Users could also call CVX from R using the `CVXfromR` package and solve the problem as follows. +Next, we use CVX, a general convex optimization solver, to solve this +specific lasso problem. CVX is implemented in the CVXR package on +CRAN. + ```{r, eval=FALSE} -library(CVXfromR) -setup.dir = "change/this/to/your/cvx/directory" -n = dim(x)[1]; p = dim(x)[2] -cvxcode = paste("variables beta(p)", - "minimize(square_pos(norm(y - x * beta, 2)) + lambda * norm(beta, 1))", - sep = ";") -Lasso = CallCVX(cvxcode, const.var = list(p = p, x = x, y = y, lambda = 8), opt.var.names = "beta", setup.dir = setup.dir, matlab.call = "change/this/to/path/to/matlab") -beta_CVX = Lasso$beta +library(CVXR) +beta <- Variable(p) +loss <- sum((y-x%*%beta)^2)/(2*n) +lassoPenalty <- function(beta,lambda)lambda*p_norm(beta,1) +obj <- loss + lassoPenalty(beta, lambda = 8/(2*n)) +prob <- Problem(Minimize(obj)) +result <- solve(prob) +beta_CVX <- result$getValue(beta) ``` -For convenience here, the results were saved in `CVXResult.RData`, and we simply load in the results. - +For convenience, the results were saved in `CVXResult.RData`, and we simply load in the results. ```{r} data(CVXResults) ``` -In addition, we use `lars` to solve the same problem. -```{r,message=FALSE} -require(lars) +Finally, we solve the same problem with the `lars` package: +```{r, message=FALSE} +library(lars) +fit_lars <- lars(x, y, type = "lasso", intercept = F, normalize = F) +beta_lars <- predict(fit_lars, s = 8 / 2, type = "coefficients", + mode = "lambda")$coefficients ``` -```{r} -fit_lars = lars(x, y, type = "lasso", intercept = F, normalize = F) -beta_lars = predict(fit_lars, s = 8/2, type = "coefficients", mode = "lambda")$coefficients -``` - -The results are listed below up to 6 decimal digits (due to convergence thresholds). +The results are listed below up to 6 decimal digits (due to convergence thresholds). We see that all three packages give the same result. ```{r} -cmp = round(cbind(beta_glmnet, beta_lars, beta_CVX), digits = 6) -colnames(cmp) = c("beta_glmnet", "beta_lars", "beta_CVX") +cmp <- round(cbind(beta_glmnet, beta_lars, beta_CVX), digits = 6) +colnames(cmp) <- c("beta_glmnet", "beta_lars", "beta_CVX") cmp ``` diff -Nru r-cran-glmnet-4.0-2/vignettes/relax.Rmd r-cran-glmnet-4.1/vignettes/relax.Rmd --- r-cran-glmnet-4.0-2/vignettes/relax.Rmd 2019-11-07 00:14:41.000000000 +0000 +++ r-cran-glmnet-4.1/vignettes/relax.Rmd 2021-01-06 22:06:55.000000000 +0000 @@ -1,7 +1,10 @@ --- -title: "Relaxed fits and other additions in `glmnet` 3.0" -author: "Trevor Hastie, Balasubramanian Narasimhan and Rob Tibshirani" -date: "October 15, 2019" +title: "The Relaxed Lasso" +author: + - Trevor Hastie + - Balasubramanian Narasimhan + - Rob Tibshirani +date: "`r format(Sys.time(), '%B %d, %Y')`" bibliography: assets/glmnet_refs.bib link-citations: true output: @@ -10,470 +13,137 @@ toc: yes toc_depth: 3 vignette: > - %\VignetteIndexEntry{Relaxed fits} + %\VignetteIndexEntry{The Relaxed Lasso} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- +```{r include=FALSE} +# the code in this chunk enables us to truncate the print output for each +# chunk using the `out.lines` option +# save the built-in output hook +hook_output <- knitr::knit_hooks$get("output") + +# set a new output hook to truncate text output +knitr::knit_hooks$set(output = function(x, options) { + if (!is.null(n <- options$out.lines)) { + x <- xfun::split_lines(x) + if (length(x) > n) { + # truncate the output + x <- c(head(x, n), "....\n") + } + x <- paste(x, collapse = "\n") + } + hook_output(x, options) +}) +``` + ## Introduction -In our vignette "glmnet" we give details for fitting lasso and -elastic-net regularized models, for -CV and various aspects of glmnet modeling. In this vignette, we -highlight some of the new tools and features in the major revision glmnet 3.0. - -The main edition is the introduction of the *relaxed lasso*. The idea -is to take a glmnet fitted object, and then for each lambda, refit the -variables in the active set without any penalization. This gives the -`relaxed` fit (note, there have been other definitions of a relaxed -fit, but this is the one we prefer). -This could of course be done for elastic net fits as well as lasso. -However, if the number of variables gets too close to the sample size -N, the relaxed path will be truncated. -Furthermore, for binomial and other nonlinear GLMs convergence can be -an issue with our current implementation if the number of variables is -too large, and perversely if the relaxed fit is too strong. - -Suppose the `glmnet` fitted linear predictor at $\lambda$ is -$\hat\eta_\lambda(x)$ and the relaxed version is $\tilde -\eta_\lambda(x)$. We also allow for shrinkage between the two: +In this vignette, we describe how the `glmnet` package can be used to fit the *relaxed lasso*. + +The idea of the relaxed lasso is to take a `glmnet` fitted object, and then for each lambda, refit the variables in the active set without any penalization. This gives the "relaxed" fit. (We note that there have been other definitions of a relaxed fit, but this is the one we prefer.) This could of course be done for elastic net fits as well as lasso. However, if the number of variables gets too close to the sample size $N$, the relaxed path will be truncated. Furthermore, for binomial and other nonlinear generalized linear models (GLMs) convergence can be an issue with our current implementation if the number of variables is too large, and perversely if the relaxed fit is too strong. + +Suppose the `glmnet` fitted linear predictor at $\lambda$ is $\hat\eta_\lambda(x)$ and the relaxed version is $\tilde\eta_\lambda(x)$. We also allow for shrinkage between the two: $$\tilde \eta_{\lambda,\gamma}=(1-\gamma)\tilde \eta_\lambda(x)+\gamma\hat\eta_\lambda(x).$$ -$\gamma\in[0,1]$ is an additional tuning parameter which can be -selected by cross validation. -The debiasing will potentially improve prediction performance, and CV -will typically select a model with a smaller number of variables. -This procedure is very competitive with forward-stepwise and -best-subset regression, and has a considerable speed advantage when -the number of variables is large. This is especially true for -best-subset, but even so for forward stepwise. The latter has to plod -through the variables one-at-a-time, while glmnet will just plunge in -and find a good active set. +$\gamma\in[0,1]$ is an additional tuning parameter which can be selected by cross-validation (CV). The debiasing will potentially improve prediction performance, and CV will typically select a model with a smaller number of variables. -Further details may be found in @glmnet, @coxnet, @strongrules, @block -and @best_subset. +This procedure is very competitive with forward-stepwise and best-subset regression, and has a considerable speed advantage when the number of variables is large. This is especially true for best-subset, but even so for forward stepwise. The latter has to plod through the variables one-at-a-time, while `glmnet` will just plunge in +and find a good active set. -## Simple relaxed fit +Further details on this form of relaxed fitting can be found in +@best_subset; more information on glmnet and elastic-net model in +general is given in @glmnet, +@coxnet, @strongrules, and @block. -To get things going, we show the most basic use. -We use the same data used in the `glmnet` vignette. +## Simple relaxed fitting -```{r} +We demonstrate the most basic relaxed lasso fit as a first example. We load some pre-generated data and fit the relaxed lasso on it by calling `glmnet` with `relax = TRUE`: +```{r out.lines = 15} library(glmnet) data(QuickStartExample) -fit=glmnet(x,y, relax=TRUE) +fit <- glmnet(x, y, relax = TRUE) print(fit) ``` -There is an extra column `%Dev R` where the `R` stands for "relaxed", -which is the percent deviance explained by the relaxed fit. This is -always higher than its neighboring column, which is the same for the -penalized fit (on the training data). +In addition to the three columns usually printed for `glmnet` objects +(`Df`, `%Dev` and `Lambda`), there is an extra column `%Dev R` (`R` +stands for "relaxed") which is the percent deviance explained by the +relaxed fit. This is always higher than its neighboring column, which +is the percent deviance exaplined for the penalized fit (on the +training data). Notice that when the `Df` stays the same, the `%Dev R` +does not change, since this typically means the active set is the +same. (The code is also smart enough to only fit such models once, so +in the truncated display shown, 9 lasso models are fit, but only 4 +relaxed fits are computed). -The fit object is class `relaxed`, which inherits from class `glmnet`. -One can plot it, with additional flexibility. +The fit object is of class `"relaxed"`, which inherits from class `"glmnet"`. Hence, the usual `plot` method for `"glmnet"` objects can be used. The code below demonstrates some additional flexibility that `"relaxed"` objects have for plotting. ```{r} -par(mfrow=c(1,3)) -plot(fit) -plot(fit,gamma=0.5) -plot(fit,gamma=0) +par(mfrow = c(1, 3), mar=c(4,4,5.5,1)) +plot(fit, main = "gamma = 1") +plot(fit, gamma = 0.5, main = "gamma = 0.5") +plot(fit, gamma = 0, main = "gamma = 0") ``` -So again, `gamma=1` is the traditional `glmnet` fit, while `gamma=0` -is the unpenalized fit, and `gamma=0.5` is a mixture of the two (at -the coefficient level, and hence also the linear predictors). - -We can also select `gamma` using `cv.glmnet`, which by default uses -the 5 values `c(0, 0.25, 0.5, 0.75, 1)`. +`gamma = 1` is the traditional `glmnet` fit (also `relax = FALSE`, the default), `gamma = 0` is the unpenalized fit, and `gamma = 0.5` is a mixture of the two (at the coefficient level, and hence also the linear predictors). +We can also select `gamma` using `cv.glmnet`, which by default uses the 5 values `c(0, 0.25, 0.5, 0.75, 1)`. This returns an object of class `"cv.relaxed"`. ```{r} -cfit=cv.glmnet(x,y,relax=TRUE) +set.seed(1) +cfit <- cv.glmnet(x, y, relax = TRUE) plot(cfit) ``` -The plot command has an `se.bands` option if you don't like the -default shading of these bands. - -Just like before, you can make predictions from a CV object, and it -uses the selected values for `lambda` and `gamma`. - -```{r, eval=FALSE} -predict(cvfit,newx) +To remove the shading of the standard error bands, pass `se.bands = FALSE`: +```{r} +plot(cfit, se.bands = FALSE) ``` -A new feature in `glmnet` is a print method for `cv.glmnet` and a -`cv.relaxed` object. +As with regular `"cv.glmnet"` objects, you can make predictions from a relaxed CV object. Just as the `s` option (for `lambda`) admits two special strings `"lambda.1se"` and `"lambda.min"` for special values of `lambda`, the `gamma` option admits two special strings `"gamma.1se"` and `"gamma.min"` for special values of `gamma`. For example, the code below makes predictions for `newx` at the `lambda` and `gamma` values that has the smallest CV error: +```{r} +predict(cfit, newx = x[1:5, ], s = "lambda.min", gamma = "gamma.min") +``` +Printing class `"cv.relaxed"` objects gives some basic information on the cross-validation: ```{r} print(cfit) ``` ## More details on relaxed fitting -Although `glmnet` has a `relax` option, you can created a relaxed -version by post-processing a `glmnet` object. +While we only demonstrate relaxed fits for the default Gaussian family, *any* of the families fit by `glmnet` can also be fit with the `relaxed` option. +Although `glmnet` has a `relax` option, you can also fit relaxed lasso models by post-processing a `glmnet` object with the `relax.glmnet` function. ```{r `relaxed`} -fit=glmnet(x,y) -fitr=relax.glmnet(fit,x=x,y=y) +fit <- glmnet(x,y) +fitr <- relax.glmnet(fit, x = x, y = y) ``` -This will rarely need to be done; one use case is if the original fit -took a long time, and the user wanted to avoid refitting it. -Note that in the call the arguments are named, since they are -passed in via the `...` argument to `relax.glmnet`. - -Needless to say, *any* of the families fit by `glmnet` can also be fit -with the `relaxed` option. - -As mentioned, a `relaxed` object is also a `glmnet` object. Apart from -the class modification, it has an additional componet named `relaxed` -which is itself a `glmnet` object, but with the relaxed coefficients. -The default behavior of extractor functions like `predict` and `coef`, -as well as `plot` will be to present results from the `glmnet` fit, -unless a value of `gamma` is given different from the default value -`gamma=1` (see the plots above). The `print` method gives additional -info on the relaxed fit. - -Likewise, a `cv.relaxed` object inherits from class `cv.glmnet`. -Here the `predict` method by default uses the optimal relaxed fit; if -predictions from the CV-optimal *original* `glmnet` fit are desired, one -can directly use `predict.cv.glmnet`. Similarly for the `print` -command, which we illustrate here. +This will rarely need to be done; one use case is if the original fit took a long time, and the user wants to avoid refitting it. Note that the arguments are named in the call in order for them to be passed correctly via the `...` argument in `relax.glmnet`. + +As mentioned, a `"relaxed"` object inherits from class `"glmnet"`. Apart from the class modification, it has an additional component named `relaxed` which is itself a `glmnet` object, but with the relaxed coefficients. The default behavior of extractor functions like `predict` and `coef`, as well as `plot` will be to present results from the `glmnet` fit, unless a value of `gamma` is given different from the default value `gamma = 1` (see the plots above). The `print` method gives additional info on the relaxed fit. +Likewise, a `cv.relaxed` object inherits from class `cv.glmnet`. Here the `predict` method by default uses the optimal relaxed fit; if predictions from the CV-optimal *original* `glmnet` fit are desired, one can directly use `predict.cv.glmnet`. Similarly, use `print` to print information for cross-validation on the relaxed fit, and `print.cv.glmnet` for information on the cross-validation for the original `glmnet` fit. ```{r} print(cfit) print.cv.glmnet(cfit) ``` -## Relaxed fits and glms - -`glmnet` itself is used to fit the relaxed fits, by using a single -value of zero -for `lambda`. However, for nonlinear models such as binomial, -multinomial and poisson, there can be convergence issues. This is -because `glmnet` does not do stepsize optimization, rather relying on -the pathwise fit to stay in the "quadratic" zone of the log -likelihood. We have an optional `path=TRUE` option for `relax.glmnet`, which actually -fits a regurized path toward the `lambda=0` solution, and thus avoids -the issue. The default is `path=FALSE` since this option adds to the -computing time. - -### Forward stepwise and relaxed fit - -One use case for a relaxed fit is as a faster version of forward -stepwise regression. With a large number `p` of variables, forward-stepwise regression can be tedious. Lasso on the other hand, because -of its convexity, can plunge in and identify good candidate sets of -variables over 100 values of `lambda`, even though `p` could be in the -10s of thousands. In a case like this, one can have `cv.glmnet` do the -selection. - -```{r} -fitr=cv.glmnet(x,y,gamma=0,relax=TRUE) -plot(fitr) -``` - -Notice that we only allow `gamma=0`, so in this case we are not considering the blended fits. - - - -## Progress bar - -We finally have a progress bar for `glmnet` and `cv.glmnet`. Ever run a -job on a big dataset, and wonder how long it will take? Now you can -use the `trace.it = TRUE` argument to these functions. - -```{r, eval=FALSE} -fit=glmnet(x,y,trace=TRUE) -``` - -``##`` - -`` |================================== |65%`` - -Here we abbreviated the argument to `trace`. This display changes in -place as the fit is produced. -Also very helpful with `cv.glmnet` - -```{r, eval=FALSE} -fit=cv.glmnet(x,y,trace=TRUE) -``` -``##`` - -`` Training`` - -`` |=============================================| 100%`` - -`` Fold: 1/10`` - -`` |=============================================| 100%`` - -`` Fold: 2/10`` - -`` |=============================================| 100%`` - -`` Fold: 3/10`` +### Possible convergence issues for relaxed fits -`` |=============================================| 100%`` +`glmnet` itself is used to fit the relaxed fits by using a single value of zero for `lambda`. However, for nonlinear models such as `family = "binomial"`, `family = "multinomial"` and `family="poisson"`, there can be convergence issues. This is because `glmnet` does not do step size optimization, rather relying on +the pathwise fit to stay in the "quadratic" zone of the log-likelihood. We have an optional `path = TRUE` option for `relax.glmnet`, which actually fits a regurized path toward the `lambda = 0` solution, and thus avoids +the issue. The default is `path = FALSE` since this option adds to the computing time. -`` Fold: 4/10`` - -`` |=============================================| 100%`` - -`` Fold: 5/10`` - -`` |=============================================| 100%`` - -`` Fold: 6/10`` - -`` |============================= | 70%`` - - -Tracing of the folds works a little differently when distributed -computing is used. - -Here the `trace` argument should be used in each call to `glmnet` or -`cv.glmnet`. One can set this option session wide via a call to -`glmnet.control` with its new `itrace` argument: - -```{r, eval=FALSE} -glmnet.control(itrace=1) -``` - -To reset it, one makes a similar call and sets `itrace=0`. - -## C index for Cox models - - We have a new performance measure for the Cox model: the Harrel *C index*. - This is like the AUC measure of concordance for survival - data, but only considers comparable pairs. Pure concordance would - record the fraction of pairs for which the order of the death times - agree with the order of the predicted risk. But with survival data, - if an observation is right censored at a time *before* another - observation's death time, they are not comparable. - -```{r} - data(CoxExample) -``` - -```{r} - cvfit=cv.glmnet(x,y,family="cox",type.measure="C") - plot(cvfit) -``` - -## Assessing models on test data - -Once we have fit a series of models using `glmnet`, we often assess -their performance on a set of evaluation or test data. We usually go -through the process of building a prediction matrix, and then deciding -on the measure, and computing the values for a series of values for -`lambda` and now `gamma`. Here we provide three functions for making -these tasks easier. - -### Performance measures - -The function `assess.glmnet` computes the same performance measures produced by -`cv.glmnet`, but on a validation or test dataset. +## Application to forward stepwise regression +One use case for a relaxed fit is as a faster version of forward stepwise regression. With a large number `p` of variables, forward stepwise regression can be tedious. On the other hand, because the lasso solves a convex problem, it can plunge in and identify good candidate sets of variables over 100 values of `lambda`, even though `p` could be in the tens of thousands. In a case like this, one can have `cv.glmnet` do the selection of variables. ```{r} -data(BinomialExample) -itrain=1:70 -fit=glmnet(x[itrain,],y[itrain],family="binomial",nlambda=20) -assess.glmnet(fit,newx=x[-itrain,],newy=y[-itrain]) -``` - -This produces a list with *all* the measures suitable for a binomial -model, computed for the entire sequence of lambdas in the fit object. -Here the function identifies the model family from the fit object. - -A second use case builds the prediction matrix first - -```{r, eval=FALSE} -pred=predict(fit,newx=x[-itrain,]) -assess.glmnet(pred,newy=y[-itrain],family="binomial") -``` - -Here we have to provide the `family` as an argument; the results (not -shown) are the same. Users can see the various measures suitable for -each family via - -```{r} -glmnet.measures() -``` - -The assess function can also take the result of `cv.glmnet` as input. -In this case the predictions are made at the optimal values for the -parameter(s). - -```{r} -cfit=cv.glmnet(x[itrain,],y[itrain],family="binomial", nlambda = 30) -assess.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain]) -``` - -This used the default value of `s=lambda.1se`, just like `predict` -would have done. -Users can provide additional arguments that get passed on to predict: - -```{r} -assess.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain], s="lambda.min") -``` - - - -One interesting use case is to get the results of CV using other -measures, via the `keep` argument. In this case the `fit.preval` -object is a matrix of prevalidated predictions made using the folds `foldid` - -```{r} -cfit=cv.glmnet(x,y,family="binomial",keep=TRUE, nlambda = 30) -assess.glmnet(cfit$fit.preval,newy=y,family="binomial") -``` - -Users can verify that the first measure here `deviance` is identical -to the component `cvm` on the `cfit` object. - -### ROC curves for binomial data - -In the special case of binomial models, users often would like to see -the ROC curve for validation or test data. Here the function -`roc.glmnet` provides the goodies. Its first argument is as in -`assess.glmnet`. Here we illustrate one use case, using the -prevlidated CV fit as before. - - -```{r} -cfit=cv.glmnet(x,y,family="binomial", type.measure="auc", keep=TRUE) -rocs=roc.glmnet(cfit$fit.preval,newy=y) -which=match(cfit$lambda.min,cfit$lambda) -plot(rocs[[which]],type="l") -nopr=sapply(rocs,lines,col="grey") -lines(rocs[[which]],lwd=2,col="red") -``` - -In this case `roc.glmnet` returns a list of cross-validated ROC data, one for each -model along the path. In the third line we identify the CV -winner. Then we plot all the curves in grey, and the winner in red. - -### Confusion matrices for classification - -For binomial and multinomial models, we often which to examine the -classification performance on new data. The function -`confusion.glmnet` will do that. - -```{r} -data(MultinomialExample) -set.seed(101) -itrain=sample(1:500,400,replace=FALSE) -cfit=cv.glmnet(x[itrain,],y[itrain],family="multinomial") -cnf=confusion.glmnet(cfit,newx=x[-itrain,],newy=y[-itrain]) -print(cnf) -``` - -It produces a table of class `confusion.table` which inherits from -calss `table`, and we also provide a print method. - -The first argument to `confusion.glmnet` should be either a `glmnet` object, or a -`cv.glmnet` object, from which predictions can be made, or a -matrix/array of predictions, such as the *kept* `fit.predval` object -from `cv.glmnet`. - -In the second case we need to specify the `family`, -otherwise *confusion* can exist between `binomial` and `multinomial` -prediction matrices. -Here we show a multinomial example - -```{r} -cfit=cv.glmnet(x,y,family="multinomial",type="class",keep=TRUE) -cnf=confusion.glmnet(cfit$fit.preval,newy=y,family="multinomial") -which=match(cfit$lambda.min,cfit$lambda) -print(cnf[[which]]) -``` -Since the `fit.preval` object has predictions for the whole path, the -result of `confusion.glmnet` here is a list of confusion tables. -We identify and print the one corresponding to the minimum -classification error. - - -## Fitting big and/or sparse GLMs - -We include a function `bigGlm` for fitting a single GLM model -(unpenalized), but allowing all the options of `glmnet`. -In other words, coefficient upper and/or lower bounds and sparse `x` -matrices. This is not too much more than fitting a model with a single -value of `lambda=0` (with some protection from edge cases). -There is also a `predict` and `print` method. - -```{r} -data(BinomialExample) -fit=bigGlm(x,y,family="binomial",lower.limits=-1) -print(fit) -``` - -## Producing x from mixed variables, and missing data - -We have created a function `makeX` that makes it easy to create the -model matrix `x` needed as input to `glmnet`. It takes as input a data -frame, which can contain vectors, matrices and factors. Some of the features are - -* Factors are *one-hot* encoded to form indicator matrices -* Missing values in the resultant matrix can be replaced by the column - means -* The `sparse` option returns a matrix in column-sparse format. This - is useful if the data are large, and factors have many levels. -* Two dataframes can be provided, `train` and `test`. This ensures the - factor levels correspond, and also imputes missing data in the test - data from means in the training data. - - We start with a simple case with some factors. - -```{r} -set.seed(101) -X = matrix(rnorm(20),10,2) -X3=sample(letters[1:3],10,replace=TRUE) -X4=sample(LETTERS[1:3],10,replace=TRUE) -df=data.frame(X,X3,X4) -makeX(df) -``` - -Or if a sparse output was desired: -```{r} -makeX(df,sparse=TRUE) -``` - -And now some missing values - -```{r} -Xn=X -Xn[3,1]=NA;Xn[5,2]=NA -X3n=X3; -X3n[6]=NA -X4n=X4 -X4n[9]=NA -dfn=data.frame(Xn,X3n,X4n) -makeX(dfn) -``` -which we can replace with column-mean imputations (and make sparse, if -we like) - -```{r} -makeX(dfn,na.impute=TRUE,sparse=TRUE) +fitr <- cv.glmnet(x, y, gamma = 0, relax = TRUE) +plot(fitr) ``` -Finally if a test set is available as well - -```{r} -X = matrix(rnorm(10),5,2) -X3=sample(letters[1:3],5,replace=TRUE) -X4=sample(LETTERS[1:3],5,replace=TRUE) -Xn=X -Xn[3,1]=NA;Xn[5,2]=NA -X3n=X3; -X3n[1]=NA -X4n=X4 -X4n[2]=NA -dftn=data.frame(Xn,X3n,X4n) -makeX(dfn,dftn,na.impute=TRUE, sparse=TRUE) -``` - +Notice that we only allow `gamma = 0`, so in this case we are not considering the blended fits. ## References