More `model_fun` examples for `pminternal::validate`
Source:vignettes/validate-examples.Rmd
validate-examples.Rmd
vignette("pminternal")
gives an introduction to the
package and writing user-defined model and score functions. This
vignette provides more examples of user-defined model functions for what
I expect are the most commonly used modeling approaches that would not
be supported by the fit
argument.
library(pminternal)
library(Hmisc)
#>
#> Attaching package: 'Hmisc'
#> The following objects are masked from 'package:base':
#>
#> format.pval, units
getHdata("gusto")
gusto <- gusto[, c("sex", "age", "hyp", "htn", "hrt", "pmi", "ste", "day30")]
gusto$y <- gusto$day30; gusto$day30 <- NULL
set.seed(234)
gusto <- gusto[sample(1:nrow(gusto), size = 5000),]
Backward Selection
The function below implements a model selected via backward elimination using AIC.
stepglm <- function(data, ...){
m <- glm(y~., data=data, family="binomial")
step(m, trace = 0)
}
steppred <- function(model, data, ...){
predict(model, newdata = data, type = "response")
}
validate(data = gusto, outcome = "y", model_fun = stepglm,
pred_fun = steppred, method = "cv_opt", B = 10)
#> C Brier Intercept Slope Eavg E50 E90 Emax
#> Apparent 0.7996 0.05992 0.000 1.000 0.0039 0.0023 0.0076 0.093
#> Optimism 0.0066 -0.00059 0.077 0.029 -0.0133 -0.0066 -0.0334 -0.137
#> Corrected 0.7930 0.06051 -0.077 0.971 0.0171 0.0089 0.0410 0.229
#> ECI
#> Apparent 0.0052
#> Optimism -0.1292
#> Corrected 0.1344
In this situation it is probably best to stick with lrm
,
fastbw
, and validate
from the rms
package (though note differences with default step
behavior) unless you want the additional calibration metrics offered by
pminternal
or want to specify your own score function (see
vignette("pminternal")
).
Ridge
vignette("pminternal")
gives an example of a glm with
lasso (L1) penalization. It is simple to modify this to implement ridge
(L2) penalization by setting alpha = 0
.
#library(glmnet)
ridge_fun <- function(data, ...){
y <- data$y
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
cv <- glmnet::cv.glmnet(x=x, y=y, alpha=0, nfolds = 10, family="binomial")
lambda <- cv$lambda.min
glmnet::glmnet(x=x, y=y, alpha = 0, lambda = lambda, family="binomial")
}
ridge_predict <- function(model, data, ...){
# note this is identical to lasso_predict from "pminternal" vignette
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
plogis(glmnet::predict.glmnet(model, newx = x)[,1])
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = ridge_fun,
pred_fun = ridge_predict, B = 10)
#> C Brier Intercept Slope Eavg E50 E90 Emax ECI
#> Apparent 0.7998 0.05991 0.204 1.095 0.006 0.0054 0.013 0.048 0.0052
#> Optimism 0.0054 -0.00047 0.066 0.025 -0.011 -0.0052 -0.024 -0.144 -0.1100
#> Corrected 0.7944 0.06037 0.138 1.071 0.016 0.0105 0.037 0.192 0.1152
# the use of package::function in user defined functions
# is especially important if you want to run
# boot_* or .632 in parallel via cores argument
# e.g.
# validate(method = ".632", data = gusto,
# outcome = "y", model_fun = ridge_fun,
# pred_fun = ridge_predict, B = 100, cores = 4)
Rather than have two separate functions we could specify an optional
argument, alpha
, that could be supplied to
validate
. If this argument isn’t supplied the function
below defaults to alpha = 0
. The chunk below is not
evaluated so no output is printed.
lognet_fun <- function(data, ...){
dots <- list(...)
if ("alpha" %in% names(dots)){
alpha <- dots[["alpha"]]
} else{
alpha <- 0
}
# cat("Using alpha =", alpha, "\n")
y <- data$y
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
cv <- glmnet::cv.glmnet(x=x, y=y, alpha = alpha, nfolds = 10, family="binomial")
lambda <- cv$lambda.min
glmnet::glmnet(x=x, y=y, alpha = alpha, lambda = lambda, family="binomial")
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = lognet_fun,
pred_fun = ridge_predict, B = 10, alpha = 0.5)
Elastic Net
To implement a model with an elastic net penalty we need to add the
steps to select alpha
. The function below evaluates
nalpha
equally spaced values of alpha
between
0 and 1 (inclusive) and selects the values lambda
and
alpha
that result in the minimum CV binomial deviance
(could be changed via type.measure
). nalpha
is
an optional argument. Note we don’t need a new predict function here so
ridge_predict
is used. To save build time the chunk below
is not evaluated.
enet_fun <- function(data, ...){
dots <- list(...)
if ("nalpha" %in% names(dots)){
nalpha <- dots[["nalpha"]]
} else{
nalpha <- 21 # 0 to 1 in steps of 0.05
}
y <- data$y
x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
x$sex <- as.numeric(x$sex == "male")
x$pmi <- as.numeric(x$pmi == "yes")
x <- as.matrix(x)
# run 10 fold CV for each alpha
alphas <- seq(0, 1, length.out = nalpha)
res <- lapply(alphas, function(a){
cv <- glmnet::cv.glmnet(x=x, y=y, alpha = a, nfolds = 10, family="binomial")
list(lambda = cv$lambda.min, bin.dev = min(cv$cvm))
})
# select result with min binomial deviance
j <- which.min(sapply(res, function(x) x$bin.dev))
# produce 'final' model with alpha and lambda
glmnet::glmnet(x=x, y=y, alpha = alphas[j], lambda = res[[j]][["lambda"]], family="binomial")
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = enet_fun,
pred_fun = ridge_predict, B = 10)
Random Forest
In the example below we use the ranger
package to create
our model_fun
and allow for optional arguments of
num.trees
, max.depth
, and
min.node.size
; others could be added (see
?ranger
).
# library(ranger)
rf_fun <- function(data, ...){
dots <- list(...)
num.trees <- if ("num.trees" %in% names(dots)) dots[["num.trees"]] else 500
max.depth <- if ("max.depth" %in% names(dots)) dots[["max.depth"]] else NULL
min.node.size <- if ("min.node.size" %in% names(dots)) dots[["min.node.size"]] else 1
# best to make sure y is a factor where '1' is level 2
data$y <- factor(data$y, levels = 0:1)
ranger::ranger(y~., data = data, probability = T,
num.trees = num.trees,
max.depth = max.depth,
min.node.size = min.node.size)
}
rf_predict <- function(model, data, ...){
predict(model, data = data)$predictions[, 2]
}
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = rf_fun,
pred_fun = rf_predict, B = 10)
#> C Brier Intercept Slope Eavg E50 E90 Emax ECI
#> Apparent 0.96 0.045 3.60 3.06 0.055 0.031 0.067 0.56 1.094
#> Optimism 0.18 -0.017 3.82 2.19 0.042 0.024 0.036 0.40 1.082
#> Corrected 0.78 0.062 -0.22 0.88 0.014 0.007 0.031 0.16 0.012
# instead of unlimited tree depth...
validate(method = "cv_optimism", data = gusto,
outcome = "y", model_fun = rf_fun,
pred_fun = rf_predict, B = 10, max.depth = 3)
#> C Brier Intercept Slope Eavg E50 E90 Emax ECI
#> Apparent 0.817 0.0603 2.22 1.99 0.03055 0.02283 0.0533 0.38 0.216
#> Optimism 0.022 -0.0012 0.46 0.21 0.00069 0.00064 -0.0013 0.12 -0.011
#> Corrected 0.794 0.0615 1.76 1.78 0.02987 0.02218 0.0545 0.26 0.227