More `model_fun` examples for `pminternal::validate` • pminternal

vignette("pminternal") gives an introduction to the package and writing user-defined model and score functions. This vignette provides more examples of user-defined model functions for what I expect are the most commonly used modeling approaches that would not be supported by the fit argument.

library(pminternal)
library(Hmisc)
#> 
#> Attaching package: 'Hmisc'
#> The following objects are masked from 'package:base':
#> 
#>     format.pval, units

getHdata("gusto")
gusto <- gusto[, c("sex", "age", "hyp", "htn", "hrt", "pmi", "ste", "day30")]

gusto$y <- gusto$day30; gusto$day30 <- NULL

set.seed(234)
gusto <- gusto[sample(1:nrow(gusto), size = 5000),]

Backward Selection

The function below implements a model selected via backward elimination using AIC.


stepglm <- function(data, ...){
  m <- glm(y~., data=data, family="binomial")
  step(m, trace = 0)
}

steppred <- function(model, data, ...){
  predict(model, newdata = data, type = "response")
}

validate(data = gusto, outcome = "y", model_fun = stepglm, 
         pred_fun = steppred, method = "cv_opt", B = 10)
#>                C    Brier Intercept Slope    Eavg     E50     E90   Emax
#> Apparent  0.7996  0.05992     0.000 1.000  0.0039  0.0023  0.0076  0.093
#> Optimism  0.0066 -0.00059     0.077 0.029 -0.0133 -0.0066 -0.0334 -0.137
#> Corrected 0.7930  0.06051    -0.077 0.971  0.0171  0.0089  0.0410  0.229
#>               ECI
#> Apparent   0.0052
#> Optimism  -0.1292
#> Corrected  0.1344

In this situation it is probably best to stick with lrm, fastbw, and validate from the rms package (though note differences with default step behavior) unless you want the additional calibration metrics offered by pminternal or want to specify your own score function (see vignette("pminternal")).

Ridge

vignette("pminternal") gives an example of a glm with lasso (L1) penalization. It is simple to modify this to implement ridge (L2) penalization by setting alpha = 0.

#library(glmnet)

ridge_fun <- function(data, ...){
  y <- data$y
  x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
  x$sex <- as.numeric(x$sex == "male")
  x$pmi <- as.numeric(x$pmi == "yes")
  x <- as.matrix(x)
  
  cv <- glmnet::cv.glmnet(x=x, y=y, alpha=0, nfolds = 10, family="binomial")
  lambda <- cv$lambda.min
  
  glmnet::glmnet(x=x, y=y, alpha = 0, lambda = lambda, family="binomial")
}

ridge_predict <- function(model, data, ...){
  # note this is identical to lasso_predict from "pminternal" vignette
  x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
  x$sex <- as.numeric(x$sex == "male")
  x$pmi <- as.numeric(x$pmi == "yes")
  x <- as.matrix(x)
  
  plogis(glmnet::predict.glmnet(model, newx = x)[,1])
}

validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = ridge_fun, 
         pred_fun = ridge_predict, B = 10)
#>                C    Brier Intercept Slope   Eavg     E50    E90   Emax     ECI
#> Apparent  0.7998  0.05991     0.204 1.095  0.006  0.0054  0.013  0.048  0.0052
#> Optimism  0.0054 -0.00047     0.066 0.025 -0.011 -0.0052 -0.024 -0.144 -0.1100
#> Corrected 0.7944  0.06037     0.138 1.071  0.016  0.0105  0.037  0.192  0.1152

# the use of package::function in user defined functions 
# is especially important if you want to run 
# boot_* or .632 in parallel via cores argument

# e.g.
# validate(method = ".632", data = gusto, 
#          outcome = "y", model_fun = ridge_fun, 
#          pred_fun = ridge_predict, B = 100, cores = 4)

Rather than have two separate functions we could specify an optional argument, alpha, that could be supplied to validate. If this argument isn’t supplied the function below defaults to alpha = 0. The chunk below is not evaluated so no output is printed.

lognet_fun <- function(data, ...){
  
  dots <- list(...)
  if ("alpha" %in% names(dots)){
    alpha <- dots[["alpha"]]
  } else{
    alpha <- 0
  }
  # cat("Using alpha =", alpha, "\n")
  
  y <- data$y
  x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
  x$sex <- as.numeric(x$sex == "male")
  x$pmi <- as.numeric(x$pmi == "yes")
  x <- as.matrix(x)
  
  cv <- glmnet::cv.glmnet(x=x, y=y, alpha = alpha, nfolds = 10, family="binomial")
  lambda <- cv$lambda.min
  
  glmnet::glmnet(x=x, y=y, alpha = alpha, lambda = lambda, family="binomial")
}

validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = lognet_fun, 
         pred_fun = ridge_predict, B = 10, alpha = 0.5)

Elastic Net

To implement a model with an elastic net penalty we need to add the steps to select alpha. The function below evaluates nalpha equally spaced values of alpha between 0 and 1 (inclusive) and selects the values lambda and alpha that result in the minimum CV binomial deviance (could be changed via type.measure). nalpha is an optional argument. Note we don’t need a new predict function here so ridge_predict is used. To save build time the chunk below is not evaluated.

enet_fun <- function(data, ...){
  
  dots <- list(...)
  if ("nalpha" %in% names(dots)){
    nalpha <- dots[["nalpha"]]
  } else{
    nalpha <- 21 # 0 to 1 in steps of 0.05
  }
  
  y <- data$y
  x <- data[, c('sex', 'age', 'hyp', 'htn', 'hrt', 'pmi', 'ste')]
  x$sex <- as.numeric(x$sex == "male")
  x$pmi <- as.numeric(x$pmi == "yes")
  x <- as.matrix(x)
  
  # run 10 fold CV for each alpha
  alphas <- seq(0, 1, length.out = nalpha)
  res <- lapply(alphas, function(a){
    cv <- glmnet::cv.glmnet(x=x, y=y, alpha = a, nfolds = 10, family="binomial")
    list(lambda = cv$lambda.min, bin.dev = min(cv$cvm))
  })
  # select result with min binomial deviance
  j <- which.min(sapply(res, function(x) x$bin.dev))
  # produce 'final' model with alpha and lambda
  glmnet::glmnet(x=x, y=y, alpha = alphas[j], lambda = res[[j]][["lambda"]], family="binomial")
}

validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = enet_fun, 
         pred_fun = ridge_predict, B = 10)

Random Forest

In the example below we use the ranger package to create our model_fun and allow for optional arguments of num.trees, max.depth, and min.node.size; others could be added (see ?ranger).

# library(ranger)

rf_fun <- function(data, ...){
  
  dots <- list(...)
  num.trees <- if ("num.trees" %in% names(dots)) dots[["num.trees"]] else 500
  max.depth <- if ("max.depth" %in% names(dots)) dots[["max.depth"]] else NULL
  min.node.size <- if ("min.node.size" %in% names(dots)) dots[["min.node.size"]] else 1
  
  # best to make sure y is a factor where '1' is level 2
  data$y <- factor(data$y, levels = 0:1)
  
  ranger::ranger(y~., data = data, probability = T, 
                 num.trees = num.trees, 
                 max.depth = max.depth, 
                 min.node.size = min.node.size)
}

rf_predict <- function(model, data, ...){
  predict(model, data = data)$predictions[, 2] 
}

validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = rf_fun, 
         pred_fun = rf_predict, B = 10)
#>              C  Brier Intercept Slope  Eavg   E50   E90 Emax   ECI
#> Apparent  0.96  0.045      3.60  3.06 0.055 0.031 0.067 0.56 1.094
#> Optimism  0.18 -0.017      3.82  2.19 0.042 0.024 0.036 0.40 1.082
#> Corrected 0.78  0.062     -0.22  0.88 0.014 0.007 0.031 0.16 0.012

# instead of unlimited tree depth...
validate(method = "cv_optimism", data = gusto, 
         outcome = "y", model_fun = rf_fun, 
         pred_fun = rf_predict, B = 10, max.depth = 3)
#>               C   Brier Intercept Slope    Eavg     E50     E90 Emax    ECI
#> Apparent  0.817  0.0603      2.22  1.99 0.03055 0.02283  0.0533 0.38  0.216
#> Optimism  0.022 -0.0012      0.46  0.21 0.00069 0.00064 -0.0013 0.12 -0.011
#> Corrected 0.794  0.0615      1.76  1.78 0.02987 0.02218  0.0545 0.26  0.227