class: center, middle, inverse, title-slide .title[ # 00 CV for many models ] .author[ ### STAT 406 ] .author[ ### Daniel J. McDonald ] .date[ ### Last modified - 2022-10-12 ] --- ## Some data and 4 models ```r data(mobility, package = "Stat406") ``` **Model 1:** Lasso on all predictors, use CV min **Model 2:** Ridge on all predictors, use CV min **Model 3:** OLS on all predictors (no tuning parameters) **Model 4:** (1) Lasso on all predictors, then (2) OLS on those chosen at CV min .emphasis[ How do I decide between these 4 models? ] -- ```r kfold_cv <- function(data, estimator, predictor, error_fun, kfolds = 5) { fold_labels <- sample(rep(seq(kfolds), length.out = nrow(data))) errors <- double(kfolds) for (fold in seq_len(kfolds)) { test_rows <- fold_labels == fold train <- data[!test_rows, ] test <- data[test_rows, ] current_model <- estimator(train) test$.preds <- predictor(current_model, test) errors[fold] <- error_fun(test) } mean(errors) } ``` --- ## Experiment setup ```r # prepare our data # note that mob has only continuous predictors, otherwise could be trouble mob <- mobility[complete.cases(mobility), ] %>% select(-ID, -State, -Name) # avoid doing this same operation a bunch xmat <- function(dataset) as.matrix(select(dataset, !Mobility)) # set up our model functions library(glmnet) mod1 <- function(dataset, ...) cv.glmnet(xmat(dataset), dataset$Mobility, type.measure = "mae", ...) mod2 <- function(dataset, ...) cv.glmnet(xmat(dataset), dataset$Mobility, alpha = 0, type.measure = "mae", ...) mod3 <- function(dataset, ...) glmnet(xmat(dataset), dataset$Mobility, lambda = 0, ...) # just does lm() mod4 <- function(dataset, ...) cv.glmnet(xmat(dataset), dataset$Mobility, relax = TRUE, gamma = 1, type.measure = "mae", ...) # this will still "work" on mod3, because there's only 1 s predictor <- function(modle, dataset) drop(predict(modle, newx = xmat(dataset), s = "lambda.min")) # chose mean absolute error just 'cause error_fun <- function(testdata) mean(abs(testdata$Mobility - testdata$.preds)) # not necessarily useful for choosing in this context, but good for illustration loo_cv <- function(dataset) { mdl <- lm(Mobility ~ ., data = dataset) mean( abs(residuals(mdl)) / abs(1 - hatvalues(mdl)) ) # MAE version } ``` --- ## Run the experiment * I'm using `purrr` functions to do this without loops, 'cause it's prettier ```r library(purrr) all_model_funs <- list(mod1 = mod1, mod2 = mod2, mod3 = mod3, mod4 = mod4) all_fits <- map(all_model_funs, ~ do.call(.x, list(dataset = mob))) # unfortunately, does different splits for each method, so we use 10, # it would be better to use the _SAME_ splits ten_fold_cv <- map_dbl(all_model_funs, ~ kfold_cv(mob, .x, predictor, error_fun, 10)) in_sample_cv <- c( mod1 = min(all_fits[[1]]$cvm), mod2 = min(all_fits[[2]]$cvm), mod3 = loo_cv(mob), mod4 = min(all_fits[[4]]$cvm) ) tib <- bind_rows(in_sample_cv, ten_fold_cv) tib$method = c("in_sample", "out_of_sample") kableExtra::kable(tib, booktabs = TRUE) ``` <table> <thead> <tr> <th style="text-align:right;"> mod1 </th> <th style="text-align:right;"> mod2 </th> <th style="text-align:right;"> mod3 </th> <th style="text-align:right;"> mod4 </th> <th style="text-align:left;"> method </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0.0158509 </td> <td style="text-align:right;"> 0.0160802 </td> <td style="text-align:right;"> 0.0163659 </td> <td style="text-align:right;"> 0.0155659 </td> <td style="text-align:left;"> in_sample </td> </tr> <tr> <td style="text-align:right;"> 0.0157503 </td> <td style="text-align:right;"> 0.0161216 </td> <td style="text-align:right;"> 0.0165424 </td> <td style="text-align:right;"> 0.0160729 </td> <td style="text-align:left;"> out_of_sample </td> </tr> </tbody> </table>