'Variable importance from a tidymodels/stacks output?

Is it possible to retrieve the variable importance for one, many, or the full stacked model after running tidymodels/stacks? This is not yet supported by the VIP package, but is there an alternative method to extracting that information?

Using the bulk of the blog from Simon Couch here this is what I am generally trying to attempt. Instead I will use random forests and SVMs to then try to retrieve a variable importance.

library(tidyverse)
library(tidymodels)
library(stacks)
library(vip)

wind_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')

wind <-
  wind_raw %>%
  dplyr::select(
    province_territory, 
    total_project_capacity_mw,
    turbine_rated_capacity_kw = turbine_rated_capacity_k_w,
    rotor_diameter_m,
    hub_height_m,
    year = commissioning_date
  ) %>%
  group_by(province_territory) %>%
  mutate(
    year = as.numeric(year),
    province_territory = case_when(
      n() < 50 ~ "Other",
      TRUE ~ province_territory
    )
  ) %>%
  filter(!is.na(year)) %>%
  ungroup() %>%
  drop_na(turbine_rated_capacity_kw)

# split into training and testing sets
set.seed(1)
wind_split <- initial_split(wind)
wind_train <- training(wind_split)
wind_test  <- testing(wind_split)

# use a 5-fold cross-validation
set.seed(1)
folds <- rsample::vfold_cv(wind_train, v = 5)

# set up a basic recipe
wind_rec <- 
  recipe(turbine_rated_capacity_kw ~ ., data = wind_train) %>%
  step_impute_knn(all_predictors()) %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors())

# define a minimal workflow
wind_wflow <- 
  workflow() %>% 
  add_recipe(wind_rec)

ctrl_res <- control_stack_resamples()

rf_spec <- 
  rand_forest(mtry = tune(), 
              min_n = tune(), 
              trees = 1000) %>%
  set_mode('regression') %>%
  set_engine("ranger", importance = "impurity")

# add it to a workflow
rf_wflow <- 
  wind_wflow %>% 
  add_model(rf_spec)

# tune cost and rand_forest and fit to the 5-fold cv
set.seed(1)
rf_res <- 
  tune_grid(
    rf_wflow , 
    resamples = folds, 
    grid = 5,
    control = ctrl_grid
  )

# define a model using parsnip
svm_spec <- 
  svm_rbf(
    cost = tune(), 
    rbf_sigma = tune()
  ) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

# add it to a workflow
svm_wflow <- 
  wind_wflow %>% 
  add_model(svm_spec)

# tune cost and rbf_sigma and fit to the 5-fold cv
set.seed(1)
svm_res <- 
  tune_grid(
    svm_wflow, 
    resamples = folds, 
    grid = 5,
    control = ctrl_grid
  )

# add the models to the stack
wind_data_st <- 
  stacks() %>%
  add_candidates(rf_res) %>%
  add_candidates(svm_res) %>%
  blend_predictions() %>%
  fit_members()

# attempt to plot the variable importance of the stacked model
wind_data_st %>%
  vip()

I return Error: Model-specific variable importance scores are currently not available for this type of model., which is self explanatory, but is there a work around to extract this information? Maybe outside of VIP? Is it possible to pluck out one of the viable models that went into the stack to evaluated? Does anyone know if VIP is planning on putting out a solution to this? Thanks in advance!



Solution 1:[1]

I've had a similar issue, and what I've done is make a tibble of variable importance for each member of the stack, then normalize them onto the same scale, and multiply by their relative weight in the stack to have a summed total relative importance.

I couldn't reproduce your code, but here's an example of what you can try...

After you've ran blend_predictions(), you can extract the weights. Then create a tibble for each that includes a column for Variable and a column for importance. Then join together and you'll have the weight importance.

library("DALEX")
library("dplyr")
library("tidymodels")


colnames(fifa)
fifa_small <- fifa %>% 
  select(value_eur, age, 
         attacking_crossing:attacking_volleys, 
         defending_marking:defending_sliding_tackle) %>% 
  as_tibble() %>% dplyr::slice_sample(n = 1000)


fifa_small_folds <- vfold_cv(fifa_small, v = 8, repeats = 1)
fifa_small_folds


basic_rec <-   
  recipe(value_eur ~ ., data = fifa_small) %>%
  step_nzv(all_numeric_predictors()) %>% 
  step_normalize(all_numeric(), -all_outcomes()) 


model1 <- 
  boost_tree(trees = 1000) %>%
  set_engine('xgboost', importance = TRUE) %>%
  set_mode('regression')


model2 <-
  linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine('glmnet')

model3 <-
  linear_reg(penalty = tune(), mixture = 0) %>%
  set_engine('glmnet')

wfs <- 
  workflow_set(
    preproc = list(basic_rec),  
    models = list(model1, model2, model3), 
    cross = T  )

wfs

doParallel::registerDoParallel()

wfs_rs <-
  workflow_map(
    wfs,
    "tune_grid",
    resamples = fifa_small_folds,
    grid = 10,
    control = control_grid(save_pred = TRUE,
                           parallel_over = "everything",
                           save_workflow = TRUE
                           
    ) )


doParallel::stopImplicitCluster()


library(stacks)
tidymodels_prefer()

wfs_stack <- 
  stacks() %>% 
  add_candidates(wfs_rs)


blend_ens <- blend_predictions(wfs_stack, penalty = 10^seq(-2, 0, length = 10))

blend_ens

ens1_wt <- stacks:::top_coefs(blend_ens) %>% slice(1) %>%  pull(weight)
ens2_wt <- stacks:::top_coefs(blend_ens) %>% slice(2) %>% pull(weight)

## Get the workflowset
individ_ens1_best_fit <- extract_workflow(wfs_rs, id = "recipe_boost_tree")

## extract the tuned results from the workflow
individ_ens1_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_boost_tree",
                                            "result"][[1]][[1]]

individ_ens1_lowest_rmse <- individ_ens1_best_tuned %>%
  show_best("rmse") %>% 
  slice(1)

## fit the final model
individ_ens1_best_final <- finalize_workflow(individ_ens1_best_fit, individ_ens1_lowest_rmse)
individ_ens1_bestfinal_1 <- individ_ens1_best_final %>% fit(fifa_small)


individ_ens1_vi_tbl <- individ_ens1_bestfinal_1 %>%
  extract_fit_parsnip() %>%
  vip::vi() %>%
  mutate(
    ens1_Importance = abs(Importance),
    Variable = factor(Variable), .keep = "unused") 



blend_ens

ens2_config <- ens_name_fn(blend_ens, 2)
ens2_id <- ens_id_fn(blend_ens, 2)

## Get the  workflowset
individ_ens2_best_fit <- extract_workflow(wfs_rs, id = "recipe_linear_reg_3")


## extract the tuned results from the best workflow
individ_ens2_best_tuned <- wfs_rs[wfs_rs$wflow_id == "recipe_linear_reg_3",
                                            "result"][[1]][[1]]


individ_ens2_lowest_rmse <- individ_ens2_best_tuned %>%
  show_best("rmse") %>% filter(.config == "Preprocessor1_Model01") %>%  slice(1)

## fit the final model
individ_ens2_best_final <- finalize_workflow(individ_ens2_best_fit, individ_ens2_lowest_rmse)

individ_ens2_bestfinal_1 <- individ_ens2_best_final %>% fit(fifa_small)


individ_ens2_vi_tbl <- individ_ens2_bestfinal_1 %>%
  extract_fit_parsnip() %>%
  vip::vi(lambda = individ_ens2_lowest_rmse$penalty) %>%   # include lambda for lasso or ridge
  mutate(
    ens2_Importance = abs(Importance),
    Variable = factor(Variable), .keep = "unused")


ens_vi_joined <- individ_ens1_vi_tbl %>% 
  left_join(individ_ens2_vi_tbl, by = c("Variable")) %>%
  mutate(across(2:ncol(.), ~ifelse(is.na(.), 0, .)),
         ens1_normed = ens1_Importance/ sum(ens1_Importance),
         ens2_normed =  ens2_Importance/ sum(ens2_Importance),
         ens1_wted = ens1_normed * ens1_wt,
         ens2_wted = ens2_normed * ens2_wt,
  )  %>% 
  rowwise() %>% 
  mutate(summed_importance = sum(c_across(ends_with("wted")))  ) %>% 
  ungroup() %>% 
  mutate(
    total_importance = summed_importance/ sum(summed_importance),   #normalized
  ) 


ens_vi_joined %>% select(Variable, total_importance) %>% 
  ggplot(aes(total_importance, fct_reorder(Variable, total_importance)))+
  geom_col()

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Jeff