# ==============================================================================
# Example of complete `tidymodels` workflow for classification
# ------------------------------------------------------------------------------
# Dataset: modeldata::mlc_churn
# Author: Prof. Dr. Michael Bücker
# ==============================================================================
# Load packages
# ------------------------------------------------------------------------------
library(tidymodels)
tidymodels_prefer()
library(modeldata)
library(themis)
library(future)
library(doFuture)
library(parallel)
# Data preparation
# ------------------------------------------------------------------------------
# Create oversampling recipe with additional ratio variables
mlc_recipe <-
recipe(churn ~ ., data = mlc_churn) %>%
step_smotenc(churn, over_ratio = 1) %>%
step_mutate(
ratio_day_minutes_per_call = ifelse(total_day_calls == 0, 0,
total_day_minutes/total_day_calls),
ratio_eve_minutes_per_call = ifelse(total_eve_calls == 0, 0,
total_eve_minutes/total_eve_calls),
ratio_night_minutes_per_call = ifelse(total_night_calls == 0, 0,
total_night_minutes/total_night_calls),
ratio_intl_minutes_per_call = ifelse(total_intl_calls == 0, 0,
total_intl_minutes/total_intl_calls),
ratio_day_charge_per_call = ifelse(total_day_calls == 0, 0,
total_day_charge/total_day_calls),
ratio_eve_charge_per_call = ifelse(total_eve_calls == 0, 0,
total_eve_charge/total_eve_calls),
ratio_night_charge_per_call = ifelse(total_night_calls == 0, 0,
total_night_charge/total_night_calls),
ratio_intl_charge_per_call = ifelse(total_intl_calls == 0, 0,
total_intl_charge/total_intl_calls)
)
# Create second recipe with dummy variables
mlc_recipe_num <- mlc_recipe %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
recipes <- list(lr_recipe = mlc_recipe,
dt_recipe = mlc_recipe,
rf_recipe = mlc_recipe,
xgb_recipe = mlc_recipe_num)
# Data splitting
# ------------------------------------------------------------------------------
mlc_churn_split <- initial_split(mlc_churn,
prop = 0.80,
strata = churn)
mlc_churn_train <- training(mlc_churn_split)
mlc_churn_test <- testing(mlc_churn_split)
# Resampling
# ------------------------------------------------------------------------------
# Cross validation
set.seed(123)
mlc_folds <- vfold_cv(mlc_churn_train)
# Modeling
# ------------------------------------------------------------------------------
# Logistic Regression
lr_model <-
logistic_reg(mode = "classification",
engine = "glm")
# Decision Tree
dt_model <-
decision_tree(mode = "classification",
engine = "rpart",
cost_complexity = tune(),
min_n = tune())
# Random Forest
rf_model <-
rand_forest(mode = "classification",
engine = "ranger",
trees = tune())
# XGBoost
xgb_model <-
boost_tree(mode = "classification",
engine = "xgboost",
learn_rate = tune(),
tree_depth = tune())
# List of models
models <- list(lr_model = lr_model,
dt_model = dt_model,
rf_model = rf_model,
xgb_model = xgb_model)
# Create workflow set
mlc_workflows <- workflow_set(preproc = recipes,
models = models,
cross = FALSE)
# Paralellization
# ------------------------------------------------------------------------------
# Create cluster for paralellization
cl <- makeCluster(12, type = "PSOCK")
# Export customer objcts/functions (if any, in this case not)
# clusterExport(cl, c("my_own_function"))
## Load libraries across workers
clusterEvalQ(cl, {
library(tidymodels)
library(modeldata)
library(themis)
NULL
})
# Configure future to use the cluster
plan(cluster, workers = cl)
registerDoFuture()
# Fit models
# ------------------------------------------------------------------------------
# WARNING: This step will consume the entire
# computation power and take > 10 minutes
ctrl <- control_grid(
parallel_over = "everything",
allow_par = TRUE,
save_workflow = TRUE,
save_pred = TRUE,
pkgs = c("tidymodels", "modeldata", "themis")
)
mlc_fit <- mlc_workflows %>%
workflow_map("tune_grid",
seed = 123,
verbose = TRUE,
metrics = metric_set(accuracy, roc_auc),
resamples = mlc_folds,
control = ctrl)
# Evaluation
# ------------------------------------------------------------------------------
# Get metrics
collect_metrics(mlc_fit)
# Visualization
autoplot(mlc_fit,
rank_metric = "roc_auc",
metric = "roc_auc",
select_best = TRUE)
# Show best results per workflow
mlc_fit %>% rank_results(rank_metric = "roc_auc",
select_best = TRUE) %>%
filter(.metric == "roc_auc")
# Get best result
best_results <-
mlc_fit %>%
extract_workflow_set_result("xgb_recipe_xgb_model") %>%
select_best(metric = "roc_auc")
best_results
# Test results
best_test_results <-
mlc_fit %>%
extract_workflow("xgb_recipe_xgb_model") %>%
finalize_workflow(best_results) %>%
last_fit(split = mlc_churn_split)
best_test_results %>%
collect_metrics()
# Predictions
best_test_results$.predictions