Code aus der Vorlesung

Supervised Machine Learning: Vollständiges MLC Churn Beispiel

# ==============================================================================
# Example of complete `tidymodels` workflow for classification
# ------------------------------------------------------------------------------
# Dataset: modeldata::mlc_churn
# Author:  Prof. Dr. Michael Bücker
# ==============================================================================

# Load packages
# ------------------------------------------------------------------------------


library(tidymodels)
tidymodels_prefer()

library(modeldata)
library(themis)

library(future)
library(doFuture)
library(parallel)


# Data preparation
# ------------------------------------------------------------------------------

# Create oversampling recipe with additional ratio variables
mlc_recipe <- 
  recipe(churn ~ ., data = mlc_churn) %>% 
  step_smotenc(churn, over_ratio = 1) %>% 
  step_mutate(
    ratio_day_minutes_per_call = ifelse(total_day_calls == 0, 0, 
                                        total_day_minutes/total_day_calls),
    ratio_eve_minutes_per_call = ifelse(total_eve_calls == 0, 0, 
                                        total_eve_minutes/total_eve_calls),
    ratio_night_minutes_per_call = ifelse(total_night_calls == 0, 0, 
                                          total_night_minutes/total_night_calls),
    ratio_intl_minutes_per_call = ifelse(total_intl_calls == 0, 0, 
                                         total_intl_minutes/total_intl_calls),
    ratio_day_charge_per_call = ifelse(total_day_calls == 0, 0, 
                                       total_day_charge/total_day_calls),
    ratio_eve_charge_per_call = ifelse(total_eve_calls == 0, 0, 
                                       total_eve_charge/total_eve_calls),
    ratio_night_charge_per_call = ifelse(total_night_calls == 0, 0, 
                                         total_night_charge/total_night_calls),
    ratio_intl_charge_per_call = ifelse(total_intl_calls == 0, 0, 
                                        total_intl_charge/total_intl_calls)
  )

# Create second recipe with dummy variables
mlc_recipe_num <- mlc_recipe %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

recipes <- list(lr_recipe = mlc_recipe,
                dt_recipe = mlc_recipe,
                rf_recipe = mlc_recipe,
                xgb_recipe = mlc_recipe_num) 

# Data splitting
# ------------------------------------------------------------------------------

mlc_churn_split <- initial_split(mlc_churn, 
                                 prop = 0.80, 
                                 strata = churn)
mlc_churn_train <- training(mlc_churn_split)
mlc_churn_test <- testing(mlc_churn_split)


# Resampling
# ------------------------------------------------------------------------------

# Cross validation 
set.seed(123)
mlc_folds <- vfold_cv(mlc_churn_train)


# Modeling
# ------------------------------------------------------------------------------


# Logistic Regression
lr_model <- 
  logistic_reg(mode = "classification", 
               engine = "glm")

# Decision Tree
dt_model <- 
  decision_tree(mode = "classification",
                engine = "rpart",
                cost_complexity = tune(),
                min_n = tune())

# Random Forest
rf_model <- 
  rand_forest(mode = "classification",
              engine = "ranger",
              trees = tune())

# XGBoost
xgb_model <- 
  boost_tree(mode = "classification",
             engine = "xgboost",
             learn_rate = tune(),
             tree_depth = tune())

# List of models
models <- list(lr_model = lr_model,
               dt_model = dt_model,
               rf_model = rf_model,
               xgb_model = xgb_model)

# Create workflow set
mlc_workflows <- workflow_set(preproc = recipes,
                              models = models, 
                              cross = FALSE)




# Paralellization
# ------------------------------------------------------------------------------


# Create cluster for paralellization
cl <- makeCluster(12, type = "PSOCK")

# Export customer objcts/functions (if any, in this case not)
# clusterExport(cl, c("my_own_function"))

## Load libraries across workers
clusterEvalQ(cl, {
  library(tidymodels)
  library(modeldata)
  library(themis)
  NULL
})

# Configure future to use the cluster
plan(cluster, workers = cl)
registerDoFuture()




# Fit models
# ------------------------------------------------------------------------------



# WARNING: This step will consume the entire 
# computation power and take > 10 minutes


ctrl <- control_grid(
  parallel_over = "everything",
  allow_par     = TRUE,
  save_workflow = TRUE,
  save_pred     = TRUE,
  pkgs          = c("tidymodels", "modeldata", "themis")
)

mlc_fit <- mlc_workflows %>% 
  workflow_map("tune_grid",
               seed = 123,
               verbose = TRUE,
               metrics = metric_set(accuracy, roc_auc),
               resamples = mlc_folds, 
               control = ctrl)

# Evaluation
# ------------------------------------------------------------------------------

# Get metrics
collect_metrics(mlc_fit)

# Visualization
autoplot(mlc_fit,
         rank_metric = "roc_auc",  
         metric = "roc_auc",       
         select_best = TRUE)

# Show best results per workflow
mlc_fit %>% rank_results(rank_metric = "roc_auc",
                         select_best = TRUE) %>% 
  filter(.metric == "roc_auc")


# Get best result
best_results <- 
  mlc_fit %>% 
  extract_workflow_set_result("xgb_recipe_xgb_model") %>% 
  select_best(metric = "roc_auc")

best_results

# Test results
best_test_results <- 
  mlc_fit %>% 
  extract_workflow("xgb_recipe_xgb_model") %>% 
  finalize_workflow(best_results) %>% 
  last_fit(split = mlc_churn_split)

best_test_results %>% 
  collect_metrics()

# Predictions
best_test_results$.predictions
Zurück nach oben