This post covers the tune and dials packages, which focus on defining and optimizing model hyperparameters.

Setup

Packages

The following packages are required:

Code

library(dials)
library(tune)
library(yardstick)
library(rsample)
library(recipes)
library(parsnip)

library(ggplot2)

Data

For these packages, I utilized the car_prices data set from the modeldata package in a few examples.

data("car_prices", package = "modeldata")

car_prices_tbl <- as_tibble(car_prices)

car_prices_tbl

# A tibble: 804 × 18
    Price Mileage Cylin…¹ Doors Cruise Sound Leather Buick Cadil…² Chevy Pontiac
    <dbl>   <int>   <int> <int>  <int> <int>   <int> <int>   <int> <int>   <int>
 1 22661.   20105       6     4      1     0       0     1       0     0       0
 2 21725.   13457       6     2      1     1       0     0       0     1       0
 3 29143.   31655       4     2      1     1       1     0       0     0       0
 4 30732.   22479       4     2      1     0       0     0       0     0       0
 5 33359.   17590       4     2      1     1       1     0       0     0       0
 6 30315.   23635       4     2      1     0       0     0       0     0       0
 7 33382.   17381       4     2      1     1       1     0       0     0       0
 8 30251.   27558       4     2      1     0       1     0       0     0       0
 9 30167.   25049       4     2      1     0       0     0       0     0       0
10 27060.   17319       4     4      1     0       1     0       0     0       0
# … with 794 more rows, 7 more variables: Saab <int>, Saturn <int>,
#   convertible <int>, coupe <int>, hatchback <int>, sedan <int>, wagon <int>,
#   and abbreviated variable names ¹Cylinder, ²Cadillac

Referencing previous entries in this series on the rsample and recipes packages, a 70/30 train/test initial_split() on both data sets is taken, a few pre-processing steps are applied on the training data sets to create recipe() objects, and those objects are passed to the prep() and juice() functions. This creates a processed training data set for each original data set.

set.seed(1914)
car_prices_split_obj <- initial_split(car_prices, prop = 0.7)

car_prices_recipe_obj <- recipe(Price ~ ., training(car_prices_split_obj)) %>% 
  step_mutate_at(Cylinder:wagon, fn = as.factor) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_naomit(all_predictors(), skip = FALSE)

car_prices_train_tbl <- car_prices_recipe_obj %>% 
  prep() %>% 
  juice()

car_prices_train_tbl

# A tibble: 562 × 35
   Mileage  Price Cylinder_X4 Cylinder…¹ Cylin…² Doors…³ Doors…⁴ Cruis…⁵ Cruis…⁶
     <int>  <dbl>       <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1    3828 37089.           1          0       0       1       0       0       1
 2   21778 36210.           0          1       0       0       1       0       1
 3   28408 12230.           1          0       0       1       0       1       0
 4    4463 17418.           1          0       0       0       1       0       1
 5   21020 13991.           1          0       0       1       0       1       0
 6   25218 23329.           1          0       0       0       1       0       1
 7   32914  8871.           1          0       0       0       1       0       1
 8   18419 20127.           0          1       0       0       1       0       1
 9   21128 14305.           1          0       0       0       1       1       0
10    1169 15636.           1          0       0       1       0       1       0
# … with 552 more rows, 26 more variables: Sound_X0 <dbl>, Sound_X1 <dbl>,
#   Leather_X0 <dbl>, Leather_X1 <dbl>, Buick_X0 <dbl>, Buick_X1 <dbl>,
#   Cadillac_X0 <dbl>, Cadillac_X1 <dbl>, Chevy_X0 <dbl>, Chevy_X1 <dbl>,
#   Pontiac_X0 <dbl>, Pontiac_X1 <dbl>, Saab_X0 <dbl>, Saab_X1 <dbl>,
#   Saturn_X0 <dbl>, Saturn_X1 <dbl>, convertible_X0 <dbl>,
#   convertible_X1 <dbl>, coupe_X0 <dbl>, coupe_X1 <dbl>, hatchback_X0 <dbl>,
#   hatchback_X1 <dbl>, sedan_X0 <dbl>, sedan_X1 <dbl>, wagon_X0 <dbl>, …

car_prices_test_tbl <- car_prices_recipe_obj %>% 
  prep() %>% 
  bake(new_data = testing(car_prices_split_obj))

car_prices_test_tbl

# A tibble: 242 × 35
   Mileage  Price Cylinder_X4 Cylinder…¹ Cylin…² Doors…³ Doors…⁴ Cruis…⁵ Cruis…⁶
     <int>  <dbl>       <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1   27558 30251.           1          0       0       1       0       0       1
 2   22814 24852.           1          0       0       0       1       0       1
 3   10014 27826.           1          0       0       0       1       0       1
 4   18464 29987.           1          0       0       0       1       0       1
 5   19830 29908.           1          0       0       0       1       0       1
 6   25357 26792.           1          0       0       0       1       0       1
 7   12090 38325.           1          0       0       1       0       0       1
 8   21167 35580.           1          0       0       1       0       0       1
 9   14568 30122.           1          0       0       0       1       0       1
10   11273 30354.           1          0       0       0       1       0       1
# … with 232 more rows, 26 more variables: Sound_X0 <dbl>, Sound_X1 <dbl>,
#   Leather_X0 <dbl>, Leather_X1 <dbl>, Buick_X0 <dbl>, Buick_X1 <dbl>,
#   Cadillac_X0 <dbl>, Cadillac_X1 <dbl>, Chevy_X0 <dbl>, Chevy_X1 <dbl>,
#   Pontiac_X0 <dbl>, Pontiac_X1 <dbl>, Saab_X0 <dbl>, Saab_X1 <dbl>,
#   Saturn_X0 <dbl>, Saturn_X1 <dbl>, convertible_X0 <dbl>,
#   convertible_X1 <dbl>, coupe_X0 <dbl>, coupe_X1 <dbl>, hatchback_X0 <dbl>,
#   hatchback_X1 <dbl>, sedan_X0 <dbl>, sedan_X1 <dbl>, wagon_X0 <dbl>, …

Models

Again referencing previous entries in this series, this time on the parsnip and yardstick packages, a basic Random Forest model is built on the training data using the rand_forest() function and “random” values for its hyperparameters. We have no guarantee that these values are any good, but we will explore that more later.

set.seed(1915)

mod_rf_fit <- rand_forest() %>% 
  set_mode("regression") %>% 
  set_engine("ranger") %>% 
  set_args(mtry = 2, trees = 10, min_n = 5) %>% 
  fit(Price ~ ., data = car_prices_train_tbl)

mod_rf_fit

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~2,      x), num.trees = ~10, min.node.size = min_rows(~5, x), num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 

Type:                             Regression 
Number of trees:                  10 
Sample size:                      562 
Number of independent variables:  34 
Mtry:                             2 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       20428937 
R squared (OOB):                  0.7873951

Model fit statistics are calculated using the yardstick::metrics() function using the default regression metrics.

mod_rf_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    4042.   
2 rsq     standard       0.882
3 mae     standard    2853.

These metrics are pretty good, but could they be better? The Random Forest model was built using the completely arbitrary values for its hyperparameters, which directly impact how accurate the model can (and cannot) be. Can better performance be obtained by “tuning” or “optimizing” these hyperparameters?

Background

While not all modeling algorithms have hyperparameters, those that do can achieve impressive performance if, among many other variables, the hyperparameters are “tuned” appropriately. “Regular” model parameters, such as the coefficients in a plain-vanilla linear regression, are estimated directly from the data and are internal to the model. Hyperparameters, on the other hand, are external to the model and must be explicitly specified before the model can be built. Examples include the mtry, trees, and min_n arguments to the Random Forest previously shown.

Different values or combinations of values of hyperparameters may or may not be better for different data sets. How do we know which values or combinations of values are “best”? Also, what does “best” mean? The short answer is, at least at first, you cannot be sure of the “best” hyperparameter values. “Best” usually means “optimal” for some defined metric. For example, we may want the hyperparameter values that result in a minimized RMSE value for a regression model or a maximized Area Under the ROC Curve for a classification model.

Often, the process involving finding the “optimal” set of hyperparameter values consists of the following steps:

Split the training data set into n pieces, called “folds”
Create a grid of possible hyperparameter value combinations
For each combination, fit a model on n-1 of the folds and calculate the performance metric on the n-th fold
Repeat step 3 until each fold has been used to calculate the performance metric
Take the average of the n performance metrics
For each combination, compare the averaged performance metric
Select the combination with the optimized averaged performance metric

Hyperparameter Definitions (`dials`)

For each model algorithm in the parsnip package, the associated tunable hyperparameters are defined in the dials package. They are functions with the same name as the parameter argument. The functions below define the possible range of values the hyperparameters of the Random Forecast could take. Each function contains a range of possible values that the hyperparameter can take. Note that these ranges are somewhat subjective, and are based on research done by the tidymodels team.

`trees`

The value of the trees hyperparameter in the Random Forest can range from 1 tree to 2,000 trees. These are the number of individual trees built for the forest.

trees()

# Trees (quantitative)
Range: [1, 2000]

`min_n`

The value of the min_n hyperparameter in the Random Forest can range from 2 to 40. These are the number of observations required to be in a node for that node to be allowed to split further.

min_n()

Minimal Node Size (quantitative)
Range: [2, 40]

`mtry`

The mtry hyperparameter is different than the previous two. While the lower bound of the range is 1, the default upper bound is unknown. mtry is the number of variables that are randomly sampled at each split in a tree. Since a data set can have a few, dozens, hundreds, or even thousands of variables, there are many possibilities for the maximum number of variables to randomly sample at each split. Hence, the ? as an upper bound.

mtry()

# Randomly Selected Predictors (quantitative)
Range: [1, ?]

Fortunately, the dials package does provide tools for determining this upper bound for a specific data set. This will be explored later.

Hyperparameter Tuning (`tune`)

The tune package contains a suite of functions for tuning the hyperparameters housed in the dials package. The two packages are specifically designed to work together.

Typically, the process of hyperparameter tuning follows these steps:

Specify which hyperparameters of a parsnip model are to be tuned
Create a grid of possible values for each hyperparameter
Find the “optimal” set of hyperparameters (as outlined previously)
Finalize the model specification with the optimal hyperparameters
Fit the model

Prepare Tuning Parameters

The first step in tuning the hyperparameters of a parsnip model is to specify which hyperparameters for a specific algorithm are to be tuned. This can be done by setting each hyperparameter argument equal to the tune function within the set_args() function. The example below shows a standard Random Forest model specification as before, but this time with the mtry, trees, and min_n arguments set to tune().

mod_rf_spec <- rand_forest() %>% 
  set_mode("regression") %>% 
  set_engine("ranger") %>% 
  set_args(
    mtry  = tune(),
    trees = tune(),
    min_n = tune()
  )

mod_rf_spec

Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  trees = tune()
  min_n = tune()

Computational engine: ranger

Passing this model specification into the tune_args() function confirms that the three hyperparameters are tunable and that they were specified correctly.

tune_args(mod_rf_spec)

# A tibble: 3 × 6
  name  tunable id    source     component   component_id
  <chr> <lgl>   <chr> <chr>      <chr>       <chr>       
1 mtry  TRUE    mtry  model_spec rand_forest <NA>        
2 trees TRUE    trees model_spec rand_forest <NA>        
3 min_n TRUE    min_n model_spec rand_forest <NA>

As mentioned earlier, the mtry hyperparameter has an unknown upper bound of possible values. This upper bound can be calculated using the training data set. Since mtry is the number of predictor variables randomly sampled at each node, it is therefore limited by the number of predictor variables in the data set. Passing mtry() and the data set of predictor variables into the finalize() function will “finalize” the hyperparameter with an upper bound so that it is ready to be tuned.

finalize(mtry(), car_prices_train_tbl[ ,-2])

# Randomly Selected Predictors (quantitative)
Range: [1, 34]

Once finalized, the three hyperparameters are ready for the next step of tuning. Passing each of them (with mtry() being passed to the finalize() function) into the parameters() function will gather them together in a format that can be used to generate combinations of potential values to explore.

params <- parameters(
  finalize(mtry(), car_prices_train_tbl[ ,-2]),
  trees(),
  min_n()
)

params

Collection of 3 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[+]
      trees trees nparam[+]
      min_n min_n nparam[+]

Grid Search

Perhaps the most well-known method of hyperparameter tuning is the grid search. Essentially, a grid of possible hyperparameter value combinations is created and a model is fit with each of them. A performance statistic is then calculated to determine which combination of hyperparameter values is “optimal”.

This process will often use K-fold cross-validation to fit the models and calculate the performance statistic. These folds can be created using the vfold_cv() function from the rsample package. A previous post in this series covers cross-validation and how this function works.

set.seed(1916)
folds_tbl <- vfold_cv(car_prices_train_tbl, v = 5)

folds_tbl

#  5-fold cross-validation 
# A tibble: 5 × 2
  splits            id   
  <list>            <chr>
1 <split [449/113]> Fold1
2 <split [449/113]> Fold2
3 <split [450/112]> Fold3
4 <split [450/112]> Fold4
5 <split [450/112]> Fold5

The tune package contains two main functions for creating a grid of potential hyperparameter values, as shown below.

Regular

The grid_regular() function creates a grid of hyperparameter combinations that span the range of possible values outlined in the hyperparameter functions defined in the dials package. The levels argument (the default value is 3) determines the number of distinct values for each hyperparameter. All possible combinations of those values of hyperparameters are included in the grid. For example, the Random Forest has 3 hyperparameters, so 3 levels of each would result in $3^{3} = 27$ combinations.

grid_regular_tbl <- grid_regular(params)

grid_regular_tbl

# A tibble: 27 × 3
    mtry trees min_n
   <int> <int> <int>
 1     1     1     2
 2    17     1     2
 3    34     1     2
 4     1  1000     2
 5    17  1000     2
 6    34  1000     2
 7     1  2000     2
 8    17  2000     2
 9    34  2000     2
10     1     1    21
# … with 17 more rows

Summarizing by hyperparameter shows that each one has three distinct values in the tuning grid.

grid_regular_tbl %>% 
  summarize(across(everything(), n_distinct))

# A tibble: 1 × 3
   mtry trees min_n
  <int> <int> <int>
1     3     3     3

If the levels argument was changed to 7, there would be 7 distinct values for each hyperparameter resulting in $7^{3} = 343$ combinations in the tuning grid.

grid_regular_7_tbl <- grid_regular(params, levels = 7)

grid_regular_7_tbl

# A tibble: 343 × 3
    mtry trees min_n
   <int> <int> <int>
 1     1     1     2
 2     6     1     2
 3    12     1     2
 4    17     1     2
 5    23     1     2
 6    28     1     2
 7    34     1     2
 8     1   334     2
 9     6   334     2
10    12   334     2
# … with 333 more rows

Again, summarizing by hyperparameter shows that each one has seven distinct values in the tuning grid.

grid_regular_7_tbl %>% 
  summarize(across(everything(), n_distinct))

# A tibble: 1 × 3
   mtry trees min_n
  <int> <int> <int>
1     7     7     7

Once the folds have been created and the size of the tuning grid has been determined, the tune_grid() function can be used to determine the optimal hyperparameter combination. The function requires:

a parsnip model specification,
a formula() (or a recipe()),
resamples (e.g., the result of vfold_cv() from earlier),
the grid of hyperparameters, and
the metrics used to determine the optimal hyperparameter combination

Note that the metric_set() function from the yardstick package can be used to set one or more metrics that will be calculated on each out-of-sample fold during cross-validation. Below, rmse() is used.

mod_rf_tuned_regular_tbl <- tune_grid(
  mod_rf_spec, Price ~ .,
  resamples = folds_tbl,
  grid      = grid_regular_tbl,
  metrics   = metric_set(rmse)
)

mod_rf_tuned_regular_tbl

# Tuning results
# 5-fold cross-validation 
# A tibble: 5 × 4
  splits            id    .metrics          .notes          
  <list>            <chr> <list>            <list>          
1 <split [449/113]> Fold1 <tibble [27 × 7]> <tibble [0 × 3]>
2 <split [449/113]> Fold2 <tibble [27 × 7]> <tibble [0 × 3]>
3 <split [450/112]> Fold3 <tibble [27 × 7]> <tibble [0 × 3]>
4 <split [450/112]> Fold4 <tibble [27 × 7]> <tibble [0 × 3]>
5 <split [450/112]> Fold5 <tibble [27 × 7]> <tibble [0 × 3]>

The result can be passed to the show_best() function to show the “best” hyperparameter combinations based on the metric that was used in tuning. By default, it shows the top five optimal combinations.

show_best(mod_rf_tuned_regular_tbl)

# A tibble: 5 × 9
   mtry trees min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1    17  2000     2 rmse    standard   2142.     5    73.5 Preprocessor1_Model08
2    17  1000     2 rmse    standard   2144.     5    75.8 Preprocessor1_Model05
3    34  2000     2 rmse    standard   2299.     5   122.  Preprocessor1_Model09
4    34  1000     2 rmse    standard   2312.     5   129.  Preprocessor1_Model06
5    17  2000    21 rmse    standard   2455.     5    89.5 Preprocessor1_Model17

The select_best() function can be used to extract the top performing hyperparameter combination. This can then be passed to the finalize_model() function, along with the parsnip model specification from earlier, to replace the hyperparameters marked with tune() to the values determined to be “optimal”.

mod_rf_best_regular_tbl <- select_best(mod_rf_tuned_regular_tbl)

mod_rf_best_regular_spec <- mod_rf_spec %>% 
  finalize_model(mod_rf_best_regular_tbl)

mod_rf_best_regular_spec

Random Forest Model Specification (regression)

Main Arguments:
  mtry = 17
  trees = 2000
  min_n = 2

Computational engine: ranger

Finally, the model with declared hyperparameter values can be passed to the parsnip::fit() function.

mod_rf_best_regular_fit <- mod_rf_best_regular_spec %>% 
  fit(Price ~ ., data = car_prices_train_tbl)

mod_rf_best_regular_fit

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~17L,      x), num.trees = ~2000L, min.node.size = min_rows(~2L, x),      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  2000 
Sample size:                      562 
Number of independent variables:  34 
Mtry:                             17 
Target node size:                 2 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       4512026 
R squared (OOB):                  0.9530431

Now that we have a model with, theoretically, optimal hyperparameters, we can compare the performance on the testing set of the original model to this new model. Below, we can see that rmse() and mae() decrease while rsq() increases. This suggests that the hyperparameter combination resulting from tuning results in a better performing model than the “randomly chosen” values used originally.

mod_rf_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    4042.   
2 rsq     standard       0.882
3 mae     standard    2853.

mod_rf_best_regular_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    2408.   
2 rsq     standard       0.943
3 mae     standard    1634.

Random

The grid_random() function also creates a grid of hyperparameter combinations. The difference is that it randomly samples the range of values outlined in the hyperparameter functions defined in the dials package. The size argument (the default is 5) determines the number of randomly sampled combinations are produced.

Creating a random grid with the default size value of 5 results in a five-row tibble. Each rows is a randomly sampled combination of the hyperparameter values. As seen below, some values may be repeated across random samples.

set.seed(1917)
grid_random_tbl <- grid_random(params)

grid_random_tbl

# A tibble: 5 × 3
   mtry trees min_n
  <int> <int> <int>
1     1    41     2
2    27  1936    21
3    15   972     2
4     9  1492    10
5    18    78    14

Changing the size argument to 10 results in a ten-row tibble with each row being a randomly sampled combination of hyperparameter values.

set.seed(1917)
grid_random_10_tbl <- grid_random(params, size = 10)

grid_random_10_tbl

# A tibble: 10 × 3
    mtry trees min_n
   <int> <int> <int>
 1     1  1915    37
 2    27  1684     8
 3    15  1323     6
 4     9  1845    26
 5    18  1409    39
 6    16   585     9
 7    12  1037    32
 8    20  1883    12
 9    14   110    30
10     1   412    37

Following the same instructions as before, the output of grid_random() can be passed to the tune_grid() function to determine the optimal hyperparameter function combination. The other inputs to the function are the same as they were previously.

mod_rf_tuned_random_tbl <- tune_grid(
  mod_rf_spec, Price ~ .,
  resamples = folds_tbl,
  grid      = grid_random_10_tbl,
  metrics   = metric_set(rmse)
)

mod_rf_tuned_random_tbl

# Tuning results
# 5-fold cross-validation 
# A tibble: 5 × 4
  splits            id    .metrics          .notes          
  <list>            <chr> <list>            <list>          
1 <split [449/113]> Fold1 <tibble [10 × 7]> <tibble [0 × 3]>
2 <split [449/113]> Fold2 <tibble [10 × 7]> <tibble [0 × 3]>
3 <split [450/112]> Fold3 <tibble [10 × 7]> <tibble [0 × 3]>
4 <split [450/112]> Fold4 <tibble [10 × 7]> <tibble [0 × 3]>
5 <split [450/112]> Fold5 <tibble [10 × 7]> <tibble [0 × 3]>

The show_best() function can once again be used to show the top five optimal combinations from the random grid.

show_best(mod_rf_tuned_random_tbl)

# A tibble: 5 × 9
   mtry trees min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1    15  1323     6 rmse    standard   2170.     5    62.3 Preprocessor1_Model03
2    16   585     9 rmse    standard   2212.     5    59.9 Preprocessor1_Model06
3    27  1684     8 rmse    standard   2270.     5   112.  Preprocessor1_Model02
4    20  1883    12 rmse    standard   2299.     5    91.1 Preprocessor1_Model08
5     9  1845    26 rmse    standard   2569.     5    65.0 Preprocessor1_Model04

The select_best() function is then used to extract the top performing hyperparameter combination and the finalize_model() function is used to finish the parsnip model specification.

mod_rf_best_random_tbl <- select_best(mod_rf_tuned_random_tbl)

mod_rf_best_random_spec <- mod_rf_spec %>% 
  finalize_model(mod_rf_best_random_tbl)

mod_rf_best_random_spec

Random Forest Model Specification (regression)

Main Arguments:
  mtry = 15
  trees = 1323
  min_n = 6

Computational engine: ranger

Finally, the parsnip::fit() function is used to fit the model.

mod_rf_best_random_fit <- mod_rf_best_random_spec %>% 
  fit(Price ~ ., data = car_prices_train_tbl)

mod_rf_best_random_fit

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~15L,      x), num.trees = ~1323L, min.node.size = min_rows(~6L, x),      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  1323 
Sample size:                      562 
Number of independent variables:  34 
Mtry:                             15 
Target node size:                 6 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       4682992 
R squared (OOB):                  0.9512639

We can now compare the performance on the testing set of the original model, the model with the optimal hyperparameter combination from the regular grid, and the model with the optimal hyperparameter combination from the random grid.

As seen before, the regular grid model has lower rmse() and mae() and a higher rsq() than the original model. The random grid model’s performance is slightly worse than the regular grid model’s, suggesting that the regular grid found a better hyperparameter combination.

mod_rf_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    4042.   
2 rsq     standard       0.882
3 mae     standard    2853.

mod_rf_best_regular_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    2408.   
2 rsq     standard       0.943
3 mae     standard    1634.

mod_rf_best_random_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    2464.   
2 rsq     standard       0.940
3 mae     standard    1668.

Bayesian Search

Another option for hyperparameter tuning is Bayesian Optimization. While a detailed discussion would be too voluminous for this post, the overall process iteratively “discovers” new candidate values for the hyperparameters by using a model to predict which values to try next. The previous link goes into further detail on the specifics of this method.

In the tune package, the tune_bayes() function can be used to perform Bayesian search. The function can take a previous tuned grid of model hyperparameters, such as from grid search, and use that information to attempt to predict “better” values. Therefore, since the random grid resulted in lesser performance than the regular grid, we will use the random grid as a starting point to see if Bayesian search can discover more optimal hyperparameter values than regular grid. In addition to passing the previously tuned random grid as the initial argument, the number of iterations is passed as the iter argument. The remaining arguments are the same as in tune_grid(). Further control can be gained through the control_bayes() function passed to the control argument.

set.seed(1918)
mod_rf_tuned_bayes_tbl <- tune_bayes(
  mod_rf_spec, Price ~ .,
  resamples  = folds_tbl,
  initial    = mod_rf_tuned_random_tbl,
  param_info = params,
  iter       = 25,
  metrics    = metric_set(rmse)
)

mod_rf_tuned_bayes_tbl

# Tuning results
# 5-fold cross-validation 
# A tibble: 105 × 5
   splits            id    .metrics          .notes           .iter
   <list>            <chr> <list>            <list>           <int>
 1 <split [449/113]> Fold1 <tibble [10 × 7]> <tibble [0 × 3]>     0
 2 <split [449/113]> Fold2 <tibble [10 × 7]> <tibble [0 × 3]>     0
 3 <split [450/112]> Fold3 <tibble [10 × 7]> <tibble [0 × 3]>     0
 4 <split [450/112]> Fold4 <tibble [10 × 7]> <tibble [0 × 3]>     0
 5 <split [450/112]> Fold5 <tibble [10 × 7]> <tibble [0 × 3]>     0
 6 <split [449/113]> Fold1 <tibble [1 × 7]>  <tibble [0 × 3]>     1
 7 <split [449/113]> Fold2 <tibble [1 × 7]>  <tibble [0 × 3]>     1
 8 <split [450/112]> Fold3 <tibble [1 × 7]>  <tibble [0 × 3]>     1
 9 <split [450/112]> Fold4 <tibble [1 × 7]>  <tibble [0 × 3]>     1
10 <split [450/112]> Fold5 <tibble [1 × 7]>  <tibble [0 × 3]>     1
# … with 95 more rows

The show_best() function can once again be used to show the top five optimal combinations from the Bayes search.

show_best(mod_rf_tuned_bayes_tbl)

# A tibble: 5 × 10
   mtry trees min_n .metric .estimator  mean     n std_err .config .iter
  <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>   <int>
1    14  1979     2 rmse    standard   2137.     5    66.0 Iter10     10
2    15  1958     3 rmse    standard   2137.     5    67.4 Iter19     19
3    16  1808     2 rmse    standard   2143.     5    70.4 Iter8       8
4    13  1873     2 rmse    standard   2143.     5    63.4 Iter14     14
5    14  1839     3 rmse    standard   2144.     5    66.1 Iter7       7

The select_best() function is then used to extract the top performing hyperparameter combination and the finalize_model() function is used to finish the parsnip model specification.

mod_rf_best_bayes_tbl <- select_best(mod_rf_tuned_bayes_tbl)

mod_rf_best_bayes_spec <- mod_rf_spec %>% 
  finalize_model(mod_rf_best_bayes_tbl)

mod_rf_best_bayes_spec

Random Forest Model Specification (regression)

Main Arguments:
  mtry = 14
  trees = 1979
  min_n = 2

Computational engine: ranger

Finally, the parsnip::fit() function is used to fit the model.

mod_rf_best_bayes_fit <- mod_rf_best_bayes_spec %>% 
  fit(Price ~ ., data = car_prices_train_tbl)

mod_rf_best_bayes_fit

parsnip model object

Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~14L,      x), num.trees = ~1979L, min.node.size = min_rows(~2L, x),      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 

Type:                             Regression 
Number of trees:                  1979 
Sample size:                      562 
Number of independent variables:  34 
Mtry:                             14 
Target node size:                 2 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       4530378 
R squared (OOB):                  0.9528521

We can now compare the performance on the testing set of the original model, the model with the optimal hyperparameter combination from the regular grid, the model with the optimal hyperparameter combination from the random grid, and the model with the optimal hyperparameter combination from the random grid further optimized through Bayesian search.

As seen before, the optimized hyperparameter models outperform the original model. Further, the model built from Bayesian search does have slightly improved performance over the random search model. This shows that the Bayesian search worked in further optimizing the random search. That said, the regular grid still has the best performance for this particular data set and algorithm.

mod_rf_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    4042.   
2 rsq     standard       0.882
3 mae     standard    2853.

mod_rf_best_regular_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    2408.   
2 rsq     standard       0.943
3 mae     standard    1634.

mod_rf_best_random_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    2464.   
2 rsq     standard       0.940
3 mae     standard    1668.

mod_rf_best_bayes_fit %>% 
  predict(car_prices_test_tbl) %>% 
  bind_cols(car_prices_test_tbl) %>% 
  metrics(truth = Price, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard    2420.   
2 rsq     standard       0.942
3 mae     standard    1645.

Notes

This post is based on a presentation that was given on the date listed. It may be updated from time to time to fix errors, detail new functions, and/or remove deprecated functions so the packages and R version will likely be newer than what was available at the time.

The R session information used for this post:

sessionInfo()

R version 4.2.1 (2022-06-23)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.1.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
[1] ggplot2_3.4.0   parsnip_1.0.3   recipes_1.0.4   dplyr_1.0.10   
[5] rsample_1.1.1   yardstick_1.1.0 tune_1.0.1      dials_1.1.0    
[9] scales_1.2.1   

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.9          lubridate_1.9.0     lattice_0.20-45    
 [4] listenv_0.8.0       tidyr_1.2.1         class_7.3-20       
 [7] digest_0.6.29       ipred_0.9-13        foreach_1.5.2      
[10] utf8_1.2.2          parallelly_1.32.1   ranger_0.14.1      
[13] R6_2.5.1            hardhat_1.2.0       evaluate_0.16      
[16] pillar_1.8.1        rlang_1.1.1         rstudioapi_0.14    
[19] furrr_0.3.1         DiceDesign_1.9      rpart_4.1.16       
[22] Matrix_1.4-1        rmarkdown_2.16      splines_4.2.1      
[25] gower_1.0.1         stringr_1.5.0       munsell_0.5.0      
[28] compiler_4.2.1      xfun_0.40           pkgconfig_2.0.3    
[31] globals_0.16.2      htmltools_0.5.3     nnet_7.3-17        
[34] tidyselect_1.2.0    tibble_3.1.8        prodlim_2019.11.13 
[37] codetools_0.2-18    workflows_1.1.2     GPfit_1.0-8        
[40] future_1.29.0       fansi_1.0.3         withr_2.5.0        
[43] MASS_7.3-57         grid_4.2.1          jsonlite_1.8.0     
[46] gtable_0.3.1        lifecycle_1.0.3     magrittr_2.0.3     
[49] future.apply_1.10.0 cli_3.6.1           stringi_1.7.12     
[52] renv_0.16.0         timeDate_4022.108   ellipsis_0.3.2     
[55] lhs_1.1.6           generics_0.1.3      vctrs_0.6.3        
[58] lava_1.7.1          iterators_1.0.14    tools_4.2.1        
[61] glue_1.6.2          purrr_0.3.5         parallel_4.2.1     
[64] fastmap_1.1.0       survival_3.3-1      yaml_2.3.5         
[67] timechange_0.1.1    colorspace_2.0-3    knitr_1.40