Objective

Predict the capacity of wind turbines in Canada based on turbine features

Algos

Type: Supervised Machine Learning

  1. Random Forests
  2. XGBoost (Gradient Boosted Trees)

Learning Goals

  1. Learn how to use the ‘xgboost’ package within the tidymodels framework
  2. Compare implementations / algo differences of rf vs xgb

Get Data

#a = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv")
#saveRDS(a, 'a.rds')
a = readRDS('a.rds') %>% 
  clean_names() %>% 
  mutate(across(where(is.character),factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

a %>% sample_n(5)
#remove unnecessary cols

a = a %>% select(
    -notes,
    -turbine_identifier,
    -turbine_number_in_project,
    -objectid,
    -total_project_capacity_mw,
    #removing these vars below b/c it would be unrealistic to use them for predictions
    #we likely wouldn't have such information with new data
    -model, 
    -latitude,
    -longitude,
    -project_name
)
a %>% sample_n(5)

5 min EDA

viz missing

a %>% visdat::vis_dat()

a %>% miss_var_summary()
#since the missing feature is the predictor/dependent var, let's remove those rows
#remove the single missing row for 'project_name'

a = a %>% na.omit()

check missing, na, inf

a %>% funModeling::status()

EDA: nom vars

names

nom.vars = a %>% select(where(is.factor)) %>% colnames()

sample

a %>% select(nom.vars) %>% sample_n(5)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

skim

a %>% select(all_of(nom.vars)) %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 6478
Number of columns 3
_______________________
Column type frequency:
factor 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
commissioning_date 0 1 FALSE 35 201: 744, 201: 621, 201: 590, 201: 544
manufacturer 0 1 FALSE 23 Ves: 1829, GE: 1725, Sie: 1033, Ene: 960
province_territory 0 1 FALSE 12 Ont: 2443, Que: 1991, Alb: 900, Nov: 310

viz: level counts distribution

a %>% funModeling::freq()

##    commissioning_date frequency percentage cumulative_perc
## 1                2014       744      11.49           11.49
## 2                2013       621       9.59           21.08
## 3                2011       590       9.11           30.19
## 4                2015       544       8.40           38.59
## 5                2009       485       7.49           46.08
## 6                2006       455       7.02           53.10
## 7                2012       404       6.24           59.34
## 8                2010       334       5.16           64.50
## 9                2016       266       4.11           68.61
## 10          2014/2015       207       3.20           71.81
## 11               2008       193       2.98           74.79
## 12               2018       179       2.76           77.55
## 13          2013/2014       154       2.38           79.93
## 14          2011/2012       141       2.18           82.11
## 15               2017       137       2.11           84.22
## 16          2006/2008       133       2.05           86.27
## 17               2007       133       2.05           88.32
## 18               1999       132       2.04           90.36
## 19               2003       116       1.79           92.15
## 20               2019       101       1.56           93.71
## 21               2004        74       1.14           94.85
## 22     2005/2006/2012        73       1.13           95.98
## 23          2000/2001        59       0.91           96.89
## 24               2001        50       0.77           97.66
## 25          2004/2005        47       0.73           98.39
## 26               2005        35       0.54           98.93
## 27          2006/2007        35       0.54           99.47
## 28          2001/2003        16       0.25           99.72
## 29               2002         8       0.12           99.84
## 30               1998         3       0.05           99.89
## 31          2002/2006         3       0.05           99.94
## 32               1993         2       0.03           99.97
## 33               2000         2       0.03          100.00
## 34               1995         1       0.02          100.02
## 35               1997         1       0.02          100.00

##                manufacturer frequency percentage cumulative_perc
## 1                    Vestas      1829      28.23           28.23
## 2                        GE      1725      26.63           54.86
## 3                   Siemens      1033      15.95           70.81
## 4                   Enercon       960      14.82           85.63
## 5                   Senvion       643       9.93           95.56
## 6                 NEG Micon       132       2.04           97.60
## 7        Acciona Wind Power        40       0.62           98.22
## 8                   Acciona        34       0.52           98.74
## 9                    Nordex        20       0.31           99.05
## 10                   Suzlon        15       0.23           99.28
## 11                   Vensys         9       0.14           99.42
## 12                   Gamesa         8       0.12           99.54
## 13                Windmatic         6       0.09           99.63
## 14                   DeWind         5       0.08           99.71
## 15 Samsung Renewable Energy         4       0.06           99.77
## 16                Northwind         3       0.05           99.82
## 17               Turbowinds         3       0.05           99.87
## 18                    Bonus         2       0.03           99.90
## 19                      EWT         2       0.03           99.93
## 20                 Lagerwey         2       0.03           99.96
## 21                 Leitwind         1       0.02           99.98
## 22               Pfleiderer         1       0.02          100.00
## 23                    Tacke         1       0.02          100.00

##           province_territory frequency percentage cumulative_perc
## 1                    Ontario      2443      37.71           37.71
## 2                     Quebec      1991      30.73           68.44
## 3                    Alberta       900      13.89           82.33
## 4                Nova Scotia       310       4.79           87.12
## 5           British Columbia       292       4.51           91.63
## 6               Saskatchewan       153       2.36           93.99
## 7                   Manitoba       133       2.05           96.04
## 8              New Brunswick       119       1.84           97.88
## 9       Prince Edward Island       104       1.61           99.49
## 10 Newfoundland and Labrador        27       0.42           99.91
## 11     Northwest Territories         4       0.06           99.97
## 12                     Yukon         2       0.03          100.00
## [1] "Variables processed: commissioning_date, manufacturer, province_territory"

Due to high number of levels, consider lumping levels for some of these features

viz: mosaic plots

#check out relationship of most frequent/largest levels of manufacturer and commissioning_date
a %>% select(manufacturer, commissioning_date) %>% 
  mutate(
    commissioning_date = fct_lump_prop(commissioning_date, prop = 0.05),
    manufacturer = fct_lump_prop(manufacturer, prop = 0.05)
    ) %>%
  filter(
    commissioning_date != 'Other',
    manufacturer != 'Other'
    ) %>% 
  ggpairs(
    columns = c('manufacturer','commissioning_date'),
      mapping = aes(color = manufacturer)
  )

It’s apparent that turbine manufacturing volume differs by manufacturer (1) relative to other manufacturers in the same year (2) as well as to itself over time’

EDA: num vars

names

num.vars = a %>% select(where(is.numeric)) %>% colnames()

sample

a %>% select(num.vars) %>% sample_n(5)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

skim

a %>% select(num.vars) %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 6478
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
hub_height_m 0 1 82.79 14.37 24.5 80 80 85 132
rotor_diameter_m 0 1 88.20 16.57 15.0 77 90 100 141
turbine_rated_capacity_k_w 0 1 1967.31 605.93 65.0 1600 1880 2300 3750

viz: distribution - density

#a %>% select(num.vars) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)
a %>% select(num.vars) %>% DataExplorer::plot_density(nrow = 2, ncol = 1)

viz: outliers

a %>% select(num.vars) %>% dlookr::plot_outlier()

viz: normality

a %>% select(num.vars) %>% dlookr::plot_normality()

viz: normality - outcome var only

a %>% plot_ly(x = ~turbine_rated_capacity_k_w, nbinsx = 50)
a %>% plot_ly(x = ~turbine_rated_capacity_k_w) %>% add_boxplot() %>% layout(title = 'Distribution of turbine_rated_capacity_k_w')
a %>% mutate(manufacturer = fct_lump_prop(manufacturer, prop = 0.05)) %>%
  filter(manufacturer != 'Other') %>%
  mutate(manufacturer = fct_reorder(manufacturer, turbine_rated_capacity_k_w)) %>%
  plot_ly(x = ~turbine_rated_capacity_k_w, color = ~manufacturer) %>% add_boxplot() %>% layout(title = 'Distribution of turbine_rated_capacity_k_w by Largest Manufacturers')

viz: correlations

a %>% select(num.vars) %>% dlookr::plot_correlate()

a %>% select(num.vars) %>% DataExplorer::plot_correlation()

a %>% select(num.vars) %>% funModeling::correlation_table('turbine_rated_capacity_k_w')

Preprocessing

1) split data

split = initial_split(a)
train = training(split)
test = testing(split)
vfold = vfold_cv(train, v = 10, strata = turbine_rated_capacity_k_w) #for supervised continuous algo, stratify on the outcome var
a %>% use_ranger(turbine_rated_capacity_k_w ~ .)

2) create recipe spec

rec =
  train %>% recipe(turbine_rated_capacity_k_w ~ .) %>% 
  step_zv(all_numeric(),-all_outcomes()) %>%
  step_nzv(all_numeric(),-all_outcomes()) %>%
  step_corr(all_numeric(),-all_outcomes()) %>% 
  step_other(all_nominal(),-all_outcomes(), threshold =  0.01) %>% 
  step_normalize(all_numeric(),-all_outcomes()) %>% 
  step_dummy(all_nominal(),-all_outcomes(),one_hot = TRUE)

3) preprocess

rec %>% prep %>% juice %>% head
rec %>% prep %>% bake(new_data = test) %>% head
rec %>% prep %>% juice %>% miss_var_summary
rec %>% prep %>% bake(new_data = test) %>% miss_var_summary()

4) create model spec

mdl.rf = rand_forest(
  trees = 500,
  min_n = tune(), #complexity
  mtry = tune(), #randomness
  ) %>% set_mode("regression") %>%
  set_engine("ranger", importance = "impurity_corrected") 
#----------------------------
#https://bradleyboehmke.github.io/HOML/gbm.html#basic-gbm
mdl.xgb = boost_tree(
  trees = 1000,
  #complexity
  min_n = tune(),
  tree_depth = tune(),
  loss_reduction = tune(),
  #randomness
  mtry = tune(),
  sample_size = tune(),
  #step size
  learn_rate = tune() #i.e. 'shrinkage', smaller values require more trees 
) %>% set_mode("regression") %>%
  set_engine("xgboost", objective = "reg:squarederror") 

5) create workflow spec

wf.rf = workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl.rf)
#----------------------------
wf.xgb = workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl.xgb)

6) execute wf on vfold using tunegrid

set.seed(345)
doParallel::registerDoParallel()

tg.rf = tune_grid(
  object = wf.rf,
  resamples = vfold,
  grid = 10,
  metrics = metric_set(mae, mape, rmse, rsq)
)

autoplot(tg.rf) %>% ggplotly() %>% layout(title = 'rf hyperparmeter mixes & performance')
#----------------------------
set.seed(345)
doParallel::registerDoParallel()

tg.xgb = tune_grid(
  object = wf.xgb,
  resamples = vfold,
  grid = 5,
  metrics = metric_set(mae, mape, rmse, rsq)
)

autoplot(tg.xgb) %>% ggplotly() %>% layout(title = 'xgb hyperparmeter mixes & performance')

7) select best hps

(sb.hps.rf = tg.rf %>% select_best('rmse'))
tg.rf %>% collect_metrics()
tg.rf %>% collect_metrics() %>% filter(.config == 'Preprocessor1_Model08')
#----------------------------
(sb.hps.xgb = tg.xgb %>% select_best('rmse'))
tg.xgb %>% collect_metrics()
tg.xgb %>% collect_metrics() %>% filter(.config == 'Preprocessor1_Model2')

8) finalize workflow & fit model

wf.rf.fin = wf.rf %>% finalize_workflow(
  parameters = sb.hps.rf
)
#----------------------------
wf.xgb.fin = wf.xgb %>% finalize_workflow(
  parameters = sb.hps.xgb
)

9) check variable importance

wf.rf.fin %>%
  fit(train) %>% 
  pull_workflow_fit() %>% 
  vip::vip(aesthetics = list(alpha = 0.75, fill = 'forestgreen'))

#----------------------------
wf.xgb.fin %>%
  fit(train) %>% 
  pull_workflow_fit() %>% 
  vip::vip(aesthetics = list(alpha = 0.75, fill = 'slateblue4'))

10) evaluate performance on test

#wf.rf.fin %>% last_fit(split) %>% collect_metrics()

wf.rf.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  select(.pred, turbine_rated_capacity_k_w) %>% 
  metric_set(mae, mape, rmse, rsq)(truth = turbine_rated_capacity_k_w, estimate = .pred)
#----------------------------
#wf.xgb.fin %>% last_fit(split) %>% collect_metrics()

wf.xgb.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  select(.pred, turbine_rated_capacity_k_w) %>% 
  metric_set(mae, mape, rmse, rsq)(truth = turbine_rated_capacity_k_w, estimate = .pred)

11) viz predictions vs test

ggplotly(
wf.rf.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  ggplot(aes(x = turbine_rated_capacity_k_w, y = .pred)) +
  geom_point(
    alpha = 0.75,
    color = 'forestgreen'
    ) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'darkgrey', alpha = 0.5) +
  labs(title = 'RF: Test Dataset: Actuals vs Predictions')
)
#----------------------------
ggplotly(
wf.xgb.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  ggplot(aes(x = turbine_rated_capacity_k_w, y = .pred)) +
  geom_point(
    alpha = 0.75,
    color = 'slateblue4'
    ) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'darkgrey', alpha = 0.5) +
  labs(title = 'XGB: Test Dataset: Actuals vs Predictions')
)