Objective

Predict the capacity of wind turbines in Canada based on turbine features

Algos

Type: Supervised Machine Learning

Random Forests
XGBoost (Gradient Boosted Trees)

Learning Goals

Learn how to use the ‘xgboost’ package within the tidymodels framework
Compare implementations / algo differences of rf vs xgb

Get Data

#a = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv")
#saveRDS(a, 'a.rds')
a = readRDS('a.rds') %>% 
  clean_names() %>% 
  mutate(across(where(is.character),factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

a %>% sample_n(5)

#remove unnecessary cols

a = a %>% select(
    -notes,
    -turbine_identifier,
    -turbine_number_in_project,
    -objectid,
    -total_project_capacity_mw,
    #removing these vars below b/c it would be unrealistic to use them for predictions
    #we likely wouldn't have such information with new data
    -model, 
    -latitude,
    -longitude,
    -project_name
)
a %>% sample_n(5)

5 min EDA

viz missing

a %>% visdat::vis_dat()

a %>% miss_var_summary()

#since the missing feature is the predictor/dependent var, let's remove those rows
#remove the single missing row for 'project_name'

a = a %>% na.omit()

check missing, na, inf

a %>% funModeling::status()

EDA: nom vars

names

nom.vars = a %>% select(where(is.factor)) %>% colnames()

sample

a %>% select(nom.vars) %>% sample_n(5)

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

skim

a %>% select(all_of(nom.vars)) %>% skim_without_charts()

Data summary

Name	Piped data
Number of rows	6478
Number of columns	3
_______________________
Column type frequency:
factor	3
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
commissioning_date	1	FALSE	35	201: 744, 201: 621, 201: 590, 201: 544
manufacturer	1	FALSE	23	Ves: 1829, GE: 1725, Sie: 1033, Ene: 960
province_territory	1	FALSE	12	Ont: 2443, Que: 1991, Alb: 900, Nov: 310

viz: level counts distribution

a %>% funModeling::freq()

##    commissioning_date frequency percentage cumulative_perc
## 1                2014       744      11.49           11.49
## 2                2013       621       9.59           21.08
## 3                2011       590       9.11           30.19
## 4                2015       544       8.40           38.59
## 5                2009       485       7.49           46.08
## 6                2006       455       7.02           53.10
## 7                2012       404       6.24           59.34
## 8                2010       334       5.16           64.50
## 9                2016       266       4.11           68.61
## 10          2014/2015       207       3.20           71.81
## 11               2008       193       2.98           74.79
## 12               2018       179       2.76           77.55
## 13          2013/2014       154       2.38           79.93
## 14          2011/2012       141       2.18           82.11
## 15               2017       137       2.11           84.22
## 16          2006/2008       133       2.05           86.27
## 17               2007       133       2.05           88.32
## 18               1999       132       2.04           90.36
## 19               2003       116       1.79           92.15
## 20               2019       101       1.56           93.71
## 21               2004        74       1.14           94.85
## 22     2005/2006/2012        73       1.13           95.98
## 23          2000/2001        59       0.91           96.89
## 24               2001        50       0.77           97.66
## 25          2004/2005        47       0.73           98.39
## 26               2005        35       0.54           98.93
## 27          2006/2007        35       0.54           99.47
## 28          2001/2003        16       0.25           99.72
## 29               2002         8       0.12           99.84
## 30               1998         3       0.05           99.89
## 31          2002/2006         3       0.05           99.94
## 32               1993         2       0.03           99.97
## 33               2000         2       0.03          100.00
## 34               1995         1       0.02          100.02
## 35               1997         1       0.02          100.00

##                manufacturer frequency percentage cumulative_perc
## 1                    Vestas      1829      28.23           28.23
## 2                        GE      1725      26.63           54.86
## 3                   Siemens      1033      15.95           70.81
## 4                   Enercon       960      14.82           85.63
## 5                   Senvion       643       9.93           95.56
## 6                 NEG Micon       132       2.04           97.60
## 7        Acciona Wind Power        40       0.62           98.22
## 8                   Acciona        34       0.52           98.74
## 9                    Nordex        20       0.31           99.05
## 10                   Suzlon        15       0.23           99.28
## 11                   Vensys         9       0.14           99.42
## 12                   Gamesa         8       0.12           99.54
## 13                Windmatic         6       0.09           99.63
## 14                   DeWind         5       0.08           99.71
## 15 Samsung Renewable Energy         4       0.06           99.77
## 16                Northwind         3       0.05           99.82
## 17               Turbowinds         3       0.05           99.87
## 18                    Bonus         2       0.03           99.90
## 19                      EWT         2       0.03           99.93
## 20                 Lagerwey         2       0.03           99.96
## 21                 Leitwind         1       0.02           99.98
## 22               Pfleiderer         1       0.02          100.00
## 23                    Tacke         1       0.02          100.00

##           province_territory frequency percentage cumulative_perc
## 1                    Ontario      2443      37.71           37.71
## 2                     Quebec      1991      30.73           68.44
## 3                    Alberta       900      13.89           82.33
## 4                Nova Scotia       310       4.79           87.12
## 5           British Columbia       292       4.51           91.63
## 6               Saskatchewan       153       2.36           93.99
## 7                   Manitoba       133       2.05           96.04
## 8              New Brunswick       119       1.84           97.88
## 9       Prince Edward Island       104       1.61           99.49
## 10 Newfoundland and Labrador        27       0.42           99.91
## 11     Northwest Territories         4       0.06           99.97
## 12                     Yukon         2       0.03          100.00

## [1] "Variables processed: commissioning_date, manufacturer, province_territory"

Due to high number of levels, consider lumping levels for some of these features

viz: mosaic plots

#check out relationship of most frequent/largest levels of manufacturer and commissioning_date
a %>% select(manufacturer, commissioning_date) %>% 
  mutate(
    commissioning_date = fct_lump_prop(commissioning_date, prop = 0.05),
    manufacturer = fct_lump_prop(manufacturer, prop = 0.05)
    ) %>%
  filter(
    commissioning_date != 'Other',
    manufacturer != 'Other'
    ) %>% 
  ggpairs(
    columns = c('manufacturer','commissioning_date'),
      mapping = aes(color = manufacturer)
  )

It’s apparent that turbine manufacturing volume differs by manufacturer (1) relative to other manufacturers in the same year (2) as well as to itself over time’

EDA: num vars

names

num.vars = a %>% select(where(is.numeric)) %>% colnames()

sample

a %>% select(num.vars) %>% sample_n(5)

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

skim

a %>% select(num.vars) %>% skim_without_charts()

Data summary

Name	Piped data
Number of rows	6478
Number of columns	3
_______________________
Column type frequency:
numeric	3
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
hub_height_m	1	82.79	14.37	24.5	80	80	85	132
rotor_diameter_m	1	88.20	16.57	15.0	77	90	100	141
turbine_rated_capacity_k_w	1	1967.31	605.93	65.0	1600	1880	2300	3750

viz: distribution - density

#a %>% select(num.vars) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)
a %>% select(num.vars) %>% DataExplorer::plot_density(nrow = 2, ncol = 1)

viz: outliers

a %>% select(num.vars) %>% dlookr::plot_outlier()

viz: normality

a %>% select(num.vars) %>% dlookr::plot_normality()

viz: normality - outcome var only

a %>% plot_ly(x = ~turbine_rated_capacity_k_w, nbinsx = 50)

a %>% plot_ly(x = ~turbine_rated_capacity_k_w) %>% add_boxplot() %>% layout(title = 'Distribution of turbine_rated_capacity_k_w')

a %>% mutate(manufacturer = fct_lump_prop(manufacturer, prop = 0.05)) %>%
  filter(manufacturer != 'Other') %>%
  mutate(manufacturer = fct_reorder(manufacturer, turbine_rated_capacity_k_w)) %>%
  plot_ly(x = ~turbine_rated_capacity_k_w, color = ~manufacturer) %>% add_boxplot() %>% layout(title = 'Distribution of turbine_rated_capacity_k_w by Largest Manufacturers')

viz: correlations

a %>% select(num.vars) %>% dlookr::plot_correlate()

a %>% select(num.vars) %>% DataExplorer::plot_correlation()

a %>% select(num.vars) %>% funModeling::correlation_table('turbine_rated_capacity_k_w')

Preprocessing

1) split data

split = initial_split(a)
train = training(split)
test = testing(split)
vfold = vfold_cv(train, v = 10, strata = turbine_rated_capacity_k_w) #for supervised continuous algo, stratify on the outcome var

a %>% use_ranger(turbine_rated_capacity_k_w ~ .)

2) create recipe spec

rec =
  train %>% recipe(turbine_rated_capacity_k_w ~ .) %>% 
  step_zv(all_numeric(),-all_outcomes()) %>%
  step_nzv(all_numeric(),-all_outcomes()) %>%
  step_corr(all_numeric(),-all_outcomes()) %>% 
  step_other(all_nominal(),-all_outcomes(), threshold =  0.01) %>% 
  step_normalize(all_numeric(),-all_outcomes()) %>% 
  step_dummy(all_nominal(),-all_outcomes(),one_hot = TRUE)

3) preprocess

rec %>% prep %>% juice %>% head

rec %>% prep %>% bake(new_data = test) %>% head

rec %>% prep %>% juice %>% miss_var_summary

rec %>% prep %>% bake(new_data = test) %>% miss_var_summary()

4) create model spec

mdl.rf = rand_forest(
  trees = 500,
  min_n = tune(), #complexity
  mtry = tune(), #randomness
  ) %>% set_mode("regression") %>%
  set_engine("ranger", importance = "impurity_corrected") 
#----------------------------
#https://bradleyboehmke.github.io/HOML/gbm.html#basic-gbm
mdl.xgb = boost_tree(
  trees = 1000,
  #complexity
  min_n = tune(),
  tree_depth = tune(),
  loss_reduction = tune(),
  #randomness
  mtry = tune(),
  sample_size = tune(),
  #step size
  learn_rate = tune() #i.e. 'shrinkage', smaller values require more trees 
) %>% set_mode("regression") %>%
  set_engine("xgboost", objective = "reg:squarederror")

5) create workflow spec

wf.rf = workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl.rf)
#----------------------------
wf.xgb = workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl.xgb)

6) execute wf on vfold using tunegrid

set.seed(345)
doParallel::registerDoParallel()

tg.rf = tune_grid(
  object = wf.rf,
  resamples = vfold,
  grid = 10,
  metrics = metric_set(mae, mape, rmse, rsq)
)

autoplot(tg.rf) %>% ggplotly() %>% layout(title = 'rf hyperparmeter mixes & performance')

#----------------------------
set.seed(345)
doParallel::registerDoParallel()

tg.xgb = tune_grid(
  object = wf.xgb,
  resamples = vfold,
  grid = 5,
  metrics = metric_set(mae, mape, rmse, rsq)
)

autoplot(tg.xgb) %>% ggplotly() %>% layout(title = 'xgb hyperparmeter mixes & performance')

7) select best hps

(sb.hps.rf = tg.rf %>% select_best('rmse'))

tg.rf %>% collect_metrics()

tg.rf %>% collect_metrics() %>% filter(.config == 'Preprocessor1_Model08')

#----------------------------
(sb.hps.xgb = tg.xgb %>% select_best('rmse'))

tg.xgb %>% collect_metrics()

tg.xgb %>% collect_metrics() %>% filter(.config == 'Preprocessor1_Model2')

8) finalize workflow & fit model

wf.rf.fin = wf.rf %>% finalize_workflow(
  parameters = sb.hps.rf
)
#----------------------------
wf.xgb.fin = wf.xgb %>% finalize_workflow(
  parameters = sb.hps.xgb
)

9) check variable importance

wf.rf.fin %>%
  fit(train) %>% 
  pull_workflow_fit() %>% 
  vip::vip(aesthetics = list(alpha = 0.75, fill = 'forestgreen'))

#----------------------------
wf.xgb.fin %>%
  fit(train) %>% 
  pull_workflow_fit() %>% 
  vip::vip(aesthetics = list(alpha = 0.75, fill = 'slateblue4'))

10) evaluate performance on test

#wf.rf.fin %>% last_fit(split) %>% collect_metrics()

wf.rf.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  select(.pred, turbine_rated_capacity_k_w) %>% 
  metric_set(mae, mape, rmse, rsq)(truth = turbine_rated_capacity_k_w, estimate = .pred)

#----------------------------
#wf.xgb.fin %>% last_fit(split) %>% collect_metrics()

wf.xgb.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  select(.pred, turbine_rated_capacity_k_w) %>% 
  metric_set(mae, mape, rmse, rsq)(truth = turbine_rated_capacity_k_w, estimate = .pred)

11) viz predictions vs test

ggplotly(
wf.rf.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  ggplot(aes(x = turbine_rated_capacity_k_w, y = .pred)) +
  geom_point(
    alpha = 0.75,
    color = 'forestgreen'
    ) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'darkgrey', alpha = 0.5) +
  labs(title = 'RF: Test Dataset: Actuals vs Predictions')
)

#----------------------------
ggplotly(
wf.xgb.fin %>%
  last_fit(split) %>%
  collect_predictions() %>%
  ggplot(aes(x = turbine_rated_capacity_k_w, y = .pred)) +
  geom_point(
    alpha = 0.75,
    color = 'slateblue4'
    ) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'darkgrey', alpha = 0.5) +
  labs(title = 'XGB: Test Dataset: Actuals vs Predictions')
)

Wind Turbines

Jeremiah Wang

December 2020

Objective

Algos

Learning Goals

Get Data

5 min EDA

viz missing

check missing, na, inf

EDA: nom vars

names

sample

skim

viz: level counts distribution

viz: mosaic plots

EDA: num vars

names

sample

skim

viz: distribution - density

viz: outliers

viz: normality

viz: normality - outcome var only

viz: correlations

Preprocessing

1) split data

2) create recipe spec

3) preprocess

4) create model spec

5) create workflow spec

6) execute wf on vfold using tunegrid

7) select best hps

8) finalize workflow & fit model

9) check variable importance

10) evaluate performance on test

11) viz predictions vs test