Predict the capacity of wind turbines in Canada based on turbine features
Type: Supervised Machine Learning
#a = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv")
#saveRDS(a, 'a.rds')
a = readRDS('a.rds') %>%
clean_names() %>%
mutate(across(where(is.character),factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is.numeric)
)
a %>% sample_n(5)#remove unnecessary cols
a = a %>% select(
-notes,
-turbine_identifier,
-turbine_number_in_project,
-objectid,
-total_project_capacity_mw,
#removing these vars below b/c it would be unrealistic to use them for predictions
#we likely wouldn't have such information with new data
-model,
-latitude,
-longitude,
-project_name
)
a %>% sample_n(5)## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
| Name | Piped data |
| Number of rows | 6478 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| commissioning_date | 0 | 1 | FALSE | 35 | 201: 744, 201: 621, 201: 590, 201: 544 |
| manufacturer | 0 | 1 | FALSE | 23 | Ves: 1829, GE: 1725, Sie: 1033, Ene: 960 |
| province_territory | 0 | 1 | FALSE | 12 | Ont: 2443, Que: 1991, Alb: 900, Nov: 310 |
## commissioning_date frequency percentage cumulative_perc
## 1 2014 744 11.49 11.49
## 2 2013 621 9.59 21.08
## 3 2011 590 9.11 30.19
## 4 2015 544 8.40 38.59
## 5 2009 485 7.49 46.08
## 6 2006 455 7.02 53.10
## 7 2012 404 6.24 59.34
## 8 2010 334 5.16 64.50
## 9 2016 266 4.11 68.61
## 10 2014/2015 207 3.20 71.81
## 11 2008 193 2.98 74.79
## 12 2018 179 2.76 77.55
## 13 2013/2014 154 2.38 79.93
## 14 2011/2012 141 2.18 82.11
## 15 2017 137 2.11 84.22
## 16 2006/2008 133 2.05 86.27
## 17 2007 133 2.05 88.32
## 18 1999 132 2.04 90.36
## 19 2003 116 1.79 92.15
## 20 2019 101 1.56 93.71
## 21 2004 74 1.14 94.85
## 22 2005/2006/2012 73 1.13 95.98
## 23 2000/2001 59 0.91 96.89
## 24 2001 50 0.77 97.66
## 25 2004/2005 47 0.73 98.39
## 26 2005 35 0.54 98.93
## 27 2006/2007 35 0.54 99.47
## 28 2001/2003 16 0.25 99.72
## 29 2002 8 0.12 99.84
## 30 1998 3 0.05 99.89
## 31 2002/2006 3 0.05 99.94
## 32 1993 2 0.03 99.97
## 33 2000 2 0.03 100.00
## 34 1995 1 0.02 100.02
## 35 1997 1 0.02 100.00
## manufacturer frequency percentage cumulative_perc
## 1 Vestas 1829 28.23 28.23
## 2 GE 1725 26.63 54.86
## 3 Siemens 1033 15.95 70.81
## 4 Enercon 960 14.82 85.63
## 5 Senvion 643 9.93 95.56
## 6 NEG Micon 132 2.04 97.60
## 7 Acciona Wind Power 40 0.62 98.22
## 8 Acciona 34 0.52 98.74
## 9 Nordex 20 0.31 99.05
## 10 Suzlon 15 0.23 99.28
## 11 Vensys 9 0.14 99.42
## 12 Gamesa 8 0.12 99.54
## 13 Windmatic 6 0.09 99.63
## 14 DeWind 5 0.08 99.71
## 15 Samsung Renewable Energy 4 0.06 99.77
## 16 Northwind 3 0.05 99.82
## 17 Turbowinds 3 0.05 99.87
## 18 Bonus 2 0.03 99.90
## 19 EWT 2 0.03 99.93
## 20 Lagerwey 2 0.03 99.96
## 21 Leitwind 1 0.02 99.98
## 22 Pfleiderer 1 0.02 100.00
## 23 Tacke 1 0.02 100.00
## province_territory frequency percentage cumulative_perc
## 1 Ontario 2443 37.71 37.71
## 2 Quebec 1991 30.73 68.44
## 3 Alberta 900 13.89 82.33
## 4 Nova Scotia 310 4.79 87.12
## 5 British Columbia 292 4.51 91.63
## 6 Saskatchewan 153 2.36 93.99
## 7 Manitoba 133 2.05 96.04
## 8 New Brunswick 119 1.84 97.88
## 9 Prince Edward Island 104 1.61 99.49
## 10 Newfoundland and Labrador 27 0.42 99.91
## 11 Northwest Territories 4 0.06 99.97
## 12 Yukon 2 0.03 100.00
## [1] "Variables processed: commissioning_date, manufacturer, province_territory"
#check out relationship of most frequent/largest levels of manufacturer and commissioning_date
a %>% select(manufacturer, commissioning_date) %>%
mutate(
commissioning_date = fct_lump_prop(commissioning_date, prop = 0.05),
manufacturer = fct_lump_prop(manufacturer, prop = 0.05)
) %>%
filter(
commissioning_date != 'Other',
manufacturer != 'Other'
) %>%
ggpairs(
columns = c('manufacturer','commissioning_date'),
mapping = aes(color = manufacturer)
)## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
| Name | Piped data |
| Number of rows | 6478 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| hub_height_m | 0 | 1 | 82.79 | 14.37 | 24.5 | 80 | 80 | 85 | 132 |
| rotor_diameter_m | 0 | 1 | 88.20 | 16.57 | 15.0 | 77 | 90 | 100 | 141 |
| turbine_rated_capacity_k_w | 0 | 1 | 1967.31 | 605.93 | 65.0 | 1600 | 1880 | 2300 | 3750 |
#a %>% select(num.vars) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)
a %>% select(num.vars) %>% DataExplorer::plot_density(nrow = 2, ncol = 1)a %>% plot_ly(x = ~turbine_rated_capacity_k_w) %>% add_boxplot() %>% layout(title = 'Distribution of turbine_rated_capacity_k_w')a %>% mutate(manufacturer = fct_lump_prop(manufacturer, prop = 0.05)) %>%
filter(manufacturer != 'Other') %>%
mutate(manufacturer = fct_reorder(manufacturer, turbine_rated_capacity_k_w)) %>%
plot_ly(x = ~turbine_rated_capacity_k_w, color = ~manufacturer) %>% add_boxplot() %>% layout(title = 'Distribution of turbine_rated_capacity_k_w by Largest Manufacturers')rec =
train %>% recipe(turbine_rated_capacity_k_w ~ .) %>%
step_zv(all_numeric(),-all_outcomes()) %>%
step_nzv(all_numeric(),-all_outcomes()) %>%
step_corr(all_numeric(),-all_outcomes()) %>%
step_other(all_nominal(),-all_outcomes(), threshold = 0.01) %>%
step_normalize(all_numeric(),-all_outcomes()) %>%
step_dummy(all_nominal(),-all_outcomes(),one_hot = TRUE)mdl.rf = rand_forest(
trees = 500,
min_n = tune(), #complexity
mtry = tune(), #randomness
) %>% set_mode("regression") %>%
set_engine("ranger", importance = "impurity_corrected")
#----------------------------
#https://bradleyboehmke.github.io/HOML/gbm.html#basic-gbm
mdl.xgb = boost_tree(
trees = 1000,
#complexity
min_n = tune(),
tree_depth = tune(),
loss_reduction = tune(),
#randomness
mtry = tune(),
sample_size = tune(),
#step size
learn_rate = tune() #i.e. 'shrinkage', smaller values require more trees
) %>% set_mode("regression") %>%
set_engine("xgboost", objective = "reg:squarederror") wf.rf.fin %>%
fit(train) %>%
pull_workflow_fit() %>%
vip::vip(aesthetics = list(alpha = 0.75, fill = 'forestgreen'))#----------------------------
wf.xgb.fin %>%
fit(train) %>%
pull_workflow_fit() %>%
vip::vip(aesthetics = list(alpha = 0.75, fill = 'slateblue4'))#wf.rf.fin %>% last_fit(split) %>% collect_metrics()
wf.rf.fin %>%
last_fit(split) %>%
collect_predictions() %>%
select(.pred, turbine_rated_capacity_k_w) %>%
metric_set(mae, mape, rmse, rsq)(truth = turbine_rated_capacity_k_w, estimate = .pred)#----------------------------
#wf.xgb.fin %>% last_fit(split) %>% collect_metrics()
wf.xgb.fin %>%
last_fit(split) %>%
collect_predictions() %>%
select(.pred, turbine_rated_capacity_k_w) %>%
metric_set(mae, mape, rmse, rsq)(truth = turbine_rated_capacity_k_w, estimate = .pred)ggplotly(
wf.rf.fin %>%
last_fit(split) %>%
collect_predictions() %>%
ggplot(aes(x = turbine_rated_capacity_k_w, y = .pred)) +
geom_point(
alpha = 0.75,
color = 'forestgreen'
) +
geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'darkgrey', alpha = 0.5) +
labs(title = 'RF: Test Dataset: Actuals vs Predictions')
)#----------------------------
ggplotly(
wf.xgb.fin %>%
last_fit(split) %>%
collect_predictions() %>%
ggplot(aes(x = turbine_rated_capacity_k_w, y = .pred)) +
geom_point(
alpha = 0.75,
color = 'slateblue4'
) +
geom_abline(intercept = 0, slope = 1, linetype = 'dashed', color = 'darkgrey', alpha = 0.5) +
labs(title = 'XGB: Test Dataset: Actuals vs Predictions')
)