Dataset

Canadian Wind Turbines from Government of Canada, shared by TidyTuesday.

Contents

Section 1: Data visualization exercise

Section 2: Modeling exercise (turbine capacity prediction)

#load packages
library(tidyverse)
library(tidymodels)
library(doParallel)
library(vip)
library(parttree) 
library(ggsci)
library(wesanderson)
library(psych)

#load data
turbines <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')

Section 1: Data visualization

Summary of turbines in a wind farm (project)

turbines %>% group_by(project_name) %>% tally(sort=T) %>% rename(turbine_n=n) %>%  psych::describe(quant=c(.25,.75))
#histogram
#wind_turbine %>% group_by(project_name) %>% tally(sort=T) %>% rename(turbine_n=n) %>% group_by(turbine_n) %>% tally() %>% rename(projects_n=n) %>% ggplot(aes(x=turbine_n, y=projects_n)) + geom_col() + theme_light()

Project capacity by province

(inspired by toeb18)

turbines %>% group_by(province_territory) %>% summarise(project_capacity = sum(total_project_capacity_mw)) %>%
  ggplot(aes(x = province_territory,y = project_capacity)) +  geom_segment(aes(x=reorder(province_territory, project_capacity), xend=province_territory, y=0, yend=project_capacity), color="#6096ba") + geom_point(size=2, color="#277da1") +
  coord_flip() + theme_light() + theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank()) +
    labs(x= "Canadian Province", y="Cummulative turbine projects capacity in MW", title="Candian Wind Turbines", subtitle="Cummulative project capacity by province")

Correlation between number of turbines and wind farm capacity (inspired by kustav_sen)

turbines %>% group_by(province_territory, project_name) %>% summarise(turbines_n = n(), project_cap = mean(total_project_capacity_mw)) %>% ggplot(aes(x=project_cap, y=turbines_n, color=province_territory)) + geom_point(size=1.5) + facet_wrap(~province_territory) + labs(x= "Number of Turbines", y = "Project Capacity in MW") + theme_minimal() + theme(legend.position="none") + labs(title= "Project capacity and number of turbines by Canadian province") + scale_color_futurama()

Manufacturer

#find top manufacturers
manufacturers <- turbines %>% 
  group_by(manufacturer) %>% 
  summarise(num_turbines = n()) %>%
  mutate(manu_ranking = min_rank(desc(num_turbines))) %>% 
  arrange(desc(num_turbines))

topmanufacturers = as_vector(manufacturers[1:4, 'manufacturer'])
wind_turbine2 = turbines %>% mutate(manufacturer = replace(manufacturer, manufacturer %in% topmanufacturers, "Others"))
wind_turbine2 <- turbines %>% 
  left_join(manufacturers) %>% 
  select(-num_turbines) %>% 
  mutate(manu_ranking = replace(manu_ranking, manufacturer == "Others", 6))

The top four manufacturers with the most turbines are Vestas, GE, Siemens, Enercon

#separate strings in commission date 
wind_turbine2 <- wind_turbine2 %>% 
  separate(commissioning_date, sep = "/", into = c("com_date_1", "com_date_2", "com_date_3")) %>%
  mutate(com_date_1 = as.numeric(com_date_1),
         com_date_2 = as.numeric(com_date_2),
         com_date_3 = as.numeric(com_date_3), 
         most_recent_com_date = pmax(com_date_1, com_date_2, com_date_3, na.rm = TRUE)
         )
#get annual capacity added and annual turbines added
wind_turbine2 <- wind_turbine2 %>% 
  group_by(most_recent_com_date) %>% 
  mutate(annual_capacity_added = sum(turbine_rated_capacity_k_w, na.rm = TRUE),
         annual_turbines_added = n()
         ) %>% 
  ungroup()
#get proportion by manufacturer
wind_turbine2 <-  wind_turbine2 %>% 
  group_by(most_recent_com_date, manufacturer) %>%
  summarise(capacity_added = sum(turbine_rated_capacity_k_w, na.rm = TRUE),
            turbines_added = n(),
            prop_capacity_added = capacity_added / annual_capacity_added,
            prop_turbines_added = turbines_added / annual_turbines_added,
            annual_capacity_added = annual_capacity_added,
            annual_turbines_added = annual_turbines_added,
            manu_ranking = manu_ranking
            ) %>% 
  distinct() %>% 
  ungroup()
#ggplot(wind_turbine2, aes(x=most_recent_com_date, y= annual_turbines_added)) + geom_col(fill="#2a9d8f") + theme_light() + labs(x="Most recent comissioning date", y= "Turbines added", title= "Annual wind turbines added", subtitle = "Canadian wind turbines")
#ggplot(wind_turbine2, aes(x=most_recent_com_date, y= annual_capacity_added)) + geom_col(fill="#15616d") + theme_light() + labs(x="Most recent comissioning date", y= "Capacity added", title= "Annual capacity added", subtitle = "Canadian wind turbines")

Proportion of the total wind energy generation capacity commissioned in a given year

(inspired by analytics_urban)

wind_turbine2$manu_ranking[wind_turbine2$manu_ranking >4] <- 'Others'
ggplot(wind_turbine2, aes(x = most_recent_com_date, y= prop_capacity_added, fill=factor(manu_ranking, labels=c("1:Vestas", "2:GE", "3:Siemens", "4:Enercon", "Others")))) + geom_col()  + scale_fill_jama() + theme_light() + scale_x_continuous(breaks = seq(from=1993, to=2019, by=5)) + labs(fill="Manufacturer", title= "Canadian wind turbines: Four largest manufacturers", subtitle="Proportion of the total wind energy generation capacity commissioned in a given year", x="", y= "Proportion") + theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank())

Total capacity commissioned across the years

(inspired by analytics_urban)

ggplot(wind_turbine2, aes(x=most_recent_com_date, y=annual_capacity_added)) + geom_line(color="slategrey") + geom_point(color="white",shape=24,aes(fill=annual_capacity_added, size=3)) + labs(y="Total capacity commissioned (megawatts)", x="", title="Canadian wind turbines",subtitle="Total capacity commissioned across the years") + theme_light() + scale_x_continuous(breaks = seq(from=1993, to=2019, by=5)) + theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position="none") + scale_fill_viridis(option="cividis")

Section 2: Modeling

Clean data

turbines_df <- turbines %>%
  transmute(
    turbine_capacity = turbine_rated_capacity_k_w,
    rotor_diameter_m,
    hub_height_m,
    commissioning_date = parse_number(commissioning_date),
    province_territory = fct_lump_n(province_territory, 10),
    model = fct_lump_n(model, 10)
  ) %>%
  filter(!is.na(turbine_capacity)) %>%
  mutate_if(is.character, factor)

Visualize capacity with year, hub height and rotor diameter

turbines_df %>%
  select(turbine_capacity:commissioning_date) %>%
  pivot_longer(rotor_diameter_m:commissioning_date) %>%
  ggplot(aes(turbine_capacity, value)) +
  geom_hex(bins = 15, alpha = 0.8) +
  geom_smooth(method = "lm",color="orange") +
  facet_wrap(~name, scales = "free_y") +
  labs(y = NULL) +
  scale_fill_gradient(high = "olivedrab3") + theme_minimal()

Partition data

set.seed(123)
wind_split <- initial_split(turbines_df, strata = turbine_capacity)
wind_train <- training(wind_split)
wind_test <- testing(wind_split)

set.seed(234)
wind_folds <- vfold_cv(wind_train, strata = turbine_capacity)
wind_folds
#  10-fold cross-validation using stratification 

Decision tree base model

tree_spec <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) %>%
  set_engine("rpart") %>%
  set_mode("regression")

tree_spec
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = tune()
  tree_depth = tune()
  min_n = tune()

Computational engine: rpart 

Grid

tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)
tree_grid

Tune model specs

doParallel::registerDoParallel()

set.seed(345)
tree_rs <- tune_grid(
  tree_spec,
  turbine_capacity ~ .,
  resamples = wind_folds,
  grid = tree_grid,
  metrics = metric_set(rmse, rsq, mae, mape)
)

Attaching package: ‘rlang’

The following objects are masked from ‘package:purrr’:

    %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, flatten_raw, invoke,
    list_along, modify, prepend, splice


Attaching package: ‘vctrs’

The following object is masked from ‘package:dplyr’:

    data_frame

The following object is masked from ‘package:tibble’:

    data_frame


Attaching package: ‘rpart’

The following object is masked from ‘package:dials’:

    prune
tree_rs
# Tuning results
# 10-fold cross-validation using stratification 

Model evaluation

collect_metrics(tree_rs)
autoplot(tree_rs) + theme_minimal() + scale_color_jco()

show_best(tree_rs, "mape")
select_best(tree_rs, "rmse")

Finalize tree

final_tree <- finalize_model(tree_spec, select_best(tree_rs, "rmse"))
final_tree
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = 1e-10
  tree_depth = 15
  min_n = 2

Computational engine: rpart 

Fit and predict

final_fit <- fit(final_tree, turbine_capacity ~ ., wind_train)
final_rs <- last_fit(final_tree, turbine_capacity ~ ., wind_split)

predict(final_fit, wind_train[144, ])
predict(final_rs$.workflow[[1]], wind_train[144, ])

Variable importance

final_fit %>%
  vip(geom = "col", aesthetics = list(fill = "#264653", alpha = 0.8)) +
  scale_y_continuous(expand = c(0, 0)) + theme_minimal()

Visualize tree results

ex_fit <- fit(
  final_tree,
  turbine_capacity ~ rotor_diameter_m + commissioning_date,
  wind_train
)

pal <- wes_palette("Zissou1", 100, type = "continuous")

wind_train %>%
  ggplot(aes(rotor_diameter_m, commissioning_date)) +
  geom_parttree(data = ex_fit, aes(fill = turbine_capacity), alpha = 0.3) +
  geom_jitter(alpha = 0.7, width = 1, height = 0.5, aes(color = turbine_capacity)) +
  scale_fill_gradientn(aesthetics = c("color", "fill"),colors=pal)

collect_metrics(final_rs)

final_rs %>%
  collect_predictions() %>%
  ggplot(aes(turbine_capacity, .pred)) +
  geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
  geom_point(alpha = 0.6, color = "#006d77") +
  coord_fixed() + theme_minimal()

---
title: "Wind Turbines - EDA and Modeling Exercise"
date: "Dec 2020"
output: html_notebook
---

### Dataset 
[Canadian Wind Turbines](https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-10-27/readme.md) from [Government of Canada](https://open.canada.ca/data/en/dataset/79fdad93-9025-49ad-ba16-c26d718cc070), shared by TidyTuesday.

### Contents

Section 1: Data visualization exercise

Section 2: Modeling exercise (turbine capacity prediction)   

  * reference: [Julia Silge's tidymodels tutorial](https://juliasilge.com/blog/wind-turbine/) 


```{r setup, message=FALSE}
#load packages
library(tidyverse)
library(tidymodels)
library(doParallel)
library(vip)
library(parttree) 
library(ggsci)
library(wesanderson)
library(psych)

#load data
turbines <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')
```

### Section 1: Data visualization
#### Summary of turbines in a wind farm (project)
```{r}
turbines %>% group_by(project_name) %>% tally(sort=T) %>% rename(turbine_n=n) %>%  psych::describe(quant=c(.25,.75))
#histogram
#wind_turbine %>% group_by(project_name) %>% tally(sort=T) %>% rename(turbine_n=n) %>% group_by(turbine_n) %>% tally() %>% rename(projects_n=n) %>% ggplot(aes(x=turbine_n, y=projects_n)) + geom_col() + theme_light()
```

#### Project capacity by province 
(inspired by toeb18)
```{r, message = FALSE}
turbines %>% group_by(province_territory) %>% summarise(project_capacity = sum(total_project_capacity_mw)) %>%
  ggplot(aes(x = province_territory,y = project_capacity)) +  geom_segment(aes(x=reorder(province_territory, project_capacity), xend=province_territory, y=0, yend=project_capacity), color="#6096ba") + geom_point(size=2, color="#277da1") +
  coord_flip() + theme_light() + theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank()) +
    labs(x= "Canadian Province", y="Cummulative turbine projects capacity in MW", title="Candian Wind Turbines", subtitle="Cummulative project capacity by province")
```

#### Correlation between number of turbines and wind farm capacity (inspired by kustav_sen)
```{r, message = FALSE}
turbines %>% group_by(province_territory, project_name) %>% summarise(turbines_n = n(), project_cap = mean(total_project_capacity_mw)) %>% ggplot(aes(x=project_cap, y=turbines_n, color=province_territory)) + geom_point(size=1.5) + facet_wrap(~province_territory) + labs(x= "Number of Turbines", y = "Project Capacity in MW") + theme_minimal() + theme(legend.position="none") + labs(title= "Project capacity and number of turbines by Canadian province") + scale_color_futurama()
```

#### Manufacturer
```{r, message = FALSE}
#find top manufacturers
manufacturers <- turbines %>% 
  group_by(manufacturer) %>% 
  summarise(num_turbines = n()) %>%
  mutate(manu_ranking = min_rank(desc(num_turbines))) %>% 
  arrange(desc(num_turbines))

topmanufacturers = as_vector(manufacturers[1:4, 'manufacturer'])
wind_turbine2 = turbines %>% mutate(manufacturer = replace(manufacturer, manufacturer %in% topmanufacturers, "Others"))
wind_turbine2 <- turbines %>% 
  left_join(manufacturers) %>% 
  select(-num_turbines) %>% 
  mutate(manu_ranking = replace(manu_ranking, manufacturer == "Others", 6))
```
The top four manufacturers with the most turbines are Vestas, GE, Siemens, Enercon

```{r, warning=FALSE,message = FALSE}
#separate strings in commission date 
wind_turbine2 <- wind_turbine2 %>% 
  separate(commissioning_date, sep = "/", into = c("com_date_1", "com_date_2", "com_date_3")) %>%
  mutate(com_date_1 = as.numeric(com_date_1),
         com_date_2 = as.numeric(com_date_2),
         com_date_3 = as.numeric(com_date_3), 
         most_recent_com_date = pmax(com_date_1, com_date_2, com_date_3, na.rm = TRUE)
         )
#get annual capacity added and annual turbines added
wind_turbine2 <- wind_turbine2 %>% 
  group_by(most_recent_com_date) %>% 
  mutate(annual_capacity_added = sum(turbine_rated_capacity_k_w, na.rm = TRUE),
         annual_turbines_added = n()
         ) %>% 
  ungroup()
#get proportion by manufacturer
wind_turbine2 <-  wind_turbine2 %>% 
  group_by(most_recent_com_date, manufacturer) %>%
  summarise(capacity_added = sum(turbine_rated_capacity_k_w, na.rm = TRUE),
            turbines_added = n(),
            prop_capacity_added = capacity_added / annual_capacity_added,
            prop_turbines_added = turbines_added / annual_turbines_added,
            annual_capacity_added = annual_capacity_added,
            annual_turbines_added = annual_turbines_added,
            manu_ranking = manu_ranking
            ) %>% 
  distinct() %>% 
  ungroup()
#ggplot(wind_turbine2, aes(x=most_recent_com_date, y= annual_turbines_added)) + geom_col(fill="#2a9d8f") + theme_light() + labs(x="Most recent comissioning date", y= "Turbines added", title= "Annual wind turbines added", subtitle = "Canadian wind turbines")
#ggplot(wind_turbine2, aes(x=most_recent_com_date, y= annual_capacity_added)) + geom_col(fill="#15616d") + theme_light() + labs(x="Most recent comissioning date", y= "Capacity added", title= "Annual capacity added", subtitle = "Canadian wind turbines")
```

#### Proportion of the total wind energy generation capacity commissioned in a given year 
(inspired by analytics_urban)
```{r, message = FALSE, warning = FALSE}
wind_turbine2$manu_ranking[wind_turbine2$manu_ranking >4] <- 'Others'
ggplot(wind_turbine2, aes(x = most_recent_com_date, y= prop_capacity_added, fill=factor(manu_ranking, labels=c("1:Vestas", "2:GE", "3:Siemens", "4:Enercon", "Others")))) + geom_col()  + scale_fill_jama() + theme_light() + scale_x_continuous(breaks = seq(from=1993, to=2019, by=5)) + labs(fill="Manufacturer", title= "Canadian wind turbines: Four largest manufacturers", subtitle="Proportion of the total wind energy generation capacity commissioned in a given year", x="", y= "Proportion") + theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank())
```

#### Total capacity commissioned across the years 
(inspired by analytics_urban)
```{r}
ggplot(wind_turbine2, aes(x=most_recent_com_date, y=annual_capacity_added)) + geom_line(color="slategrey") + geom_point(color="white",shape=24,aes(fill=annual_capacity_added, size=3)) + labs(y="Total capacity commissioned (megawatts)", x="", title="Canadian wind turbines",subtitle="Total capacity commissioned across the years") + theme_light() + scale_x_continuous(breaks = seq(from=1993, to=2019, by=5)) + theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position="none") + scale_fill_viridis(option="cividis")
```


### Section 2: Modeling

#### Clean data
```{r}
turbines_df <- turbines %>%
  transmute(
    turbine_capacity = turbine_rated_capacity_k_w,
    rotor_diameter_m,
    hub_height_m,
    commissioning_date = parse_number(commissioning_date),
    province_territory = fct_lump_n(province_territory, 10),
    model = fct_lump_n(model, 10)
  ) %>%
  filter(!is.na(turbine_capacity)) %>%
  mutate_if(is.character, factor)
```

#### Visualize capacity with year, hub height and rotor diameter
```{r, fig.width=5,fig.height=1.8}
turbines_df %>%
  select(turbine_capacity:commissioning_date) %>%
  pivot_longer(rotor_diameter_m:commissioning_date) %>%
  ggplot(aes(turbine_capacity, value)) +
  geom_hex(bins = 15, alpha = 0.8) +
  geom_smooth(method = "lm",color="orange") +
  facet_wrap(~name, scales = "free_y") +
  labs(y = NULL) +
  scale_fill_gradient(high = "olivedrab3") + theme_minimal()
```

#### Partition data
```{r}
set.seed(123)
wind_split <- initial_split(turbines_df, strata = turbine_capacity)
wind_train <- training(wind_split)
wind_test <- testing(wind_split)

set.seed(234)
wind_folds <- vfold_cv(wind_train, strata = turbine_capacity)
wind_folds
```

#### Decision tree base model
```{r}
tree_spec <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) %>%
  set_engine("rpart") %>%
  set_mode("regression")

tree_spec
```

#### Grid
```{r}
tree_grid <- grid_regular(cost_complexity(), tree_depth(), min_n(), levels = 4)
tree_grid
```

#### Tune model specs
```{r}
doParallel::registerDoParallel()

set.seed(345)
tree_rs <- tune_grid(
  tree_spec,
  turbine_capacity ~ .,
  resamples = wind_folds,
  grid = tree_grid,
  metrics = metric_set(rmse, rsq, mae, mape)
)

tree_rs
```

#### Model evaluation
```{r}
collect_metrics(tree_rs)
autoplot(tree_rs) + theme_minimal() + scale_color_jco()
```
```{r}
show_best(tree_rs, "mape")
select_best(tree_rs, "rmse")
```

#### Finalize tree
```{r}
final_tree <- finalize_model(tree_spec, select_best(tree_rs, "rmse"))
final_tree
```

#### Fit and predict 
```{r}
final_fit <- fit(final_tree, turbine_capacity ~ ., wind_train)
final_rs <- last_fit(final_tree, turbine_capacity ~ ., wind_split)

predict(final_fit, wind_train[144, ])
predict(final_rs$.workflow[[1]], wind_train[144, ])
```

#### Variable importance
```{r}
final_fit %>%
  vip(geom = "col", aesthetics = list(fill = "#264653", alpha = 0.8)) +
  scale_y_continuous(expand = c(0, 0)) + theme_minimal()
```

#### Visualize tree results
```{r}
ex_fit <- fit(
  final_tree,
  turbine_capacity ~ rotor_diameter_m + commissioning_date,
  wind_train
)

pal <- wes_palette("Zissou1", 100, type = "continuous")

wind_train %>%
  ggplot(aes(rotor_diameter_m, commissioning_date)) +
  geom_parttree(data = ex_fit, aes(fill = turbine_capacity), alpha = 0.3) +
  geom_jitter(alpha = 0.7, width = 1, height = 0.5, aes(color = turbine_capacity)) +
  scale_fill_gradientn(aesthetics = c("color", "fill"),colors=pal)
```

```{r}
collect_metrics(final_rs)

final_rs %>%
  collect_predictions() %>%
  ggplot(aes(turbine_capacity, .pred)) +
  geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
  geom_point(alpha = 0.6, color = "#006d77") +
  coord_fixed() + theme_minimal()

```

