library(tidyverse)
library(ggplot2)
library(ggResidpanel)
library(patchwork)
library(tidymodels)
library(scales)
library(plotly)
library(corrplot)
library(RColorBrewer)
library(viridis)
library(dplyr)
library(stargazer)
library(rpart)
library(reactable)
library(knitr)
library(kableExtra)
library(vip)
options(scipen = 999)
df_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')
df = df_raw
df[1:20,] %>%
reactable(
pageSizeOptions = c(4, 8, 12),
resizable = TRUE,
wrap = FALSE,
bordered = TRUE,
compact = T,
defaultColDef = colDef(minWidth = 150)
)
t1= data.frame(unique.count = apply(df,2,function(x) length(unique(x)))) #unique values data
t2 = data.frame(na.count = colSums(is.na(df_raw)))
cbind(column = colnames(df),unique_count = t1$unique.count,na_count = t2$na.count) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
kable_paper(full_width = F, font_size =20)
| column | unique_count | na_count |
|---|---|---|
| objectid | 6698 | 0 |
| province_territory | 12 | 0 |
| project_name | 269 | 1 |
| total_project_capacity_mw | 154 | 0 |
| turbine_identifier | 6683 | 0 |
| turbine_number_in_project | 4300 | 0 |
| turbine_rated_capacity_k_w | 48 | 220 |
| rotor_diameter_m | 42 | 0 |
| hub_height_m | 42 | 0 |
| manufacturer | 23 | 0 |
| model | 92 | 0 |
| commissioning_date | 35 | 0 |
| latitude | 6623 | 0 |
| longitude | 6632 | 0 |
| notes | 28 | 6064 |
na_ind = which(is.na(df$turbine_rated_capacity_k_w))
View(df[na_ind,])
df = df[-na_ind,]
df %>%
count(province_territory) %>%
rename(count = n) %>%
mutate(pct = round(count/nrow(df),2)) %>% # turbines in each province
arrange(desc(count)) %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
kable_paper(full_width = F)
| province_territory | count | pct |
|---|---|---|
| Ontario | 2443 | 0.38 |
| Quebec | 1991 | 0.31 |
| Alberta | 900 | 0.14 |
| Nova Scotia | 310 | 0.05 |
| British Columbia | 292 | 0.05 |
| Saskatchewan | 153 | 0.02 |
| Manitoba | 133 | 0.02 |
| New Brunswick | 119 | 0.02 |
| Prince Edward Island | 104 | 0.02 |
| Newfoundland and Labrador | 27 | 0.00 |
| Northwest Territories | 4 | 0.00 |
| Yukon | 2 | 0.00 |
df = df %>% rename(rotor.d = rotor_diameter_m,
hub.h = hub_height_m,
commission.y = commissioning_date,
turbine_capacity = turbine_rated_capacity_k_w,
project_capacity=total_project_capacity_mw)
num.dat = df %>% select_if(is.numeric) %>% select(-objectid)
apply(num.dat,2,function(x) round(summary(x))) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped","hover","bordered")) %>%
kable_paper(full_width = F )
| project_capacity | turbine_capacity | rotor.d | hub.h | latitude | longitude | |
|---|---|---|---|---|---|---|
| Min. | 0 | 65 | 15 | 24 | 42 | -135 |
| 1st Qu. | 69 | 1600 | 77 | 80 | 44 | -84 |
| Median | 101 | 1880 | 90 | 80 | 47 | -81 |
| Mean | 132 | 1967 | 88 | 83 | 47 | -83 |
| 3rd Qu. | 179 | 2300 | 100 | 85 | 49 | -68 |
| Max. | 350 | 3750 | 141 | 132 | 64 | -53 |
df = df %>% mutate(province_territory = fct_lump_n(province_territory, 3))
prov.coord = df %>%
group_by(province_territory) %>%
summarise(prov.lat = (mean(latitude) + 4),
prov.lon = (mean(longitude))) %>%
slice(1:3)
rownames(prov.coord) = c("Alberta","Ontario","Quebec") # So we can add annotations of these provinces to the map
canada = map_data("world", region = "canada")
canada %>%
ggplot(aes(x = long, y = lat))+
geom_polygon(aes(group = group), fill = "grey") +
geom_density2d(df,mapping = aes(longitude,latitude),
alpha = 0.4,
size = 0.8,
color = "cyan3")+
geom_hex(df,mapping = aes(longitude,latitude,
fill = province_territory,
alpha = (..count..)),
bins = 35)+
geom_text(prov.coord,mapping = aes(prov.lon,prov.lat,
color = province_territory,
label = rownames(prov.coord)),
fontface = "italic",
fontface = "bold")+ # Adding an annotation for the main provinces
scale_fill_brewer(palette = "Dark2")+
scale_color_brewer(palette = "Dark2")+
theme(legend.position = "right", legend.text = element_text(size = 10))+
labs(title = "Canadian Wind Turbines")+
guides(colour = FALSE)+
theme_classic()
df %>%
count(commission.y)
## # A tibble: 35 x 2
## commission.y n
## * <chr> <int>
## 1 1993 2
## 2 1995 1
## 3 1997 1
## 4 1998 3
## 5 1999 132
## 6 2000 2
## 7 2000/2001 59
## 8 2001 50
## 9 2001/2003 16
## 10 2002 8
## # ... with 25 more rows
df$commission.y = as.numeric(str_sub(df$commission.y,1,4))
Some observations have multiple commission.y (2001/2003,2000/2001…ect) for each such observation I take the first year of commissioning (2001/2003 = 2001).
\[\textit{Generally, a turbine output is given by the following closed equation:}\\ P = 0.5 \cdot \rho \cdot A \cdot c_p \cdot V^3 \\ \textit{Where,}\\ \textit{ρ = Air density in kg/m3}\\ \textit{A = Rotor swept area (m2)}\\ \textit{Cp = Coefficient of performance}\\ \textit{V = wind velocity (m/s)}\]
Since the turbine’s output (and therefore its capacity) is determined in a deterministic system there’s little inference to make about it. My goal in this notebook is to analyze some aspects of this dataset and evaluate if I can predict a turbine’s capacity reasonably well.
hist(df$turbine_capacity,col = "darkblue")
library(corrplot)
cor.dat = df %>%
select_if(is.numeric) %>%
select(-c(objectid,
project_capacity))
corrplot(cor(cor.dat),
method = "color",
type = "upper",
addCoef.col = "black",
tl.col="black",
tl.srt=45)
library(plotly)
library(viridis)
p = df %>% ggplot(aes(longitude,latitude,size = turbine_capacity,color = turbine_capacity))+
geom_jitter(alpha = 0.1 )+
scale_color_viridis()+
theme_light()
ggplotly(p)
\[\left\{ \begin{array}{ll} High\ Capacity & if,\ turbine\ capacity\leq Q_1 \\ Medium\ Capacity &if,\ Q_1< turbine\ capacity< Q_2 \\ Low\ Capacity &if,\ turbine\ capacity\leq Q_3 \\ \end{array} \right.\]
quants = quantile(df$turbine_capacity,seq(0.25,0.75,0.25))
quants
## 25% 50% 75%
## 1600 1880 2300
Q_1 = quants[1]
Q_2 = quants[2]
Q_3 = quants[3]
df = df %>% mutate(rank = ifelse(
df$turbine_capacity<=Q_1,
yes = "Low Capacity",
no = ifelse(
df$turbine_capacity>Q_1 &
df$turbine_capacity<Q_3,
"Medium Capacity",
"High Capacity")))
df$rank = factor(df$rank, levels = c("High Capacity",
"Medium Capacity",
"Low Capacity" )) # So facet plots appear in this order
In the following plots we confirm that a turbine’s capacity is indeed independent of its location:
ggplot(df,aes(longitude,latitude))+
geom_density_2d_filled(alpha = 0.7,df,mapping = aes(longitude,latitude))+
facet_grid(~rank)+
ylim(c(40,60))+
theme_minimal()
We saw already that our data is not sparse, therefore we need a plot which gives us some sense about the density of observations in a given point (interval); We can use geom_hex to achieve that.
p1 =ggplot(df,aes(rotor.d,turbine_capacity))+
scale_fill_viridis(option = "C")+
geom_hex(bins = 20)+
geom_smooth(method="lm") +
theme(legend.position = "none")
p2 = ggplot(df,aes(commission.y,turbine_capacity))+
scale_fill_viridis(option = "C")+
geom_hex(bins = 20)+
geom_smooth(method="lm")+
theme(legend.position = "none")+
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p3 = ggplot(df,aes(hub.h,turbine_capacity))+
scale_fill_viridis(option = "C")+
geom_hex(bins = 20)+
geom_smooth(method="lm")+
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p1+p2+p3
df$interaction.hub.rotor = df$rotor.d*df$hub.h
p = ggplot(df,aes(interaction.hub.rotor,turbine_capacity))+
scale_fill_viridis(option = "C")+
geom_hex(bins = 20)+
geom_smooth(method="lm")
ggplotly(p)
## `geom_smooth()` using formula 'y ~ x'
df %>%
mutate(manufacturer = fct_lump_prop(manufacturer, 0.05)) %>%
plot_ly(y = ~turbine_capacity,
color = ~manufacturer,
type = "box") %>%
layout(title = 'Manufacturers Comparison',
xaxis = list(title = "Manufacturer"),
yaxis = list(title = "capacity_k_w"))
library(reactable)
manufacturer_model_table = df %>%
group_by(manufacturer,model) %>%
summarise(
count = n(),
avg.capacity = round(mean(turbine_capacity)),
sd.capacity = round(sd(turbine_capacity))
) %>%
ungroup() %>%
mutate(sd.capacity = ifelse(is.na(sd.capacity),0,sd.capacity)) %>%
arrange(desc(count))
reactable(manufacturer_model_table,
filterable = TRUE,
showPageSizeOptions = TRUE,
bordered = TRUE,
striped = T,
compact = T)
\[1.\textit{turbine capacity}_i =\beta_0 + \beta_1\textit{rotor diameter} + \beta_2\textit{hub height}+\epsilon\\\]
\[2.\textit{turbine capacity}_i =\beta_0 + \beta_1\textit{rotor diameter} + \beta_2\textit{hub height}+\beta_3\textit{commission date}+\epsilon\\\]
\[3.\textit{turbine capacity}_i =\beta_0 + \beta_1\textit{rotor diameter} + \beta_2\textit{hub height}+\beta_3\textit{commission date}+\beta_4\textit{manufacturer}+\epsilon\\\]
library(stargazer)
# Create new data frame which includes only the varibles we will use in the model and standardize it.
df = df %>%
mutate(manufacturer = fct_lump_prop(manufacturer, 0.05))
dat = df %>%
select(which(sapply(.,class)=="numeric"),
manufacturer,
-c(objectid,project_capacity,longitude,latitude)) %>%
mutate(sqrt.rotor.d = sqrt(rotor.d)) %>%
mutate_if(is.numeric,scale) %>%
as.data.frame()
set.seed(1)
train_ind = sample(1:nrow(dat),round(0.75*nrow(dat)))
train_ols = dat[train_ind,]
test_ols = dat[-train_ind,]
basic_model1 = lm(turbine_capacity ~
rotor.d+
hub.h
,data =train_ols)
basic_model2 = lm(turbine_capacity ~
rotor.d+
hub.h+
commission.y
,data =train_ols)
base_model3 = lm(turbine_capacity ~
rotor.d+
hub.h+
commission.y+
manufacturer
,data =train_ols)
stargazer(basic_model1,basic_model2,base_model3,
digits = 2,
type = "html",
font.size = "tiny",
single.row = T,
header=FALSE,
title = "Regression Results")
| Dependent variable: | |||
| turbine_capacity | |||
| (1) | (2) | (3) | |
| rotor.d | 0.50*** (0.01) | 0.48*** (0.02) | 0.76*** (0.02) |
| hub.h | 0.32*** (0.01) | 0.31*** (0.02) | 0.20*** (0.01) |
| commission.y | 0.03** (0.02) | -0.18*** (0.02) | |
| manufacturerGE | -1.21*** (0.03) | ||
| manufacturerSenvion | -0.61*** (0.03) | ||
| manufacturerSiemens | -0.83*** (0.03) | ||
| manufacturerVestas | -0.81*** (0.03) | ||
| manufacturerOther | -0.82*** (0.05) | ||
| Constant | -0.001 (0.01) | -0.001 (0.01) | 0.78*** (0.02) |
| Observations | 4,858 | 4,858 | 4,858 |
| R2 | 0.60 | 0.60 | 0.72 |
| Adjusted R2 | 0.60 | 0.60 | 0.72 |
| Residual Std. Error | 0.63 (df = 4855) | 0.63 (df = 4854) | 0.53 (df = 4849) |
| F Statistic | 3,657.08*** (df = 2; 4855) | 2,441.09*** (df = 3; 4854) | 1,546.55*** (df = 8; 4849) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
We include interactions and check if the square root transformation of the rotor variable is justified.
\[1.\textit{turbine capacity}_i =\beta_0 + \beta_1\sqrt{\textit{rotor diameter}}\times\textit{hub height}\times\textit{manufacturer}\times\textit{commission date}+\epsilon\\\]
\[2.\textit{turbine capacity}_i =\beta_0 + \beta_1\textit{rotor diameter}\times\textit{hub height}\times\textit{commission date}\times\textit{manufacturer}+\epsilon\\\]
\[\textit{Intercept}+\underbrace{5}_{dummies}+6\sum_{k=1}^{3}{{3}\choose{k}} = \text{48 coefficients}\]
model1 = lm(turbine_capacity ~
sqrt.rotor.d*hub.h*manufacturer*commission.y,data =train_ols)
model2 = lm(turbine_capacity ~
rotor.d*hub.h*manufacturer*commission.y,data =train_ols)
stargazer(model1,model2,
style = "qje",
digits = 2,
type = "html",
header=FALSE,
title = "Regression Results",omit = c("manufacturer"))
| turbine_capacity | ||
| (1) | (2) | |
| sqrt.rotor.d | 0.19*** | |
| (0.05) | ||
| rotor.d | 0.20*** | |
| (0.05) | ||
| hub.h | -0.18*** | -0.19*** |
| (0.06) | (0.06) | |
| commission.y | 0.01 | -0.01 |
| (0.04) | (0.04) | |
| sqrt.rotor.d:hub.h | -0.20*** | |
| (0.05) | ||
| rotor.d:hub.h | -0.21*** | |
| (0.06) | ||
| sqrt.rotor.d:commission.y | 0.09** | |
| (0.04) | ||
| rotor.d:commission.y | 0.03 | |
| (0.04) | ||
| hub.h:commission.y | 0.35*** | 0.36*** |
| (0.07) | (0.08) | |
| sqrt.rotor.d:hub.h:commission.y | 0.28*** | |
| (0.02) | ||
| rotor.d:hub.h:commission.y | 0.30*** | |
| (0.03) | ||
| Constant | 0.49*** | 0.51*** |
| (0.03) | (0.04) | |
| N | 4,858 | 4,858 |
| R2 | 0.82 | 0.82 |
| Adjusted R2 | 0.81 | 0.82 |
| Residual Std. Error (df = 4810) | 0.43 | 0.43 |
| F Statistic (df = 47; 4810) | 453.71*** | 461.61*** |
| Notes: | ***Significant at the 1 percent level. | |
| **Significant at the 5 percent level. | ||
| *Significant at the 10 percent level. | ||
library(ggResidpanel)
p1 = resid_panel(model1, plots = "R")
p2 = resid_panel(model2,plots = "R")
p1+p2
\[1. \textit{RMSE} =\sqrt{\frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2 }{n}}\]
\[ 2.R^2 =1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\sum_{i=1}^{n} (y_i - \bar{y_i})^2}\]
#Monte Carlo Cross Validation
#Custon function to extract rmse
rmse.fun <- function(model) {sqrt(mean(residuals(model)^2))}
rmse.ols.model1 = rep(0,100)
rsq.ols.model1 = rep(0,100)
rmse.ols.model2 = rep(0,100)
rsq.ols.model2 = rep(0,100)
set.seed(2021)
for (i in 1:100) {
#Sample and split the data
train_ind = sample(1:nrow(df),round(0.75*nrow(df)))
train_ols = dat[train_ind,]
#extract the metrics model.1
ols.model1 = lm(turbine_capacity ~
sqrt.rotor.d*hub.h*manufacturer*commission.y,data =train_ols)
rmse.ols.model1[i] = rmse.fun(ols.model1)
rsq.ols.model1[i] = summary(ols.model1)$r.squared
#extract the metrics model.2
ols_model2 = lm(turbine_capacity ~
rotor.d*hub.h*manufacturer*commission.y,data =train_ols)
rmse.ols.model2[i] = rmse.fun(ols_model2)
rsq.ols.model2[i] = summary(ols_model2)$r.squared
}
metrics = round(
cbind.data.frame(rmse.ols.model1,
rsq.ols.model1,
rmse.ols.model2,
rsq.ols.model2),2)
apply(metrics,2,function(x) round(summary(x),5)) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped","hover","bordered")) %>%
kable_paper(full_width = F)
| rmse.ols.model1 | rsq.ols.model1 | rmse.ols.model2 | rsq.ols.model2 | |
|---|---|---|---|---|
| Min. | 0.4200 | 0.8100 | 0.4100 | 0.8100 |
| 1st Qu. | 0.4200 | 0.8200 | 0.4200 | 0.8200 |
| Median | 0.4300 | 0.8200 | 0.4200 | 0.8200 |
| Mean | 0.4258 | 0.8183 | 0.4222 | 0.8207 |
| 3rd Qu. | 0.4300 | 0.8200 | 0.4200 | 0.8200 |
| Max. | 0.4400 | 0.8300 | 0.4300 | 0.8300 |
ols.test = lm(turbine_capacity ~
rotor.d*hub.h*manufacturer*commission.y,data =test_ols)
rmse.ols.test = rmse.fun(ols.test)
rsq.ols.test = summary(ols.test)$r.squared
rmse.rsq.ols.test = rbind(rmse.ols.test,rsq.ols.test)
data.frame(OLS = rmse.rsq.ols.test)
## OLS
## rmse.ols.test 0.4171117
## rsq.ols.test 0.8313867
p.ols = test_ols %>%
mutate(preds=predict(ols.model1,test_ols)) %>%
select(c(turbine_capacity, preds)) %>%
ggplot(aes(turbine_capacity, preds)) +
geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "darkblue") +
ggtitle("Regression - Actual vs Fitted")+
coord_fixed()
p.ols
library(rpart)
dat2 = df %>%
select_if(is.numeric) %>%
select(-c(objectid,
latitude,
longitude,
project_capacity)) %>%
mutate_if(is.numeric,scale)
set.seed(1)
split = initial_split(dat2)
train_tree = training(split)
test_tree = testing(split)
# Vfold cross validation
set.seed(1)
folds = vfold_cv(v = 5 ,train_tree,strata = turbine_capacity)
tree_specification = decision_tree(cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()) %>%
set_engine("rpart") %>%
set_mode("regression")
tree_grid = grid_regular(cost_complexity(),
tree_depth(),
min_n(),
levels = 3)
# generate different combintions of cost - complexity & tree depths parameters
# min_n(): The minimum number of data points in a node that is required for the node to be split further.
set.seed(1)
tree_rs = tune_grid(
tree_specification,
turbine_capacity ~.,
resamples = folds,
grid = tree_grid,
metrics = metric_set(rmse, rsq)
)
# tune_grid() computes a set of performance metrics (e.g. accuracy or RMSE) for a pre-defined set of tuning parameters that correspond to a model or recipe across one or more resamples of the data.
autoplot(tree_rs)
final_tree = finalize_model(tree_specification, select_best(tree_rs, "rmse"))
final.fit = fit(final_tree, turbine_capacity ~ ., train_tree)
final.rs = last_fit(final_tree, turbine_capacity ~ ., split)
library(vip)
final.fit %>%
vip(geom = "col", aesthetics = list(fill = "darkblue"))+
ggtitle("Rank of variables by prediction importance")
collect_metrics(final.rs) %>%
rename(Tree = .estimate) %>%
select(-c(.estimator,.config)) %>%
mutate(OLS = rmse.rsq.ols.test) %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
kable_paper(html_font = "arial",full_width = F,font_size = 38)
| .metric | Tree | OLS |
|---|---|---|
| rmse | 0.1322390 | 0.4171117 |
| rsq | 0.9827031 | 0.8313867 |
p.tree = final.rs %>%
collect_predictions() %>%
ggplot(aes(turbine_capacity, .pred)) +
geom_abline(slope = 1, lty = 2, color = "gray50", alpha = 0.5) +
geom_point(alpha = 0.6, color = "darkblue") +
ggtitle("Tree - Actual vs Fitted")+
coord_fixed()
p.ols + p.tree