1 EDA

The Tidy Tuesday dataset for 2020-10-27 was released and is about Canadian Wind Turbines. First, we will explore the data and see what insights we can obtain, then we will fit a model to predict the rated capacity of the turbines.

Link: Canadian Wind Turbines

wind_turbine <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')

## 
## -- Column specification --------------------------------------------------------
## cols(
##   objectid = col_double(),
##   province_territory = col_character(),
##   project_name = col_character(),
##   total_project_capacity_mw = col_double(),
##   turbine_identifier = col_character(),
##   turbine_number_in_project = col_character(),
##   turbine_rated_capacity_k_w = col_double(),
##   rotor_diameter_m = col_double(),
##   hub_height_m = col_double(),
##   manufacturer = col_character(),
##   model = col_character(),
##   commissioning_date = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   notes = col_character()
## )

skimr::skim(wind_turbine)

Data summary
Name	wind_turbine
Number of rows	6698
Number of columns	15
_______________________
Column type frequency:
character	8
numeric	7
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
province_territory	0	1.00	5	25	12
project_name	1	1.00	5	41	268
turbine_identifier	0	1.00	4	6	6683
turbine_number_in_project	0	1.00	3	7	4300
manufacturer	0	1.00	2	24	23
model	0	1.00	3	12	92
commissioning_date	0	1.00	4	14	35
notes	6064	0.09	20	196	27

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
objectid	0	1.00	3349.50	1933.69	1.00	1675.25	3349.50	5023.75	6698.00	▇▇▇▇▇
total_project_capacity_mw	0	1.00	134.76	90.50	0.30	70.50	102.00	181.50	350.00	▆▇▅▂▂
turbine_rated_capacity_k_w	220	0.97	1967.31	605.93	65.00	1600.00	1880.00	2300.00	3750.00	▁▃▇▅▁
rotor_diameter_m	0	1.00	88.62	16.45	15.00	80.00	90.00	100.00	141.00	▁▁▇▇▁
hub_height_m	0	1.00	83.34	14.44	24.50	80.00	80.00	92.00	132.00	▁▂▇▃▁
latitude	0	1.00	46.76	3.27	42.00	43.98	46.67	49.17	64.49	▇▇▁▁▁
longitude	0	1.00	-83.03	17.59	-135.23	-84.41	-80.67	-67.85	-52.97	▁▂▁▇▅

wind_turbine %>%
  filter(!is.na(turbine_rated_capacity_k_w)) %>%
  ggplot((aes(x = turbine_rated_capacity_k_w))) +
  geom_histogram(
    bins = 7,
    fill = "palegreen3",
    col = "white",
    alpha = .9
  ) +
  theme_classic() +
  theme(panel.grid.major.y = element_line(),
        panel.grid.minor.y = element_line()) +
  labs(title = "Histogram Of Rated Turbine Capacity", x = "Rated Capacity (kW)", y =
         "Count")+
  scale_y_continuous(expand=c(0,0))

We see that the most common turbine rating capacity is around 1,500-2,000 kW.

wind_turbine %>% 
  count(province_territory,sort = TRUE,name = "Count") %>%
  ggplot(aes(y=reorder(province_territory,Count),x=Count))+
  geom_col(fill="cadetblue3")+
  geom_text(aes(label=Count),hjust=-.1)+
  theme_classic()+
  theme(panel.grid.major.x = element_line(),
        panel.grid.minor.x = element_line(),
        axis.ticks.y = element_blank())+
  labs(title="Number of Wind Turbines by Province",x="Number of Wind Turbines",y="Province")+
  scale_x_continuous(expand=c(0,0),limits = c(0,2900))

Here we see the breakdown of number of wind turbines by territory. Ontario has the most wind turbines, with 2663 turbines and Yukon has the least, with just two.

wind_turbine %>%
  mutate(commissioning_date = parse_number(commissioning_date)) %>%
  mutate(commissioning_date = as.numeric(commissioning_date))%>%
  ggplot(aes(x=commissioning_date))+
  #geom_histogram(binwidth = 1,fill="cadetblue3",col="black")+
  geom_bar(fill="cadetblue3",col="white")+
  geom_text(stat="count", aes(label=..count.. ),vjust=-.5,size=3)+
  theme_classic()+
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line())+
  labs(title="Despite A Large Number of Turbines Commissioned\nDuring The Early Part Of The 2010s,\nThere has Been A Sharp Decline Over Recent Years", subtitle="Number of Commissioned Wind Turbines by Year",x="Year",y="Number of Wind Turbines")+
  scale_y_continuous(expand = c(0,0),limits = c(0,1200))

The data suggests wind turbines started coming into use by 1999/2000. The number commissioned gradually increased, until reaching a peak in 2014, where there has since been a decline.

wind_turbine %>% 
  mutate( model= fct_lump_n(model,n=10)) %>%
  count(model,sort = TRUE) %>%
  ggplot(aes(y=reorder(model,n),x=n ))+
    geom_col(fill="cadetblue3")+
  geom_text(aes(label=n),hjust=-.1)+
  theme_classic()+
  theme(panel.grid.major.x = element_line(),
        panel.grid.minor.x = element_line(),
        axis.ticks.y = element_blank())+
  labs(title="Top 10 Most Common Wind Turbine Models",x="Number of Models",y="Model")+
    scale_x_continuous(expand=c(0,0),limits = c(0,3000))

We reduce the 92 model types in the dataset into the top 10 most common, categorising all those that appear outside into “Other”.

Let’s see if there is any difference between the models total energy capacity

counts =    wind_turbine %>% 
   mutate( model= fct_lump_n(model,n=10)) %>%
   count(model,sort = TRUE)    # Allows us to order factors by count

wind_turbine %>%
   mutate( model= fct_lump_n(model,n=10)) %>%
  left_join(counts,by="model") %>% 
  filter(!is.na(turbine_rated_capacity_k_w) ) %>%
  ggplot(aes(x= fct_reorder(model,-n) ,y=turbine_rated_capacity_k_w))+
  #geom_violin(fill="palegreen3",draw_quantiles = c(0.25,0.5,0.75) )+
  geom_boxplot(fill="palegreen3")+ 
   theme_classic()+
  theme(panel.grid.major.y = element_line(),
        panel.grid.minor.y = element_line())+
  labs(title = "Differences of Rated Capacity of Turbines Between Models",subtitle="Models Ordered From Most To Least Common" , x="Model",y="Turbine Rated Capacity (KW)")

Let’s look at the differences between regions

counts =    wind_turbine %>% 
   mutate( province_territory = fct_lump_n(province_territory ,n=8)) %>%
   count(province_territory,sort = TRUE)

wind_turbine %>%
   mutate( province_territory= fct_lump_n(province_territory,n=8)) %>%
  left_join(counts,by="province_territory") %>% 
  filter(!is.na(turbine_rated_capacity_k_w) ) %>%
  ggplot(aes(x= fct_reorder(province_territory,-n) ,y=turbine_rated_capacity_k_w))+
  #geom_violin(fill="palegreen3",draw_quantiles = c(0.25,0.5,0.75) )+
  geom_boxplot(fill="palegreen3")+ 
   theme_classic()+
  theme(panel.grid.major.x = element_line(),
        panel.grid.minor.x = element_line())+
  labs(title = "Differences of Rated Capacity of Turbines Between Regions",subtitle="Models Ordered From Most To Least Common" , x="Region",y="Turbine Rated Capacity (KW)")+
  coord_flip()

We see some evidence of normally distributed values, but let’s use another plot to visually confirm if this is truly the case.

wind_turbine %>%
   mutate( province_territory= fct_lump_n(province_territory,n=8)) %>%
  left_join(counts,by="province_territory") %>% 
  filter(!is.na(turbine_rated_capacity_k_w) ) %>%
  ggplot(aes(y= fct_reorder(province_territory,-n) ,x= turbine_rated_capacity_k_w ,fill = stat(x) ))+
  #geom_violin(fill="palegreen3",draw_quantiles = c(0.25,0.5,0.75) )+
  #geom_boxplot(fill="palegreen3")+ 
  geom_density_ridges(fill="palegreen3",rel_min_height = 0.01)+
  #geom_density_ridges_gradient(rel_min_height = 0.01) +
  #scale_fill_viridis_c( option = "D", begin = .2) +
   theme_classic()+
  theme(panel.grid.major.y = element_line(),
        panel.grid.minor.y = element_line())+
  labs(title = "Differences of Rated Capacity of Turbines Between Regions",subtitle="Models Ordered From Most To Least Common" , y="Region",x="Turbine Rated Capacity (kW)")

## Picking joint bandwidth of 142

We see that there are various different modes for the regions, indicating that the it is not normally distributed.

Also of note are the differences in the rated capacities of turbines for different regions, in particular we see that some regions are more likely to be associated with different ratings. For example, in New Brunswick, many of the turbines are rated at 3,000 kW, while in Saskatchewan, most of it’s turbines are rated at just under 2000 kW.

wind_turbine %>%
  filter(!is.na(turbine_rated_capacity_k_w) ) %>%
    mutate(commissioning_date = parse_number(commissioning_date)) %>%
  mutate(commissioning_date = as.numeric(commissioning_date))%>%
  ggplot(aes(x=commissioning_date,y=turbine_rated_capacity_k_w)  )+
  geom_jitter(height = 250,width=.5,alpha=0.25,col="palegreen3")+
 # geom_bin2d()+
   theme_classic()+
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line())+
 #scale_fill_viridis_c(option="B",begin=.2,end=.8)
    labs(title = "Turbines Are Rated With Higher Capacities As Time Passes",x="Commisioning Date",y="Turbine Rated Capacity (kW)")

Newer turbines tend to have a higher turbine rated capacity.

count1 = wind_turbine %>%
  mutate(     province_territory = fct_lump_n(province_territory ,n=8))%>%
  count(province_territory)

count2 = wind_turbine %>%
  mutate(     model = fct_lump_n(model ,n=8))%>%
  count(model)

table = wind_turbine %>%
  mutate(     province_territory = fct_lump_n(province_territory ,n=8),
              model= fct_lump_n(model,n=10)) %>%
 # count(province_territory,model) %>%
  #pivot_wider(names_from = model, values_from=n) %>%
  janitor::tabyl(province_territory,model)%>%
  janitor::adorn_totals(c("row","col")) 

total_cols =  table %>%
  slice_tail()

#table %>% anti_join(total_cols, by="province_territory") %>%
#  arrange(-Total) %>% bind_rows(total_cols ) %>%
#  select(province_territory,"GE 1.5SLE",,everything())

  #bind_cols(count1 %>% select(n)  ) %>%
  #bind_rows(count2 %>% pluck(2) )

  table %>%
  knitr::kable()

province_territory	E-82	GE 1.5SLE	GE 1.6-100	MM82	MM92	SWT 2.3-101	V47/660	V80	V82/1650	V90/1800	Other	Total
Alberta	0	87	51	0	0	0	177	0	0	83	502	900
British Columbia	0	0	0	0	0	0	0	0	0	0	292	292
Manitoba	0	0	0	0	0	60	0	0	0	0	73	133
New Brunswick	0	0	0	0	0	0	0	0	0	0	119	119
Nova Scotia	5	54	0	0	0	0	0	0	0	0	251	310
Ontario	0	325	319	10	49	780	1	0	175	196	808	2663
Quebec	255	545	0	170	357	0	0	60	0	0	604	1991
Saskatchewan	7	0	0	0	0	0	0	83	0	0	63	153
Other	0	0	0	0	0	0	0	55	0	0	82	137
Total	267	1011	370	180	406	840	178	198	175	279	2794	6698

We can see a breakdown of the turbines by model and region. The most common turbine by model and region, not including the “other” category, is the SWT 2.3-101 model in Ontario, occurring 780 times.

Look at the manufactures

wind_turbine %>%
  filter(!is.na(turbine_rated_capacity_k_w)) %>%
  pivot_longer ( cols = c(rotor_diameter_m,hub_height_m)  ) %>%
  ggplot(aes(y=turbine_rated_capacity_k_w,x=value))+
  geom_point(col="palegreen3",size=4)+
  facet_wrap(name~.,scales = "free")+
  theme_bw()+
  labs(title="Bigger Turbines Have A Higher Rating",x="Value (m)",y="Rated Capacity (kW)")

We clearly see that as the hub height for a turbine increase, and the blade diameter increases, then so does the turbine capacity.

I will now clean up the data fully

turbines =  wind_turbine  %>%
  select( turbine_rated_capacity_k_w, rotor_diameter_m, hub_height_m, commissioning_date, province_territory,model )%>%
  filter( !is.na(turbine_rated_capacity_k_w) ) %>%
  transmute( capacity = turbine_rated_capacity_k_w/1000,  # convert from kw to mw 
             rotor_diameter = rotor_diameter_m,
             hub_height = hub_height_m,
             year =  parse_number(commissioning_date),
             year = as.numeric(year),
             territory = fct_lump_n(province_territory,n=8),
             model= fct_lump_n(model,n=10)
  ) %>%
  mutate( territory = fct_relevel(territory,"Other"),
          model = fct_relevel(model,"Other") )



slice_sample(turbines,n=10) %>%
  knitr::kable(caption = "Random Sample of 10 rows of the newly created turbines dataset")

Random Sample of 10 rows of the newly created turbines dataset
capacity	rotor_diameter	hub_height	year	territory	model
1.500	82.5	80.0	2012	Ontario	Other
2.221	93.0	80.0	2013	Ontario	Other
1.650	82.0	80.0	2010	Ontario	V82/1650
2.000	82.0	85.0	2013	Quebec	E-82
1.500	77.0	80.0	2006	Ontario	GE 1.5SLE
2.850	103.0	98.0	2014	Ontario	Other
2.350	92.0	98.0	2014	Quebec	Other
1.500	77.0	80.0	2011	Quebec	GE 1.5SLE
0.660	50.0	47.0	2003	Alberta	V47/660
3.200	113.0	99.5	2018	Ontario	Other

2 Modelling

Let’s fit some models using tidy models

set.seed(411)
turbines_split =  initial_split(turbines,strata = capacity)

train = training(turbines_split)
test = testing(turbines_split)

Since we can see a linear relationship in our data, we will use a linear regression model, and a SVM model with a linear degree polynomial kernal.

linear_rec = recipe( capacity ~ . , data = train ) %>%
  step_dummy(all_nominal() ) %>%
  step_zv(all_predictors())
  

svm_rec = recipe(capacity ~ ., data=train) %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_predictors())

lin_mod = linear_reg() %>%
  set_engine("lm")
  
svm_mod = svm_poly(degree = 1, cost = tune(), scale_factor = tune()  ) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

lin_mod

## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

svm_mod

## Polynomial Support Vector Machine Specification (regression)
## 
## Main Arguments:
##   cost = tune()
##   degree = 1
##   scale_factor = tune()
## 
## Computational engine: kernlab

lin_wf = workflow() %>%
  add_model(lin_mod)%>%
  add_recipe(linear_rec)

svm_wf = workflow() %>%
  add_model( svm_mod) %>%
  add_recipe( svm_rec )

lin_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: linear_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 2 Recipe Steps
## 
## * step_dummy()
## * step_zv()
## 
## -- Model -----------------------------------------------------------------------
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

svm_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: svm_poly()
## 
## -- Preprocessor ----------------------------------------------------------------
## 3 Recipe Steps
## 
## * step_dummy()
## * step_zv()
## * step_normalize()
## 
## -- Model -----------------------------------------------------------------------
## Polynomial Support Vector Machine Specification (regression)
## 
## Main Arguments:
##   cost = tune()
##   degree = 1
##   scale_factor = tune()
## 
## Computational engine: kernlab

2.1 Linear Fit

lin_fit = fit(lin_wf,data=train) 

lin_coefs =  lin_fit %>% pull_workflow_fit() %>% tidy(conf.int =TRUE) 
lin_coefs %>% 
  knitr::kable(caption="Summary of Linear Regression Fit")

Summary of Linear Regression Fit
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	9.2130122	4.4434301	2.073401	0.0381878	0.5018699	17.9241544
rotor_diameter	0.0222306	0.0006204	35.835154	0.0000000	0.0210145	0.0234468
hub_height	0.0096676	0.0006042	16.001133	0.0000000	0.0084831	0.0108521
year	-0.0047685	0.0022314	-2.136990	0.0326488	-0.0091431	-0.0003939
territory_Alberta	-0.2894111	0.0355609	-8.138454	0.0000000	-0.3591268	-0.2196955
territory_British.Columbia	-0.1745982	0.0389854	-4.478558	0.0000077	-0.2510272	-0.0981692
territory_Manitoba	-0.4018365	0.0466016	-8.622799	0.0000000	-0.4931969	-0.3104761
territory_New.Brunswick	0.2757164	0.0458280	6.016333	0.0000000	0.1858727	0.3655601
territory_Nova.Scotia	-0.2597243	0.0382819	-6.784511	0.0000000	-0.3347743	-0.1846743
territory_Ontario	-0.3523646	0.0349704	-10.076080	0.0000000	-0.4209225	-0.2838067
territory_Quebec	-0.1623267	0.0338899	-4.789820	0.0000017	-0.2287664	-0.0958871
territory_Saskatchewan	-0.3235824	0.0415129	-7.794746	0.0000000	-0.4049665	-0.2421982
model_E.82	0.0010192	0.0249514	0.040847	0.9674195	-0.0478970	0.0499354
model_GE.1.5SLE	-0.3703821	0.0147416	-25.124938	0.0000000	-0.3992823	-0.3414818
model_GE.1.6.100	-0.6705772	0.0221897	-30.220162	0.0000000	-0.7140792	-0.6270753
model_MM82	-0.0572432	0.0293861	-1.947969	0.0514765	-0.1148533	0.0003669
model_MM92	-0.2835722	0.0205366	-13.808167	0.0000000	-0.3238332	-0.2433112
model_SWT.2.3.101	-0.1370336	0.0180071	-7.609991	0.0000000	-0.1723357	-0.1017316
model_V47.660	-0.2687488	0.0327449	-8.207359	0.0000000	-0.3329436	-0.2045540
model_V80	-0.1372679	0.0322460	-4.256902	0.0000211	-0.2004846	-0.0740512
model_V82.1650	-0.2381672	0.0282525	-8.429943	0.0000000	-0.2935550	-0.1827794
model_V90.1800	-0.3162529	0.0223391	-14.156896	0.0000000	-0.3600478	-0.2724581

Graphically we have the following,

lin_coefs %>%
  filter( term !="(Intercept)" ) %>% 
  filter( !str_starts(term,"territory|model"  )  ) %>%
 ggplot(aes(x=estimate,y=term))+
   geom_point(size=3)+
   geom_errorbar(aes(xmin=conf.low, xmax=conf.high),width=0.1)+
  geom_vline(xintercept = 0,lty=2,col="firebrick3")+
  theme_classic()+
  theme(panel.grid.major = element_line(),
        panel.grid.minor.x = element_line())+
  labs(title ="Coefficient Estimates of:\n Year Commisioned, Rotor Diameter and Hub Height")

We see that, holding the assumption that all other variables are the same:

A unit increase in year (going forwards in time), is associated with an increase in the capacity of -0.005 mW
A unit increase in the rotor diameter, is associated with an increase in capacity of 0.022 mW
A unit increase in the hub height, is associated with an increase in capacity of 0.01 mW

Next we will see if there is a difference between the terrorities, from the base level of “Other”

lin_coefs %>%
  filter( term !="(Intercept)" ) %>% 
  filter( str_starts(term,"territory"  )  ) %>%
  mutate (   term  = str_remove_all(term,"territory_")) %>%
 ggplot(aes(x=estimate,y=reorder(term,-estimate) ))+
   geom_point(size=3)+
   geom_errorbar(aes(xmin=conf.low, xmax=conf.high),width=0.2)+
  geom_vline(xintercept = 0,lty=2,col="firebrick3")+
  theme_classic()+
  theme(panel.grid.major = element_line(),
        panel.grid.minor.x = element_line())+
  labs(title = "Coefficient Estimates For Territory",x="Coefficent Estimate",y="Territory")

We see that for all territories, excluding New Brunswick, they are associated with a decrease in capacity when compared to regions categorised into “Other”, whereas New Brunswick has a higher capacity rating, than compared to “Other” territories.

lin_coefs %>%
  filter( term !="(Intercept)" ) %>% 
  filter( str_starts(term,"model"  )  ) %>%
  mutate (   term  = str_remove_all(term,"model_")) %>%
 ggplot(aes(x=estimate,y=reorder(term,-estimate) ))+
   geom_point(size=3)+
   geom_errorbar(aes(xmin=conf.low, xmax=conf.high),width=0.2)+
  geom_vline(xintercept = 0,lty=2,col="firebrick3")+
  theme_classic()+
  theme(panel.grid.major = element_line(),
        panel.grid.minor.x = element_line())+
  labs(title = "Coefficient Estimates For Turbine Model",x="Coefficent Estimate",y="Turbine Model")

We can immediately see that the models E.82 and MM82 are not statistically different from “Other” models. All other models, however, are associated with a decrease in rated capacity if all other variables are held the same.

2.1.1 Estimate Liner Regression Using Resampling

We can get an estimate of the model performance using cross validation

set.seed(411)
train_folds =  vfold_cv(data=train,v = 10,strata = capacity)

lin_train_res =
 fit_resamples(
  lin_wf,
  resamples = train_folds,
  control = control_resamples(save_pred = TRUE)
  )

lin_train_res %>%
  collect_metrics() %>%
  knitr::kable(caption="Performance with 10-fold CV")

Performance with 10-fold CV
.metric	.estimator	mean	n	std_err
rmse	standard	0.3016236	10	0.0029794
rsq	standard	0.7533284	10	0.0066304

We can also see how it performed on the 10 difference folds

lin_train_res %>%
  unnest(.metrics) %>%
  ggplot(aes(x=.metric,y=.estimate))+
  #geom_violin(aes(fill=.metric),alpha=0.2,show.legend = FALSE,draw_quantiles = c(.25,.5,.75) )+
  geom_boxplot(aes(fill=.metric),alpha=0.2,show.legend = FALSE )+
  #geom_jitter(aes(col=.metric),height = 0 , width = .1,size=4)+
  geom_point(aes(col=.metric),size=4,show.legend = FALSE)+
  facet_wrap(.~.metric,scales="free")+
  theme_bw()+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())+
  labs(title = "Estimated Linear Regression Performace Using Resampling", x = NULL,y="Estimate")

2.2 SVM

2.2.1 Tuning Hyper Parameters

Here we will tune the hyper parameters and try to obtain best ones that lead to the lowest rmse.

start = Sys.time()

set.seed(411)
svm_res = svm_wf %>%
  tune_grid(  resamples = train_folds , 
              grid = grid_max_entropy(cost(),scale_factor(),size=12),
              control = control_resamples(save_pred = TRUE)
  )

end = Sys.time()

end - start

## Time difference of 4.815124 mins

#svm_res  %>% collect_metrics() %>% 
#  pivot_longer( cols = c(cost,scale_factor) ) %>%
#  mutate( value = log(value)) %>%
#  ggplot(aes(x=value,y=mean))+
#  geom_point(aes(col=.config) )+
#  facet_grid(name~.metric, scales = "free" )



svm_res %>% autoplot( )

It seems that the higher the value of the scale factor, the better the model does. Not much can be said for the cost however.

Below shows the best performing models in terms on rmse.

svm_res %>% collect_metrics() %>%
  filter(.metric == "rmse") %>%
  arrange(mean) %>%
  slice_head(n=5) %>%
  knitr::kable()

cost	scale_factor	.metric	.estimator	mean	n	std_err	.config
0.1684447	0.0402025	rmse	standard	0.3180527	10	0.0037822	Model08
2.2370083	0.0863493	rmse	standard	0.3233718	10	0.0041345	Model02
29.3020047	0.0068977	rmse	standard	0.3233871	10	0.0041254	Model05
1.7578389	0.0001275	rmse	standard	0.3643379	10	0.0034872	Model07
0.0090896	0.0144006	rmse	standard	0.4052306	10	0.0035779	Model04

svm_res %>%
select_best("rmse") %>%
  knitr::kable()

cost	scale_factor	.config
0.1684447	0.0402025	Model08

Let’s use the best parameters

final_svm = svm_wf %>% finalize_workflow( svm_res %>% select_best("rmse")  )

final_svm

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: svm_poly()
## 
## -- Preprocessor ----------------------------------------------------------------
## 3 Recipe Steps
## 
## * step_dummy()
## * step_zv()
## * step_normalize()
## 
## -- Model -----------------------------------------------------------------------
## Polynomial Support Vector Machine Specification (regression)
## 
## Main Arguments:
##   cost = 0.168444715797351
##   degree = 1
##   scale_factor = 0.0402025484551983
## 
## Computational engine: kernlab

trained_svm = final_svm %>%
 fit(data = train)

3 Predict on Test Set

lin_test_res =  test %>% bind_cols(
predict(lin_fit,new_data = test)
)%>%
  metrics(truth = capacity , .pred) %>% 
  filter(.metric !="mae") %>%
  mutate(model = "Linear")

svm_test_res = test %>% bind_cols(
predict(trained_svm ,new_data = test)
)%>%
  metrics(truth = capacity , .pred) %>%
    filter(.metric !="mae") %>%
  mutate(model = "SVM")

lin_test_res %>% bind_rows(
svm_test_res) %>%
  select(-.estimator) %>%
  pivot_wider(  names_from = .metric, values_from = .estimate ) %>%
  knitr::kable(caption="Performance on Test Set")

Performance on Test Set
model	rmse	rsq
Linear	0.3021730	0.7491079
SVM	0.3118845	0.7325560

We see that the linear regression Model did better in terms of RMSE and R\(^2\).

all_preds = test %>% bind_cols(
  predict(lin_fit,new_data = test)
) %>% bind_cols(
predict(trained_svm ,new_data = test)  
  ) %>%
  rename( Linear = .pred...7, SVM = .pred...8  ) %>%
  pivot_longer(cols = c(Linear,SVM), names_to="Model",values_to="Pred" )

## New names:
## * .pred -> .pred...7
## * .pred -> .pred...8

all_preds %>%
  ggplot(aes(x=capacity,y=Pred))+
    geom_abline(slope=1,lty=2,col="grey70",size=1)+
  geom_point(col="palegreen4",size=2,alpha=0.4)+
  facet_wrap(.~Model)+
  theme_bw()+  
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line())+
  labs(title="Final Performance on Test Set",x="True Capacity (mW)",y="Predicted Capacity (mW)")

We notice that the linear regression model, and the SVM performed almost the same, though we notice that they both had problems at the same points, namely at the 2.5 and 3 True Capacity points. We can investigate further

Let’s what caused these big residuals.

3.1 Residual Checking

all_res = all_preds %>%
  mutate( residual = (capacity - Pred)^2    ) %>%
  arrange(-residual)

all_res %>%
 # filter(Model == "SVM") %>%
  ggplot(aes(x=residual,y=territory,)) +
  geom_jitter(aes(col=territory),width=0.05,height=0.1,size=3, show.legend = FALSE,)+
 # geom_boxplot(aes(fill=territory),alpha=0.4,show.legend = FALSE)+
  theme_bw()+
  facet_wrap(.~Model,nrow = 2)+
  labs(title="Residual of Turbine Locations",x="Residual", y="Territory")

We see that the SVM model has some trouble with turbines from British Columbia, whereas

all_res %>%
 # filter(Model == "SVM") %>%
  ggplot(aes(x=residual,y=model,)) +
  geom_jitter(aes(col=model),width=0.05,height=0.1,size=3 ,show.legend = FALSE,)+
 # geom_boxplot(aes(fill=territory),alpha=0.4,show.legend = FALSE)+
  theme_bw()+
  facet_wrap(.~Model,nrow = 2)+
  labs(title="Residual of Turbine Models",x="Residual", y="Turbine Model")

It seems that “Other” models are causing issues for both regression models. This shouldn’t be too surprising as we originally had 92 models, and then we binned the ones that were not in the top 10 most common, into “Other”. This means that we could be missing some information that may allow us to better predict the high value residuals.

all_res %>%
 # filter(Model == "SVM") %>%
  ggplot(aes(x=residual,y=year,)) +
  geom_jitter(width=0.05,height=0.1,size=3 ,alpha=0.1,show.legend = FALSE,)+
 # geom_boxplot(aes(fill=territory),alpha=0.4,show.legend = FALSE)+
  theme_bw()+
  facet_wrap(.~Model,nrow = 2)+
  labs(title="Residual of Year Commissioned",x="Residual", y="Year")

Both models seems to not perform that well during the year around 2010. This is

all_res %>%
  ggplot(  aes(x=rotor_diameter,y=hub_height))+
  geom_point(aes(col=residual)) +  
  facet_wrap(.~Model)+
  theme_bw()+  
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line())+
  scale_color_viridis_c(option="C",begin=1,end=0)+
  labs(title="Residual of Turbine Physical Characteristics",x="Rotor Diameter (m)", y="Hub Height (m)")

There appears to be no real pattern in increased residuals when looking at the physical traits of turbines themselves.

Let’s see if there were any major changes of the phyiscal traits over the years.

turbines %>% 
  pivot_longer( cols=c(rotor_diameter,hub_height) ) %>%
  ggplot(aes(x=year,y=value))+
  geom_jitter(width=0.1,height=2,alpha=0.1)+
  #geom_smooth(method="lm", col="red")+ # for predicting future lengths
  facet_wrap(.~name)+
  theme_bw()+  
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line())+
  labs(title="Turbines Are Becoming Ever Larger",x="Year", y= "Value (m)")

We observe a pretty strong trend; newer turbines tend to be bigger than older ones.

turbines %>%
  select( where(is.numeric) ) %>%
  cor() %>%
  corrplot::corrplot(type="upper", addCoef.col = "white")

The above figure shows the correlations between the numeric observations. We note that the year and rotor diameter are highly correlated. So removing either could improve model performance.

4 New Predictions

Let’s make see what the model predicts when we produce sensible fake data for the year of 2020 to 2025. We will focus on the regions of: Ontario, Quebec, and Alberta. And we will also focus on the model type “Other”, since this has been the recent trend for those regions. We will also vary the sizes of the turbines.

turbines %>%
  filter(territory %in% c("Ontario","Quebec","Alberta") )%>%
  group_by(year) %>%
  count(model,territory) %>%
  ggplot(aes(x=year,y=n,col=model))+
  geom_point(size=5)+
  geom_line(size=2)+
  facet_wrap(.~territory,nrow=3)+
  theme_bw()+  
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line())+
  labs(title= "Trend of Turbine Model Used in Alberta, Ontario, and Quebec",x="Year",y="Count")

Using a linear model to predict the turbine characteristics, soley using year as the variable.

phys_pred <-
  function(var) {
    # Helper Function to get the estimated physical dimensions
    predict(lm(var ~ year ,  data = turbines) , newdata =  data.frame(year =
                                                                        seq(2020, 2025, 1))) %>%
      as_tibble() %>%
      bind_cols(year = seq(2020, 2025, 1))
    
    
  }


physical_preds =  phys_pred(turbines$rotor_diameter) %>%
  rename(rotor_diameter = value) %>%
  select(-year)   %>%
  bind_cols(phys_pred (turbines$hub_height) %>%
              rename(hub_height = value))



fake_data = crossing(
  year = (2020:2025),
  territory = c("Ontario", "Quebec", "Alberta"),
  model = "Other"
) %>%
  left_join(physical_preds, by = "year")

fake_data %>%
  knitr::kable(caption="Fake Data")

Fake Data
year	territory	model	rotor_diameter	hub_height
2020	Alberta	Other	115.7408	105.3071
2020	Ontario	Other	115.7408	105.3071
2020	Quebec	Other	115.7408	105.3071
2021	Alberta	Other	118.8070	107.8143
2021	Ontario	Other	118.8070	107.8143
2021	Quebec	Other	118.8070	107.8143
2022	Alberta	Other	121.8733	110.3214
2022	Ontario	Other	121.8733	110.3214
2022	Quebec	Other	121.8733	110.3214
2023	Alberta	Other	124.9395	112.8286
2023	Ontario	Other	124.9395	112.8286
2023	Quebec	Other	124.9395	112.8286
2024	Alberta	Other	128.0057	115.3357
2024	Ontario	Other	128.0057	115.3357
2024	Quebec	Other	128.0057	115.3357
2025	Alberta	Other	131.0719	117.8428
2025	Ontario	Other	131.0719	117.8428
2025	Quebec	Other	131.0719	117.8428

Now we shall create predictions using the linear regression model, and also obtain the prediction intervals

lin_fake = fake_data %>% bind_cols(
  predict(lin_fit, new_data = fake_data, type = "pred_int"),
 predict(lin_fit, new_data = fake_data )  )

ggplot(lin_fake, aes(year, .pred, col = territory)) +
  geom_ribbon(
    aes(ymin = .pred_lower, ymax = .pred_upper, fill = territory),
    alpha = 0.1,
    lty = 2
  ) +
  geom_point(size = 3) +
  geom_text(aes(label= round(.pred,2) ),vjust=-.5,size=4,col="black")+
  geom_line(aes(group = 1), size = 1.4) +
  facet_wrap(. ~ territory, nrow = 3) +
  labs(
    title = "Alberta, Ontario, and Quebec Can Expect To See\nHigher Rated Turbine Capacity In The Near Future",
    subtitle = 'Predicted Future Rated Capacity for Model Type: "Other" ',
    x = "Year",
    y = "Predicted Capacity (mW)"
  ) +
  theme_bw() +
  theme(legend.position = "none")

This is based on the assumption that the physical dimensions of turbines will continue to increase linearly over time. The predictions tells us that a turbine in the region of Quebec, will be higher rated than ones in Alberta or Ontario.

Canadian Wind Turbines

03/11/2020