library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Go to Tidy Tuesday and get the wind turbine data from Oct. 27, 2020.
wind_turbine <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## objectid = col_double(),
## province_territory = col_character(),
## project_name = col_character(),
## total_project_capacity_mw = col_double(),
## turbine_identifier = col_character(),
## turbine_number_in_project = col_character(),
## turbine_rated_capacity_k_w = col_double(),
## rotor_diameter_m = col_double(),
## hub_height_m = col_double(),
## manufacturer = col_character(),
## model = col_character(),
## commissioning_date = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## notes = col_character()
## )
Run glimpse() on the dataframe.
glimpse(wind_turbine)
## Rows: 6,698
## Columns: 15
## $ objectid <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ province_territory <chr> "Alberta", "Alberta", "Alberta", "Alberta"…
## $ project_name <chr> "Optimist Wind Energy", "Castle River Wind…
## $ total_project_capacity_mw <dbl> 0.90, 44.00, 3.78, 3.78, 3.78, 3.78, 19.50…
## $ turbine_identifier <chr> "OWE1", "CRW1", "WWT1", "WWT2", "WWT3", "W…
## $ turbine_number_in_project <chr> "1/2", "1/60", "1/6", "2/6", "3/6", "4/6",…
## $ turbine_rated_capacity_k_w <dbl> 150, 600, 600, 600, 600, 660, 1300, 1300, …
## $ rotor_diameter_m <dbl> 23, 44, 44, 44, 44, 47, 60, 60, 60, 60, 60…
## $ hub_height_m <dbl> 30, 40, 50, 50, 50, 50, 46, 46, 46, 46, 46…
## $ manufacturer <chr> "Bonus", "Vestas", "Vestas", "Vestas", "Ve…
## $ model <chr> "AN 150/30", "V44/600", "V44/600", "V44/60…
## $ commissioning_date <chr> "1993", "1997", "1998", "1998", "1998", "2…
## $ latitude <dbl> 49.52535, 49.51264, 49.23307, 49.23303, 49…
## $ longitude <dbl> -114.0548, -114.0357, -113.6549, -113.6302…
## $ notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Use select to keep only the interesting variables. Make the date variable numeric. Make appropriate variables factors. Glimpse the resulting dataframe, which should be named “turbines”
turbines = wind_turbine %>%
select(objectid, province_territory, turbine_rated_capacity_k_w, rotor_diameter_m,
hub_height_m, manufacturer, commissioning_date, latitude, longitude) %>%
mutate(province_territory = factor(province_territory),
manufacturer = factor(manufacturer),
commissioning_date = as.numeric(commissioning_date)) %>%
glimpse()
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Rows: 6,698
## Columns: 9
## $ objectid <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ province_territory <fct> Alberta, Alberta, Alberta, Alberta, Albert…
## $ turbine_rated_capacity_k_w <dbl> 150, 600, 600, 600, 600, 660, 1300, 1300, …
## $ rotor_diameter_m <dbl> 23, 44, 44, 44, 44, 47, 60, 60, 60, 60, 60…
## $ hub_height_m <dbl> 30, 40, 50, 50, 50, 50, 46, 46, 46, 46, 46…
## $ manufacturer <fct> Bonus, Vestas, Vestas, Vestas, Vestas, Ves…
## $ commissioning_date <dbl> 1993, 1997, 1998, 1998, 1998, 2000, 2001, …
## $ latitude <dbl> 49.52535, 49.51264, 49.23307, 49.23303, 49…
## $ longitude <dbl> -114.0548, -114.0357, -113.6549, -113.6302…
Run summary() on the dataframe turbines.
summary(turbines)
## objectid province_territory turbine_rated_capacity_k_w
## Min. : 1 Ontario :2663 Min. : 65
## 1st Qu.:1675 Quebec :1991 1st Qu.:1600
## Median :3350 Alberta : 900 Median :1880
## Mean :3350 Nova Scotia : 310 Mean :1967
## 3rd Qu.:5024 British Columbia: 292 3rd Qu.:2300
## Max. :6698 Saskatchewan : 153 Max. :3750
## (Other) : 389 NA's :220
## rotor_diameter_m hub_height_m manufacturer commissioning_date
## Min. : 15.00 Min. : 24.50 Vestas :1834 Min. :1993
## 1st Qu.: 80.00 1st Qu.: 80.00 GE :1725 1st Qu.:2009
## Median : 90.00 Median : 80.00 Siemens :1248 Median :2012
## Mean : 88.62 Mean : 83.34 Enercon : 960 Mean :2011
## 3rd Qu.:100.00 3rd Qu.: 92.00 Senvion : 643 3rd Qu.:2014
## Max. :141.00 Max. :132.00 NEG Micon: 132 Max. :2019
## (Other) : 156 NA's :868
## latitude longitude
## Min. :42.00 Min. :-135.23
## 1st Qu.:43.98 1st Qu.: -84.41
## Median :46.67 Median : -80.67
## Mean :46.76 Mean : -83.03
## 3rd Qu.:49.17 3rd Qu.: -67.85
## Max. :64.49 Max. : -52.97
##
How has the number of turbines installed per year increased over time.
count_by_year = turbines %>%
group_by(commissioning_date) %>%
summarize(count = n())
ggplot(count_by_year,aes(x = commissioning_date,y = count)) +
geom_line() +
geom_point(size = 2, color = "Red")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
How has the amount of capacity installed per year inceased over time.
cap_by_year = turbines %>%
group_by(commissioning_date) %>%
summarize(cap = sum(turbine_rated_capacity_k_w))
ggplot(cap_by_year,aes(x = commissioning_date,y = cap)) +
geom_line() +
geom_point(size = 2, color = "Red")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 3 rows containing missing values (geom_point).
Repeat the previous questions showing the differences among the provinces.
One possibility is to add facet_wrap() to the existing graph. I’ll just do it with the count.
count_by_year = turbines %>%
group_by(commissioning_date,province_territory) %>%
summarize(count = n())
## `summarise()` has grouped output by 'commissioning_date'. You can override using the `.groups` argument.
ggplot(count_by_year,aes(x = commissioning_date,y = count)) +
geom_line() +
geom_point(size = 2, color = "Red") +
facet_wrap(~province_territory)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## Warning: Removed 6 rows containing missing values (geom_point).
What about plotting all of the provinces in one graph?
count_by_year = turbines %>%
group_by(commissioning_date,province_territory) %>%
summarize(count = n())
## `summarise()` has grouped output by 'commissioning_date'. You can override using the `.groups` argument.
ggplot(count_by_year,aes(x = commissioning_date,y = count)) +
geom_line(aes(color = province_territory)) +
geom_point(size = 1, aes(color = province_territory))
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 rows containing missing values (geom_point).
Show how current installed capacity varies among the provinces. Put the province names on the y-axis.
cop_by_province = turbines %>%
group_by(province_territory) %>%
summarize(cap = sum(turbine_rated_capacity_k_w))
ggplot(cop_by_province,aes(x = cap,y = province_territory)) +
geom_col()
## Warning: Removed 1 rows containing missing values (position_stack).
cop_by_province = turbines %>%
group_by(province_territory) %>%
summarize(cap = sum(turbine_rated_capacity_k_w))
ggplot(cop_by_province,aes(x = cap,y = reorder(province_territory,cap))) +
geom_col()
## Warning: Removed 1 rows containing missing values (position_stack).