library(tidyverse)
library(dplyr)
library(ggplot2)
library(gt)607_Project2A_DylanGold
Codebase #1
Importing data
In this file I will tidy and analyze the dataset I posted myself in discussion 5A. This source is at https://www.kaggle.com/datasets/eugeniyosetrov/renewable-power-plants?resource=download&select=renewable_capacity_timeseries.csv
url <- "https://raw.githubusercontent.com/DylanGoldJ/607-Project-2/refs/heads/main/FileA/renewable_capacity_timeseries.csv"
df <- read_csv(
file = url,
col_names = TRUE
)
# Head is 0 values, show tail for non 0 values.
tail(df, 8)# A tibble: 8 × 40
day CH_bioenergy_capacity CH_solar_capacity CH_wind_onshore_capacity
<date> <dbl> <dbl> <dbl>
1 2020-11-25 87.2 620. 64.2
2 2020-11-26 87.2 620. 64.2
3 2020-11-27 87.2 620. 64.2
4 2020-11-28 87.2 620. 64.2
5 2020-11-29 87.2 620. 64.2
6 2020-11-30 87.2 620. 64.2
7 2020-12-01 87.2 620. 64.2
8 2020-12-02 87.2 620. 64.2
# ℹ 36 more variables: DE_bioenergy_capacity <dbl>,
# DE_geothermal_capacity <dbl>, DE_solar_capacity <dbl>,
# DE_wind_capacity <dbl>, DE_wind_offshore_capacity <dbl>,
# DE_wind_onshore_capacity <dbl>, DK_solar_capacity <dbl>,
# DK_wind_capacity <dbl>, DK_wind_offshore_capacity <dbl>,
# DK_wind_onshore_capacity <dbl>, FR_bioenergy_capacity <dbl>,
# FR_geothermal_capacity <dbl>, FR_hydro_capacity <dbl>, …
This data set has the capacity of different types of energy sources in different countries.
Tidying the data
I will use glimpse to see the formatting of the columns.
glimpse(df)Rows: 44,166
Columns: 40
$ day <date> 1900-01-01, 1900-01-02, 1900-01-03, 1…
$ CH_bioenergy_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ CH_solar_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ CH_wind_onshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DE_bioenergy_capacity <dbl> 0.00, 0.08, 0.08, 0.08, 0.08, 0.08, 0.…
$ DE_geothermal_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DE_solar_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DE_wind_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DE_wind_offshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DE_wind_onshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DK_solar_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DK_wind_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DK_wind_offshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ DK_wind_onshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FR_bioenergy_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FR_geothermal_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FR_hydro_capacity <dbl> 122.496, 122.496, 122.496, 122.496, 12…
$ FR_marine_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FR_solar_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FR_wind_onshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_bioenergy_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_hydro_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_marine_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_solar_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_wind_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_wind_offshore_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-GBN_wind_onshore_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-NIR_bioenergy_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-NIR_solar_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-NIR_wind_onshore_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_bioenergy_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_hydro_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_marine_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_solar_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_wind_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_wind_offshore_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `GB-UKM_wind_onshore_capacity` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SE_wind_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SE_wind_offshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ SE_wind_onshore_capacity <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
We can see that there are 40 columns when we realistically just need 4 columns. We can try to convert our columns into day, country, energy_type, energy_amount. I will try to separate the names by the first appearance of ‘_’.
energy_data <- df %>%
pivot_longer(
cols = !c("day"),
names_to = c("country", "energy_type"),
names_pattern = "^([^_]+)_(.*)_capacity", # This regex is starting with : first grouping - characters that are not _, then _, second grouping - rest of string without capcacity.
values_to = "energy_amount"
)
tail(energy_data, 8)# A tibble: 8 × 4
day country energy_type energy_amount
<date> <chr> <chr> <dbl>
1 2020-12-02 GB-UKM marine 34.5
2 2020-12-02 GB-UKM solar 8473.
3 2020-12-02 GB-UKM wind 23020
4 2020-12-02 GB-UKM wind_offshore 9693.
5 2020-12-02 GB-UKM wind_onshore 13327.
6 2020-12-02 SE wind 9514.
7 2020-12-02 SE wind_offshore 191.
8 2020-12-02 SE wind_onshore 9323.
We now have our data in a longer format with just 4 columns. I will check to see NA values that we may need to supplement.
colSums(is.na(energy_data)) day country energy_type energy_amount
0 0 0 0
We don’t have an NA values.
I will also convert the country codes to full names
country_names <- c(
"DE" = "Germany",
"GB-UKM" = "United Kingdom",
"GB-GBN" = "Great Britain",
"FR" = "France",
"DK" = "Denmark",
"SE" = "Sweden",
"GB-NIR" = "Northern Ireland",
"CH" = "Switzerland"
)
energy_data$country <- country_names[energy_data$country]
head(energy_data)# A tibble: 6 × 4
day country energy_type energy_amount
<date> <chr> <chr> <dbl>
1 1900-01-01 Switzerland bioenergy 0
2 1900-01-01 Switzerland solar 0
3 1900-01-01 Switzerland wind_onshore 0
4 1900-01-01 Germany bioenergy 0
5 1900-01-01 Germany geothermal 0
6 1900-01-01 Germany solar 0
Now that the data is in a tidy format we can start to analyze it.
Analysis
My first interest is finding the difference in each energy type in terms of raw generation among all countries. We can do this per each year.
First lets group by the year, country and energy type
yearly_energy_data <- energy_data %>%
mutate(year = year(day)) %>%
group_by(year, country, energy_type) %>%
summarise(
energy_yearly = sum(energy_amount),
.groups = "keep"
)
head(yearly_energy_data) #Has 0 values, in 1900s# A tibble: 6 × 4
# Groups: year, country, energy_type [6]
year country energy_type energy_yearly
<dbl> <chr> <chr> <dbl>
1 1900 Denmark solar 0
2 1900 Denmark wind 0
3 1900 Denmark wind_offshore 0
4 1900 Denmark wind_onshore 0
5 1900 France bioenergy 0
6 1900 France geothermal 0
tail(yearly_energy_data) #Shows modern values# A tibble: 6 × 4
# Groups: year, country, energy_type [6]
year country energy_type energy_yearly
<dbl> <chr> <chr> <dbl>
1 2020 United Kingdom hydro 1160156.
2 2020 United Kingdom marine 11626.
3 2020 United Kingdom solar 2853635.
4 2020 United Kingdom wind 7714480.
5 2020 United Kingdom wind_offshore 3223240.
6 2020 United Kingdom wind_onshore 4491240.
We can also just group by year and country to combine all the energy types.
yearly_energy_total <- yearly_energy_data %>%
group_by(year, country) %>%
summarise(
energy_total = sum(energy_yearly),
.groups = "keep"
)
yearly_energy_total# A tibble: 968 × 3
# Groups: year, country [968]
year country energy_total
<dbl> <chr> <dbl>
1 1900 Denmark 0
2 1900 France 44849.
3 1900 Germany 29.1
4 1900 Great Britain 0
5 1900 Northern Ireland 0
6 1900 Sweden 0
7 1900 Switzerland 0
8 1900 United Kingdom 0
9 1901 Denmark 0
10 1901 France 45211.
# ℹ 958 more rows
To display a table for this I will show the total energy for each country across all years. Keep in mind we dropped na values which may have been unreported years. For the most part it should be fairly accurate. We can also arrange it
energy_total_by_country <- yearly_energy_total %>%
group_by(country) %>%
summarise(energy_total = sum(energy_total)) %>%
arrange(desc(energy_total))
energy_total_by_country# A tibble: 8 × 2
country energy_total
<chr> <dbl>
1 Germany 507154526.
2 United Kingdom 214517654.
3 Great Britain 207324910.
4 France 82867640.
5 Denmark 59064658.
6 Sweden 45247120.
7 Northern Ireland 3777263.
8 Switzerland 2218045.
We can create a table to show this data
energy_total_by_country %>%
gt() %>%
cols_label(
country = "Country",
energy_total = "Energy Total(megawatt)"
) %>%
tab_header(title = md("Energy Generation By Country"))| Energy Generation By Country | |
| Country | Energy Total(megawatt) |
|---|---|
| Germany | 507154526 |
| United Kingdom | 214517654 |
| Great Britain | 207324910 |
| France | 82867640 |
| Denmark | 59064658 |
| Sweden | 45247120 |
| Northern Ireland | 3777263 |
| Switzerland | 2218045 |
Now that we showed a table we can also show some time series to show more analysis.
I was interested in seeing how certain types of power generation would change over time. We can look at Germany’s Energy Generation.
germany_energy <- energy_data %>%
filter(country == "Germany")
ggplot(data = germany_energy,
aes(x = day,
y = energy_amount,
color = energy_type)
) +
geom_line() +
ggtitle("Germany Energy Generation By Type Overtime") +
xlab("Year") +
ylab("Energy Generated (megawatt)") +
labs(color = "Energy Type")We can just use the data from 1990 on wards. We can see until the 1990’s energy generation was basically nothing it was compared to today in Germany. This is likely true for all the countries.
germany_energy <- germany_energy %>%
filter(day > as.Date("1990-1-1"))
ggplot(data = germany_energy,
aes(x = day,
y = energy_amount,
color = energy_type)
) +
geom_line() +
ggtitle("Germany Energy Generation By Type Overtime") +
xlab("Year") +
ylab("Energy Generated (megawatt)") +
labs(color = "Energy Type")We can see above the energy generation of Germany over time. I will also look at United Kingdom, they are producing the next most reported energy in our table.
UK_energy <- energy_data %>%
filter(country == "United Kingdom") %>%
filter(day > as.Date("1940-1-1")) #increased the range, they had some earlier reported energy generation.
ggplot(data = UK_energy,
aes(x = day,
y = energy_amount,
color = energy_type)
) +
geom_line() +
ggtitle("United Kingdom Generation By Type Overtime") +
xlab("Year") +
ylab("Energy Generated (megawatt)") +
labs(color = "Energy Type")I increased the range for United Kingdom. We can see that they have a much higher amount of certain energies like hydro electric energy. Something I thought was interesting is how the UK has not used much solar energy when compared to Germany, as well as their early adoption of hydro type energy generation. We can see both countries do rely on wind for a large portion of their energy generated.
I will show one more country, France.
france_energy <- energy_data %>%
filter(country == "France") %>%
filter(day > as.Date("1980-1-1")) #increased the range, they had some earlier reported energy generation.
ggplot(data = france_energy,
aes(x = day,
y = energy_amount,
color = energy_type)
) +
geom_line() +
ggtitle("France Energy Generation By Type Overtime") +
xlab("Year") +
ylab("Energy Generated (megawatt)")We can see for France solar energy is also a large portion of their energy gernation. We dont see some type’s of energy generation. This could indicate they may have not reported this energy type. It seems that all countries have solar as a decent portion of their energy generation. I will show a graph of just solar energy now.
solar_energy <- energy_data %>%
filter(energy_type == "solar") %>%
filter(day > as.Date("2000-1-1"))
ggplot(data = solar_energy,
aes(x = day,
y = energy_amount,
color = country)
) +
geom_line() +
ggtitle("France Energy Generation By Type Overtime") +
xlab("Year") +
ylab("Energy Generated (megawatt)")We can see most countries have some portion of their energy gerneation from solar, Switzerland and Northern Ireland being the exceptions. Germany has by far the most solar in addition to general energy generation.
Summarizing
To summarize, We were able to convert our data into a longer format and perform various types of analysis. We looked at the raw generation output in a table and saw that Germany generated by far the most energy. We looked closer at the change of different types of energy over time for different countries like Germany and the United Kingdom. There were some interesting features, like how the UK has a noticeably higher and earlier usage of hydro electric energy generation and saw that wind and solar are both popular forms of energy generation in many of these countries.
Some ways we could add on to this is to look at non-renewable energy sources and see if countries with a noticeably lower generated energy like Switzerland and Northern Ireland suppliment with fossil fuels or if it is just a result of a lower population/landmass.