#load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(broom)
library(splines)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-10
library(leaps)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(neuralnet)
##
## Attaching package: 'neuralnet'
##
## The following object is masked from 'package:dplyr':
##
## compute
#set global chunk options
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 8,
fig.height = 5
)
#set seed for reproducibility
set.seed(712)
#load owid co2 data directly from github
emissions_raw <- read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv") %>%
clean_names()
#view structure
glimpse(emissions_raw)
## Rows: 50,411
## Columns: 79
## $ country <chr> "Afghanistan", "Afghanistan"…
## $ year <dbl> 1750, 1751, 1752, 1753, 1754…
## $ iso_code <chr> "AFG", "AFG", "AFG", "AFG", …
## $ population <dbl> 2802560, NA, NA, NA, NA, NA,…
## $ gdp <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cement_co2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cement_co2_per_capita <dbl> 0, NA, NA, NA, NA, NA, NA, N…
## $ co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_growth_abs <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_growth_prct <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_growth_abs <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_growth_prct <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_gdp <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_unit_energy <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_gdp <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_unit_energy <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ coal_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ coal_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2_per_gdp <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_cement_co2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cumulative_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_co2_including_luc <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_coal_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_flaring_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_gas_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_luc_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_oil_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_other_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ energy_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ energy_per_gdp <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ flaring_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ flaring_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ gas_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ gas_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ ghg_excluding_lucf_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ ghg_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ land_use_change_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ land_use_change_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ methane <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ methane_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ nitrous_oxide <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ nitrous_oxide_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ oil_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ oil_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ other_co2_per_capita <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ other_industry_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ primary_energy_consumption <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cement_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_co2_including_luc <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_coal_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_cement_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_co2_including_luc <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_coal_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_flaring_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_gas_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_luc_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_oil_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_other_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_flaring_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_gas_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_luc_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_oil_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_other_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_of_temperature_change_from_ghg <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_ch4 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_ghg <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_n2o <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ total_ghg <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ total_ghg_excluding_lucf <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ trade_co2 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ trade_co2_share <dbl> NA, NA, NA, NA, NA, NA, NA, …
#remove aggregated owid region codes and keep country-level iso codes
emissions_country <- emissions_raw %>%
filter(!is.na(iso_code)) %>%
filter(!str_detect(iso_code, "^OWID_"))
#create latest year snapshot
latest_year <- max(emissions_country$year, na.rm = TRUE)
emissions_latest <- emissions_country %>%
filter(year == latest_year)
#select variables used throughout the project
emissions_model <- emissions_latest %>%
select(
country,
iso_code,
year,
co2,
co2_per_capita,
methane,
nitrous_oxide,
population,
gdp,
primary_energy_consumption,
coal_co2,
oil_co2,
gas_co2
) %>%
drop_na(co2, co2_per_capita, methane, nitrous_oxide, population)
#check data dimensions
dim(emissions_model)
## [1] 199 13
#Descriptive Statistics
#summary statistics for key variables
summary_table <- emissions_model %>%
summarize(
n_countries = n(),
mean_co2 = mean(co2, na.rm = TRUE),
median_co2 = median(co2, na.rm = TRUE),
sd_co2 = sd(co2, na.rm = TRUE),
mean_co2_per_capita = mean(co2_per_capita, na.rm = TRUE),
median_co2_per_capita = median(co2_per_capita, na.rm = TRUE),
mean_methane = mean(methane, na.rm = TRUE),
mean_nitrous_oxide = mean(nitrous_oxide, na.rm = TRUE)
)
kable(summary_table, digits = 3, caption = "Summary statistics for latest-year country emissions data")
Summary statistics for latest-year country emissions
data
| 199 |
187.847 |
12.855 |
973.097 |
4.347 |
2.959 |
47.733 |
14.753 |
#top 10 countries by total co2
top_total <- emissions_latest %>%
filter(!is.na(co2)) %>%
arrange(desc(co2)) %>%
slice_head(n = 10) %>%
select(country, co2, co2_per_capita, population)
kable(top_total, digits = 3, caption = "Top 10 countries by total CO2 emissions")
Top 10 countries by total CO2 emissions
| China |
12289.037 |
8.658 |
1419321284 |
| United States |
4904.120 |
14.197 |
345426566 |
| India |
3193.478 |
2.201 |
1450935785 |
| Russia |
1780.524 |
12.295 |
144820421 |
| Japan |
961.867 |
7.772 |
123753041 |
| Indonesia |
812.220 |
2.865 |
283487933 |
| Iran |
792.631 |
8.656 |
91567737 |
| Saudi Arabia |
692.133 |
20.379 |
33962751 |
| South Korea |
583.679 |
11.286 |
51717587 |
| Germany |
572.319 |
6.769 |
84552234 |
#plot top total emitters
top_total %>%
ggplot(aes(x = reorder(country, co2), y = co2)) +
geom_col() +
coord_flip() +
labs(
title = "Top 10 Countries by Total CO2 Emissions",
x = "Country",
y = "CO2 emissions"
)

#top 10 countries by co2 per capita
top_per_capita <- emissions_latest %>%
filter(!is.na(co2_per_capita), population > 1000000) %>%
arrange(desc(co2_per_capita)) %>%
slice_head(n = 10) %>%
select(country, co2_per_capita, co2, population)
kable(top_per_capita, digits = 3, caption = "Top 10 countries by CO2 emissions per capita")
Top 10 countries by CO2 emissions per capita
| Qatar |
41.271 |
125.812 |
3048429 |
| Kuwait |
26.248 |
129.519 |
4934508 |
| Bahrain |
24.270 |
39.003 |
1607059 |
| Trinidad and Tobago |
22.932 |
34.576 |
1507778 |
| Saudi Arabia |
20.379 |
692.133 |
33962751 |
| United Arab Emirates |
20.131 |
221.988 |
11027135 |
| Oman |
15.651 |
82.662 |
5281542 |
| Australia |
14.477 |
386.732 |
26713206 |
| United States |
14.197 |
4904.120 |
345426566 |
| Kazakhstan |
13.937 |
287.006 |
20592574 |
#plot top per capita emitters
top_per_capita %>%
ggplot(aes(x = reorder(country, co2_per_capita), y = co2_per_capita)) +
geom_col() +
coord_flip() +
labs(
title = "Top 10 Countries by CO2 Emissions Per Capita",
x = "Country",
y = "CO2 emissions per capita"
)
#Hypothesis Testing/ Multiple Testing
#select countries for pairwise comparison
test_data <- emissions_country %>%
filter(country %in% c("Australia", "Brazil", "China", "India", "United States")) %>%
select(country, year, co2) %>%
drop_na(co2)
#example two sample t-test
australia_brazil_test <- t.test(
co2 ~ country,
data = test_data %>% filter(country %in% c("Australia", "Brazil"))
)
australia_brazil_test
##
## Welch Two Sample t-test
##
## data: co2 by country
## t = -2.3982, df = 292.39, p-value = 0.0171
## alternative hypothesis: true difference in means between group Australia and group Brazil is not equal to 0
## 95 percent confidence interval:
## -63.382939 -6.243336
## sample estimates:
## mean in group Australia mean in group Brazil
## 72.87153 107.68467
#create pairwise t-tests across selected countries
country_pairs <- combn(unique(test_data$country), 2, simplify = FALSE)
pairwise_tests <- map_dfr(country_pairs, function(pair){
temp <- test_data %>%
filter(country %in% pair)
test_result <- t.test(co2 ~ country, data = temp)
tibble(
country_1 = pair[1],
country_2 = pair[2],
p_value = test_result$p.value
)
})
#adjust p-values using bonferroni and benjamini-hochberg
pairwise_tests <- pairwise_tests %>%
mutate(
p_bonferroni = p.adjust(p_value, method = "bonferroni"),
p_bh = p.adjust(p_value, method = "BH")
)
kable(pairwise_tests, digits = 5, caption = "Pairwise t-tests with multiple testing adjustment")
Pairwise t-tests with multiple testing adjustment
| Australia |
Brazil |
0.01710 |
0.17103 |
0.01900 |
| Australia |
China |
0.00000 |
0.00000 |
0.00000 |
| Australia |
India |
0.00000 |
0.00000 |
0.00000 |
| Australia |
United States |
0.00000 |
0.00000 |
0.00000 |
| Brazil |
China |
0.00000 |
0.00000 |
0.00000 |
| Brazil |
India |
0.00000 |
0.00000 |
0.00000 |
| Brazil |
United States |
0.00000 |
0.00000 |
0.00000 |
| China |
India |
0.00000 |
0.00000 |
0.00000 |
| China |
United States |
0.17373 |
1.00000 |
0.17373 |
| India |
United States |
0.00000 |
0.00000 |
0.00000 |
#Spline Regression and Loess Smoothing
#filter china time series
china_data <- emissions_country %>%
filter(country == "China") %>%
select(year, co2) %>%
drop_na(co2)
#loess plot for china
ggplot(china_data, aes(x = year, y = co2)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = TRUE) +
labs(
title = "LOESS Trend of CO2 Emissions in China",
x = "Year",
y = "CO2 emissions"
)

#natural cubic spline model for china
china_spline <- lm(co2 ~ ns(year, df = 5), data = china_data)
#add predictions
china_data <- china_data %>%
mutate(spline_pred = predict(china_spline, newdata = china_data))
#plot spline fit
ggplot(china_data, aes(x = year, y = co2)) +
geom_point(alpha = 0.5) +
geom_line(aes(y = spline_pred), linewidth = 1) +
labs(
title = "Natural Cubic Spline Model for China CO2 Emissions",
x = "Year",
y = "CO2 emissions"
)

summary(china_spline)
##
## Call:
## lm(formula = co2 ~ ns(year, df = 5), data = china_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1059.55 -95.02 2.71 41.07 1570.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.56 151.85 0.261 0.794935
## ns(year, df = 5)1 111.52 194.47 0.573 0.567479
## ns(year, df = 5)2 841.75 246.81 3.410 0.000903 ***
## ns(year, df = 5)3 3513.35 191.76 18.322 < 2e-16 ***
## ns(year, df = 5)4 9792.00 385.02 25.433 < 2e-16 ***
## ns(year, df = 5)5 12999.46 167.46 77.627 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 379.4 on 112 degrees of freedom
## Multiple R-squared: 0.989, Adjusted R-squared: 0.9885
## F-statistic: 2005 on 5 and 112 DF, p-value: < 2.2e-16
#filter belgium time series
belgium_data <- emissions_country %>%
filter(country == "Belgium") %>%
select(year, co2) %>%
drop_na(co2)
#loess plot for belgium
ggplot(belgium_data, aes(x = year, y = co2)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess", se = TRUE) +
labs(
title = "LOESS Trend of CO2 Emissions in Belgium",
x = "Year",
y = "CO2 emissions"
)
#Principal Component Analysis
#prepare pca variables
pca_data <- emissions_model %>%
select(
country,
co2,
co2_per_capita,
methane,
nitrous_oxide,
population,
primary_energy_consumption,
coal_co2,
oil_co2,
gas_co2
) %>%
drop_na()
#store country names
pca_countries <- pca_data$country
#scale numeric variables
pca_numeric <- pca_data %>%
select(-country) %>%
scale()
#run pca
pca_fit <- prcomp(pca_numeric, center = TRUE, scale. = TRUE)
#pca summary
summary(pca_fit)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.5935 1.0895 0.81470 0.53703 0.26742 0.22192 0.11273
## Proportion of Variance 0.7473 0.1319 0.07375 0.03205 0.00795 0.00547 0.00141
## Cumulative Proportion 0.7473 0.8792 0.95298 0.98503 0.99297 0.99844 0.99986
## PC8 PC9
## Standard deviation 0.03560 0.004683
## Proportion of Variance 0.00014 0.000000
## Cumulative Proportion 1.00000 1.000000
#scree plot
fviz_eig(pca_fit, addlabels = TRUE) +
labs(title = "PCA Scree Plot")

#pca biplot
fviz_pca_biplot(
pca_fit,
repel = TRUE,
label = "var"
) +
labs(title = "PCA Biplot of Country Emissions Variables")

#create pca scores table
pca_scores <- as_tibble(pca_fit$x) %>%
mutate(country = pca_countries)
#show countries with highest pc1
pca_scores %>%
arrange(desc(PC1)) %>%
select(country, PC1, PC2) %>%
slice_head(n = 10) %>%
kable(digits = 3, caption = "Countries with highest PC1 scores")
Countries with highest PC1 scores
| China |
17.686 |
-1.522 |
| United States |
9.474 |
4.160 |
| India |
7.106 |
-2.847 |
| Russia |
2.980 |
1.872 |
| Brazil |
2.001 |
-1.178 |
| Indonesia |
1.273 |
-0.938 |
| Iran |
0.770 |
0.924 |
| Japan |
0.545 |
0.383 |
| Saudi Arabia |
0.430 |
2.270 |
| Canada |
0.380 |
1.198 |
#K-Means Clustering
#use same scaled pca numeric data for clustering
set.seed(712)
#elbow method
fviz_nbclust(pca_numeric, kmeans, method = "wss") +
labs(title = "Elbow Method for Choosing Number of Clusters")

#fit k-means with 3 clusters
set.seed(712)
kmeans_3 <- kmeans(pca_numeric, centers = 3, nstart = 25)
#add clusters to pca scores
cluster_results <- pca_scores %>%
mutate(cluster = factor(kmeans_3$cluster))
#plot clusters using first two principal components
ggplot(cluster_results, aes(x = PC1, y = PC2, color = cluster, label = country)) +
geom_point(size = 3) +
labs(
title = "K-Means Clustering of Countries Using PCA Scores",
x = "PC1",
y = "PC2",
color = "Cluster"
)

#combine clusters with original country data
cluster_summary_data <- pca_data %>%
mutate(cluster = factor(kmeans_3$cluster))
#summarize clusters
cluster_summary <- cluster_summary_data %>%
group_by(cluster) %>%
summarize(
n_countries = n(),
mean_co2 = mean(co2, na.rm = TRUE),
mean_co2_per_capita = mean(co2_per_capita, na.rm = TRUE),
mean_methane = mean(methane, na.rm = TRUE),
mean_population = mean(population, na.rm = TRUE),
.groups = "drop"
)
kable(cluster_summary, digits = 3, caption = "Cluster summary statistics")
Cluster summary statistics
| 1 |
1 |
12289.037 |
8.658 |
1590.674 |
1419321284 |
| 2 |
2 |
4048.799 |
8.199 |
838.821 |
898181176 |
| 3 |
73 |
214.753 |
6.972 |
62.322 |
43836576 |
#show countries in each cluster
cluster_summary_data %>%
select(country, cluster, co2, co2_per_capita, methane, nitrous_oxide) %>%
arrange(cluster, desc(co2)) %>%
kable(digits = 3, caption = "Countries by K-means cluster")
Countries by K-means cluster
| China |
1 |
12289.037 |
8.658 |
1590.674 |
475.534 |
| United States |
2 |
4904.120 |
14.197 |
818.348 |
239.599 |
| India |
2 |
3193.478 |
2.201 |
859.294 |
310.004 |
| Russia |
3 |
1780.524 |
12.295 |
474.072 |
85.242 |
| Japan |
3 |
961.867 |
7.772 |
28.820 |
17.964 |
| Indonesia |
3 |
812.220 |
2.865 |
412.009 |
70.983 |
| Iran |
3 |
792.631 |
8.656 |
170.996 |
32.055 |
| Saudi Arabia |
3 |
692.133 |
20.379 |
117.014 |
13.056 |
| South Korea |
3 |
583.679 |
11.286 |
39.670 |
13.689 |
| Germany |
3 |
572.319 |
6.769 |
56.136 |
32.159 |
| Canada |
3 |
533.340 |
13.420 |
115.338 |
44.735 |
| Turkey |
3 |
513.035 |
5.865 |
111.797 |
40.169 |
| Brazil |
3 |
483.012 |
2.278 |
579.990 |
203.971 |
| Mexico |
3 |
460.988 |
3.523 |
151.802 |
39.548 |
| South Africa |
3 |
439.831 |
6.872 |
79.290 |
20.294 |
| Australia |
3 |
386.732 |
14.477 |
135.639 |
67.779 |
| Vietnam |
3 |
370.931 |
3.673 |
94.982 |
30.310 |
| United Kingdom |
3 |
312.906 |
4.526 |
43.812 |
28.062 |
| Italy |
3 |
301.930 |
5.088 |
40.757 |
15.218 |
| Malaysia |
3 |
290.224 |
8.162 |
28.313 |
8.866 |
| Kazakhstan |
3 |
287.006 |
13.937 |
54.973 |
11.967 |
| Poland |
3 |
272.862 |
7.080 |
44.502 |
26.926 |
| Thailand |
3 |
267.760 |
3.736 |
95.815 |
20.060 |
| France |
3 |
264.156 |
3.969 |
62.809 |
32.581 |
| Taiwan |
3 |
262.345 |
11.301 |
15.465 |
5.727 |
| Egypt |
3 |
258.368 |
2.217 |
49.838 |
22.805 |
| Iraq |
3 |
233.635 |
5.074 |
156.213 |
7.054 |
| United Arab Emirates |
3 |
221.988 |
20.131 |
37.574 |
2.635 |
| Spain |
3 |
220.341 |
4.599 |
44.595 |
18.848 |
| Algeria |
3 |
198.203 |
4.234 |
69.212 |
8.114 |
| Pakistan |
3 |
179.800 |
0.716 |
228.614 |
71.433 |
| Philippines |
3 |
174.878 |
1.510 |
78.930 |
18.405 |
| Argentina |
3 |
171.059 |
3.743 |
114.124 |
52.294 |
| Ukraine |
3 |
142.498 |
3.764 |
52.380 |
20.267 |
| Uzbekistan |
3 |
139.234 |
3.829 |
52.573 |
15.741 |
| Kuwait |
3 |
129.519 |
26.248 |
39.526 |
1.225 |
| Qatar |
3 |
125.812 |
41.271 |
22.491 |
0.914 |
| Venezuela |
3 |
116.042 |
4.085 |
47.634 |
11.111 |
| Netherlands |
3 |
114.785 |
6.297 |
19.054 |
8.976 |
| Bangladesh |
3 |
108.318 |
0.624 |
95.567 |
32.007 |
| Colombia |
3 |
92.659 |
1.752 |
91.299 |
23.444 |
| Belgium |
3 |
85.456 |
7.280 |
11.920 |
8.379 |
| Oman |
3 |
82.662 |
15.651 |
32.877 |
1.417 |
| Turkmenistan |
3 |
81.006 |
10.809 |
23.489 |
7.195 |
| Chile |
3 |
78.726 |
3.983 |
18.641 |
7.305 |
| Czechia |
3 |
75.623 |
7.044 |
12.622 |
6.691 |
| Peru |
3 |
70.247 |
2.053 |
31.677 |
9.523 |
| Morocco |
3 |
69.062 |
1.814 |
30.335 |
8.667 |
| Romania |
3 |
68.551 |
3.605 |
21.646 |
10.187 |
| Austria |
3 |
56.368 |
6.180 |
9.310 |
3.561 |
| Belarus |
3 |
55.818 |
6.163 |
18.577 |
15.353 |
| Singapore |
3 |
53.919 |
9.245 |
5.801 |
1.337 |
| Greece |
3 |
53.361 |
5.311 |
9.912 |
3.958 |
| Israel |
3 |
52.637 |
5.607 |
8.964 |
2.557 |
| Hungary |
3 |
40.017 |
4.136 |
9.494 |
5.427 |
| Azerbaijan |
3 |
39.830 |
3.853 |
12.679 |
2.977 |
| Sweden |
3 |
38.097 |
3.592 |
7.925 |
6.066 |
| Norway |
3 |
37.183 |
6.668 |
8.397 |
4.177 |
| Portugal |
3 |
35.539 |
3.409 |
10.832 |
2.789 |
| Hong Kong |
3 |
33.324 |
4.494 |
5.148 |
0.442 |
| Ireland |
3 |
33.311 |
6.339 |
17.634 |
8.668 |
| New Zealand |
3 |
32.479 |
6.229 |
34.596 |
13.252 |
| Switzerland |
3 |
32.072 |
3.595 |
6.025 |
2.191 |
| Bulgaria |
3 |
31.622 |
4.679 |
8.962 |
4.010 |
| Finland |
3 |
29.775 |
5.301 |
5.699 |
6.373 |
| Slovakia |
3 |
29.091 |
5.283 |
5.804 |
3.272 |
| Denmark |
3 |
28.369 |
4.746 |
10.216 |
4.691 |
| Croatia |
3 |
18.465 |
4.765 |
3.160 |
2.124 |
| Slovenia |
3 |
12.751 |
6.018 |
2.344 |
0.751 |
| Lithuania |
3 |
12.542 |
4.387 |
2.618 |
4.993 |
| Estonia |
3 |
8.307 |
6.105 |
1.330 |
1.703 |
| Cyprus |
3 |
7.299 |
5.374 |
2.370 |
0.361 |
| Luxembourg |
3 |
7.040 |
10.460 |
0.535 |
0.244 |
| North Macedonia |
3 |
6.620 |
3.631 |
1.528 |
0.458 |
| Latvia |
3 |
6.462 |
3.452 |
2.285 |
2.185 |
| Iceland |
3 |
3.803 |
9.667 |
0.557 |
0.346 |
#Best Subset Selection
#prepare safer data for best subset selection
subset_data <- emissions_model %>%
select(
co2,
methane,
nitrous_oxide,
population,
primary_energy_consumption
) %>%
mutate(across(everything(), as.numeric)) %>%
drop_na()
#log transform variables
subset_data_log <- subset_data %>%
mutate(across(everything(), ~log1p(.x)))
#remove rows with infinite values
subset_data_log <- subset_data_log %>%
filter(if_all(everything(), is.finite))
#remove columns with zero variance or invalid standard deviation
subset_data_log <- subset_data_log %>%
select(where(~!is.na(sd(.x, na.rm = TRUE)) && sd(.x, na.rm = TRUE) > 0))
#create train/test split
set.seed(712)
train_index <- sample(
1:nrow(subset_data_log),
size = floor(0.7 * nrow(subset_data_log))
)
train_data_subset <- subset_data_log[train_index, ]
test_data_subset <- subset_data_log[-train_index, ]
#fit best subset selection
subset_fit <- regsubsets(
co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
data = train_data_subset,
nvmax = 4,
method = "exhaustive"
)
#model summary
subset_summary <- summary(subset_fit)
#plot bic
plot(
subset_summary$bic,
type = "b",
xlab = "Number of predictors",
ylab = "BIC",
main = "Best Subset Selection Using BIC"
)

#best model by bic
best_size_bic <- which.min(subset_summary$bic)
#show best model coefficients
coef(subset_fit, best_size_bic)
## (Intercept) methane
## -1.4621262 0.2119569
## primary_energy_consumption
## 0.8495087
#best model by bic
best_size_bic <- which.min(subset_summary$bic)
#coefficients for best bic model
coef(subset_fit, best_size_bic)
## (Intercept) methane
## -1.4621262 0.2119569
## primary_energy_consumption
## 0.8495087
Ridge and Lasso Regression
#prepare data for ridge and lasso regression
glmnet_data <- emissions_model %>%
select(
co2,
methane,
nitrous_oxide,
population,
primary_energy_consumption
) %>%
mutate(across(everything(), as.numeric)) %>%
drop_na()
#log transform variables
glmnet_data <- glmnet_data %>%
mutate(across(everything(), ~log1p(.x)))
#remove infinite values
glmnet_data <- glmnet_data %>%
filter(if_all(everything(), is.finite))
#check dimensions
dim(glmnet_data)
## [1] 79 5
#create train/test split
set.seed(712)
train_index <- sample(
1:nrow(glmnet_data),
size = floor(0.7 * nrow(glmnet_data))
)
train_glmnet <- glmnet_data[train_index, ]
test_glmnet <- glmnet_data[-train_index, ]
#create x and y matrices
x_train <- model.matrix(
co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
data = train_glmnet
)[, -1]
y_train <- train_glmnet$co2
x_test <- model.matrix(
co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
data = test_glmnet
)[, -1]
y_test <- test_glmnet$co2
#check that y is not constant
summary(y_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.569 3.620 4.694 4.673 5.593 9.417
sd(y_train)
## [1] 1.531922
#ridge regression
set.seed(712)
ridge_cv <- cv.glmnet(
x_train,
y_train,
alpha = 0
)
#best lambda
ridge_lambda <- ridge_cv$lambda.min
ridge_lambda
## [1] 0.1482516
#ridge predictions
ridge_pred <- predict(
ridge_cv,
s = ridge_lambda,
newx = x_test
)
#ridge mse
ridge_mse <- mean((y_test - ridge_pred)^2)
ridge_mse
## [1] 0.05863233
#lasso regression alpha equals 1
set.seed(712)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
#best lambda
lasso_lambda <- lasso_cv$lambda.min
lasso_lambda
## [1] 0.002005915
#lasso predictions
lasso_pred <- predict(lasso_cv, s = lasso_lambda, newx = x_test)
#lasso test mse
lasso_mse <- mean((y_test - lasso_pred)^2)
lasso_mse
## [1] 0.08058013
#lasso coefficients
coef(lasso_cv, s = lasso_lambda)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s=0.002005915
## (Intercept) -2.53431876
## methane 0.23553516
## nitrous_oxide -0.12388400
## population 0.07984216
## primary_energy_consumption 0.84053375
#compare regularization models
regularization_results <- tibble(
model = c("Ridge regression", "Lasso regression"),
test_mse = c(ridge_mse, lasso_mse),
test_rmse = sqrt(test_mse)
)
kable(regularization_results, digits = 5, caption = "Ridge and lasso test error comparison")
Ridge and lasso test error comparison
| Ridge regression |
0.05863 |
0.24214 |
| Lasso regression |
0.08058 |
0.28387 |
#Neural Network Prediction
#prepare clean neural network data
nn_data <- emissions_model %>%
select(
co2,
methane,
nitrous_oxide,
population,
primary_energy_consumption
) %>%
mutate(across(everything(), as.numeric)) %>%
drop_na()
#log transform variables
nn_data <- nn_data %>%
mutate(across(everything(), ~log1p(.x)))
#remove infinite values
nn_data <- nn_data %>%
filter(if_all(everything(), is.finite))
#normalize function
normalize <- function(x){
(x - min(x)) / (max(x) - min(x))
}
#normalize all variables
nn_data_norm <- as.data.frame(nn_data %>%
mutate(across(everything(), normalize)))
#create train/test split
set.seed(712)
nn_train_index <- sample(
1:nrow(nn_data_norm),
size = floor(0.7 * nrow(nn_data_norm))
)
nn_train <- nn_data_norm[nn_train_index, ]
nn_test <- nn_data_norm[-nn_train_index, ]
#fit simpler neural network model
set.seed(712)
nn_fit <- neuralnet::neuralnet(
co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
data = nn_train,
hidden = c(3, 3),
linear.output = TRUE
)
#plot neural network
plot(nn_fit)
#create exact predictor matrix for prediction
nn_test_x <- nn_test[, c(
"methane",
"nitrous_oxide",
"population",
"primary_energy_consumption"
)]
#generate predictions
nn_pred <- neuralnet::compute(nn_fit, nn_test_x)$net.result
#make predictions numeric
nn_pred <- as.numeric(nn_pred)
#calculate test mse and rmse
nn_mse <- mean((nn_test$co2 - nn_pred)^2)
nn_rmse <- sqrt(nn_mse)
nn_mse
## [1] 0.001521429
nn_rmse
## [1] 0.0390055
#compare predictive models
final_model_comparison <- tibble(
model = c("Ridge regression", "Lasso regression", "Neural network"),
test_mse = c(ridge_mse, lasso_mse, nn_mse),
test_rmse = c(sqrt(ridge_mse), sqrt(lasso_mse), nn_rmse)
)
kable(final_model_comparison, digits = 5, caption = "Final predictive model comparison")
Final predictive model comparison
| Ridge regression |
0.05863 |
0.24214 |
| Lasso regression |
0.08058 |
0.28387 |
| Neural network |
0.00152 |
0.03901 |