#load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(broom)
library(splines)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10
library(leaps)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## 
## The following object is masked from 'package:dplyr':
## 
##     compute
#set global chunk options
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width = 8,
  fig.height = 5
)

#set seed for reproducibility
set.seed(712)
#load owid co2 data directly from github
emissions_raw <- read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv") %>%
  clean_names()

#view structure
glimpse(emissions_raw)
## Rows: 50,411
## Columns: 79
## $ country                                   <chr> "Afghanistan", "Afghanistan"…
## $ year                                      <dbl> 1750, 1751, 1752, 1753, 1754…
## $ iso_code                                  <chr> "AFG", "AFG", "AFG", "AFG", …
## $ population                                <dbl> 2802560, NA, NA, NA, NA, NA,…
## $ gdp                                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cement_co2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cement_co2_per_capita                     <dbl> 0, NA, NA, NA, NA, NA, NA, N…
## $ co2                                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_growth_abs                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_growth_prct                           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc                         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_growth_abs              <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_growth_prct             <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_capita              <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_gdp                 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_unit_energy         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_capita                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_gdp                               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_unit_energy                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ coal_co2                                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ coal_co2_per_capita                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2                           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2_per_capita                <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2_per_gdp                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_cement_co2                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cumulative_co2                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_co2_including_luc              <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_coal_co2                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_flaring_co2                    <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_gas_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_luc_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_oil_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_other_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ energy_per_capita                         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ energy_per_gdp                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ flaring_co2                               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ flaring_co2_per_capita                    <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ gas_co2                                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ gas_co2_per_capita                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ ghg_excluding_lucf_per_capita             <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ ghg_per_capita                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ land_use_change_co2                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ land_use_change_co2_per_capita            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ methane                                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ methane_per_capita                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ nitrous_oxide                             <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ nitrous_oxide_per_capita                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ oil_co2                                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ oil_co2_per_capita                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ other_co2_per_capita                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ other_industry_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ primary_energy_consumption                <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cement_co2                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_co2                          <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_co2_including_luc            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_coal_co2                     <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_cement_co2        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_co2               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_co2_including_luc <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_coal_co2          <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_flaring_co2       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_gas_co2           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_luc_co2           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_oil_co2           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_other_co2         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_flaring_co2                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_gas_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_luc_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_oil_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_other_co2                    <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_of_temperature_change_from_ghg      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_ch4               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_co2               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_ghg               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_n2o               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ total_ghg                                 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ total_ghg_excluding_lucf                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ trade_co2                                 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ trade_co2_share                           <dbl> NA, NA, NA, NA, NA, NA, NA, …
#remove aggregated owid region codes and keep country-level iso codes
emissions_country <- emissions_raw %>%
  filter(!is.na(iso_code)) %>%
  filter(!str_detect(iso_code, "^OWID_"))

#create latest year snapshot
latest_year <- max(emissions_country$year, na.rm = TRUE)

emissions_latest <- emissions_country %>%
  filter(year == latest_year)

#select variables used throughout the project
emissions_model <- emissions_latest %>%
  select(
    country,
    iso_code,
    year,
    co2,
    co2_per_capita,
    methane,
    nitrous_oxide,
    population,
    gdp,
    primary_energy_consumption,
    coal_co2,
    oil_co2,
    gas_co2
  ) %>%
  drop_na(co2, co2_per_capita, methane, nitrous_oxide, population)

#check data dimensions
dim(emissions_model)
## [1] 199  13

#Descriptive Statistics

#summary statistics for key variables
summary_table <- emissions_model %>%
  summarize(
    n_countries = n(),
    mean_co2 = mean(co2, na.rm = TRUE),
    median_co2 = median(co2, na.rm = TRUE),
    sd_co2 = sd(co2, na.rm = TRUE),
    mean_co2_per_capita = mean(co2_per_capita, na.rm = TRUE),
    median_co2_per_capita = median(co2_per_capita, na.rm = TRUE),
    mean_methane = mean(methane, na.rm = TRUE),
    mean_nitrous_oxide = mean(nitrous_oxide, na.rm = TRUE)
  )

kable(summary_table, digits = 3, caption = "Summary statistics for latest-year country emissions data")
Summary statistics for latest-year country emissions data
n_countries mean_co2 median_co2 sd_co2 mean_co2_per_capita median_co2_per_capita mean_methane mean_nitrous_oxide
199 187.847 12.855 973.097 4.347 2.959 47.733 14.753
#top 10 countries by total co2
top_total <- emissions_latest %>%
  filter(!is.na(co2)) %>%
  arrange(desc(co2)) %>%
  slice_head(n = 10) %>%
  select(country, co2, co2_per_capita, population)

kable(top_total, digits = 3, caption = "Top 10 countries by total CO2 emissions")
Top 10 countries by total CO2 emissions
country co2 co2_per_capita population
China 12289.037 8.658 1419321284
United States 4904.120 14.197 345426566
India 3193.478 2.201 1450935785
Russia 1780.524 12.295 144820421
Japan 961.867 7.772 123753041
Indonesia 812.220 2.865 283487933
Iran 792.631 8.656 91567737
Saudi Arabia 692.133 20.379 33962751
South Korea 583.679 11.286 51717587
Germany 572.319 6.769 84552234
#plot top total emitters
top_total %>%
  ggplot(aes(x = reorder(country, co2), y = co2)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 Countries by Total CO2 Emissions",
    x = "Country",
    y = "CO2 emissions"
  )

#top 10 countries by co2 per capita
top_per_capita <- emissions_latest %>%
  filter(!is.na(co2_per_capita), population > 1000000) %>%
  arrange(desc(co2_per_capita)) %>%
  slice_head(n = 10) %>%
  select(country, co2_per_capita, co2, population)

kable(top_per_capita, digits = 3, caption = "Top 10 countries by CO2 emissions per capita")
Top 10 countries by CO2 emissions per capita
country co2_per_capita co2 population
Qatar 41.271 125.812 3048429
Kuwait 26.248 129.519 4934508
Bahrain 24.270 39.003 1607059
Trinidad and Tobago 22.932 34.576 1507778
Saudi Arabia 20.379 692.133 33962751
United Arab Emirates 20.131 221.988 11027135
Oman 15.651 82.662 5281542
Australia 14.477 386.732 26713206
United States 14.197 4904.120 345426566
Kazakhstan 13.937 287.006 20592574
#plot top per capita emitters
top_per_capita %>%
  ggplot(aes(x = reorder(country, co2_per_capita), y = co2_per_capita)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 Countries by CO2 Emissions Per Capita",
    x = "Country",
    y = "CO2 emissions per capita"
  )

#Hypothesis Testing/ Multiple Testing

#select countries for pairwise comparison
test_data <- emissions_country %>%
  filter(country %in% c("Australia", "Brazil", "China", "India", "United States")) %>%
  select(country, year, co2) %>%
  drop_na(co2)

#example two sample t-test
australia_brazil_test <- t.test(
  co2 ~ country,
  data = test_data %>% filter(country %in% c("Australia", "Brazil"))
)

australia_brazil_test
## 
##  Welch Two Sample t-test
## 
## data:  co2 by country
## t = -2.3982, df = 292.39, p-value = 0.0171
## alternative hypothesis: true difference in means between group Australia and group Brazil is not equal to 0
## 95 percent confidence interval:
##  -63.382939  -6.243336
## sample estimates:
## mean in group Australia    mean in group Brazil 
##                72.87153               107.68467
#create pairwise t-tests across selected countries
country_pairs <- combn(unique(test_data$country), 2, simplify = FALSE)

pairwise_tests <- map_dfr(country_pairs, function(pair){
  temp <- test_data %>%
    filter(country %in% pair)

  test_result <- t.test(co2 ~ country, data = temp)

  tibble(
    country_1 = pair[1],
    country_2 = pair[2],
    p_value = test_result$p.value
  )
})

#adjust p-values using bonferroni and benjamini-hochberg
pairwise_tests <- pairwise_tests %>%
  mutate(
    p_bonferroni = p.adjust(p_value, method = "bonferroni"),
    p_bh = p.adjust(p_value, method = "BH")
  )

kable(pairwise_tests, digits = 5, caption = "Pairwise t-tests with multiple testing adjustment")
Pairwise t-tests with multiple testing adjustment
country_1 country_2 p_value p_bonferroni p_bh
Australia Brazil 0.01710 0.17103 0.01900
Australia China 0.00000 0.00000 0.00000
Australia India 0.00000 0.00000 0.00000
Australia United States 0.00000 0.00000 0.00000
Brazil China 0.00000 0.00000 0.00000
Brazil India 0.00000 0.00000 0.00000
Brazil United States 0.00000 0.00000 0.00000
China India 0.00000 0.00000 0.00000
China United States 0.17373 1.00000 0.17373
India United States 0.00000 0.00000 0.00000

#Spline Regression and Loess Smoothing

#filter china time series
china_data <- emissions_country %>%
  filter(country == "China") %>%
  select(year, co2) %>%
  drop_na(co2)

#loess plot for china
ggplot(china_data, aes(x = year, y = co2)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(
    title = "LOESS Trend of CO2 Emissions in China",
    x = "Year",
    y = "CO2 emissions"
  )

#natural cubic spline model for china
china_spline <- lm(co2 ~ ns(year, df = 5), data = china_data)

#add predictions
china_data <- china_data %>%
  mutate(spline_pred = predict(china_spline, newdata = china_data))

#plot spline fit
ggplot(china_data, aes(x = year, y = co2)) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = spline_pred), linewidth = 1) +
  labs(
    title = "Natural Cubic Spline Model for China CO2 Emissions",
    x = "Year",
    y = "CO2 emissions"
  )

summary(china_spline)
## 
## Call:
## lm(formula = co2 ~ ns(year, df = 5), data = china_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1059.55   -95.02     2.71    41.07  1570.51 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          39.56     151.85   0.261 0.794935    
## ns(year, df = 5)1   111.52     194.47   0.573 0.567479    
## ns(year, df = 5)2   841.75     246.81   3.410 0.000903 ***
## ns(year, df = 5)3  3513.35     191.76  18.322  < 2e-16 ***
## ns(year, df = 5)4  9792.00     385.02  25.433  < 2e-16 ***
## ns(year, df = 5)5 12999.46     167.46  77.627  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 379.4 on 112 degrees of freedom
## Multiple R-squared:  0.989,  Adjusted R-squared:  0.9885 
## F-statistic:  2005 on 5 and 112 DF,  p-value: < 2.2e-16
#filter belgium time series
belgium_data <- emissions_country %>%
  filter(country == "Belgium") %>%
  select(year, co2) %>%
  drop_na(co2)

#loess plot for belgium
ggplot(belgium_data, aes(x = year, y = co2)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(
    title = "LOESS Trend of CO2 Emissions in Belgium",
    x = "Year",
    y = "CO2 emissions"
  )

#Principal Component Analysis

#prepare pca variables
pca_data <- emissions_model %>%
  select(
    country,
    co2,
    co2_per_capita,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption,
    coal_co2,
    oil_co2,
    gas_co2
  ) %>%
  drop_na()

#store country names
pca_countries <- pca_data$country

#scale numeric variables
pca_numeric <- pca_data %>%
  select(-country) %>%
  scale()

#run pca
pca_fit <- prcomp(pca_numeric, center = TRUE, scale. = TRUE)

#pca summary
summary(pca_fit)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.5935 1.0895 0.81470 0.53703 0.26742 0.22192 0.11273
## Proportion of Variance 0.7473 0.1319 0.07375 0.03205 0.00795 0.00547 0.00141
## Cumulative Proportion  0.7473 0.8792 0.95298 0.98503 0.99297 0.99844 0.99986
##                            PC8      PC9
## Standard deviation     0.03560 0.004683
## Proportion of Variance 0.00014 0.000000
## Cumulative Proportion  1.00000 1.000000
#scree plot
fviz_eig(pca_fit, addlabels = TRUE) +
  labs(title = "PCA Scree Plot")

#pca biplot

fviz_pca_biplot(
  pca_fit,
  repel = TRUE,
  label = "var"
) +
  labs(title = "PCA Biplot of Country Emissions Variables")

#create pca scores table
pca_scores <- as_tibble(pca_fit$x) %>%
  mutate(country = pca_countries)

#show countries with highest pc1
pca_scores %>%
  arrange(desc(PC1)) %>%
  select(country, PC1, PC2) %>%
  slice_head(n = 10) %>%
  kable(digits = 3, caption = "Countries with highest PC1 scores")
Countries with highest PC1 scores
country PC1 PC2
China 17.686 -1.522
United States 9.474 4.160
India 7.106 -2.847
Russia 2.980 1.872
Brazil 2.001 -1.178
Indonesia 1.273 -0.938
Iran 0.770 0.924
Japan 0.545 0.383
Saudi Arabia 0.430 2.270
Canada 0.380 1.198

#K-Means Clustering

#use same scaled pca numeric data for clustering
set.seed(712)

#elbow method
fviz_nbclust(pca_numeric, kmeans, method = "wss") +
  labs(title = "Elbow Method for Choosing Number of Clusters")

#fit k-means with 3 clusters
set.seed(712)
kmeans_3 <- kmeans(pca_numeric, centers = 3, nstart = 25)

#add clusters to pca scores
cluster_results <- pca_scores %>%
  mutate(cluster = factor(kmeans_3$cluster))

#plot clusters using first two principal components
ggplot(cluster_results, aes(x = PC1, y = PC2, color = cluster, label = country)) +
  geom_point(size = 3) +
  labs(
    title = "K-Means Clustering of Countries Using PCA Scores",
    x = "PC1",
    y = "PC2",
    color = "Cluster"
  )

#combine clusters with original country data
cluster_summary_data <- pca_data %>%
  mutate(cluster = factor(kmeans_3$cluster))

#summarize clusters
cluster_summary <- cluster_summary_data %>%
  group_by(cluster) %>%
  summarize(
    n_countries = n(),
    mean_co2 = mean(co2, na.rm = TRUE),
    mean_co2_per_capita = mean(co2_per_capita, na.rm = TRUE),
    mean_methane = mean(methane, na.rm = TRUE),
    mean_population = mean(population, na.rm = TRUE),
    .groups = "drop"
  )

kable(cluster_summary, digits = 3, caption = "Cluster summary statistics")
Cluster summary statistics
cluster n_countries mean_co2 mean_co2_per_capita mean_methane mean_population
1 1 12289.037 8.658 1590.674 1419321284
2 2 4048.799 8.199 838.821 898181176
3 73 214.753 6.972 62.322 43836576
#show countries in each cluster
cluster_summary_data %>%
  select(country, cluster, co2, co2_per_capita, methane, nitrous_oxide) %>%
  arrange(cluster, desc(co2)) %>%
  kable(digits = 3, caption = "Countries by K-means cluster")
Countries by K-means cluster
country cluster co2 co2_per_capita methane nitrous_oxide
China 1 12289.037 8.658 1590.674 475.534
United States 2 4904.120 14.197 818.348 239.599
India 2 3193.478 2.201 859.294 310.004
Russia 3 1780.524 12.295 474.072 85.242
Japan 3 961.867 7.772 28.820 17.964
Indonesia 3 812.220 2.865 412.009 70.983
Iran 3 792.631 8.656 170.996 32.055
Saudi Arabia 3 692.133 20.379 117.014 13.056
South Korea 3 583.679 11.286 39.670 13.689
Germany 3 572.319 6.769 56.136 32.159
Canada 3 533.340 13.420 115.338 44.735
Turkey 3 513.035 5.865 111.797 40.169
Brazil 3 483.012 2.278 579.990 203.971
Mexico 3 460.988 3.523 151.802 39.548
South Africa 3 439.831 6.872 79.290 20.294
Australia 3 386.732 14.477 135.639 67.779
Vietnam 3 370.931 3.673 94.982 30.310
United Kingdom 3 312.906 4.526 43.812 28.062
Italy 3 301.930 5.088 40.757 15.218
Malaysia 3 290.224 8.162 28.313 8.866
Kazakhstan 3 287.006 13.937 54.973 11.967
Poland 3 272.862 7.080 44.502 26.926
Thailand 3 267.760 3.736 95.815 20.060
France 3 264.156 3.969 62.809 32.581
Taiwan 3 262.345 11.301 15.465 5.727
Egypt 3 258.368 2.217 49.838 22.805
Iraq 3 233.635 5.074 156.213 7.054
United Arab Emirates 3 221.988 20.131 37.574 2.635
Spain 3 220.341 4.599 44.595 18.848
Algeria 3 198.203 4.234 69.212 8.114
Pakistan 3 179.800 0.716 228.614 71.433
Philippines 3 174.878 1.510 78.930 18.405
Argentina 3 171.059 3.743 114.124 52.294
Ukraine 3 142.498 3.764 52.380 20.267
Uzbekistan 3 139.234 3.829 52.573 15.741
Kuwait 3 129.519 26.248 39.526 1.225
Qatar 3 125.812 41.271 22.491 0.914
Venezuela 3 116.042 4.085 47.634 11.111
Netherlands 3 114.785 6.297 19.054 8.976
Bangladesh 3 108.318 0.624 95.567 32.007
Colombia 3 92.659 1.752 91.299 23.444
Belgium 3 85.456 7.280 11.920 8.379
Oman 3 82.662 15.651 32.877 1.417
Turkmenistan 3 81.006 10.809 23.489 7.195
Chile 3 78.726 3.983 18.641 7.305
Czechia 3 75.623 7.044 12.622 6.691
Peru 3 70.247 2.053 31.677 9.523
Morocco 3 69.062 1.814 30.335 8.667
Romania 3 68.551 3.605 21.646 10.187
Austria 3 56.368 6.180 9.310 3.561
Belarus 3 55.818 6.163 18.577 15.353
Singapore 3 53.919 9.245 5.801 1.337
Greece 3 53.361 5.311 9.912 3.958
Israel 3 52.637 5.607 8.964 2.557
Hungary 3 40.017 4.136 9.494 5.427
Azerbaijan 3 39.830 3.853 12.679 2.977
Sweden 3 38.097 3.592 7.925 6.066
Norway 3 37.183 6.668 8.397 4.177
Portugal 3 35.539 3.409 10.832 2.789
Hong Kong 3 33.324 4.494 5.148 0.442
Ireland 3 33.311 6.339 17.634 8.668
New Zealand 3 32.479 6.229 34.596 13.252
Switzerland 3 32.072 3.595 6.025 2.191
Bulgaria 3 31.622 4.679 8.962 4.010
Finland 3 29.775 5.301 5.699 6.373
Slovakia 3 29.091 5.283 5.804 3.272
Denmark 3 28.369 4.746 10.216 4.691
Croatia 3 18.465 4.765 3.160 2.124
Slovenia 3 12.751 6.018 2.344 0.751
Lithuania 3 12.542 4.387 2.618 4.993
Estonia 3 8.307 6.105 1.330 1.703
Cyprus 3 7.299 5.374 2.370 0.361
Luxembourg 3 7.040 10.460 0.535 0.244
North Macedonia 3 6.620 3.631 1.528 0.458
Latvia 3 6.462 3.452 2.285 2.185
Iceland 3 3.803 9.667 0.557 0.346

#Best Subset Selection

#prepare safer data for best subset selection
subset_data <- emissions_model %>%
  select(
    co2,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na()

#log transform variables
subset_data_log <- subset_data %>%
  mutate(across(everything(), ~log1p(.x)))

#remove rows with infinite values
subset_data_log <- subset_data_log %>%
  filter(if_all(everything(), is.finite))

#remove columns with zero variance or invalid standard deviation
subset_data_log <- subset_data_log %>%
  select(where(~!is.na(sd(.x, na.rm = TRUE)) && sd(.x, na.rm = TRUE) > 0))

#create train/test split
set.seed(712)
train_index <- sample(
  1:nrow(subset_data_log),
  size = floor(0.7 * nrow(subset_data_log))
)

train_data_subset <- subset_data_log[train_index, ]
test_data_subset <- subset_data_log[-train_index, ]

#fit best subset selection
subset_fit <- regsubsets(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = train_data_subset,
  nvmax = 4,
  method = "exhaustive"
)

#model summary
subset_summary <- summary(subset_fit)

#plot bic
plot(
  subset_summary$bic,
  type = "b",
  xlab = "Number of predictors",
  ylab = "BIC",
  main = "Best Subset Selection Using BIC"
)

#best model by bic
best_size_bic <- which.min(subset_summary$bic)

#show best model coefficients
coef(subset_fit, best_size_bic)
##                (Intercept)                    methane 
##                 -1.4621262                  0.2119569 
## primary_energy_consumption 
##                  0.8495087
#best model by bic
best_size_bic <- which.min(subset_summary$bic)

#coefficients for best bic model
coef(subset_fit, best_size_bic)
##                (Intercept)                    methane 
##                 -1.4621262                  0.2119569 
## primary_energy_consumption 
##                  0.8495087

Ridge and Lasso Regression

#prepare data for ridge and lasso regression
glmnet_data <- emissions_model %>%
  select(
    co2,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na()

#log transform variables
glmnet_data <- glmnet_data %>%
  mutate(across(everything(), ~log1p(.x)))

#remove infinite values
glmnet_data <- glmnet_data %>%
  filter(if_all(everything(), is.finite))

#check dimensions
dim(glmnet_data)
## [1] 79  5
#create train/test split
set.seed(712)

train_index <- sample(
  1:nrow(glmnet_data),
  size = floor(0.7 * nrow(glmnet_data))
)

train_glmnet <- glmnet_data[train_index, ]
test_glmnet <- glmnet_data[-train_index, ]

#create x and y matrices
x_train <- model.matrix(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = train_glmnet
)[, -1]

y_train <- train_glmnet$co2

x_test <- model.matrix(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = test_glmnet
)[, -1]

y_test <- test_glmnet$co2

#check that y is not constant
summary(y_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.569   3.620   4.694   4.673   5.593   9.417
sd(y_train)
## [1] 1.531922
#ridge regression
set.seed(712)

ridge_cv <- cv.glmnet(
  x_train,
  y_train,
  alpha = 0
)

#best lambda
ridge_lambda <- ridge_cv$lambda.min

ridge_lambda
## [1] 0.1482516
#ridge predictions
ridge_pred <- predict(
  ridge_cv,
  s = ridge_lambda,
  newx = x_test
)

#ridge mse
ridge_mse <- mean((y_test - ridge_pred)^2)

ridge_mse
## [1] 0.05863233
#lasso regression alpha equals 1
set.seed(712)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)

#best lambda
lasso_lambda <- lasso_cv$lambda.min
lasso_lambda
## [1] 0.002005915
#lasso predictions
lasso_pred <- predict(lasso_cv, s = lasso_lambda, newx = x_test)

#lasso test mse
lasso_mse <- mean((y_test - lasso_pred)^2)
lasso_mse
## [1] 0.08058013
#lasso coefficients
coef(lasso_cv, s = lasso_lambda)
## 5 x 1 sparse Matrix of class "dgCMatrix"
##                            s=0.002005915
## (Intercept)                  -2.53431876
## methane                       0.23553516
## nitrous_oxide                -0.12388400
## population                    0.07984216
## primary_energy_consumption    0.84053375
#compare regularization models
regularization_results <- tibble(
  model = c("Ridge regression", "Lasso regression"),
  test_mse = c(ridge_mse, lasso_mse),
  test_rmse = sqrt(test_mse)
)

kable(regularization_results, digits = 5, caption = "Ridge and lasso test error comparison")
Ridge and lasso test error comparison
model test_mse test_rmse
Ridge regression 0.05863 0.24214
Lasso regression 0.08058 0.28387

#Neural Network Prediction

#prepare clean neural network data
nn_data <- emissions_model %>%
  select(
    co2,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na()

#log transform variables
nn_data <- nn_data %>%
  mutate(across(everything(), ~log1p(.x)))

#remove infinite values
nn_data <- nn_data %>%
  filter(if_all(everything(), is.finite))

#normalize function
normalize <- function(x){
  (x - min(x)) / (max(x) - min(x))
}

#normalize all variables
nn_data_norm <- as.data.frame(nn_data %>%
  mutate(across(everything(), normalize)))

#create train/test split
set.seed(712)
nn_train_index <- sample(
  1:nrow(nn_data_norm),
  size = floor(0.7 * nrow(nn_data_norm))
)

nn_train <- nn_data_norm[nn_train_index, ]
nn_test <- nn_data_norm[-nn_train_index, ]

#fit simpler neural network model
set.seed(712)

nn_fit <- neuralnet::neuralnet(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = nn_train,
  hidden = c(3, 3),
  linear.output = TRUE
)

#plot neural network
plot(nn_fit)

#create exact predictor matrix for prediction
nn_test_x <- nn_test[, c(
  "methane",
  "nitrous_oxide",
  "population",
  "primary_energy_consumption"
)]

#generate predictions
nn_pred <- neuralnet::compute(nn_fit, nn_test_x)$net.result

#make predictions numeric
nn_pred <- as.numeric(nn_pred)

#calculate test mse and rmse
nn_mse <- mean((nn_test$co2 - nn_pred)^2)
nn_rmse <- sqrt(nn_mse)

nn_mse
## [1] 0.001521429
nn_rmse
## [1] 0.0390055
#compare predictive models
final_model_comparison <- tibble(
  model = c("Ridge regression", "Lasso regression", "Neural network"),
  test_mse = c(ridge_mse, lasso_mse, nn_mse),
  test_rmse = c(sqrt(ridge_mse), sqrt(lasso_mse), nn_rmse)
)

kable(final_model_comparison, digits = 5, caption = "Final predictive model comparison")
Final predictive model comparison
model test_mse test_rmse
Ridge regression 0.05863 0.24214
Lasso regression 0.08058 0.28387
Neural network 0.00152 0.03901