#load libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(broom)
library(splines)
library(glmnet)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10

library(leaps)
library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(neuralnet)

## 
## Attaching package: 'neuralnet'
## 
## The following object is masked from 'package:dplyr':
## 
##     compute

#set global chunk options
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width = 8,
  fig.height = 5
)

#set seed for reproducibility
set.seed(712)

#load owid co2 data directly from github
emissions_raw <- read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv") %>%
  clean_names()

#view structure
glimpse(emissions_raw)

## Rows: 50,411
## Columns: 79
## $ country                                   <chr> "Afghanistan", "Afghanistan"…
## $ year                                      <dbl> 1750, 1751, 1752, 1753, 1754…
## $ iso_code                                  <chr> "AFG", "AFG", "AFG", "AFG", …
## $ population                                <dbl> 2802560, NA, NA, NA, NA, NA,…
## $ gdp                                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cement_co2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cement_co2_per_capita                     <dbl> 0, NA, NA, NA, NA, NA, NA, N…
## $ co2                                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_growth_abs                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_growth_prct                           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc                         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_growth_abs              <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_growth_prct             <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_capita              <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_gdp                 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_including_luc_per_unit_energy         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_capita                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_gdp                               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ co2_per_unit_energy                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ coal_co2                                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ coal_co2_per_capita                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2                           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2_per_capita                <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ consumption_co2_per_gdp                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_cement_co2                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ cumulative_co2                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_co2_including_luc              <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_coal_co2                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_flaring_co2                    <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_gas_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_luc_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_oil_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ cumulative_other_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ energy_per_capita                         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ energy_per_gdp                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ flaring_co2                               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ flaring_co2_per_capita                    <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ gas_co2                                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ gas_co2_per_capita                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ ghg_excluding_lucf_per_capita             <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ ghg_per_capita                            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ land_use_change_co2                       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ land_use_change_co2_per_capita            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ methane                                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ methane_per_capita                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ nitrous_oxide                             <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ nitrous_oxide_per_capita                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ oil_co2                                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ oil_co2_per_capita                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ other_co2_per_capita                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ other_industry_co2                        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ primary_energy_consumption                <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cement_co2                   <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_co2                          <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_co2_including_luc            <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_coal_co2                     <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_cement_co2        <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_co2               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_co2_including_luc <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_coal_co2          <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_flaring_co2       <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_gas_co2           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_luc_co2           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_oil_co2           <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_cumulative_other_co2         <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_flaring_co2                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_gas_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_luc_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_oil_co2                      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_global_other_co2                    <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ share_of_temperature_change_from_ghg      <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_ch4               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_co2               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_ghg               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ temperature_change_from_n2o               <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ total_ghg                                 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ total_ghg_excluding_lucf                  <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ trade_co2                                 <dbl> NA, NA, NA, NA, NA, NA, NA, …
## $ trade_co2_share                           <dbl> NA, NA, NA, NA, NA, NA, NA, …

#remove aggregated owid region codes and keep country-level iso codes
emissions_country <- emissions_raw %>%
  filter(!is.na(iso_code)) %>%
  filter(!str_detect(iso_code, "^OWID_"))

#create latest year snapshot
latest_year <- max(emissions_country$year, na.rm = TRUE)

emissions_latest <- emissions_country %>%
  filter(year == latest_year)

#select variables used throughout the project
emissions_model <- emissions_latest %>%
  select(
    country,
    iso_code,
    year,
    co2,
    co2_per_capita,
    methane,
    nitrous_oxide,
    population,
    gdp,
    primary_energy_consumption,
    coal_co2,
    oil_co2,
    gas_co2
  ) %>%
  drop_na(co2, co2_per_capita, methane, nitrous_oxide, population)

#check data dimensions
dim(emissions_model)

## [1] 199  13

#Descriptive Statistics

#summary statistics for key variables
summary_table <- emissions_model %>%
  summarize(
    n_countries = n(),
    mean_co2 = mean(co2, na.rm = TRUE),
    median_co2 = median(co2, na.rm = TRUE),
    sd_co2 = sd(co2, na.rm = TRUE),
    mean_co2_per_capita = mean(co2_per_capita, na.rm = TRUE),
    median_co2_per_capita = median(co2_per_capita, na.rm = TRUE),
    mean_methane = mean(methane, na.rm = TRUE),
    mean_nitrous_oxide = mean(nitrous_oxide, na.rm = TRUE)
  )

kable(summary_table, digits = 3, caption = "Summary statistics for latest-year country emissions data")

Summary statistics for latest-year country emissions data
n_countries	mean_co2	median_co2	sd_co2	mean_co2_per_capita	median_co2_per_capita	mean_methane	mean_nitrous_oxide
199	187.847	12.855	973.097	4.347	2.959	47.733	14.753

#top 10 countries by total co2
top_total <- emissions_latest %>%
  filter(!is.na(co2)) %>%
  arrange(desc(co2)) %>%
  slice_head(n = 10) %>%
  select(country, co2, co2_per_capita, population)

kable(top_total, digits = 3, caption = "Top 10 countries by total CO2 emissions")

Top 10 countries by total CO2 emissions
country	co2	co2_per_capita	population
China	12289.037	8.658	1419321284
United States	4904.120	14.197	345426566
India	3193.478	2.201	1450935785
Russia	1780.524	12.295	144820421
Japan	961.867	7.772	123753041
Indonesia	812.220	2.865	283487933
Iran	792.631	8.656	91567737
Saudi Arabia	692.133	20.379	33962751
South Korea	583.679	11.286	51717587
Germany	572.319	6.769	84552234

#plot top total emitters
top_total %>%
  ggplot(aes(x = reorder(country, co2), y = co2)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 Countries by Total CO2 Emissions",
    x = "Country",
    y = "CO2 emissions"
  )

#top 10 countries by co2 per capita
top_per_capita <- emissions_latest %>%
  filter(!is.na(co2_per_capita), population > 1000000) %>%
  arrange(desc(co2_per_capita)) %>%
  slice_head(n = 10) %>%
  select(country, co2_per_capita, co2, population)

kable(top_per_capita, digits = 3, caption = "Top 10 countries by CO2 emissions per capita")

Top 10 countries by CO2 emissions per capita
country	co2_per_capita	co2	population
Qatar	41.271	125.812	3048429
Kuwait	26.248	129.519	4934508
Bahrain	24.270	39.003	1607059
Trinidad and Tobago	22.932	34.576	1507778
Saudi Arabia	20.379	692.133	33962751
United Arab Emirates	20.131	221.988	11027135
Oman	15.651	82.662	5281542
Australia	14.477	386.732	26713206
United States	14.197	4904.120	345426566
Kazakhstan	13.937	287.006	20592574

#plot top per capita emitters
top_per_capita %>%
  ggplot(aes(x = reorder(country, co2_per_capita), y = co2_per_capita)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 Countries by CO2 Emissions Per Capita",
    x = "Country",
    y = "CO2 emissions per capita"
  )

#Hypothesis Testing/ Multiple Testing

#select countries for pairwise comparison
test_data <- emissions_country %>%
  filter(country %in% c("Australia", "Brazil", "China", "India", "United States")) %>%
  select(country, year, co2) %>%
  drop_na(co2)

#example two sample t-test
australia_brazil_test <- t.test(
  co2 ~ country,
  data = test_data %>% filter(country %in% c("Australia", "Brazil"))
)

australia_brazil_test

## 
##  Welch Two Sample t-test
## 
## data:  co2 by country
## t = -2.3982, df = 292.39, p-value = 0.0171
## alternative hypothesis: true difference in means between group Australia and group Brazil is not equal to 0
## 95 percent confidence interval:
##  -63.382939  -6.243336
## sample estimates:
## mean in group Australia    mean in group Brazil 
##                72.87153               107.68467

#create pairwise t-tests across selected countries
country_pairs <- combn(unique(test_data$country), 2, simplify = FALSE)

pairwise_tests <- map_dfr(country_pairs, function(pair){
  temp <- test_data %>%
    filter(country %in% pair)

  test_result <- t.test(co2 ~ country, data = temp)

  tibble(
    country_1 = pair[1],
    country_2 = pair[2],
    p_value = test_result$p.value
  )
})

#adjust p-values using bonferroni and benjamini-hochberg
pairwise_tests <- pairwise_tests %>%
  mutate(
    p_bonferroni = p.adjust(p_value, method = "bonferroni"),
    p_bh = p.adjust(p_value, method = "BH")
  )

kable(pairwise_tests, digits = 5, caption = "Pairwise t-tests with multiple testing adjustment")

Pairwise t-tests with multiple testing adjustment
country_1	country_2	p_value	p_bonferroni	p_bh
Australia	Brazil	0.01710	0.17103	0.01900
Australia	China	0.00000	0.00000	0.00000
Australia	India	0.00000	0.00000	0.00000
Australia	United States	0.00000	0.00000	0.00000
Brazil	China	0.00000	0.00000	0.00000
Brazil	India	0.00000	0.00000	0.00000
Brazil	United States	0.00000	0.00000	0.00000
China	India	0.00000	0.00000	0.00000
China	United States	0.17373	1.00000	0.17373
India	United States	0.00000	0.00000	0.00000

#Spline Regression and Loess Smoothing

#filter china time series
china_data <- emissions_country %>%
  filter(country == "China") %>%
  select(year, co2) %>%
  drop_na(co2)

#loess plot for china
ggplot(china_data, aes(x = year, y = co2)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(
    title = "LOESS Trend of CO2 Emissions in China",
    x = "Year",
    y = "CO2 emissions"
  )

#natural cubic spline model for china
china_spline <- lm(co2 ~ ns(year, df = 5), data = china_data)

#add predictions
china_data <- china_data %>%
  mutate(spline_pred = predict(china_spline, newdata = china_data))

#plot spline fit
ggplot(china_data, aes(x = year, y = co2)) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = spline_pred), linewidth = 1) +
  labs(
    title = "Natural Cubic Spline Model for China CO2 Emissions",
    x = "Year",
    y = "CO2 emissions"
  )

summary(china_spline)

## 
## Call:
## lm(formula = co2 ~ ns(year, df = 5), data = china_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1059.55   -95.02     2.71    41.07  1570.51 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          39.56     151.85   0.261 0.794935    
## ns(year, df = 5)1   111.52     194.47   0.573 0.567479    
## ns(year, df = 5)2   841.75     246.81   3.410 0.000903 ***
## ns(year, df = 5)3  3513.35     191.76  18.322  < 2e-16 ***
## ns(year, df = 5)4  9792.00     385.02  25.433  < 2e-16 ***
## ns(year, df = 5)5 12999.46     167.46  77.627  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 379.4 on 112 degrees of freedom
## Multiple R-squared:  0.989,  Adjusted R-squared:  0.9885 
## F-statistic:  2005 on 5 and 112 DF,  p-value: < 2.2e-16

#filter belgium time series
belgium_data <- emissions_country %>%
  filter(country == "Belgium") %>%
  select(year, co2) %>%
  drop_na(co2)

#loess plot for belgium
ggplot(belgium_data, aes(x = year, y = co2)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(
    title = "LOESS Trend of CO2 Emissions in Belgium",
    x = "Year",
    y = "CO2 emissions"
  )

#Principal Component Analysis

#prepare pca variables
pca_data <- emissions_model %>%
  select(
    country,
    co2,
    co2_per_capita,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption,
    coal_co2,
    oil_co2,
    gas_co2
  ) %>%
  drop_na()

#store country names
pca_countries <- pca_data$country

#scale numeric variables
pca_numeric <- pca_data %>%
  select(-country) %>%
  scale()

#run pca
pca_fit <- prcomp(pca_numeric, center = TRUE, scale. = TRUE)

#pca summary
summary(pca_fit)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.5935 1.0895 0.81470 0.53703 0.26742 0.22192 0.11273
## Proportion of Variance 0.7473 0.1319 0.07375 0.03205 0.00795 0.00547 0.00141
## Cumulative Proportion  0.7473 0.8792 0.95298 0.98503 0.99297 0.99844 0.99986
##                            PC8      PC9
## Standard deviation     0.03560 0.004683
## Proportion of Variance 0.00014 0.000000
## Cumulative Proportion  1.00000 1.000000

#scree plot
fviz_eig(pca_fit, addlabels = TRUE) +
  labs(title = "PCA Scree Plot")

#pca biplot

fviz_pca_biplot(
  pca_fit,
  repel = TRUE,
  label = "var"
) +
  labs(title = "PCA Biplot of Country Emissions Variables")

#create pca scores table
pca_scores <- as_tibble(pca_fit$x) %>%
  mutate(country = pca_countries)

#show countries with highest pc1
pca_scores %>%
  arrange(desc(PC1)) %>%
  select(country, PC1, PC2) %>%
  slice_head(n = 10) %>%
  kable(digits = 3, caption = "Countries with highest PC1 scores")

Countries with highest PC1 scores
country	PC1	PC2
China	17.686	-1.522
United States	9.474	4.160
India	7.106	-2.847
Russia	2.980	1.872
Brazil	2.001	-1.178
Indonesia	1.273	-0.938
Iran	0.770	0.924
Japan	0.545	0.383
Saudi Arabia	0.430	2.270
Canada	0.380	1.198

#K-Means Clustering

#use same scaled pca numeric data for clustering
set.seed(712)

#elbow method
fviz_nbclust(pca_numeric, kmeans, method = "wss") +
  labs(title = "Elbow Method for Choosing Number of Clusters")

#fit k-means with 3 clusters
set.seed(712)
kmeans_3 <- kmeans(pca_numeric, centers = 3, nstart = 25)

#add clusters to pca scores
cluster_results <- pca_scores %>%
  mutate(cluster = factor(kmeans_3$cluster))

#plot clusters using first two principal components
ggplot(cluster_results, aes(x = PC1, y = PC2, color = cluster, label = country)) +
  geom_point(size = 3) +
  labs(
    title = "K-Means Clustering of Countries Using PCA Scores",
    x = "PC1",
    y = "PC2",
    color = "Cluster"
  )

#combine clusters with original country data
cluster_summary_data <- pca_data %>%
  mutate(cluster = factor(kmeans_3$cluster))

#summarize clusters
cluster_summary <- cluster_summary_data %>%
  group_by(cluster) %>%
  summarize(
    n_countries = n(),
    mean_co2 = mean(co2, na.rm = TRUE),
    mean_co2_per_capita = mean(co2_per_capita, na.rm = TRUE),
    mean_methane = mean(methane, na.rm = TRUE),
    mean_population = mean(population, na.rm = TRUE),
    .groups = "drop"
  )

kable(cluster_summary, digits = 3, caption = "Cluster summary statistics")

Cluster summary statistics
cluster	n_countries	mean_co2	mean_co2_per_capita	mean_methane	mean_population
1	1	12289.037	8.658	1590.674	1419321284
2	2	4048.799	8.199	838.821	898181176
3	73	214.753	6.972	62.322	43836576

#show countries in each cluster
cluster_summary_data %>%
  select(country, cluster, co2, co2_per_capita, methane, nitrous_oxide) %>%
  arrange(cluster, desc(co2)) %>%
  kable(digits = 3, caption = "Countries by K-means cluster")

Countries by K-means cluster
country	cluster	co2	co2_per_capita	methane	nitrous_oxide
China	1	12289.037	8.658	1590.674	475.534
United States	2	4904.120	14.197	818.348	239.599
India	2	3193.478	2.201	859.294	310.004
Russia	3	1780.524	12.295	474.072	85.242
Japan	3	961.867	7.772	28.820	17.964
Indonesia	3	812.220	2.865	412.009	70.983
Iran	3	792.631	8.656	170.996	32.055
Saudi Arabia	3	692.133	20.379	117.014	13.056
South Korea	3	583.679	11.286	39.670	13.689
Germany	3	572.319	6.769	56.136	32.159
Canada	3	533.340	13.420	115.338	44.735
Turkey	3	513.035	5.865	111.797	40.169
Brazil	3	483.012	2.278	579.990	203.971
Mexico	3	460.988	3.523	151.802	39.548
South Africa	3	439.831	6.872	79.290	20.294
Australia	3	386.732	14.477	135.639	67.779
Vietnam	3	370.931	3.673	94.982	30.310
United Kingdom	3	312.906	4.526	43.812	28.062
Italy	3	301.930	5.088	40.757	15.218
Malaysia	3	290.224	8.162	28.313	8.866
Kazakhstan	3	287.006	13.937	54.973	11.967
Poland	3	272.862	7.080	44.502	26.926
Thailand	3	267.760	3.736	95.815	20.060
France	3	264.156	3.969	62.809	32.581
Taiwan	3	262.345	11.301	15.465	5.727
Egypt	3	258.368	2.217	49.838	22.805
Iraq	3	233.635	5.074	156.213	7.054
United Arab Emirates	3	221.988	20.131	37.574	2.635
Spain	3	220.341	4.599	44.595	18.848
Algeria	3	198.203	4.234	69.212	8.114
Pakistan	3	179.800	0.716	228.614	71.433
Philippines	3	174.878	1.510	78.930	18.405
Argentina	3	171.059	3.743	114.124	52.294
Ukraine	3	142.498	3.764	52.380	20.267
Uzbekistan	3	139.234	3.829	52.573	15.741
Kuwait	3	129.519	26.248	39.526	1.225
Qatar	3	125.812	41.271	22.491	0.914
Venezuela	3	116.042	4.085	47.634	11.111
Netherlands	3	114.785	6.297	19.054	8.976
Bangladesh	3	108.318	0.624	95.567	32.007
Colombia	3	92.659	1.752	91.299	23.444
Belgium	3	85.456	7.280	11.920	8.379
Oman	3	82.662	15.651	32.877	1.417
Turkmenistan	3	81.006	10.809	23.489	7.195
Chile	3	78.726	3.983	18.641	7.305
Czechia	3	75.623	7.044	12.622	6.691
Peru	3	70.247	2.053	31.677	9.523
Morocco	3	69.062	1.814	30.335	8.667
Romania	3	68.551	3.605	21.646	10.187
Austria	3	56.368	6.180	9.310	3.561
Belarus	3	55.818	6.163	18.577	15.353
Singapore	3	53.919	9.245	5.801	1.337
Greece	3	53.361	5.311	9.912	3.958
Israel	3	52.637	5.607	8.964	2.557
Hungary	3	40.017	4.136	9.494	5.427
Azerbaijan	3	39.830	3.853	12.679	2.977
Sweden	3	38.097	3.592	7.925	6.066
Norway	3	37.183	6.668	8.397	4.177
Portugal	3	35.539	3.409	10.832	2.789
Hong Kong	3	33.324	4.494	5.148	0.442
Ireland	3	33.311	6.339	17.634	8.668
New Zealand	3	32.479	6.229	34.596	13.252
Switzerland	3	32.072	3.595	6.025	2.191
Bulgaria	3	31.622	4.679	8.962	4.010
Finland	3	29.775	5.301	5.699	6.373
Slovakia	3	29.091	5.283	5.804	3.272
Denmark	3	28.369	4.746	10.216	4.691
Croatia	3	18.465	4.765	3.160	2.124
Slovenia	3	12.751	6.018	2.344	0.751
Lithuania	3	12.542	4.387	2.618	4.993
Estonia	3	8.307	6.105	1.330	1.703
Cyprus	3	7.299	5.374	2.370	0.361
Luxembourg	3	7.040	10.460	0.535	0.244
North Macedonia	3	6.620	3.631	1.528	0.458
Latvia	3	6.462	3.452	2.285	2.185
Iceland	3	3.803	9.667	0.557	0.346

#Best Subset Selection

#prepare safer data for best subset selection
subset_data <- emissions_model %>%
  select(
    co2,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na()

#log transform variables
subset_data_log <- subset_data %>%
  mutate(across(everything(), ~log1p(.x)))

#remove rows with infinite values
subset_data_log <- subset_data_log %>%
  filter(if_all(everything(), is.finite))

#remove columns with zero variance or invalid standard deviation
subset_data_log <- subset_data_log %>%
  select(where(~!is.na(sd(.x, na.rm = TRUE)) && sd(.x, na.rm = TRUE) > 0))

#create train/test split
set.seed(712)
train_index <- sample(
  1:nrow(subset_data_log),
  size = floor(0.7 * nrow(subset_data_log))
)

train_data_subset <- subset_data_log[train_index, ]
test_data_subset <- subset_data_log[-train_index, ]

#fit best subset selection
subset_fit <- regsubsets(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = train_data_subset,
  nvmax = 4,
  method = "exhaustive"
)

#model summary
subset_summary <- summary(subset_fit)

#plot bic
plot(
  subset_summary$bic,
  type = "b",
  xlab = "Number of predictors",
  ylab = "BIC",
  main = "Best Subset Selection Using BIC"
)

#best model by bic
best_size_bic <- which.min(subset_summary$bic)

#show best model coefficients
coef(subset_fit, best_size_bic)

##                (Intercept)                    methane 
##                 -1.4621262                  0.2119569 
## primary_energy_consumption 
##                  0.8495087

#best model by bic
best_size_bic <- which.min(subset_summary$bic)

#coefficients for best bic model
coef(subset_fit, best_size_bic)

##                (Intercept)                    methane 
##                 -1.4621262                  0.2119569 
## primary_energy_consumption 
##                  0.8495087

Ridge and Lasso Regression

#prepare data for ridge and lasso regression
glmnet_data <- emissions_model %>%
  select(
    co2,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na()

#log transform variables
glmnet_data <- glmnet_data %>%
  mutate(across(everything(), ~log1p(.x)))

#remove infinite values
glmnet_data <- glmnet_data %>%
  filter(if_all(everything(), is.finite))

#check dimensions
dim(glmnet_data)

## [1] 79  5

#create train/test split
set.seed(712)

train_index <- sample(
  1:nrow(glmnet_data),
  size = floor(0.7 * nrow(glmnet_data))
)

train_glmnet <- glmnet_data[train_index, ]
test_glmnet <- glmnet_data[-train_index, ]

#create x and y matrices
x_train <- model.matrix(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = train_glmnet
)[, -1]

y_train <- train_glmnet$co2

x_test <- model.matrix(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = test_glmnet
)[, -1]

y_test <- test_glmnet$co2

#check that y is not constant
summary(y_train)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.569   3.620   4.694   4.673   5.593   9.417

sd(y_train)

## [1] 1.531922

#ridge regression
set.seed(712)

ridge_cv <- cv.glmnet(
  x_train,
  y_train,
  alpha = 0
)

#best lambda
ridge_lambda <- ridge_cv$lambda.min

ridge_lambda

## [1] 0.1482516

#ridge predictions
ridge_pred <- predict(
  ridge_cv,
  s = ridge_lambda,
  newx = x_test
)

#ridge mse
ridge_mse <- mean((y_test - ridge_pred)^2)

ridge_mse

## [1] 0.05863233

#lasso regression alpha equals 1
set.seed(712)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)

#best lambda
lasso_lambda <- lasso_cv$lambda.min
lasso_lambda

## [1] 0.002005915

#lasso predictions
lasso_pred <- predict(lasso_cv, s = lasso_lambda, newx = x_test)

#lasso test mse
lasso_mse <- mean((y_test - lasso_pred)^2)
lasso_mse

## [1] 0.08058013

#lasso coefficients
coef(lasso_cv, s = lasso_lambda)

## 5 x 1 sparse Matrix of class "dgCMatrix"
##                            s=0.002005915
## (Intercept)                  -2.53431876
## methane                       0.23553516
## nitrous_oxide                -0.12388400
## population                    0.07984216
## primary_energy_consumption    0.84053375

#compare regularization models
regularization_results <- tibble(
  model = c("Ridge regression", "Lasso regression"),
  test_mse = c(ridge_mse, lasso_mse),
  test_rmse = sqrt(test_mse)
)

kable(regularization_results, digits = 5, caption = "Ridge and lasso test error comparison")

Ridge and lasso test error comparison
model	test_mse	test_rmse
Ridge regression	0.05863	0.24214
Lasso regression	0.08058	0.28387

#Neural Network Prediction

#prepare clean neural network data
nn_data <- emissions_model %>%
  select(
    co2,
    methane,
    nitrous_oxide,
    population,
    primary_energy_consumption
  ) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na()

#log transform variables
nn_data <- nn_data %>%
  mutate(across(everything(), ~log1p(.x)))

#remove infinite values
nn_data <- nn_data %>%
  filter(if_all(everything(), is.finite))

#normalize function
normalize <- function(x){
  (x - min(x)) / (max(x) - min(x))
}

#normalize all variables
nn_data_norm <- as.data.frame(nn_data %>%
  mutate(across(everything(), normalize)))

#create train/test split
set.seed(712)
nn_train_index <- sample(
  1:nrow(nn_data_norm),
  size = floor(0.7 * nrow(nn_data_norm))
)

nn_train <- nn_data_norm[nn_train_index, ]
nn_test <- nn_data_norm[-nn_train_index, ]

#fit simpler neural network model
set.seed(712)

nn_fit <- neuralnet::neuralnet(
  co2 ~ methane + nitrous_oxide + population + primary_energy_consumption,
  data = nn_train,
  hidden = c(3, 3),
  linear.output = TRUE
)

#plot neural network
plot(nn_fit)

#create exact predictor matrix for prediction
nn_test_x <- nn_test[, c(
  "methane",
  "nitrous_oxide",
  "population",
  "primary_energy_consumption"
)]

#generate predictions
nn_pred <- neuralnet::compute(nn_fit, nn_test_x)$net.result

#make predictions numeric
nn_pred <- as.numeric(nn_pred)

#calculate test mse and rmse
nn_mse <- mean((nn_test$co2 - nn_pred)^2)
nn_rmse <- sqrt(nn_mse)

nn_mse

## [1] 0.001521429

nn_rmse

## [1] 0.0390055

#compare predictive models
final_model_comparison <- tibble(
  model = c("Ridge regression", "Lasso regression", "Neural network"),
  test_mse = c(ridge_mse, lasso_mse, nn_mse),
  test_rmse = c(sqrt(ridge_mse), sqrt(lasso_mse), nn_rmse)
)

kable(final_model_comparison, digits = 5, caption = "Final predictive model comparison")

Final predictive model comparison
model	test_mse	test_rmse
Ridge regression	0.05863	0.24214
Lasso regression	0.08058	0.28387
Neural network	0.00152	0.03901

FinalProject712

Ashley St Aubin

2026-05-08

Ridge and Lasso Regression