Loading Packages

library(ggplot2) 
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2) 
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(dplyr)

Data Loading and Preparation

# Load excel data
greenhouse.df <- read.csv("GGE.csv")
# Remove first three rows containg metadata
greenhouse.df <- greenhouse.df[-c(1:3),]
# Rename the columns
colnames(greenhouse.df) <- c("CountryName","CountryCode","IndicatorName","IncomeGroup","Y2020","Y2021","Y2022","Y2023")

Data Description

# Summarize the dataset
summary(greenhouse.df)
##  CountryName        CountryCode        IndicatorName      IncomeGroup       
##  Length:110         Length:110         Length:110         Length:110        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      Y2020              Y2021              Y2022              Y2023         
##  Min.   :    0.01   Min.   :    0.01   Min.   :    0.01   Min.   :    0.01  
##  1st Qu.:    9.33   1st Qu.:    8.87   1st Qu.:    8.73   1st Qu.:    8.72  
##  Median :   44.27   Median :   46.47   Median :   44.99   Median :   43.72  
##  Mean   : 1784.44   Mean   : 1864.94   Mean   : 1875.85   Mean   : 1919.47  
##  3rd Qu.:  399.29   3rd Qu.:  427.02   3rd Qu.:  413.71   3rd Qu.:  383.97  
##  Max.   :33904.52   Max.   :35504.53   Max.   :35892.84   Max.   :37161.64  
##  NA's   :4          NA's   :4          NA's   :4          NA's   :4
# Display first 6 rows
head(greenhouse.df, 6)
# Display last 10 rows
tail(greenhouse.df, 10)

Data Preprocessing

# Drop the 'IndicatorName' column
greenhouse.df <- greenhouse.df[, !(names(greenhouse.df) == "IndicatorName")]
# Remove rows where 'IncomeGroup' is blank
greenhouse.df <- greenhouse.df[greenhouse.df$IncomeGroup != "", ]
# Get unique values from the 'IncomeGroup' column
unique(greenhouse.df$IncomeGroup)
## [1] "High income"         "Low income"          "Lower middle income"
## [4] "Upper middle income"
# Replace values of 'IncomeGroup' with values to calculate correlation matrix
greenhouse.df$IncomeGroup <- recode(greenhouse.df$IncomeGroup, "Low income" = 1, "Lower middle income" = 2, "Upper middle income" = 3, "High income" = 4)

Detect Missing Values

print("Missing values identified:")
## [1] "Missing values identified:"
colSums(is.na(greenhouse.df))
## CountryName CountryCode IncomeGroup       Y2020       Y2021       Y2022 
##           0           0           0           4           4           4 
##       Y2023 
##           4

Handle Missing Values

Impute missing values with the calculated median for the “High income” group

# For Y2023
median_high_income <- median(greenhouse.df$Y2023[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2023[is.na(greenhouse.df$Y2023)] <- median_high_income

# For Y2022
median_high_income <- median(greenhouse.df$Y2022[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2022[is.na(greenhouse.df$Y2022)] <- median_high_income

# For Y2021
median_high_income <- median(greenhouse.df$Y2021[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2021[is.na(greenhouse.df$Y2021)] <- median_high_income

# For Y2020
median_high_income <- median(greenhouse.df$Y2020[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2020[is.na(greenhouse.df$Y2020)] <- median_high_income

# Check if missing points no longer present
print("Missing values identified:")
## [1] "Missing values identified:"
colSums(is.na(greenhouse.df))
## CountryName CountryCode IncomeGroup       Y2020       Y2021       Y2022 
##           0           0           0           0           0           0 
##       Y2023 
##           0

Detect Outliers

Visualize Outliers

par(mfrow = c(1, 4))
# Using Boxplot
boxplot(greenhouse.df$Y2023, main = "For 2023", col = "lightblue", las = 2)
boxplot(greenhouse.df$Y2022, main = "For 2022", col = "lightblue", las = 2)
boxplot(greenhouse.df$Y2021, main = "For 2021", col = "lightblue", las = 2)
boxplot(greenhouse.df$Y2020, main = "For 2020", col = "lightblue", las = 2)

# Using Log-transformed Boxplot
boxplot(log(ceiling(greenhouse.df$Y2023)), main = "For 2023", col = "lightblue", las = 1.5)
boxplot(log(ceiling(greenhouse.df$Y2022)), main = "For 2022", col = "lightblue", las = 1.5)
boxplot(log(ceiling(greenhouse.df$Y2021)), main = "For 2021", col = "lightblue", las = 1.5)
boxplot(log(ceiling(greenhouse.df$Y2020)), main = "For 2020", col = "lightblue", las = 1.5)

Detect outliers using IQR since boxplot is skewed.

# For 2023
Q1 <- quantile(greenhouse.df$Y2023, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2023, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2023 < lower_bound | greenhouse.df$Y2023 > upper_bound
# Outliers detected
greenhouse.df$Y2023[outliers]
##  [1]   267.8232   365.6846   571.8398   281.3805  1300.1689   747.6780
##  [7] 15943.9866   223.9666   681.8103   256.7921   335.9680   285.3839
## [13]   385.5201   379.3186  1200.1998  4133.5544
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
# For 2022
Q1 <- quantile(greenhouse.df$Y2022, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2022, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2022 < lower_bound | greenhouse.df$Y2022 > upper_bound
# Outliers detected
greenhouse.df$Y2022[outliers]
##  [1]   267.6329   374.7613   569.0103   278.4889  1298.4894   745.2451
##  [7] 15159.6420   213.1061   761.9835   263.2161   333.3439   304.8863
## [13]   416.1122   406.5195  1152.7260  3897.2090
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
# For 2021
Q1 <- quantile(greenhouse.df$Y2021, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2021, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2021 < lower_bound | greenhouse.df$Y2021 > upper_bound
# Outliers detected
greenhouse.df$Y2021[outliers]
##  [1]   256.9649   367.6140   580.7378   269.4111  1294.5115   728.2679
##  [7] 15175.6191   208.0763   783.4886   253.9457   332.3060   308.5954
## [13]   429.4873   419.6342  1077.0778  3679.8618
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
# For 2020
Q1 <- quantile(greenhouse.df$Y2020, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2020, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2020 < lower_bound | greenhouse.df$Y2020 > upper_bound
# Outliers detected
greenhouse.df$Y2020[outliers]
##  [1]   249.3237   347.3124   584.5715   251.5593  1215.9991   711.9313
##  [7] 14497.8987   196.5034   749.7997   241.1320   306.7631   290.3876
## [13]   169.9928   398.3513   399.6036  1050.3392  3433.6190
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]

Correlation Matrix

correlation_matrix <- cor(greenhouse.df[,c("IncomeGroup","Y2020","Y2021","Y2022","Y2023")])
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
correlation_matrix
##              IncomeGroup        Y2020       Y2021       Y2022       Y2023
## IncomeGroup  1.000000000 -0.009385267 -0.01023816 -0.01238826 -0.01485911
## Y2020       -0.009385267  1.000000000  0.99997931  0.99980175  0.99969584
## Y2021       -0.010238163  0.999979312  1.00000000  0.99989020  0.99979469
## Y2022       -0.012388260  0.999801751  0.99989020  1.00000000  0.99994920
## Y2023       -0.014859112  0.999695839  0.99979469  0.99994920  1.00000000
correlation_melted <- melt(correlation_matrix)
ggplot(correlation_melted, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(midpoint = 0, low = "blue", high = "red", mid = "orange") +
  theme_minimal() +
  labs(title = "Correlation Heatmap", x = "Variables", y = "Variables")

Questions

What patterns did you observe?

From the boxplots, we observe that the distributions for 2020, 2021, 2022, and 2023 are left-skewed. This suggests that a majority of countries have relatively high emissions.

Did you detect any outliers or missing values? If yes, what steps did you take to handle them?

Yes, both missing values and outliers were detected in the dataset.

Handling Missing Values:

  1. All missing values were found within the “High income” group.
  2. These missing values were replaced with the median of the “High income” group to maintain consistency within the category.

Handling Outliers:

  1. Outliers were identified in the 2020–2023 columns.
  2. The outliers were examined along with their corresponding country and other yearly emissions.
  3. Since no significant abnormalities were detected, they were retained to preserve the meaning of the data.

Justify any modifications you made to the dataset.

  1. Metadata Removal: The first three rows contained metadata, which was removed to ensure proper preprocessing and analysis.
  2. Pruning Missing Values: Missing values were replaced with the median to maintain data integrity and prevent skewing the analysis.
  3. Dropping Redundant Column: The ‘IndicatorName’ column was removed since it contained only a single repeated value and did not contribute meaningful information.
  4. Categorizing Income Groups: The ‘IncomeGroup’ column was converted into numerical categories (1 = Low income, 2 = Lower middle income, 3 = Upper middle income, 4 = High income) to enable correlation analysis and statistical modeling, such as regression.

Exploratory Data Analysis

Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across different countries.

par(mfrow = c(1, 2))

# Convert data to long format
emissions_long <- greenhouse.df %>%
  pivot_longer(cols = c(Y2020, Y2021, Y2022, Y2023), names_to = "Year", values_to = "Emissions")

# Calculate the top 10 countries based on average emissions across both years
top_countries <- emissions_long %>%
  group_by(CountryName) %>%
  summarise(Total_Emissions = sum(Emissions, na.rm = TRUE)) %>%
  top_n(10, Total_Emissions) %>%
  pull(CountryName)

# Filter dataset to include only the top 10 countries
filtered_data <- emissions_long %>%
  filter(CountryName %in% top_countries)

# Create the grouped bar chart
ggplot(filtered_data, aes(x = reorder(CountryName, -Emissions), y = Emissions, fill = Year)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Greenhouse Gas Emissions - Top 10 Countries",
       x = "Country",
       y = "Total Emissions (Mt CO2e)",
       fill = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Calculate the bottom 10 countries based on average emissions across both years
bottom_countries <- emissions_long %>%
  group_by(CountryName) %>%
  summarise(Total_Emissions = sum(Emissions, na.rm = TRUE)) %>%
  arrange(Total_Emissions) %>%   # Arrange in ascending order
  slice(1:10) %>%              # Select the first 10 rows (lowest emissions)
  pull(CountryName)

# Filter dataset to include only the bottom 10 countries
filtered_data <- emissions_long %>%
  filter(CountryName %in% bottom_countries)

# Create the grouped bar chart
ggplot(filtered_data, aes(x = reorder(CountryName, -Emissions), y = Emissions, fill = Year)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Greenhouse Gas Emissions - Bottom 10 Countries",
       x = "Country",
       y = "Total Emissions (Mt CO2e)",
       fill = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

Investigate whether there is a significant relationship between IncomeGroup and total emissions.

Visualization

The graph below highlights a significant relationship between IncomeGroup and Total Emissions. Countries in the Upper Middle-Income category exhibit the highest total emissions, while both Lower Middle-Income and High-Income countries show considerably lower emissions. Meanwhile, Low-Income countries have the lowest emissions overall.

# Calculate the total emissions by IncomeGroup and Year
total_emissions <- emissions_long %>%
  group_by(IncomeGroup, Year) %>%
  summarise(Total_Emissions = sum(Emissions, na.rm = TRUE))
## `summarise()` has grouped output by 'IncomeGroup'. You can override using the
## `.groups` argument.
# Plot the bar chart
ggplot(total_emissions, aes(x = factor(IncomeGroup), y = Total_Emissions, fill = Year)) +
  geom_bar(stat = "identity", position = "dodge") +  # Position dodge for side-by-side bars
  labs(title = "Total Greenhouse Gas Emissions by Income Group and Year",
       x = "Income Group",
       y = "Total Emissions (Mt CO2e)",
       fill = "Year") +
  theme_minimal()

Statistical Testing

Since the p-value is 0.00955, it suggest that IncomeGroup has a significant effect on total emissions.

# Apply ANOVA to check the relationship between IncomeGroup and Emissions
anova_results <- aov(Emissions ~ factor(IncomeGroup), data = emissions_long)
# Summary of ANOVA results
summary(anova_results)
##                      Df    Sum Sq  Mean Sq F value  Pr(>F)   
## factor(IncomeGroup)   3  30566326 10188775   3.871 0.00955 **
## Residuals           356 937030512  2632108                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Summary

Briefly summarize the key steps you performed.

Data Cleaning & Preprocessing:

  1. Removed metadata rows.
  2. Dropped unnecessary columns like IndicatorName.
  3. Converted IncomeGroup into numerical categories for analysis.
  4. Handled missing values by imputing with the median.
  5. Addressed outliers using z-score but retained significant outliers like China’s emissions.

Exploratory Data Analysis (EDA):

  1. Visualized total greenhouse gas emissions across different countries over the years.
  2. Identified patterns in emissions.

Statistical Analysis:

  1. Conducted ANOVA to examine the relationship between IncomeGroup and total emissions.
  2. Used visualizations to support the findings.

Present your main findings and insights briefly.

Emissions Distribution Across Countries:

  1. Upper middle-income countries have the highest total greenhouse gas emissions.
  2. Low-income countries contribute the least to total emissions.
  3. China stands out with the most and significantly higher emissions compared to other countries.

Relationship Between Income Group and Emissions:

  1. ANOVA analysis confirmed a statistically significant relationship between income group and total emissions (p-value = 0.00955).
  2. Upper-middle and lower-middle-income countries show a rising trend in greenhouse gas emissions.
  3. In contrast, emissions for low-income and high-income countries have remained relatively stable over time.

Road Ahead

Predictive Modelling

To prepare for predictive modeling, perform the following:

  1. Partition the dataset into training (70%) and validation (30%) sets.
  2. Use set.seed() before partitioning to ensure reproducibility.
set.seed(935008)
trainIndex <- sample(1:nrow(greenhouse.df), 0.7 * nrow(greenhouse.df)) # Partitioning
train.df <- greenhouse.df[trainIndex, ]
valid.df <- greenhouse.df[-trainIndex, ]
dim(train.df) # Training data dimension
## [1] 62  7
dim(valid.df) # Test data dimension
## [1] 28  7