library(ggplot2)
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(dplyr)
# Load excel data
greenhouse.df <- read.csv("GGE.csv")
# Remove first three rows containg metadata
greenhouse.df <- greenhouse.df[-c(1:3),]
# Rename the columns
colnames(greenhouse.df) <- c("CountryName","CountryCode","IndicatorName","IncomeGroup","Y2020","Y2021","Y2022","Y2023")
# Summarize the dataset
summary(greenhouse.df)
## CountryName CountryCode IndicatorName IncomeGroup
## Length:110 Length:110 Length:110 Length:110
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Y2020 Y2021 Y2022 Y2023
## Min. : 0.01 Min. : 0.01 Min. : 0.01 Min. : 0.01
## 1st Qu.: 9.33 1st Qu.: 8.87 1st Qu.: 8.73 1st Qu.: 8.72
## Median : 44.27 Median : 46.47 Median : 44.99 Median : 43.72
## Mean : 1784.44 Mean : 1864.94 Mean : 1875.85 Mean : 1919.47
## 3rd Qu.: 399.29 3rd Qu.: 427.02 3rd Qu.: 413.71 3rd Qu.: 383.97
## Max. :33904.52 Max. :35504.53 Max. :35892.84 Max. :37161.64
## NA's :4 NA's :4 NA's :4 NA's :4
# Display first 6 rows
head(greenhouse.df, 6)
# Display last 10 rows
tail(greenhouse.df, 10)
# Drop the 'IndicatorName' column
greenhouse.df <- greenhouse.df[, !(names(greenhouse.df) == "IndicatorName")]
# Remove rows where 'IncomeGroup' is blank
greenhouse.df <- greenhouse.df[greenhouse.df$IncomeGroup != "", ]
# Get unique values from the 'IncomeGroup' column
unique(greenhouse.df$IncomeGroup)
## [1] "High income" "Low income" "Lower middle income"
## [4] "Upper middle income"
# Replace values of 'IncomeGroup' with values to calculate correlation matrix
greenhouse.df$IncomeGroup <- recode(greenhouse.df$IncomeGroup, "Low income" = 1, "Lower middle income" = 2, "Upper middle income" = 3, "High income" = 4)
print("Missing values identified:")
## [1] "Missing values identified:"
colSums(is.na(greenhouse.df))
## CountryName CountryCode IncomeGroup Y2020 Y2021 Y2022
## 0 0 0 4 4 4
## Y2023
## 4
Impute missing values with the calculated median for the “High income” group
# For Y2023
median_high_income <- median(greenhouse.df$Y2023[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2023[is.na(greenhouse.df$Y2023)] <- median_high_income
# For Y2022
median_high_income <- median(greenhouse.df$Y2022[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2022[is.na(greenhouse.df$Y2022)] <- median_high_income
# For Y2021
median_high_income <- median(greenhouse.df$Y2021[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2021[is.na(greenhouse.df$Y2021)] <- median_high_income
# For Y2020
median_high_income <- median(greenhouse.df$Y2020[greenhouse.df$IncomeGroup == 4], na.rm = TRUE)
greenhouse.df$Y2020[is.na(greenhouse.df$Y2020)] <- median_high_income
# Check if missing points no longer present
print("Missing values identified:")
## [1] "Missing values identified:"
colSums(is.na(greenhouse.df))
## CountryName CountryCode IncomeGroup Y2020 Y2021 Y2022
## 0 0 0 0 0 0
## Y2023
## 0
Visualize Outliers
par(mfrow = c(1, 4))
# Using Boxplot
boxplot(greenhouse.df$Y2023, main = "For 2023", col = "lightblue", las = 2)
boxplot(greenhouse.df$Y2022, main = "For 2022", col = "lightblue", las = 2)
boxplot(greenhouse.df$Y2021, main = "For 2021", col = "lightblue", las = 2)
boxplot(greenhouse.df$Y2020, main = "For 2020", col = "lightblue", las = 2)
# Using Log-transformed Boxplot
boxplot(log(ceiling(greenhouse.df$Y2023)), main = "For 2023", col = "lightblue", las = 1.5)
boxplot(log(ceiling(greenhouse.df$Y2022)), main = "For 2022", col = "lightblue", las = 1.5)
boxplot(log(ceiling(greenhouse.df$Y2021)), main = "For 2021", col = "lightblue", las = 1.5)
boxplot(log(ceiling(greenhouse.df$Y2020)), main = "For 2020", col = "lightblue", las = 1.5)
Detect outliers using IQR since boxplot is skewed.
# For 2023
Q1 <- quantile(greenhouse.df$Y2023, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2023, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2023 < lower_bound | greenhouse.df$Y2023 > upper_bound
# Outliers detected
greenhouse.df$Y2023[outliers]
## [1] 267.8232 365.6846 571.8398 281.3805 1300.1689 747.6780
## [7] 15943.9866 223.9666 681.8103 256.7921 335.9680 285.3839
## [13] 385.5201 379.3186 1200.1998 4133.5544
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
# For 2022
Q1 <- quantile(greenhouse.df$Y2022, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2022, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2022 < lower_bound | greenhouse.df$Y2022 > upper_bound
# Outliers detected
greenhouse.df$Y2022[outliers]
## [1] 267.6329 374.7613 569.0103 278.4889 1298.4894 745.2451
## [7] 15159.6420 213.1061 761.9835 263.2161 333.3439 304.8863
## [13] 416.1122 406.5195 1152.7260 3897.2090
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
# For 2021
Q1 <- quantile(greenhouse.df$Y2021, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2021, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2021 < lower_bound | greenhouse.df$Y2021 > upper_bound
# Outliers detected
greenhouse.df$Y2021[outliers]
## [1] 256.9649 367.6140 580.7378 269.4111 1294.5115 728.2679
## [7] 15175.6191 208.0763 783.4886 253.9457 332.3060 308.5954
## [13] 429.4873 419.6342 1077.0778 3679.8618
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
# For 2020
Q1 <- quantile(greenhouse.df$Y2020, 0.25, na.rm = TRUE)
Q3 <- quantile(greenhouse.df$Y2020, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- greenhouse.df$Y2020 < lower_bound | greenhouse.df$Y2020 > upper_bound
# Outliers detected
greenhouse.df$Y2020[outliers]
## [1] 249.3237 347.3124 584.5715 251.5593 1215.9991 711.9313
## [7] 14497.8987 196.5034 749.7997 241.1320 306.7631 290.3876
## [13] 169.9928 398.3513 399.6036 1050.3392 3433.6190
# Compare outliers with the corresponding years' emissions to check for abnormality
greenhouse.df[outliers, c("CountryName", "Y2020", "Y2021", "Y2022", "Y2023")]
correlation_matrix <- cor(greenhouse.df[,c("IncomeGroup","Y2020","Y2021","Y2022","Y2023")])
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
correlation_matrix
## IncomeGroup Y2020 Y2021 Y2022 Y2023
## IncomeGroup 1.000000000 -0.009385267 -0.01023816 -0.01238826 -0.01485911
## Y2020 -0.009385267 1.000000000 0.99997931 0.99980175 0.99969584
## Y2021 -0.010238163 0.999979312 1.00000000 0.99989020 0.99979469
## Y2022 -0.012388260 0.999801751 0.99989020 1.00000000 0.99994920
## Y2023 -0.014859112 0.999695839 0.99979469 0.99994920 1.00000000
correlation_melted <- melt(correlation_matrix)
ggplot(correlation_melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(midpoint = 0, low = "blue", high = "red", mid = "orange") +
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables")
From the boxplots, we observe that the distributions for 2020, 2021, 2022, and 2023 are left-skewed. This suggests that a majority of countries have relatively high emissions.
Yes, both missing values and outliers were detected in the dataset.
par(mfrow = c(1, 2))
# Convert data to long format
emissions_long <- greenhouse.df %>%
pivot_longer(cols = c(Y2020, Y2021, Y2022, Y2023), names_to = "Year", values_to = "Emissions")
# Calculate the top 10 countries based on average emissions across both years
top_countries <- emissions_long %>%
group_by(CountryName) %>%
summarise(Total_Emissions = sum(Emissions, na.rm = TRUE)) %>%
top_n(10, Total_Emissions) %>%
pull(CountryName)
# Filter dataset to include only the top 10 countries
filtered_data <- emissions_long %>%
filter(CountryName %in% top_countries)
# Create the grouped bar chart
ggplot(filtered_data, aes(x = reorder(CountryName, -Emissions), y = Emissions, fill = Year)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Greenhouse Gas Emissions - Top 10 Countries",
x = "Country",
y = "Total Emissions (Mt CO2e)",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Calculate the bottom 10 countries based on average emissions across both years
bottom_countries <- emissions_long %>%
group_by(CountryName) %>%
summarise(Total_Emissions = sum(Emissions, na.rm = TRUE)) %>%
arrange(Total_Emissions) %>% # Arrange in ascending order
slice(1:10) %>% # Select the first 10 rows (lowest emissions)
pull(CountryName)
# Filter dataset to include only the bottom 10 countries
filtered_data <- emissions_long %>%
filter(CountryName %in% bottom_countries)
# Create the grouped bar chart
ggplot(filtered_data, aes(x = reorder(CountryName, -Emissions), y = Emissions, fill = Year)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Greenhouse Gas Emissions - Bottom 10 Countries",
x = "Country",
y = "Total Emissions (Mt CO2e)",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
The graph below highlights a significant relationship between IncomeGroup and Total Emissions. Countries in the Upper Middle-Income category exhibit the highest total emissions, while both Lower Middle-Income and High-Income countries show considerably lower emissions. Meanwhile, Low-Income countries have the lowest emissions overall.
# Calculate the total emissions by IncomeGroup and Year
total_emissions <- emissions_long %>%
group_by(IncomeGroup, Year) %>%
summarise(Total_Emissions = sum(Emissions, na.rm = TRUE))
## `summarise()` has grouped output by 'IncomeGroup'. You can override using the
## `.groups` argument.
# Plot the bar chart
ggplot(total_emissions, aes(x = factor(IncomeGroup), y = Total_Emissions, fill = Year)) +
geom_bar(stat = "identity", position = "dodge") + # Position dodge for side-by-side bars
labs(title = "Total Greenhouse Gas Emissions by Income Group and Year",
x = "Income Group",
y = "Total Emissions (Mt CO2e)",
fill = "Year") +
theme_minimal()
Since the p-value is 0.00955, it suggest that IncomeGroup has a significant effect on total emissions.
# Apply ANOVA to check the relationship between IncomeGroup and Emissions
anova_results <- aov(Emissions ~ factor(IncomeGroup), data = emissions_long)
# Summary of ANOVA results
summary(anova_results)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(IncomeGroup) 3 30566326 10188775 3.871 0.00955 **
## Residuals 356 937030512 2632108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(935008)
trainIndex <- sample(1:nrow(greenhouse.df), 0.7 * nrow(greenhouse.df)) # Partitioning
train.df <- greenhouse.df[trainIndex, ]
valid.df <- greenhouse.df[-trainIndex, ]
dim(train.df) # Training data dimension
## [1] 62 7
dim(valid.df) # Test data dimension
## [1] 28 7