library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(reshape2)
library(VIM)
df<- read.csv("GGE.csv")
view(df)
str(df)
## 'data.frame': 113 obs. of 8 variables:
## $ Data.Source : chr "Last Updated Date" "" "Country Name" "Aruba" ...
## $ World.Development.Indicators : chr "1/28/2025" "" "Country Code" "ABW" ...
## $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
## $ X : chr "" "" "IncomeGroup" "High income" ...
## $ X.1 : num NA NA 2020 0.482 1421.775 ...
## $ X.2 : num NA NA 2021 0.531 1443.811 ...
## $ X.3 : num NA NA 2022 0.534 1443.936 ...
## $ X.4 : num NA NA 2023 0.561 1447.72 ...
dim(df)
## [1] 113 8
summary(df)
## Data.Source World.Development.Indicators
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. X
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## X.1 X.2 X.3 X.4
## Min. : 0.01 Min. : 0.01 Min. : 0.01 Min. : 0.01
## 1st Qu.: 9.33 1st Qu.: 9.12 1st Qu.: 9.18 1st Qu.: 9.26
## Median : 44.98 Median : 46.73 Median : 46.41 Median : 43.98
## Mean : 1786.64 Mean : 1866.39 Mean : 1877.21 Mean : 1920.44
## 3rd Qu.: 492.09 3rd Qu.: 505.11 3rd Qu.: 492.56 3rd Qu.: 478.68
## Max. :33904.52 Max. :35504.53 Max. :35892.84 Max. :37161.64
## NA's :6 NA's :6 NA's :6 NA's :6
head(df)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
df[1:10, ]
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## 7 Africa Western and Central AFW
## 8 Angola AGO
## 9 Albania ALB
## 10 Andorra AND
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 7 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 8 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 9 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
## 7 866.4966 886.6322 893.4701 906.0481
## 8 Lower middle income 61.6801 64.4090 67.2108 67.7008
## 9 Upper middle income 7.9674 8.3953 7.8120 7.6737
## 10 High income NA NA NA NA
numeric_cols <- c("X.1", "X.2", "X.3", "X.4")
df[numeric_cols] <- lapply(df[numeric_cols], as.numeric)
boxplot(df[, numeric_cols], main = "Boxplot for Outlier Detection", col = "lightblue")

detect_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
return(which(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR)))
}
outliers <- lapply(df[numeric_cols], detect_outliers)
outliers
## $X.1
## [1] 3 5 11 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109 111
## [20] 113
##
## $X.2
## [1] 3 5 11 33 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109
## [20] 111 113
##
## $X.3
## [1] 3 5 11 33 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109
## [20] 111 113
##
## $X.4
## [1] 3 5 11 33 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109
## [20] 110 111 113
colSums(is.na(df))
## Data.Source
## 0
## World.Development.Indicators
## 0
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 0
## X
## 0
## X.1
## 6
## X.2
## 6
## X.3
## 6
## X.4
## 6
aggr(df, col=c('seagreen','orange'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Missing data","Pattern"))

##
## Variables sorted by number of missings:
## Variable Count
## X.1 0.05309735
## X.2 0.05309735
## X.3 0.05309735
## X.4 0.05309735
## Data.Source 0.00000000
## World.Development.Indicators 0.00000000
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. 0.00000000
## X 0.00000000
cor_matrix <- cor(df[, numeric_cols], use = "complete.obs")
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE)

df[numeric_cols] <- lapply(df[numeric_cols], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
})
cap_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
df_clean <- df[!is.na(as.numeric(df$X.1)), ]
rownames(df_clean) <- NULL
numeric_cols <- names(df_clean)[sapply(df_clean, is.numeric)]
df_clean[numeric_cols] <- lapply(df_clean[numeric_cols], cap_outliers)
colnames(df) <- df[3, ]
df <- df[-3, ]
df <- df[-c(1:2), ]
df <- df[, -3]
df <- df[df[, 3] != "" & !is.na(df[, 3]), ]
df <- na.omit(df)
view(df)
summary(df)
## Country Name Country Code IncomeGroup 2020
## Length:90 Length:90 Length:90 Min. : 0.009
## Class :character Class :character Class :character 1st Qu.: 6.102
## Mode :character Mode :character Mode :character Median : 29.973
## Mean : 301.593
## 3rd Qu.: 70.239
## Max. :14497.899
## 2021 2022 2023
## Min. : 0.009 Min. : 0.009 Min. : 0.009
## 1st Qu.: 6.495 1st Qu.: 6.526 1st Qu.: 6.547
## Median : 32.587 Median : 33.095 Median : 33.320
## Mean : 316.058 Mean : 319.096 Mean : 328.919
## 3rd Qu.: 71.956 3rd Qu.: 72.532 3rd Qu.: 72.008
## Max. :15175.619 Max. :15159.642 Max. :15943.987
head(df)
## Country Name Country Code IncomeGroup 2020 2021
## 4 Aruba ABW High income 0.4822 0.5312
## 6 Afghanistan AFG Low income 26.6463 27.6431
## 8 Angola AGO Lower middle income 61.6801 64.4090
## 9 Albania ALB Upper middle income 7.9674 8.3953
## 10 Andorra AND High income 44.9823 46.7315
## 12 United Arab Emirates ARE High income 249.3237 256.9649
## 2022 2023
## 4 0.5336 0.5615
## 6 28.6141 29.4601
## 8 67.2108 67.7008
## 9 7.8120 7.6737
## 10 46.4124 43.9811
## 12 267.6329 267.8232
df[1:10, ]
## Country Name Country Code IncomeGroup 2020 2021
## 4 Aruba ABW High income 0.4822 0.5312
## 6 Afghanistan AFG Low income 26.6463 27.6431
## 8 Angola AGO Lower middle income 61.6801 64.4090
## 9 Albania ALB Upper middle income 7.9674 8.3953
## 10 Andorra AND High income 44.9823 46.7315
## 12 United Arab Emirates ARE High income 249.3237 256.9649
## 13 Argentina ARG Upper middle income 347.3124 367.6140
## 14 Armenia ARM Upper middle income 9.9936 10.6714
## 15 American Samoa ASM High income 0.0085 0.0085
## 16 Antigua and Barbuda ATG High income 0.3413 0.3710
## 2022 2023
## 4 0.5336 0.5615
## 6 28.6141 29.4601
## 8 67.2108 67.7008
## 9 7.8120 7.6737
## 10 46.4124 43.9811
## 12 267.6329 267.8232
## 13 374.7613 365.6846
## 14 10.3645 10.8363
## 15 0.0085 0.0085
## 16 0.3723 0.3886
str(df)
## 'data.frame': 90 obs. of 7 variables:
## $ Country Name: chr "Aruba" "Afghanistan" "Angola" "Albania" ...
## $ Country Code: chr "ABW" "AFG" "AGO" "ALB" ...
## $ IncomeGroup : chr "High income" "Low income" "Lower middle income" "Upper middle income" ...
## $ 2020 : num 0.482 26.646 61.68 7.967 44.982 ...
## $ 2021 : num 0.531 27.643 64.409 8.395 46.731 ...
## $ 2022 : num 0.534 28.614 67.211 7.812 46.412 ...
## $ 2023 : num 0.561 29.46 67.701 7.674 43.981 ...
numeric_cols <- c("2020", "2021", "2022", "2023")
if (all(numeric_cols %in% colnames(df))) {
df_filtered <- df
df_filtered[numeric_cols] <- lapply(df_filtered[numeric_cols], function(x) ifelse(x > 1500, NA, x))
df_melted <- melt(df_filtered, measure.vars = numeric_cols, na.rm = TRUE)
ggplot(df_melted, aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
labs(title = "Boxplot for Outlier Detection by Year", x = "Year", y = "Values") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else {
print("One or more specified columns do not exist in the data frame.")
}

aggr(df, col=c('seagreen','orange'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Missing data","Pattern"))

##
## Variables sorted by number of missings:
## Variable Count
## Country Name 0
## Country Code 0
## IncomeGroup 0
## 2020 0
## 2021 0
## 2022 0
## 2023 0
numeric_cols <- c("2020", "2021", "2022", "2023")
df[numeric_cols] <- lapply(df[numeric_cols], as.numeric)
cor_matrix <- cor(df[, numeric_cols], use = "complete.obs")
ggcorrplot(cor_matrix,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("red", "white", "darkblue"),
title = "Correlation Matrix",
ggtheme = theme_minimal())

df$IncomeGroup <- as.factor(df$IncomeGroup)
ggplot(df, aes(x = reorder(`Country Name`, -`2023`), y = `2023`, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Greenhouse Gas Emissions (2023)",
x = "Country", y = "Emissions (Mt CO₂e)") +
theme_minimal()

top_20_countries <- df %>%
arrange(desc(`2023`)) %>%
head(20)
ggplot(top_20_countries, aes(x = reorder(`Country Name`, -`2023`), y = `2023`, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Greenhouse Gas Emissions (2023)",
x = "Country", y = "Emissions (Mt CO₂e)") +
theme_minimal()

ggplot(df, aes(x = IncomeGroup, y = `2023`, fill = IncomeGroup)) +
geom_boxplot() +
labs(title = "Greenhouse Gas Emissions by Income Group (2023)",
x = "Income Group", y = "Emissions (Mt CO₂e)") +
theme_minimal()

df_filtered <- df %>% filter(`2023` < 1000)
ggplot(df_filtered, aes(x = IncomeGroup, y = `2023`, fill = IncomeGroup)) +
geom_boxplot() +
labs(title = "Greenhouse Gas Emissions by Income Group (2023)",
x = "Income Group", y = "Emissions (Mt CO₂e)") +
theme_minimal()

anova_result <- aov(`2023` ~ IncomeGroup, data = df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## IncomeGroup 3 8509572 2836524 0.943 0.423
## Residuals 86 258569760 3006625
lm_model <- lm(`2023` ~ IncomeGroup, data = df)
summary(lm_model)
##
## Call:
## lm(formula = `2023` ~ IncomeGroup, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -823.7 -282.8 -112.0 -30.1 15120.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.90 277.66 0.407 0.685
## IncomeGroupLow income -77.24 641.22 -0.120 0.904
## IncomeGroupLower middle income 170.81 494.09 0.346 0.730
## IncomeGroupUpper middle income 710.95 449.85 1.580 0.118
##
## Residual standard error: 1734 on 86 degrees of freedom
## Multiple R-squared: 0.03186, Adjusted R-squared: -0.001911
## F-statistic: 0.9434 on 3 and 86 DF, p-value: 0.4234
set.seed(2505)
train.rows <- sample(rownames(df), dim(df)[1]*0.7)
train.data <- df[train.rows, ]
valid.rows <- setdiff(rownames(df), train.rows)
valid.data <- df[valid.rows, ]
valid.data
## Country Name Country Code IncomeGroup 2020 2021
## 8 Angola AGO Lower middle income 61.6801 64.4090
## 10 Andorra AND High income 44.9823 46.7315
## 12 United Arab Emirates ARE High income 249.3237 256.9649
## 14 Armenia ARM Upper middle income 9.9936 10.6714
## 16 Antigua and Barbuda ATG High income 0.3413 0.3710
## 18 Austria AUT High income 77.2734 80.7242
## 19 Azerbaijan AZE Upper middle income 54.7245 56.9862
## 22 Benin BEN Lower middle income 17.6528 16.0186
## 27 Bahamas, The BHS High income 1.7910 1.9488
## 32 Bolivia BOL Lower middle income 48.0161 52.1085
## 36 Bhutan BTN Lower middle income 3.0652 3.1870
## 41 Switzerland CHE High income 44.9823 46.7315
## 43 Chile CHL High income 124.0909 125.2908
## 45 Cote d'Ivoire CIV Lower middle income 29.5006 31.5286
## 46 Cameroon CMR Lower middle income 38.7752 38.8883
## 54 Cuba CUB Upper middle income 39.5182 38.8494
## 59 Germany DEU High income 749.7997 783.4886
## 61 Dominica DMA Upper middle income 0.1347 0.1423
## 62 Denmark DNK High income 43.5573 44.9410
## 71 Egypt, Arab Rep. EGY Lower middle income 306.7631 332.3060
## 83 Micronesia, Fed. Sts. FSM Lower middle income 0.0473 0.0479
## 87 Ghana GHA Lower middle income 45.3548 46.9903
## 88 Gibraltar GIB High income 0.6622 0.6696
## 89 Guinea GIN Lower middle income 26.3328 26.9913
## 91 Guinea-Bissau GNB Low income 2.8916 2.9301
## 98 Guyana GUY High income 7.3657 7.3580
## 101 Honduras HND Lower middle income 20.3316 21.4947
## 113 India IND Lower middle income 3433.6190 3679.8618
## 2022 2023
## 8 67.2108 67.7008
## 10 46.4124 43.9811
## 12 267.6329 267.8232
## 14 10.3645 10.8363
## 16 0.3723 0.3886
## 18 75.4085 72.9215
## 19 59.0263 62.5503
## 22 16.6150 16.6995
## 27 1.9575 2.0503
## 32 53.5025 55.1857
## 36 3.2176 3.2549
## 41 43.5627 43.4464
## 43 125.3880 121.4631
## 45 32.1687 32.1840
## 46 39.4892 39.3772
## 54 38.2368 39.4003
## 59 761.9835 681.8103
## 61 0.1426 0.1469
## 62 43.2261 41.8315
## 71 333.3439 335.9680
## 83 0.0487 0.0492
## 87 48.2049 48.2659
## 88 0.6984 0.7124
## 89 27.9610 28.6347
## 91 2.9991 2.9975
## 98 8.2852 8.1910
## 101 22.1065 22.9201
## 113 3897.2090 4133.5544