library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
##
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
## With the help of visuals and R functions:
df<- read.csv("GGE.csv")
view(df)
str(df)
## 'data.frame': 113 obs. of 8 variables:
## $ Data.Source : chr "Last Updated Date" "" "Country Name" "Aruba" ...
## $ World.Development.Indicators : chr "1/28/2025" "" "Country Code" "ABW" ...
## $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
## $ X : chr "" "" "IncomeGroup" "High income" ...
## $ X.1 : num NA NA 2020 0.482 1421.775 ...
## $ X.2 : num NA NA 2021 0.531 1443.811 ...
## $ X.3 : num NA NA 2022 0.534 1443.936 ...
## $ X.4 : num NA NA 2023 0.561 1447.72 ...
dim(df)
## [1] 113 8
##Question 1 (20 points) with Solution:
## 1. Summarize the dataset.
summary(df)
## Data.Source World.Development.Indicators
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. X
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## X.1 X.2 X.3 X.4
## Min. : 0.01 Min. : 0.01 Min. : 0.01 Min. : 0.01
## 1st Qu.: 9.33 1st Qu.: 9.12 1st Qu.: 9.18 1st Qu.: 9.26
## Median : 44.98 Median : 46.73 Median : 46.41 Median : 43.98
## Mean : 1786.64 Mean : 1866.39 Mean : 1877.21 Mean : 1920.44
## 3rd Qu.: 492.09 3rd Qu.: 505.11 3rd Qu.: 492.56 3rd Qu.: 478.68
## Max. :33904.52 Max. :35504.53 Max. :35892.84 Max. :37161.64
## NA's :6 NA's :6 NA's :6 NA's :6
## 2. Display the first 6 and last 10 rows.
head(df)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
df[1:10, ]
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## 7 Africa Western and Central AFW
## 8 Angola AGO
## 9 Albania ALB
## 10 Andorra AND
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 7 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 8 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 9 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
## 7 866.4966 886.6322 893.4701 906.0481
## 8 Lower middle income 61.6801 64.4090 67.2108 67.7008
## 9 Upper middle income 7.9674 8.3953 7.8120 7.6737
## 10 High income NA NA NA NA
## 3. Identify any outliers in the dataset.
# Convert columns to numeric if necessary
numeric_cols <- c("X.1", "X.2", "X.3", "X.4")
df[numeric_cols] <- lapply(df[numeric_cols], as.numeric)
# Boxplot to visualize outliers
boxplot(df[, numeric_cols], main = "Boxplot for Outlier Detection", col = "lightblue")

# Identify outliers using IQR
detect_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
return(which(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR)))
}
outliers <- lapply(df[numeric_cols], detect_outliers)
outliers
## $X.1
## [1] 3 5 11 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109 111
## [20] 113
##
## $X.2
## [1] 3 5 11 33 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109
## [20] 111 113
##
## $X.3
## [1] 3 5 11 33 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109
## [20] 111 113
##
## $X.4
## [1] 3 5 11 33 44 65 66 67 68 69 72 77 78 99 102 106 107 108 109
## [20] 110 111 113
##4. Detect missing data points.
# Count missing values
colSums(is.na(df))
## Data.Source
## 0
## World.Development.Indicators
## 0
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 0
## X
## 0
## X.1
## 6
## X.2
## 6
## X.3
## 6
## X.4
## 6
# Visualize missing data
aggr(df, col=c('seagreen','orange'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Missing data","Pattern"))

##
## Variables sorted by number of missings:
## Variable Count
## X.1 0.05309735
## X.2 0.05309735
## X.3 0.05309735
## X.4 0.05309735
## Data.Source 0.00000000
## World.Development.Indicators 0.00000000
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. 0.00000000
## X 0.00000000
##5. Explore relationships between variables through initial pattern analysis (e.g., correlation analysis).
# Compute correlation matrix for numeric columns
cor_matrix <- cor(df[, numeric_cols], use = "complete.obs")
# Visualize correlation
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE)

##6. Preprocess the data by handling missing values and outliers appropriately.
# Handling missing values by replacing with the median
df[numeric_cols] <- lapply(df[numeric_cols], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
})
# Handling outliers by capping at 1.5*IQR
cap_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
# Identify where the actual numeric data starts
df_clean <- df[!is.na(as.numeric(df$X.1)), ] # Keeps only rows where numeric values start
# Reset row indices
rownames(df_clean) <- NULL
# Select only numeric columns
numeric_cols <- names(df_clean)[sapply(df_clean, is.numeric)]
# Apply outlier capping only to numeric columns
df_clean[numeric_cols] <- lapply(df_clean[numeric_cols], cap_outliers)
##7. Ensure the dataset is clean and ready for further analysis.
# Set the third row as the column names
colnames(df) <- df[3, ]
# Remove the third row after setting it as the header
df <- df[-3, ]
# Deleting the first 2 rows since it has unnecessary data
df <- df[-c(1:2), ]
# Deleting the 3rd column since it has similar data
df <- df[, -3]
# Removing rows where column 3 has blank values
df <- df[df[, 3] != "" & !is.na(df[, 3]), ]
df <- na.omit(df)
# Check the structure of the cleaned data
str(df)
## 'data.frame': 90 obs. of 7 variables:
## $ Country Name: chr "Aruba" "Afghanistan" "Angola" "Albania" ...
## $ Country Code: chr "ABW" "AFG" "AGO" "ALB" ...
## $ IncomeGroup : chr "High income" "Low income" "Lower middle income" "Upper middle income" ...
## $ 2020 : num 0.482 26.646 61.68 7.967 44.982 ...
## $ 2021 : num 0.531 27.643 64.409 8.395 46.731 ...
## $ 2022 : num 0.534 28.614 67.211 7.812 46.412 ...
## $ 2023 : num 0.561 29.46 67.701 7.674 43.981 ...
###After performing all these steps, I got a clean data.
view(df)
# Question 2 (15 points):
# • Explain your findings from Question 1.
# • What patterns did you observe?
# • Did you detect any outliers or missing values? If yes, what steps did you take to handle them?
# • Justify any modifications you made to the dataset.
#Solution:
#The dataset contains country-level greenhouse gas emissions across multiple years (2020-2023).
#The column names were corrected to reflect actual years instead of placeholder names like "X.1", "X.2", etc.
#Income groups were also present, which could be useful for categorizing emission levels.
#Outliers were detected using the IQR method.
#Certain countries had significantly high or low emission values compared to others.
#Outliers were capped at 1.5 × IQR to prevent extreme values from skewing analysis.
#Missing values were present in numerical columns (6 missing per year).
#These were replaced with the median to maintain dataset consistency without distorting overall trends.
#Metadata removal: The first few rows contained non-relevant text and were dropped.
#Renaming columns: The third row was used as column headers to improve readability.
#Outlier capping: Prevents extreme values from dominating analysis while preserving variability.
#Handling missing values: Replacing with the median ensures that data remains representative without artificially inflating or deflating values.
#Unnecessary columns removed: The third column was redundant and was dropped to avoid redundancy.
##Question 3 (20 points):
# • Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across different countries.
# • Investigate whether there is a significant relationship between IncomeGroup and total emissions.
# • Use at least one set of appropriate visualizations to support your analysis.
# • Apply statistical methods or models if necessary (e.g., correlation tests, regression analysis)
#Solution:
# Convert IncomeGroup to factor
df$IncomeGroup <- as.factor(df$IncomeGroup)
# Bar plot for total emissions by country (latest year: 2023)
ggplot(df, aes(x = reorder(`Country Name`, -`2023`), y = `2023`, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Greenhouse Gas Emissions (2023)",
x = "Country", y = "Emissions (Mt CO₂e)") +
theme_minimal()

# Boxplot for emissions by income group (2023)
ggplot(df, aes(x = IncomeGroup, y = `2023`, fill = IncomeGroup)) +
geom_boxplot() +
labs(title = "Greenhouse Gas Emissions by Income Group (2023)",
x = "Income Group", y = "Emissions (Mt CO₂e)") +
theme_minimal()

# One-way ANOVA to test if IncomeGroup affects emissions
anova_result <- aov(`2023` ~ IncomeGroup, data = df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## IncomeGroup 3 8509572 2836524 0.943 0.423
## Residuals 86 258569760 3006625
# Linear regression model: Income Group as predictor for emissions
lm_model <- lm(`2023` ~ IncomeGroup, data = df)
summary(lm_model)
##
## Call:
## lm(formula = `2023` ~ IncomeGroup, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -823.7 -282.8 -112.0 -30.1 15120.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.90 277.66 0.407 0.685
## IncomeGroupLow income -77.24 641.22 -0.120 0.904
## IncomeGroupLower middle income 170.81 494.09 0.346 0.730
## IncomeGroupUpper middle income 710.95 449.85 1.580 0.118
##
## Residual standard error: 1734 on 86 degrees of freedom
## Multiple R-squared: 0.03186, Adjusted R-squared: -0.001911
## F-statistic: 0.9434 on 3 and 86 DF, p-value: 0.4234
#Since p > 0.05, we fail to reject the null hypothesis,
#meaning income group does not appear to have a strong effect on total emissions in this dataset.
# Question 4 (10 points): Summary and Conclusion
# • Briefly summarize the key steps you performed.
##1. Data Cleaning & Preprocessing:
# Removed irrelevant metadata rows.
# Set correct column headers for clarity.
# Handled 6 missing values per year using median imputation.
# Identified and capped outliers using the IQR method to prevent skewed analysis.
# Removed redundant columns for a streamlined dataset.
##2. Modifications & Justifications:
# Metadata & Redundant Column Removal: Improved data structure.
# Column Renaming: Replaced placeholders with actual years for clarity.
# Outlier Handling: Prevented distortion while preserving trends.
# Missing Value Treatment: Used median to ensure data consistency.
# • Present your main findings and insights briefly.
# The dataset provides valuable insights into global greenhouse gas emissions over the past four years.
# Emissions generally increased across most countries, with some variations.
# Income group classification does not directly correlate with emission levels, but middle-income countries show high variability.
# Outliers and missing data were handled effectively to ensure accurate analysis.
# Question 5 (New) (10 points):
#To prepare for predictive modeling, perform the following:
# 1. Partition the dataset into training (70%) and validation (30%) sets.
# 2. Use set.seed() before partitioning to ensure reproducibility.
# use set.seed() to get the same partitions when re-running the R code.
set.seed(2505)
train.rows <- sample(rownames(df), dim(df)[1]*0.7)
# collect all the columns with training row ID into training set:
train.data <- df[train.rows, ]
# assign row IDs that are not already in the training set, into validation
valid.rows <- setdiff(rownames(df), train.rows)
valid.data <- df[valid.rows, ]
valid.data
## Country Name Country Code IncomeGroup 2020 2021
## 8 Angola AGO Lower middle income 61.6801 64.4090
## 10 Andorra AND High income 44.9823 46.7315
## 12 United Arab Emirates ARE High income 249.3237 256.9649
## 14 Armenia ARM Upper middle income 9.9936 10.6714
## 16 Antigua and Barbuda ATG High income 0.3413 0.3710
## 18 Austria AUT High income 77.2734 80.7242
## 19 Azerbaijan AZE Upper middle income 54.7245 56.9862
## 22 Benin BEN Lower middle income 17.6528 16.0186
## 27 Bahamas, The BHS High income 1.7910 1.9488
## 32 Bolivia BOL Lower middle income 48.0161 52.1085
## 36 Bhutan BTN Lower middle income 3.0652 3.1870
## 41 Switzerland CHE High income 44.9823 46.7315
## 43 Chile CHL High income 124.0909 125.2908
## 45 Cote d'Ivoire CIV Lower middle income 29.5006 31.5286
## 46 Cameroon CMR Lower middle income 38.7752 38.8883
## 54 Cuba CUB Upper middle income 39.5182 38.8494
## 59 Germany DEU High income 749.7997 783.4886
## 61 Dominica DMA Upper middle income 0.1347 0.1423
## 62 Denmark DNK High income 43.5573 44.9410
## 71 Egypt, Arab Rep. EGY Lower middle income 306.7631 332.3060
## 83 Micronesia, Fed. Sts. FSM Lower middle income 0.0473 0.0479
## 87 Ghana GHA Lower middle income 45.3548 46.9903
## 88 Gibraltar GIB High income 0.6622 0.6696
## 89 Guinea GIN Lower middle income 26.3328 26.9913
## 91 Guinea-Bissau GNB Low income 2.8916 2.9301
## 98 Guyana GUY High income 7.3657 7.3580
## 101 Honduras HND Lower middle income 20.3316 21.4947
## 113 India IND Lower middle income 3433.6190 3679.8618
## 2022 2023
## 8 67.2108 67.7008
## 10 46.4124 43.9811
## 12 267.6329 267.8232
## 14 10.3645 10.8363
## 16 0.3723 0.3886
## 18 75.4085 72.9215
## 19 59.0263 62.5503
## 22 16.6150 16.6995
## 27 1.9575 2.0503
## 32 53.5025 55.1857
## 36 3.2176 3.2549
## 41 43.5627 43.4464
## 43 125.3880 121.4631
## 45 32.1687 32.1840
## 46 39.4892 39.3772
## 54 38.2368 39.4003
## 59 761.9835 681.8103
## 61 0.1426 0.1469
## 62 43.2261 41.8315
## 71 333.3439 335.9680
## 83 0.0487 0.0492
## 87 48.2049 48.2659
## 88 0.6984 0.7124
## 89 27.9610 28.6347
## 91 2.9991 2.9975
## 98 8.2852 8.1910
## 101 22.1065 22.9201
## 113 3897.2090 4133.5544
##The link for the conversation in AI is being shared here:
# https://chatgpt.com/share/67bc8080-5f4c-8010-9f45-c472c8abf2f7