library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
gge.df<- read.csv("GGE.csv")
str(gge.df)
## 'data.frame': 113 obs. of 8 variables:
## $ Data.Source : chr "Last Updated Date" "" "Country Name" "Aruba" ...
## $ World.Development.Indicators : chr "1/28/2025" "" "Country Code" "ABW" ...
## $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
## $ X : chr "" "" "IncomeGroup" "High income" ...
## $ X.1 : num NA NA 2020 0.482 1421.775 ...
## $ X.2 : num NA NA 2021 0.531 1443.811 ...
## $ X.3 : num NA NA 2022 0.534 1443.936 ...
## $ X.4 : num NA NA 2023 0.561 1447.72 ...
head(gge.df)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
##With the help of visuals and R functions:
##1. Summarize the dataset.
summary(gge.df)
## Data.Source World.Development.Indicators
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. X
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## X.1 X.2 X.3 X.4
## Min. : 0.01 Min. : 0.01 Min. : 0.01 Min. : 0.01
## 1st Qu.: 9.33 1st Qu.: 9.12 1st Qu.: 9.18 1st Qu.: 9.26
## Median : 44.98 Median : 46.73 Median : 46.41 Median : 43.98
## Mean : 1786.64 Mean : 1866.39 Mean : 1877.21 Mean : 1920.44
## 3rd Qu.: 492.09 3rd Qu.: 505.11 3rd Qu.: 492.56 3rd Qu.: 478.68
## Max. :33904.52 Max. :35504.53 Max. :35892.84 Max. :37161.64
## NA's :6 NA's :6 NA's :6 NA's :6
str(gge.df)
## 'data.frame': 113 obs. of 8 variables:
## $ Data.Source : chr "Last Updated Date" "" "Country Name" "Aruba" ...
## $ World.Development.Indicators : chr "1/28/2025" "" "Country Code" "ABW" ...
## $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
## $ X : chr "" "" "IncomeGroup" "High income" ...
## $ X.1 : num NA NA 2020 0.482 1421.775 ...
## $ X.2 : num NA NA 2021 0.531 1443.811 ...
## $ X.3 : num NA NA 2022 0.534 1443.936 ...
## $ X.4 : num NA NA 2023 0.561 1447.72 ...
##2. Display the first 6 and last 10 rows.
head(gge.df)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
tail(gge.df, 10)
## Data.Source World.Development.Indicators
## 104 Haiti HTI
## 105 Hungary HUN
## 106 IBRD only IBD
## 107 IDA & IBRD total IBT
## 108 IDA total IDA
## 109 IDA blend IDB
## 110 Indonesia IDN
## 111 IDA only IDX
## 112 Isle of Man IMN
## 113 India IND
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 104 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 105 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 106 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 107 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 108 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 109 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 110 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 111 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 112 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 113 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 104 Lower middle income 13.5011 13.4188 13.5126 13.6570
## 105 High income 65.8413 67.1101 64.2695 60.9268
## 106 30773.4680 32278.6559 32645.8080 33886.6317
## 107 33904.5157 35504.5258 35892.8384 37161.6368
## 108 3131.0477 3225.8699 3247.0304 3275.0051
## 109 1328.2953 1365.0507 1354.6398 1359.2719
## 110 Upper middle income 1050.3392 1077.0778 1152.7260 1200.1998
## 111 1802.7524 1860.8192 1892.3906 1915.7332
## 112 High income NA NA NA NA
## 113 Lower middle income 3433.6190 3679.8618 3897.2090 4133.5544
##3. Identify any outliers in the dataset.
# Visualize outliers using boxplot for each relevant numeric column
boxplot(gge.df[, c("X.1", "X.2", "X.3", "X.4")], main="Boxplot for Emission Data", col="lightblue")

# Find outliers using boxplot stats
boxplot.stats(gge.df$X.1)$out
## [1] 2020.000 1421.775 2855.401 14497.899 17384.746 11519.574 20045.569
## [8] 1741.846 8030.991 2567.021 3388.279 2205.838 17104.667 1239.158
## [15] 30773.468 33904.516 3131.048 1328.295 1802.752 3433.619
boxplot.stats(gge.df$X.2)$out
## [1] 2021.000 1443.811 2975.145 1294.512 15175.619 18086.111 12110.700
## [8] 20778.963 1817.263 8470.215 2706.661 3577.018 2242.381 17893.354
## [15] 1274.220 32278.656 35504.526 3225.870 1365.051 1860.819 3679.862
boxplot.stats(gge.df$X.3)$out
## [1] 2022.000 1443.936 3070.508 1298.489 15159.642 18164.730 12490.736
## [8] 20726.915 1731.899 8346.704 2628.341 3482.311 2226.298 17853.672
## [15] 1301.592 32645.808 35892.838 3247.030 1354.640 1892.391 3897.209
boxplot.stats(gge.df$X.4)$out
## [1] 2023.000 1447.720 3110.291 1300.169 15943.987 19088.032 12885.504
## [8] 21578.194 1738.032 8114.897 2432.461 3221.795 2254.267 17492.897
## [15] 1321.242 33886.632 37161.637 3275.005 1359.272 1200.200 1915.733
## [22] 4133.554
##4. Detect missing data points.
# Count missing values in the dataset
colSums(is.na(gge.df))
## Data.Source
## 0
## World.Development.Indicators
## 0
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 0
## X
## 0
## X.1
## 6
## X.2
## 6
## X.3
## 6
## X.4
## 6
# Alternatively, view the rows with missing data
missing_data <- gge.df[!complete.cases(gge.df), ]
head(missing_data)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 10 Andorra AND
## 42 Channel Islands CHI
## 55 Curacao CUW
## 112 Isle of Man IMN
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 42 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 55 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 112 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 10 High income NA NA NA NA
## 42 High income NA NA NA NA
## 55 High income NA NA NA NA
## 112 High income NA NA NA NA
##5. Explore relationships between variables through initial pattern analysis (e.g., correlation analysis)
# Compute the correlation matrix
correlation_matrix <- cor(gge.df[, c("X.1", "X.2", "X.3", "X.4")], use = "complete.obs")
# Display the correlation matrix
print(correlation_matrix)
## X.1 X.2 X.3 X.4
## X.1 1.0000000 0.9999866 0.9999200 0.9996466
## X.2 0.9999866 1.0000000 0.9999545 0.9996598
## X.3 0.9999200 0.9999545 1.0000000 0.9997844
## X.4 0.9996466 0.9996598 0.9997844 1.0000000
# Visualize correlations using a heatmap or correlation plot
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
corrplot(correlation_matrix, method="circle", type="upper", order="hclust")

##6. Preprocess the data by handling missing values and outliers appropriately.
# Remove rows with missing values
gge.df_clean <- na.omit(gge.df)
# Alternatively, impute missing values with the mean for each column (example)
gge.df$X.1[is.na(gge.df$X.1)] <- mean(gge.df$X.1, na.rm = TRUE)
# Remove outliers using a basic threshold method (example)
gge.df <- gge.df[!gge.df$X.1 %in% boxplot.stats(gge.df$X.1)$out, ]
##7. Ensure the dataset is clean and ready for further analysis.
# Remove rows where 'X.1' column has NA values
gge.df_clean <- gge.df[!is.na(gge.df$X.1), ]
# Remove rows where all values are blank (empty strings)
gge.df_clean <- gge.df[!apply(gge.df == "", 1, all), ]
# Replace empty strings with NA
gge.df[gge.df == ""] <- NA
# Remove rows with any NA values
gge.df_clean <- na.omit(gge.df)
view(gge.df)
###Question 2 (15 points):
## Explain your findings from Question 1.
## What patterns did you observe?
# The dataset contains both character (e.g., Data.Source) and numeric (X.1 to X.4) columns.
# Metadata rows (e.g., “Last Updated Date”, “Country Name”) do not contain useful numerical data.
# Missing values (NA) are present in numerical columns, mostly in metadata rows.
# Outliers were identified in emissions data, with extreme values skewing the analysis.
##Did you detect any outliers or missing values? If yes, what steps did you take to handle them?
# Missing Values: Removed metadata rows with NA using na.omit().
# Outliers: Identified using boxplot.stats() and removed to prevent skewed analysis.
# Data Cleaning: Replaced empty strings with NA and eliminated incomplete rows.
##Justify any modifications you made to the dataset
# Removed metadata and incomplete rows to retain only relevant emissions data.
# Excluded extreme outliers for more accurate trend analysis.
# Ensured a clean, structured dataset for reliable insights.
###Question 3 (20 points):
# Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across
#different countries.
# Calculate summary statistics for emissions data across the years
summary(gge.df[, c("X.1", "X.2", "X.3", "X.4")])
## X.1 X.2 X.3
## Min. : 0.0085 Min. : 0.0085 Min. : 0.0085
## 1st Qu.: 7.6666 1st Qu.: 6.8354 1st Qu.: 6.9962
## Median : 39.5182 Median : 33.6445 Median : 34.0210
## Mean : 313.6153 Mean : 225.7976 Mean : 226.1181
## 3rd Qu.: 250.4415 3rd Qu.: 125.2908 3rd Qu.: 125.3880
## Max. :2020.0000 Max. :2021.0000 Max. :2022.0000
## NA's :6 NA's :6
## X.4
## Min. : 0.0085
## 1st Qu.: 6.9821
## Median : 34.4568
## Mean : 224.8155
## 3rd Qu.: 121.4631
## Max. :2023.0000
## NA's :6
# Bar plot for emissions in 2020 across different countries
library(ggplot2)
ggplot(gge.df, aes(x = reorder(Data.Source, X.1), y = X.1)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Greenhouse Gas Emissions in 2020 by Country (Mt CO2e)",
x = "Country", y = "Emissions (Mt CO2e)") +
theme_minimal()

# Investigate whether there is a significant relationship between IncomeGroup and total
#emissions.
# Boxplot for emissions in 2020 categorized by Income Group
ggplot(gge.df, aes(x = X, y = X.1)) +
geom_boxplot() +
labs(title = "Emissions by Income Group (2020)",
x = "Income Group", y = "Emissions (Mt CO2e)") +
theme_minimal()

# ANOVA for emissions in 2020 across Income Groups
anova_result <- aov(X.1 ~ X, data = gge.df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## X 4 4327383 1081846 6.597 0.000115 ***
## Residuals 84 13775176 163990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 10 observations deleted due to missingness
# Use at least one set of appropriate visualizations to support your analysis.
# Linear regression model for emissions in 2020 based on Income Group
lm_result <- lm(X.1 ~ X, data = gge.df)
summary(lm_result)
##
## Call:
## lm(formula = X.1 ~ X, data = gge.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -294.25 -228.42 -87.84 -5.16 1492.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 294.26 64.85 4.538 1.88e-05 ***
## XIncomeGroup 1725.74 410.12 4.208 6.42e-05 ***
## XLow income -260.27 149.75 -1.738 0.0859 .
## XLower middle income -241.81 117.69 -2.055 0.0430 *
## XUpper middle income -141.87 106.47 -1.333 0.1863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 405 on 84 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.239, Adjusted R-squared: 0.2028
## F-statistic: 6.597 on 4 and 84 DF, p-value: 0.0001148
##Question 4 (10 points): Summary and Conclusion
## Briefly summarize the key steps you performed.
#Key Steps Performed
# Data Preparation: Analyzed greenhouse gas emissions (excluding LULUCF) across countries (2020–2023) to understand distribution by country.
# Visualizations: Used bar and box plots to compare emissions by country and income group.
# Statistical Analysis:
# ANOVA Test: Confirmed significant differences in emissions between income groups (p = 0.000115).
# Linear Regression: Showed income group influences emissions, with significant effects for “Lower middle income” and “Upper middle income” groups.
## Present your main findings and insights briefly.
#Key Findings:
# Emissions Distribution: Greenhouse gas emissions (excluding LULUCF) were analyzed across countries from 2020–2023.
# Statistical Analysis:
# ANOVA Test: Showed significant differences in emissions between income groups (p = 0.000115).
# Regression Analysis: Confirmed income group influences emissions, particularly for “Lower middle income” countries. However, “Upper middle income” countries showed no significant difference from high-income ones.
# Explained Variation: Income group accounts for about 24% of emissions variance, implying other key factors like industrial activity and energy consumption also play a role.
#Conclusion:
#Higher-income countries tend to have higher emissions, but income group alone does not fully explain the differences. A broader analysis incorporating other economic and industrial factors is needed for a complete picture.
#Question 5 (New) (10 points):
#To prepare for predictive modeling, perform the following:
#1. Partition the dataset into training (70%) and validation (30%) sets.
#2. Use set.seed() before partitioning to ensure reproducibility
# Step 1: Set seed for reproducibility
set.seed(123) # You can choose any number as a seed value
# Step 2: Partition the dataset into 70% training and 30% validation sets
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
trainIndex <- createDataPartition(gge.df$X, p = 0.7, list = FALSE)
## Warning in createDataPartition(gge.df$X, p = 0.7, list = FALSE): Some classes
## have a single record ( IncomeGroup ) and these will be selected for the sample
trainSet <- gge.df[trainIndex, ]
validationSet <- gge.df[-trainIndex, ]
# Check the number of rows in the training and validation sets
nrow(trainSet) # Should be 70% of the total number of rows
## [1] 72
nrow(validationSet) # Should be 30% of the total number of rows
## [1] 27
##conversation link
### https://chatgpt.com/share/67bb9ce4-ef34-8008-aa2f-7890019ab76c