library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

gge.df<- read.csv("GGE.csv")

str(gge.df)
## 'data.frame':    113 obs. of  8 variables:
##  $ Data.Source                                              : chr  "Last Updated Date" "" "Country Name" "Aruba" ...
##  $ World.Development.Indicators                             : chr  "1/28/2025" "" "Country Code" "ABW" ...
##  $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr  "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
##  $ X                                                        : chr  "" "" "IncomeGroup" "High income" ...
##  $ X.1                                                      : num  NA NA 2020 0.482 1421.775 ...
##  $ X.2                                                      : num  NA NA 2021 0.531 1443.811 ...
##  $ X.3                                                      : num  NA NA 2022 0.534 1443.936 ...
##  $ X.4                                                      : num  NA NA 2023 0.561 1447.72 ...
head(gge.df)
##                   Data.Source World.Development.Indicators
## 1           Last Updated Date                    1/28/2025
## 2                                                         
## 3                Country Name                 Country Code
## 4                       Aruba                          ABW
## 5 Africa Eastern and Southern                          AFE
## 6                 Afghanistan                          AFG
##              Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                     
## 3                                                       Indicator Name
## 4            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##             X       X.1       X.2       X.3       X.4
## 1                    NA        NA        NA        NA
## 2                    NA        NA        NA        NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income    0.4822    0.5312    0.5336    0.5615
## 5             1421.7752 1443.8112 1443.9357 1447.7204
## 6  Low income   26.6463   27.6431   28.6141   29.4601
##With the help of visuals and R functions:
##1. Summarize the dataset.
summary(gge.df)
##  Data.Source        World.Development.Indicators
##  Length:113         Length:113                  
##  Class :character   Class :character            
##  Mode  :character   Mode  :character            
##                                                 
##                                                 
##                                                 
##                                                 
##  Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.      X            
##  Length:113                                                Length:113        
##  Class :character                                          Class :character  
##  Mode  :character                                          Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##       X.1                X.2                X.3                X.4          
##  Min.   :    0.01   Min.   :    0.01   Min.   :    0.01   Min.   :    0.01  
##  1st Qu.:    9.33   1st Qu.:    9.12   1st Qu.:    9.18   1st Qu.:    9.26  
##  Median :   44.98   Median :   46.73   Median :   46.41   Median :   43.98  
##  Mean   : 1786.64   Mean   : 1866.39   Mean   : 1877.21   Mean   : 1920.44  
##  3rd Qu.:  492.09   3rd Qu.:  505.11   3rd Qu.:  492.56   3rd Qu.:  478.68  
##  Max.   :33904.52   Max.   :35504.53   Max.   :35892.84   Max.   :37161.64  
##  NA's   :6          NA's   :6          NA's   :6          NA's   :6
str(gge.df)
## 'data.frame':    113 obs. of  8 variables:
##  $ Data.Source                                              : chr  "Last Updated Date" "" "Country Name" "Aruba" ...
##  $ World.Development.Indicators                             : chr  "1/28/2025" "" "Country Code" "ABW" ...
##  $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr  "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
##  $ X                                                        : chr  "" "" "IncomeGroup" "High income" ...
##  $ X.1                                                      : num  NA NA 2020 0.482 1421.775 ...
##  $ X.2                                                      : num  NA NA 2021 0.531 1443.811 ...
##  $ X.3                                                      : num  NA NA 2022 0.534 1443.936 ...
##  $ X.4                                                      : num  NA NA 2023 0.561 1447.72 ...
##2. Display the first 6 and last 10 rows.
head(gge.df)
##                   Data.Source World.Development.Indicators
## 1           Last Updated Date                    1/28/2025
## 2                                                         
## 3                Country Name                 Country Code
## 4                       Aruba                          ABW
## 5 Africa Eastern and Southern                          AFE
## 6                 Afghanistan                          AFG
##              Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                     
## 3                                                       Indicator Name
## 4            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##             X       X.1       X.2       X.3       X.4
## 1                    NA        NA        NA        NA
## 2                    NA        NA        NA        NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income    0.4822    0.5312    0.5336    0.5615
## 5             1421.7752 1443.8112 1443.9357 1447.7204
## 6  Low income   26.6463   27.6431   28.6141   29.4601
tail(gge.df, 10)
##          Data.Source World.Development.Indicators
## 104            Haiti                          HTI
## 105          Hungary                          HUN
## 106        IBRD only                          IBD
## 107 IDA & IBRD total                          IBT
## 108        IDA total                          IDA
## 109        IDA blend                          IDB
## 110        Indonesia                          IDN
## 111         IDA only                          IDX
## 112      Isle of Man                          IMN
## 113            India                          IND
##     Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 104 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 105 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 106 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 107 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 108 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 109 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 110 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 111 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 112 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 113 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##                       X        X.1        X.2        X.3        X.4
## 104 Lower middle income    13.5011    13.4188    13.5126    13.6570
## 105         High income    65.8413    67.1101    64.2695    60.9268
## 106                     30773.4680 32278.6559 32645.8080 33886.6317
## 107                     33904.5157 35504.5258 35892.8384 37161.6368
## 108                      3131.0477  3225.8699  3247.0304  3275.0051
## 109                      1328.2953  1365.0507  1354.6398  1359.2719
## 110 Upper middle income  1050.3392  1077.0778  1152.7260  1200.1998
## 111                      1802.7524  1860.8192  1892.3906  1915.7332
## 112         High income         NA         NA         NA         NA
## 113 Lower middle income  3433.6190  3679.8618  3897.2090  4133.5544
##3. Identify any outliers in the dataset.
# Visualize outliers using boxplot for each relevant numeric column
boxplot(gge.df[, c("X.1", "X.2", "X.3", "X.4")], main="Boxplot for Emission Data", col="lightblue")

# Find outliers using boxplot stats
boxplot.stats(gge.df$X.1)$out
##  [1]  2020.000  1421.775  2855.401 14497.899 17384.746 11519.574 20045.569
##  [8]  1741.846  8030.991  2567.021  3388.279  2205.838 17104.667  1239.158
## [15] 30773.468 33904.516  3131.048  1328.295  1802.752  3433.619
boxplot.stats(gge.df$X.2)$out
##  [1]  2021.000  1443.811  2975.145  1294.512 15175.619 18086.111 12110.700
##  [8] 20778.963  1817.263  8470.215  2706.661  3577.018  2242.381 17893.354
## [15]  1274.220 32278.656 35504.526  3225.870  1365.051  1860.819  3679.862
boxplot.stats(gge.df$X.3)$out
##  [1]  2022.000  1443.936  3070.508  1298.489 15159.642 18164.730 12490.736
##  [8] 20726.915  1731.899  8346.704  2628.341  3482.311  2226.298 17853.672
## [15]  1301.592 32645.808 35892.838  3247.030  1354.640  1892.391  3897.209
boxplot.stats(gge.df$X.4)$out
##  [1]  2023.000  1447.720  3110.291  1300.169 15943.987 19088.032 12885.504
##  [8] 21578.194  1738.032  8114.897  2432.461  3221.795  2254.267 17492.897
## [15]  1321.242 33886.632 37161.637  3275.005  1359.272  1200.200  1915.733
## [22]  4133.554
##4. Detect missing data points.
# Count missing values in the dataset
colSums(is.na(gge.df))
##                                               Data.Source 
##                                                         0 
##                              World.Development.Indicators 
##                                                         0 
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. 
##                                                         0 
##                                                         X 
##                                                         0 
##                                                       X.1 
##                                                         6 
##                                                       X.2 
##                                                         6 
##                                                       X.3 
##                                                         6 
##                                                       X.4 
##                                                         6
# Alternatively, view the rows with missing data
missing_data <- gge.df[!complete.cases(gge.df), ]
head(missing_data)
##           Data.Source World.Development.Indicators
## 1   Last Updated Date                    1/28/2025
## 2                                                 
## 10            Andorra                          AND
## 42    Channel Islands                          CHI
## 55            Curacao                          CUW
## 112       Isle of Man                          IMN
##                Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1   Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                       
## 10             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 42             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 55             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 112            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##               X X.1 X.2 X.3 X.4
## 1                NA  NA  NA  NA
## 2                NA  NA  NA  NA
## 10  High income  NA  NA  NA  NA
## 42  High income  NA  NA  NA  NA
## 55  High income  NA  NA  NA  NA
## 112 High income  NA  NA  NA  NA
##5. Explore relationships between variables through initial pattern analysis (e.g., correlation analysis)
# Compute the correlation matrix
correlation_matrix <- cor(gge.df[, c("X.1", "X.2", "X.3", "X.4")], use = "complete.obs")

# Display the correlation matrix
print(correlation_matrix)
##           X.1       X.2       X.3       X.4
## X.1 1.0000000 0.9999866 0.9999200 0.9996466
## X.2 0.9999866 1.0000000 0.9999545 0.9996598
## X.3 0.9999200 0.9999545 1.0000000 0.9997844
## X.4 0.9996466 0.9996598 0.9997844 1.0000000
# Visualize correlations using a heatmap or correlation plot
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
corrplot(correlation_matrix, method="circle", type="upper", order="hclust")

##6. Preprocess the data by handling missing values and outliers appropriately.
# Remove rows with missing values
gge.df_clean <- na.omit(gge.df)

# Alternatively, impute missing values with the mean for each column (example)
gge.df$X.1[is.na(gge.df$X.1)] <- mean(gge.df$X.1, na.rm = TRUE)
# Remove outliers using a basic threshold method (example)
gge.df <- gge.df[!gge.df$X.1 %in% boxplot.stats(gge.df$X.1)$out, ]

##7. Ensure the dataset is clean and ready for further analysis.
# Remove rows where 'X.1' column has NA values
gge.df_clean <- gge.df[!is.na(gge.df$X.1), ]

# Remove rows where all values are blank (empty strings)
gge.df_clean <- gge.df[!apply(gge.df == "", 1, all), ]

# Replace empty strings with NA
gge.df[gge.df == ""] <- NA

# Remove rows with any NA values
gge.df_clean <- na.omit(gge.df)

view(gge.df)
###Question 2 (15 points):

## Explain your findings from Question 1.

## What patterns did you observe?
#   The dataset contains both character (e.g., Data.Source) and numeric (X.1 to X.4) columns.
#   Metadata rows (e.g., “Last Updated Date”, “Country Name”) do not contain useful numerical data.
#   Missing values (NA) are present in numerical columns, mostly in metadata rows.
#   Outliers were identified in emissions data, with extreme values skewing the analysis.

##Did you detect any outliers or missing values? If yes, what steps did you take to handle them?
#   Missing Values: Removed metadata rows with NA using na.omit().
#   Outliers: Identified using boxplot.stats() and removed to prevent skewed analysis.
#   Data Cleaning: Replaced empty strings with NA and eliminated incomplete rows.

##Justify any modifications you made to the dataset
#   Removed metadata and incomplete rows to retain only relevant emissions data.
#   Excluded extreme outliers for more accurate trend analysis.
#   Ensured a clean, structured dataset for reliable insights.
###Question 3 (20 points):
# Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across
#different countries.
# Calculate summary statistics for emissions data across the years
summary(gge.df[, c("X.1", "X.2", "X.3", "X.4")])
##       X.1                 X.2                 X.3           
##  Min.   :   0.0085   Min.   :   0.0085   Min.   :   0.0085  
##  1st Qu.:   7.6666   1st Qu.:   6.8354   1st Qu.:   6.9962  
##  Median :  39.5182   Median :  33.6445   Median :  34.0210  
##  Mean   : 313.6153   Mean   : 225.7976   Mean   : 226.1181  
##  3rd Qu.: 250.4415   3rd Qu.: 125.2908   3rd Qu.: 125.3880  
##  Max.   :2020.0000   Max.   :2021.0000   Max.   :2022.0000  
##                      NA's   :6           NA's   :6          
##       X.4           
##  Min.   :   0.0085  
##  1st Qu.:   6.9821  
##  Median :  34.4568  
##  Mean   : 224.8155  
##  3rd Qu.: 121.4631  
##  Max.   :2023.0000  
##  NA's   :6
# Bar plot for emissions in 2020 across different countries
library(ggplot2)
ggplot(gge.df, aes(x = reorder(Data.Source, X.1), y = X.1)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Total Greenhouse Gas Emissions in 2020 by Country (Mt CO2e)",
       x = "Country", y = "Emissions (Mt CO2e)") +
  theme_minimal()

# Investigate whether there is a significant relationship between IncomeGroup and total
#emissions.
# Boxplot for emissions in 2020 categorized by Income Group
ggplot(gge.df, aes(x = X, y = X.1)) +
  geom_boxplot() +
  labs(title = "Emissions by Income Group (2020)",
       x = "Income Group", y = "Emissions (Mt CO2e)") +
  theme_minimal()

# ANOVA for emissions in 2020 across Income Groups
anova_result <- aov(X.1 ~ X, data = gge.df)
summary(anova_result)
##             Df   Sum Sq Mean Sq F value   Pr(>F)    
## X            4  4327383 1081846   6.597 0.000115 ***
## Residuals   84 13775176  163990                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 10 observations deleted due to missingness
# Use at least one set of appropriate visualizations to support your analysis.
# Linear regression model for emissions in 2020 based on Income Group
lm_result <- lm(X.1 ~ X, data = gge.df)
summary(lm_result)
## 
## Call:
## lm(formula = X.1 ~ X, data = gge.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -294.25 -228.42  -87.84   -5.16 1492.39 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            294.26      64.85   4.538 1.88e-05 ***
## XIncomeGroup          1725.74     410.12   4.208 6.42e-05 ***
## XLow income           -260.27     149.75  -1.738   0.0859 .  
## XLower middle income  -241.81     117.69  -2.055   0.0430 *  
## XUpper middle income  -141.87     106.47  -1.333   0.1863    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 405 on 84 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.239,  Adjusted R-squared:  0.2028 
## F-statistic: 6.597 on 4 and 84 DF,  p-value: 0.0001148
##Question 4 (10 points): Summary and Conclusion

## Briefly summarize the key steps you performed.
#Key Steps Performed
#   Data Preparation: Analyzed greenhouse gas emissions (excluding LULUCF) across countries (2020–2023) to understand distribution by country.
#   Visualizations: Used bar and box plots to compare emissions by country and income group.
#   Statistical Analysis:
#   ANOVA Test: Confirmed significant differences in emissions between income groups (p = 0.000115).
#   Linear Regression: Showed income group influences emissions, with significant effects for “Lower middle income” and “Upper middle income” groups.

## Present your main findings and insights briefly.
#Key Findings:
#   Emissions Distribution: Greenhouse gas emissions (excluding LULUCF) were analyzed across countries from 2020–2023.
#   Statistical Analysis:
#   ANOVA Test: Showed significant differences in emissions between income groups (p = 0.000115).
#   Regression Analysis: Confirmed income group influences emissions, particularly for “Lower middle income” countries. However, “Upper middle income” countries showed no significant difference from high-income ones.
#   Explained Variation: Income group accounts for about 24% of emissions variance, implying other key factors like industrial activity and energy consumption also play a role.

#Conclusion:
#Higher-income countries tend to have higher emissions, but income group alone does not fully explain the differences. A broader analysis incorporating other economic and industrial factors is needed for a complete picture.
#Question 5 (New) (10 points):
#To prepare for predictive modeling, perform the following:
#1. Partition the dataset into training (70%) and validation (30%) sets.
#2. Use set.seed() before partitioning to ensure reproducibility

# Step 1: Set seed for reproducibility
set.seed(123)  # You can choose any number as a seed value

# Step 2: Partition the dataset into 70% training and 30% validation sets
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
trainIndex <- createDataPartition(gge.df$X, p = 0.7, list = FALSE)
## Warning in createDataPartition(gge.df$X, p = 0.7, list = FALSE): Some classes
## have a single record ( IncomeGroup ) and these will be selected for the sample
trainSet <- gge.df[trainIndex, ]
validationSet <- gge.df[-trainIndex, ]

# Check the number of rows in the training and validation sets
nrow(trainSet)  # Should be 70% of the total number of rows
## [1] 72
nrow(validationSet)  # Should be 30% of the total number of rows
## [1] 27
##conversation link
### https://chatgpt.com/share/67bb9ce4-ef34-8008-aa2f-7890019ab76c