library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
##  With the help of visuals and R functions:
df<- read.csv("GGE.csv")
view(df)
str(df)
## 'data.frame':    113 obs. of  8 variables:
##  $ Data.Source                                              : chr  "Last Updated Date" "" "Country Name" "Aruba" ...
##  $ World.Development.Indicators                             : chr  "1/28/2025" "" "Country Code" "ABW" ...
##  $ Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.: chr  "Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5" "" "Indicator Name" "Total greenhouse gas emissions excluding LULUCF (Mt CO2e)" ...
##  $ X                                                        : chr  "" "" "IncomeGroup" "High income" ...
##  $ X.1                                                      : num  NA NA 2020 0.482 1421.775 ...
##  $ X.2                                                      : num  NA NA 2021 0.531 1443.811 ...
##  $ X.3                                                      : num  NA NA 2022 0.534 1443.936 ...
##  $ X.4                                                      : num  NA NA 2023 0.561 1447.72 ...
dim(df)
## [1] 113   8
##Question 1 (20 points) with Solution:

##  1. Summarize the dataset.
summary(df)
##  Data.Source        World.Development.Indicators
##  Length:113         Length:113                  
##  Class :character   Class :character            
##  Mode  :character   Mode  :character            
##                                                 
##                                                 
##                                                 
##                                                 
##  Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.      X            
##  Length:113                                                Length:113        
##  Class :character                                          Class :character  
##  Mode  :character                                          Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##       X.1                X.2                X.3                X.4          
##  Min.   :    0.01   Min.   :    0.01   Min.   :    0.01   Min.   :    0.01  
##  1st Qu.:    9.33   1st Qu.:    9.12   1st Qu.:    9.18   1st Qu.:    9.26  
##  Median :   44.98   Median :   46.73   Median :   46.41   Median :   43.98  
##  Mean   : 1786.64   Mean   : 1866.39   Mean   : 1877.21   Mean   : 1920.44  
##  3rd Qu.:  492.09   3rd Qu.:  505.11   3rd Qu.:  492.56   3rd Qu.:  478.68  
##  Max.   :33904.52   Max.   :35504.53   Max.   :35892.84   Max.   :37161.64  
##  NA's   :6          NA's   :6          NA's   :6          NA's   :6
##  2. Display the first 6 and last 10 rows.
head(df)
##                   Data.Source World.Development.Indicators
## 1           Last Updated Date                    1/28/2025
## 2                                                         
## 3                Country Name                 Country Code
## 4                       Aruba                          ABW
## 5 Africa Eastern and Southern                          AFE
## 6                 Afghanistan                          AFG
##              Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                     
## 3                                                       Indicator Name
## 4            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##             X       X.1       X.2       X.3       X.4
## 1                    NA        NA        NA        NA
## 2                    NA        NA        NA        NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income    0.4822    0.5312    0.5336    0.5615
## 5             1421.7752 1443.8112 1443.9357 1447.7204
## 6  Low income   26.6463   27.6431   28.6141   29.4601
df[1:10, ]
##                    Data.Source World.Development.Indicators
## 1            Last Updated Date                    1/28/2025
## 2                                                          
## 3                 Country Name                 Country Code
## 4                        Aruba                          ABW
## 5  Africa Eastern and Southern                          AFE
## 6                  Afghanistan                          AFG
## 7   Africa Western and Central                          AFW
## 8                       Angola                          AGO
## 9                      Albania                          ALB
## 10                     Andorra                          AND
##               Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1  Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                      
## 3                                                        Indicator Name
## 4             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 7             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 8             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 9             Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 10            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##                      X       X.1       X.2       X.3       X.4
## 1                             NA        NA        NA        NA
## 2                             NA        NA        NA        NA
## 3          IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4          High income    0.4822    0.5312    0.5336    0.5615
## 5                      1421.7752 1443.8112 1443.9357 1447.7204
## 6           Low income   26.6463   27.6431   28.6141   29.4601
## 7                       866.4966  886.6322  893.4701  906.0481
## 8  Lower middle income   61.6801   64.4090   67.2108   67.7008
## 9  Upper middle income    7.9674    8.3953    7.8120    7.6737
## 10         High income        NA        NA        NA        NA
##  3. Identify any outliers in the dataset.
# Convert columns to numeric if necessary
numeric_cols <- c("X.1", "X.2", "X.3", "X.4")
df[numeric_cols] <- lapply(df[numeric_cols], as.numeric)

# Boxplot to visualize outliers
boxplot(df[, numeric_cols], main = "Boxplot for Outlier Detection", col = "lightblue")

# Identify outliers using IQR
detect_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  return(which(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR)))
}

outliers <- lapply(df[numeric_cols], detect_outliers)
outliers
## $X.1
##  [1]   3   5  11  44  65  66  67  68  69  72  77  78  99 102 106 107 108 109 111
## [20] 113
## 
## $X.2
##  [1]   3   5  11  33  44  65  66  67  68  69  72  77  78  99 102 106 107 108 109
## [20] 111 113
## 
## $X.3
##  [1]   3   5  11  33  44  65  66  67  68  69  72  77  78  99 102 106 107 108 109
## [20] 111 113
## 
## $X.4
##  [1]   3   5  11  33  44  65  66  67  68  69  72  77  78  99 102 106 107 108 109
## [20] 110 111 113
##4. Detect missing data points.
# Count missing values
colSums(is.na(df))
##                                               Data.Source 
##                                                         0 
##                              World.Development.Indicators 
##                                                         0 
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. 
##                                                         0 
##                                                         X 
##                                                         0 
##                                                       X.1 
##                                                         6 
##                                                       X.2 
##                                                         6 
##                                                       X.3 
##                                                         6 
##                                                       X.4 
##                                                         6
# Visualize missing data
aggr(df, col=c('seagreen','orange'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##                                                   Variable      Count
##                                                        X.1 0.05309735
##                                                        X.2 0.05309735
##                                                        X.3 0.05309735
##                                                        X.4 0.05309735
##                                                Data.Source 0.00000000
##                               World.Development.Indicators 0.00000000
##  Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. 0.00000000
##                                                          X 0.00000000
##5. Explore relationships between variables through initial pattern analysis (e.g., correlation analysis).
# Compute correlation matrix for numeric columns
cor_matrix <- cor(df[, numeric_cols], use = "complete.obs")

# Visualize correlation
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE)

##6. Preprocess the data by handling missing values and outliers appropriately.
# Handling missing values by replacing with the median
df[numeric_cols] <- lapply(df[numeric_cols], function(x) {
  x[is.na(x)] <- median(x, na.rm = TRUE)
  return(x)
})

# Handling outliers by capping at 1.5*IQR
cap_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  x[x < lower_bound] <- lower_bound
  x[x > upper_bound] <- upper_bound
  return(x)
}

# Identify where the actual numeric data starts
df_clean <- df[!is.na(as.numeric(df$X.1)), ]  # Keeps only rows where numeric values start

# Reset row indices
rownames(df_clean) <- NULL

# Select only numeric columns
numeric_cols <- names(df_clean)[sapply(df_clean, is.numeric)]

# Apply outlier capping only to numeric columns
df_clean[numeric_cols] <- lapply(df_clean[numeric_cols], cap_outliers)


##7. Ensure the dataset is clean and ready for further analysis.
# Set the third row as the column names
colnames(df) <- df[3, ]

# Remove the third row after setting it as the header
df <- df[-3, ]

# Deleting the first 2 rows since it has unnecessary data
df <- df[-c(1:2), ]

# Deleting the 3rd column since it has similar data
df <- df[, -3]

# Removing rows where column 3 has blank values
df <- df[df[, 3] != "" & !is.na(df[, 3]), ]

df <- na.omit(df)

# Check the structure of the cleaned data
str(df)
## 'data.frame':    90 obs. of  7 variables:
##  $ Country Name: chr  "Aruba" "Afghanistan" "Angola" "Albania" ...
##  $ Country Code: chr  "ABW" "AFG" "AGO" "ALB" ...
##  $ IncomeGroup : chr  "High income" "Low income" "Lower middle income" "Upper middle income" ...
##  $ 2020        : num  0.482 26.646 61.68 7.967 44.982 ...
##  $ 2021        : num  0.531 27.643 64.409 8.395 46.731 ...
##  $ 2022        : num  0.534 28.614 67.211 7.812 46.412 ...
##  $ 2023        : num  0.561 29.46 67.701 7.674 43.981 ...
###After performing all these steps, I got a clean data.
view(df)
#  Question 2 (15 points):

#  • Explain your findings from Question 1.
#  • What patterns did you observe?
#  • Did you detect any outliers or missing values? If yes, what steps did you take to handle them?
#  • Justify any modifications you made to the dataset.

#Solution:

#The dataset contains country-level greenhouse gas emissions across multiple years (2020-2023).

#The column names were corrected to reflect actual years instead of placeholder names like "X.1", "X.2", etc.
#Income groups were also present, which could be useful for categorizing emission levels.

#Outliers were detected using the IQR method.

#Certain countries had significantly high or low emission values compared to others.
#Outliers were capped at 1.5 × IQR to prevent extreme values from skewing analysis.
#Missing values were present in numerical columns (6 missing per year).

#These were replaced with the median to maintain dataset consistency without distorting overall trends.

#Metadata removal: The first few rows contained non-relevant text and were dropped.
#Renaming columns: The third row was used as column headers to improve readability.
#Outlier capping: Prevents extreme values from dominating analysis while preserving variability.
#Handling missing values: Replacing with the median ensures that data remains representative without artificially inflating or deflating values.
#Unnecessary columns removed: The third column was redundant and was dropped to avoid redundancy.
##Question 3 (20 points):
#  • Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across different countries.
#  • Investigate whether there is a significant relationship between IncomeGroup and total emissions.
#  • Use at least one set of appropriate visualizations to support your analysis.
#  • Apply statistical methods or models if necessary (e.g., correlation tests, regression analysis)

#Solution:

# Convert IncomeGroup to factor
df$IncomeGroup <- as.factor(df$IncomeGroup)

# Bar plot for total emissions by country (latest year: 2023)
ggplot(df, aes(x = reorder(`Country Name`, -`2023`), y = `2023`, fill = IncomeGroup)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Total Greenhouse Gas Emissions (2023)", 
       x = "Country", y = "Emissions (Mt CO₂e)") +
  theme_minimal()

# Boxplot for emissions by income group (2023)
ggplot(df, aes(x = IncomeGroup, y = `2023`, fill = IncomeGroup)) +
  geom_boxplot() +
  labs(title = "Greenhouse Gas Emissions by Income Group (2023)", 
       x = "Income Group", y = "Emissions (Mt CO₂e)") +
  theme_minimal()

# One-way ANOVA to test if IncomeGroup affects emissions
anova_result <- aov(`2023` ~ IncomeGroup, data = df)
summary(anova_result)
##             Df    Sum Sq Mean Sq F value Pr(>F)
## IncomeGroup  3   8509572 2836524   0.943  0.423
## Residuals   86 258569760 3006625
# Linear regression model: Income Group as predictor for emissions
lm_model <- lm(`2023` ~ IncomeGroup, data = df)
summary(lm_model)
## 
## Call:
## lm(formula = `2023` ~ IncomeGroup, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -823.7  -282.8  -112.0   -30.1 15120.1 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                      112.90     277.66   0.407    0.685
## IncomeGroupLow income            -77.24     641.22  -0.120    0.904
## IncomeGroupLower middle income   170.81     494.09   0.346    0.730
## IncomeGroupUpper middle income   710.95     449.85   1.580    0.118
## 
## Residual standard error: 1734 on 86 degrees of freedom
## Multiple R-squared:  0.03186,    Adjusted R-squared:  -0.001911 
## F-statistic: 0.9434 on 3 and 86 DF,  p-value: 0.4234
#Since p > 0.05, we fail to reject the null hypothesis, 
#meaning income group does not appear to have a strong effect on total emissions in this dataset.
#   Question 4 (10 points): Summary and Conclusion

#  • Briefly summarize the key steps you performed.

##1. Data Cleaning & Preprocessing:

#   Removed irrelevant metadata rows.
#   Set correct column headers for clarity.
#   Handled 6 missing values per year using median imputation.
#   Identified and capped outliers using the IQR method to prevent skewed analysis.
#   Removed redundant columns for a streamlined dataset.

##2. Modifications & Justifications:
#   Metadata & Redundant Column Removal: Improved data structure.
#   Column Renaming: Replaced placeholders with actual years for clarity.
#   Outlier Handling: Prevented distortion while preserving trends.
#   Missing Value Treatment: Used median to ensure data consistency.


#  • Present your main findings and insights briefly.

#   The dataset provides valuable insights into global greenhouse gas emissions over the past four years.
#   Emissions generally increased across most countries, with some variations.
#   Income group classification does not directly correlate with emission levels, but middle-income countries show high variability.
#   Outliers and missing data were handled effectively to ensure accurate analysis.
#   Question 5 (New) (10 points):
  
#To prepare for predictive modeling, perform the following:
# 1. Partition the dataset into training (70%) and validation (30%) sets.
# 2. Use set.seed() before partitioning to ensure reproducibility.
# use set.seed() to get the same partitions when re-running the R code.
set.seed(2505)
train.rows <- sample(rownames(df), dim(df)[1]*0.7) 
# collect all the columns with training row ID into training set: 
train.data <- df[train.rows, ] 
# assign row IDs that are not already in the training set, into validation 
valid.rows <- setdiff(rownames(df), train.rows) 
valid.data <- df[valid.rows, ]
valid.data
##              Country Name Country Code         IncomeGroup      2020      2021
## 8                  Angola          AGO Lower middle income   61.6801   64.4090
## 10                Andorra          AND         High income   44.9823   46.7315
## 12   United Arab Emirates          ARE         High income  249.3237  256.9649
## 14                Armenia          ARM Upper middle income    9.9936   10.6714
## 16    Antigua and Barbuda          ATG         High income    0.3413    0.3710
## 18                Austria          AUT         High income   77.2734   80.7242
## 19             Azerbaijan          AZE Upper middle income   54.7245   56.9862
## 22                  Benin          BEN Lower middle income   17.6528   16.0186
## 27           Bahamas, The          BHS         High income    1.7910    1.9488
## 32                Bolivia          BOL Lower middle income   48.0161   52.1085
## 36                 Bhutan          BTN Lower middle income    3.0652    3.1870
## 41            Switzerland          CHE         High income   44.9823   46.7315
## 43                  Chile          CHL         High income  124.0909  125.2908
## 45          Cote d'Ivoire          CIV Lower middle income   29.5006   31.5286
## 46               Cameroon          CMR Lower middle income   38.7752   38.8883
## 54                   Cuba          CUB Upper middle income   39.5182   38.8494
## 59                Germany          DEU         High income  749.7997  783.4886
## 61               Dominica          DMA Upper middle income    0.1347    0.1423
## 62                Denmark          DNK         High income   43.5573   44.9410
## 71       Egypt, Arab Rep.          EGY Lower middle income  306.7631  332.3060
## 83  Micronesia, Fed. Sts.          FSM Lower middle income    0.0473    0.0479
## 87                  Ghana          GHA Lower middle income   45.3548   46.9903
## 88              Gibraltar          GIB         High income    0.6622    0.6696
## 89                 Guinea          GIN Lower middle income   26.3328   26.9913
## 91          Guinea-Bissau          GNB          Low income    2.8916    2.9301
## 98                 Guyana          GUY         High income    7.3657    7.3580
## 101              Honduras          HND Lower middle income   20.3316   21.4947
## 113                 India          IND Lower middle income 3433.6190 3679.8618
##          2022      2023
## 8     67.2108   67.7008
## 10    46.4124   43.9811
## 12   267.6329  267.8232
## 14    10.3645   10.8363
## 16     0.3723    0.3886
## 18    75.4085   72.9215
## 19    59.0263   62.5503
## 22    16.6150   16.6995
## 27     1.9575    2.0503
## 32    53.5025   55.1857
## 36     3.2176    3.2549
## 41    43.5627   43.4464
## 43   125.3880  121.4631
## 45    32.1687   32.1840
## 46    39.4892   39.3772
## 54    38.2368   39.4003
## 59   761.9835  681.8103
## 61     0.1426    0.1469
## 62    43.2261   41.8315
## 71   333.3439  335.9680
## 83     0.0487    0.0492
## 87    48.2049   48.2659
## 88     0.6984    0.7124
## 89    27.9610   28.6347
## 91     2.9991    2.9975
## 98     8.2852    8.1910
## 101   22.1065   22.9201
## 113 3897.2090 4133.5544
##The link for the conversation in AI is being shared here:
# https://chatgpt.com/share/67bc8080-5f4c-8010-9f45-c472c8abf2f7