library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.2
library(ggthemes)
library("e1071")
## Warning: package 'e1071' was built under R version 4.4.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Loading DATA SET

#Question 1 (20 points):
###Loading DATA SET

df <- read.csv("GGE.csv")
head(df)
##                   Data.Source World.Development.Indicators
## 1           Last Updated Date                    1/28/2025
## 2                                                         
## 3                Country Name                 Country Code
## 4                       Aruba                          ABW
## 5 Africa Eastern and Southern                          AFE
## 6                 Afghanistan                          AFG
##              Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                     
## 3                                                       Indicator Name
## 4            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##             X       X.1       X.2       X.3       X.4
## 1                    NA        NA        NA        NA
## 2                    NA        NA        NA        NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income    0.4822    0.5312    0.5336    0.5615
## 5             1421.7752 1443.8112 1443.9357 1447.7204
## 6  Low income   26.6463   27.6431   28.6141   29.4601

###Summarize the dataset

summary(df)
##  Data.Source        World.Development.Indicators
##  Length:113         Length:113                  
##  Class :character   Class :character            
##  Mode  :character   Mode  :character            
##                                                 
##                                                 
##                                                 
##                                                 
##  Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.      X            
##  Length:113                                                Length:113        
##  Class :character                                          Class :character  
##  Mode  :character                                          Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##       X.1                X.2                X.3                X.4          
##  Min.   :    0.01   Min.   :    0.01   Min.   :    0.01   Min.   :    0.01  
##  1st Qu.:    9.33   1st Qu.:    9.12   1st Qu.:    9.18   1st Qu.:    9.26  
##  Median :   44.98   Median :   46.73   Median :   46.41   Median :   43.98  
##  Mean   : 1786.64   Mean   : 1866.39   Mean   : 1877.21   Mean   : 1920.44  
##  3rd Qu.:  492.09   3rd Qu.:  505.11   3rd Qu.:  492.56   3rd Qu.:  478.68  
##  Max.   :33904.52   Max.   :35504.53   Max.   :35892.84   Max.   :37161.64  
##  NA's   :6          NA's   :6          NA's   :6          NA's   :6

###Display the first 6 and last 10 rows.

head(df,6)
##                   Data.Source World.Development.Indicators
## 1           Last Updated Date                    1/28/2025
## 2                                                         
## 3                Country Name                 Country Code
## 4                       Aruba                          ABW
## 5 Africa Eastern and Southern                          AFE
## 6                 Afghanistan                          AFG
##              Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2                                                                     
## 3                                                       Indicator Name
## 4            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6            Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##             X       X.1       X.2       X.3       X.4
## 1                    NA        NA        NA        NA
## 2                    NA        NA        NA        NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income    0.4822    0.5312    0.5336    0.5615
## 5             1421.7752 1443.8112 1443.9357 1447.7204
## 6  Low income   26.6463   27.6431   28.6141   29.4601
tail(df,10)
##          Data.Source World.Development.Indicators
## 104            Haiti                          HTI
## 105          Hungary                          HUN
## 106        IBRD only                          IBD
## 107 IDA & IBRD total                          IBT
## 108        IDA total                          IDA
## 109        IDA blend                          IDB
## 110        Indonesia                          IDN
## 111         IDA only                          IDX
## 112      Isle of Man                          IMN
## 113            India                          IND
##     Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 104 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 105 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 106 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 107 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 108 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 109 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 110 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 111 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 112 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 113 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##                       X        X.1        X.2        X.3        X.4
## 104 Lower middle income    13.5011    13.4188    13.5126    13.6570
## 105         High income    65.8413    67.1101    64.2695    60.9268
## 106                     30773.4680 32278.6559 32645.8080 33886.6317
## 107                     33904.5157 35504.5258 35892.8384 37161.6368
## 108                      3131.0477  3225.8699  3247.0304  3275.0051
## 109                      1328.2953  1365.0507  1354.6398  1359.2719
## 110 Upper middle income  1050.3392  1077.0778  1152.7260  1200.1998
## 111                      1802.7524  1860.8192  1892.3906  1915.7332
## 112         High income         NA         NA         NA         NA
## 113 Lower middle income  3433.6190  3679.8618  3897.2090  4133.5544

###Removing Unnesscary Space

df <- df[-c(1:3),]
head(df)
##                   Data.Source World.Development.Indicators
## 4                       Aruba                          ABW
## 5 Africa Eastern and Southern                          AFE
## 6                 Afghanistan                          AFG
## 7  Africa Western and Central                          AFW
## 8                      Angola                          AGO
## 9                     Albania                          ALB
##   Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.                   X
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)         High income
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)                    
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)          Low income
## 7 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)                    
## 8 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) Lower middle income
## 9 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) Upper middle income
##         X.1       X.2       X.3       X.4
## 4    0.4822    0.5312    0.5336    0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6   26.6463   27.6431   28.6141   29.4601
## 7  866.4966  886.6322  893.4701  906.0481
## 8   61.6801   64.4090   67.2108   67.7008
## 9    7.9674    8.3953    7.8120    7.6737

###Renaming Coloums for Better Analysing & View AfterMath Structure

colnames(df)[1:8]<- c("Countries","Country Code","Indicator 
                      Group","Income Group","2020","2021",
                      "2022","2023")
colnames(df)
## [1] "Countries"                              
## [2] "Country Code"                           
## [3] "Indicator \n                      Group"
## [4] "Income Group"                           
## [5] "2020"                                   
## [6] "2021"                                   
## [7] "2022"                                   
## [8] "2023"

###Detect missing data points ###BOX Plot(Console OutPut) to Identify NA’s

##Log Function for better Visualization of Box Plot

log_data <- log(df[, c("2020", "2021", "2022", "2023")])
log_data<- log_data %>%
  pivot_longer(cols = c("2020", "2021", "2022", "2023"), 
               names_to = "Year", values_to = "Emissions")

##BOX Plot(Console OutPut) to Identify NA’s

ggplot(log_data, aes(x = Year, y = Emissions)) + 
  geom_boxplot(outlier.colour = "red", color = "black") +  # Adding outliers and color
  ggtitle("Boxplot of Greenhouse Gas Emissions (2020-2023)") +
  xlab("Year") + 
  ylab(" Greenhouse Gas Emissions ") +
  theme_economist() +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ylim(-5, 20)  
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

###or

df_na <- colSums(is.na(df))
df_na[df_na > 0]
## 2020 2021 2022 2023 
##    4    4    4    4

##NA Values Detected(In Console)

##Filling NA Values ###what to use to replace Null Values, MEAN OR MEDIAN & WHY ? ###“Mean” When your data is roughly normally distributed. ###“Median” When your data is skewed or contains outliers. ### FINDING DISTRIBUTION OF DATA(FOR FILLING NA) -

ggplot(df, aes(x = `2020`)) + 
  geom_density() +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).

###Unable To Showcase Distribution hence using Log Function,To Fill NA Values

###Using ggplot and Log To Identify Year - ‘2021’ Data Distribution

ggplot(df, aes(x = log(`2021`))) + 
  geom_density()+ggtitle('Data Distribution of Year 2021')+
  xlab('Data Distribution of Year 2021') +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).

###Using ggplot and Log To Identify Year - ‘2022’ Data Distribution

ggplot(df, aes(x = log(`2022`))) + 
  geom_density()+ggtitle('Data Distribution of Year 2022')+
  xlab('Data Distribution of Year 2022') +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).

###Using ggplot and Log To Identify Year - ‘2023’ Data Distribution

ggplot(df, aes(x = log(`2023`))) + 
  geom_density()+ggtitle('Data Distribution of Year 2023')+
  xlab('Data Distribution of Year 2023') +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).

##RESULT - Highly Right Skewed Data , Hence MEDIAN To Fill NA

###filling Nulls Values with Median , as we have a highly skewed data

df$`2020`[is.na(df$`2020`)] <- median(df$`2020`, na.rm = TRUE)
df$`2021`[is.na(df$`2021`)] <- median(df$`2021`, na.rm = TRUE)
df$`2022`[is.na(df$`2022`)] <- median(df$`2022`, na.rm = TRUE)
df$`2023`[is.na(df$`2023`)] <- median(df$`2023`, na.rm = TRUE)

###Recheck NA

df_na <- colSums(is.na(df))
df_na[df_na > 0]
## named numeric(0)

RESULT - NA’s Filled with Median

###Identify any outliers in the dataset. ###Using Box Plot to identify Outliers

log_data <- log(df[, c("2020", "2021", "2022", "2023")])

log_data<- log_data %>%
  pivot_longer(cols = c("2020", "2021", "2022", "2023"), 
               names_to = "Year", values_to = "Emissions")
 
ggplot(log_data, aes(x = Year, y = Emissions)) + 
  geom_boxplot(outlier.colour = "red", color = "black") +  # Adding outliers and color
  ggtitle("Finding Outliers") +
  xlab("Year") + 
  ylab("Greenhouse Gas Emissions") +
  theme_economist() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ylim(-5, 20) 

##RESULT - Outliers Observed In Every Year

###Explore relationships between variables through initial pattern analysis ##Cor Matrix

df_selected_cor = df[,c("2020","2021","2022","2023")]
cor_set = cor(df_selected_cor)
print(cor_set)
##           2020      2021      2022      2023
## 2020 1.0000000 0.9999878 0.9999217 0.9996506
## 2021 0.9999878 1.0000000 0.9999547 0.9996613
## 2022 0.9999217 0.9999547 1.0000000 0.9997853
## 2023 0.9996506 0.9996613 0.9997853 1.0000000

###correlation plot

ggpairs(cor_set) + 
  theme_economist() + 
  theme(panel.grid = element_blank()) + 
  theme(axis.text.x = element_text(size = 8)) 

##RESULT - High Cor Visible Across Years

###Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across ###different countries. As we have 4 years of data (2020,2021,2022,2023) ###We will take avg of 4 years towards each country using Mean Function

df_avg <- df
df_avg$Average_Emissions <- rowMeans(df[, c("2020", "2021", "2022", "2023")], na.rm = TRUE)

##Ploting Green House Gas Emission(Mean) by Country

ggplot(df_avg, aes(x = reorder(Countries, -Average_Emissions), y = Average_Emissions)) + 
  geom_bar(stat = "identity") +
  xlab("Countries") +
  ylab("Megatonnes of Carbon Dioxide") +
  ggtitle("AVG Greenhouse Gas Emissions for 2020-2023") +
  theme_economist()+
  theme(axis.text.x = element_text(angle = 90, size = 6))

##Top 10 & Least 10 Emitting Countries(Calculations)

df_sorted <- df_avg[order(df_avg$Average_Emissions, decreasing = TRUE), ]
top_10_countries <- head(df_sorted, 10)
least_10_countries <- tail(df_sorted, 10)
print(top_10_countries)
##                                       Countries Country Code
## 107                            IDA & IBRD total          IBT
## 106                                   IBRD only          IBD
## 67                          East Asia & Pacific          EAS
## 65  East Asia & Pacific (excluding high income)          EAP
## 99                                  High income          HIC
## 44                                        China          CHN
## 66                   Early-demographic dividend          EAR
## 69                        Europe & Central Asia          ECS
## 113                                       India          IND
## 77                               European Union          EUU
##                       Indicator \n                      Group
## 107 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 106 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 67  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 65  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 99  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 44  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 66  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 69  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 113 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 77  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##            Income Group      2020      2021      2022      2023
## 107                     33904.516 35504.526 35892.838 37161.637
## 106                     30773.468 32278.656 32645.808 33886.632
## 67                      20045.569 20778.963 20726.915 21578.194
## 65                      17384.746 18086.111 18164.730 19088.032
## 99                      17104.667 17893.354 17853.672 17492.897
## 44  Upper middle income 14497.899 15175.619 15159.642 15943.987
## 66                      11519.574 12110.700 12490.736 12885.504
## 69                       8030.991  8470.215  8346.704  8114.897
## 113 Lower middle income  3433.619  3679.862  3897.209  4133.554
## 77                       3388.279  3577.018  3482.311  3221.795
##     Average_Emissions
## 107         35615.879
## 106         32396.141
## 67          20782.410
## 65          18180.905
## 99          17586.147
## 44          15194.287
## 66          12251.628
## 69           8240.702
## 113          3786.061
## 77           3417.351
print(least_10_countries)
##                Countries Country Code
## 4                  Aruba          ABW
## 16   Antigua and Barbuda          ATG
## 56        Cayman Islands          CYM
## 31               Bermuda          BMU
## 94               Grenada          GRD
## 61              Dominica          DMA
## 82         Faroe Islands          FRO
## 83 Micronesia, Fed. Sts.          FSM
## 97                  Guam          GUM
## 15        American Samoa          ASM
##                      Indicator \n                      Group
## 4  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 16 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 56 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 31 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 94 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 61 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 82 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 83 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 97 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 15 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##           Income Group   2020   2021   2022   2023 Average_Emissions
## 4          High income 0.4822 0.5312 0.5336 0.5615          0.527125
## 16         High income 0.3413 0.3710 0.3723 0.3886          0.368300
## 56         High income 0.3358 0.3692 0.3710 0.3901          0.366525
## 31         High income 0.3441 0.3596 0.3606 0.3788          0.360775
## 94 Upper middle income 0.1772 0.1910 0.1923 0.2004          0.190225
## 61 Upper middle income 0.1347 0.1423 0.1426 0.1469          0.141625
## 82         High income 0.0507 0.0510 0.0506 0.0507          0.050750
## 83 Lower middle income 0.0473 0.0479 0.0487 0.0492          0.048275
## 97         High income 0.0151 0.0151 0.0152 0.0154          0.015200
## 15         High income 0.0085 0.0085 0.0085 0.0085          0.008500

##Plot of Top 10 Emitting Countries

ggplot(top_10_countries, aes(x = reorder(Countries, -Average_Emissions), y = Average_Emissions)) + 
  geom_bar(stat = "identity") + xlab("Countries or Regions") +ylab("Megatonnes of Carbon Dioxide") +
  ggtitle("Top 10 Emitting Countries or Regions (2020-2023)") +
  theme_economist() +theme(axis.text.x = element_text(angle = 90, size = 7))

##Plot of Least 10 Emitting Countries

ggplot(least_10_countries, aes(x = reorder(Countries , -Average_Emissions), y = Average_Emissions)) + 
  geom_bar(stat = "identity") +  xlab("Countries") +ylab("Megatonnes of Carbon Dioxide") +
  ggtitle("Least 10 Emitting Countries or Regions (2020-2023)") +
  theme_economist() +theme(axis.text.x = element_text(angle = 90, size = 7))

#Question Investigating whether there is a relationship between IncomeGroup and total emissions.

##Using Bar Plot to observe relationship

df_avg$`Income Group` <- factor(df_avg$`Income Group`)
df_avg <- na.omit(df_avg)

df_avg$IncomeGroup <- factor(df_avg$`Income Group`, levels = c("Low income", "Medium income", "High income"))
df_avg <- na.omit(df_avg)

ggplot(df_avg, aes(x = IncomeGroup, y = Average_Emissions, fill = IncomeGroup)) +
  geom_bar(stat = "identity") +
  xlab("Income Group") +
  ylab("Average Emissions(Mega Ton)") +
  ggtitle("Greenhouse Gas Emissions by Income Group") +
  theme_economist() +
  theme(axis.text.x = element_text(angle = 360, hjust = 1))

##Using Density Plot to observe relationship

ggplot(df_avg, aes(x = Average_Emissions, fill = IncomeGroup)) +
  geom_density(alpha = 0.6) +  # Smooth density curve
  labs(title = "Density Plot of  Emissions by Income Group", 
       x = "Emissions in Mega Ton", 
       y = "Density of Countries") + 
  theme_economist()

##Using Interactive Scatter Plot to observe relationship

log_data2 <- log(df_avg[, c("2020", "2021", "2022", "2023","Average_Emissions")]) 
df_avg <- cbind(df_avg, `Log Avg Emissions` = log_data2$Average_Emissions)
head(df_avg)
##               Countries Country Code
## 4                 Aruba          ABW
## 6           Afghanistan          AFG
## 10              Andorra          AND
## 12 United Arab Emirates          ARE
## 15       American Samoa          ASM
## 16  Antigua and Barbuda          ATG
##                      Indicator \n                      Group Income Group
## 4  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)  High income
## 6  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)   Low income
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)  High income
## 12 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)  High income
## 15 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)  High income
## 16 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)  High income
##        2020      2021      2022     2023 Average_Emissions IncomeGroup
## 4    0.4822   0.53120   0.53360   0.5615          0.527125 High income
## 6   26.6463  27.64310  28.61410  29.4601         28.090900  Low income
## 10  44.2698  46.46975  44.98755  43.7173         44.861100 High income
## 12 249.3237 256.96490 267.63290 267.8232        260.436175 High income
## 15   0.0085   0.00850   0.00850   0.0085          0.008500 High income
## 16   0.3413   0.37100   0.37230   0.3886          0.368300 High income
##    Log Avg Emissions
## 4         -0.6403176
## 6          3.3354457
## 10         3.8035710
## 12         5.5623578
## 15        -4.7676891
## 16        -0.9988575
ggplot_EmissionLevels <- ggplot(df_avg, aes(x = Countries, y = `Log Avg Emissions`, colour = `Income Group`)) + 
   geom_jitter() + 
   xlab("Countries") + 
   ylab("Emission Levels") +
   theme_economist() +  
   theme(axis.text.x = element_blank())

interactive_plot <- ggplotly(ggplot_EmissionLevels, tooltip = c( "color", "Countries"))
interactive_plot

##RESULT - High Relation Observed Between High Come and High Emission across 3 Plots

###Partitioning the dataset into training (70%) and validation (30%) sets. ###Use set.seed() before partitioning to ensure reproducibility

set.seed(007)
n <- nrow(df_avg)
train_index <- sample(1:n, size = 0.7 * n)
train_set <- df[train_index, ]
validation_set <- df[-train_index, ]

###view Train Set

head(train_set)
##                      Countries Country Code
## 45               Cote d'Ivoire          CIV
## 22                       Benin          BEN
## 34                    Barbados          BRB
## 31                     Bermuda          BMU
## 42             Channel Islands          CHI
## 5  Africa Eastern and Southern          AFE
##                      Indicator \n                      Group
## 45 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 22 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 34 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 31 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 42 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##           Income Group      2020       2021       2022      2023
## 45 Lower middle income   29.5006   31.52860   32.16870   32.1840
## 22 Lower middle income   17.6528   16.01860   16.61500   16.6995
## 34         High income    0.8647    0.94940    0.95180    0.9890
## 31         High income    0.3441    0.35960    0.36060    0.3788
## 42         High income   44.2698   46.46975   44.98755   43.7173
## 5                      1421.7752 1443.81120 1443.93570 1447.7204

###View Validation Set

head(validation_set)
##               Countries Country Code
## 4                 Aruba          ABW
## 8                Angola          AGO
## 10              Andorra          AND
## 12 United Arab Emirates          ARE
## 13            Argentina          ARG
## 16  Antigua and Barbuda          ATG
##                      Indicator \n                      Group
## 4  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 8  Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 12 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 13 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 16 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
##           Income Group     2020      2021      2022     2023
## 4          High income   0.4822   0.53120   0.53360   0.5615
## 8  Lower middle income  61.6801  64.40900  67.21080  67.7008
## 10         High income  44.2698  46.46975  44.98755  43.7173
## 12         High income 249.3237 256.96490 267.63290 267.8232
## 13 Upper middle income 347.3124 367.61400 374.76130 365.6846
## 16         High income   0.3413   0.37100   0.37230   0.3886

#Summary -

###1.Loaded and cleaned the dataset, filling missing values with the median due to skewed data. ###2.Conducted exploratory data analysis (EDA) using density and box plots to understand emissions distributions and outliers. ###3.Performed correlation analysis between emission years and visualized relationships with ggpairs() and heatmaps. ###4.Calculated and visualized average emissions by country, highlighting top and least emitters. ###5.Investigated emissions by income group using bar plots and density plots. ###6.Applied log transformation, created interactive visualizations, and split the data into training and validation sets for further analysis

#Key Insights and Findings ###1.Emissions data exhibited a strong right skew, which was normalized using log transformation. ###2.A strong correlation was observed between emissions across the years 2020-2023, visualized via a heatmap and ggpairs. ###3.The top 10 and least 10 emitting countries were identified, highlighting significant contributors to global emissions. ###4.A positive relationship between income group and emissions was found, with higher-income countries emitting more greenhouse gases. ###5.Log transformation improved the normality of the emissions data, making it more suitable for analysis. ###6.Interactive plots were created to allow dynamic exploration of emissions data. ###7.The dataset was split into training and validation sets for future analysis or modeling.

##Thanks