library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.2
library(ggthemes)
library("e1071")
## Warning: package 'e1071' was built under R version 4.4.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Loading DATA SET
#Question 1 (20 points):
###Loading DATA SET
df <- read.csv("GGE.csv")
head(df)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
###Summarize the dataset
summary(df)
## Data.Source World.Development.Indicators
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. X
## Length:113 Length:113
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## X.1 X.2 X.3 X.4
## Min. : 0.01 Min. : 0.01 Min. : 0.01 Min. : 0.01
## 1st Qu.: 9.33 1st Qu.: 9.12 1st Qu.: 9.18 1st Qu.: 9.26
## Median : 44.98 Median : 46.73 Median : 46.41 Median : 43.98
## Mean : 1786.64 Mean : 1866.39 Mean : 1877.21 Mean : 1920.44
## 3rd Qu.: 492.09 3rd Qu.: 505.11 3rd Qu.: 492.56 3rd Qu.: 478.68
## Max. :33904.52 Max. :35504.53 Max. :35892.84 Max. :37161.64
## NA's :6 NA's :6 NA's :6 NA's :6
###Display the first 6 and last 10 rows.
head(df,6)
## Data.Source World.Development.Indicators
## 1 Last Updated Date 1/28/2025
## 2
## 3 Country Name Country Code
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 1 Data from: https://data.worldbank.org/indicator/EN.GHG.ALL.MT.CE.AR5
## 2
## 3 Indicator Name
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 IncomeGroup 2020.0000 2021.0000 2022.0000 2023.0000
## 4 High income 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 Low income 26.6463 27.6431 28.6141 29.4601
tail(df,10)
## Data.Source World.Development.Indicators
## 104 Haiti HTI
## 105 Hungary HUN
## 106 IBRD only IBD
## 107 IDA & IBRD total IBT
## 108 IDA total IDA
## 109 IDA blend IDB
## 110 Indonesia IDN
## 111 IDA only IDX
## 112 Isle of Man IMN
## 113 India IND
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e.
## 104 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 105 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 106 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 107 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 108 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 109 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 110 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 111 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 112 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 113 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## X X.1 X.2 X.3 X.4
## 104 Lower middle income 13.5011 13.4188 13.5126 13.6570
## 105 High income 65.8413 67.1101 64.2695 60.9268
## 106 30773.4680 32278.6559 32645.8080 33886.6317
## 107 33904.5157 35504.5258 35892.8384 37161.6368
## 108 3131.0477 3225.8699 3247.0304 3275.0051
## 109 1328.2953 1365.0507 1354.6398 1359.2719
## 110 Upper middle income 1050.3392 1077.0778 1152.7260 1200.1998
## 111 1802.7524 1860.8192 1892.3906 1915.7332
## 112 High income NA NA NA NA
## 113 Lower middle income 3433.6190 3679.8618 3897.2090 4133.5544
###Removing Unnesscary Space
df <- df[-c(1:3),]
head(df)
## Data.Source World.Development.Indicators
## 4 Aruba ABW
## 5 Africa Eastern and Southern AFE
## 6 Afghanistan AFG
## 7 Africa Western and Central AFW
## 8 Angola AGO
## 9 Albania ALB
## Total.greenhouse.gas.emissions.excluding.LULUCF..Mt.CO2e. X
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) High income
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) Low income
## 7 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 8 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) Lower middle income
## 9 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) Upper middle income
## X.1 X.2 X.3 X.4
## 4 0.4822 0.5312 0.5336 0.5615
## 5 1421.7752 1443.8112 1443.9357 1447.7204
## 6 26.6463 27.6431 28.6141 29.4601
## 7 866.4966 886.6322 893.4701 906.0481
## 8 61.6801 64.4090 67.2108 67.7008
## 9 7.9674 8.3953 7.8120 7.6737
###Renaming Coloums for Better Analysing & View AfterMath Structure
colnames(df)[1:8]<- c("Countries","Country Code","Indicator
Group","Income Group","2020","2021",
"2022","2023")
colnames(df)
## [1] "Countries"
## [2] "Country Code"
## [3] "Indicator \n Group"
## [4] "Income Group"
## [5] "2020"
## [6] "2021"
## [7] "2022"
## [8] "2023"
###Detect missing data points ###BOX Plot(Console OutPut) to Identify NA’s
##Log Function for better Visualization of Box Plot
log_data <- log(df[, c("2020", "2021", "2022", "2023")])
log_data<- log_data %>%
pivot_longer(cols = c("2020", "2021", "2022", "2023"),
names_to = "Year", values_to = "Emissions")
##BOX Plot(Console OutPut) to Identify NA’s
ggplot(log_data, aes(x = Year, y = Emissions)) +
geom_boxplot(outlier.colour = "red", color = "black") + # Adding outliers and color
ggtitle("Boxplot of Greenhouse Gas Emissions (2020-2023)") +
xlab("Year") +
ylab(" Greenhouse Gas Emissions ") +
theme_economist() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ylim(-5, 20)
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
###or
df_na <- colSums(is.na(df))
df_na[df_na > 0]
## 2020 2021 2022 2023
## 4 4 4 4
##NA Values Detected(In Console)
##Filling NA Values ###what to use to replace Null Values, MEAN OR MEDIAN & WHY ? ###“Mean” When your data is roughly normally distributed. ###“Median” When your data is skewed or contains outliers. ### FINDING DISTRIBUTION OF DATA(FOR FILLING NA) -
ggplot(df, aes(x = `2020`)) +
geom_density() +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
###Unable To Showcase Distribution hence using Log Function,To Fill NA
Values
###Using ggplot and Log To Identify Year - ‘2021’ Data Distribution
ggplot(df, aes(x = log(`2021`))) +
geom_density()+ggtitle('Data Distribution of Year 2021')+
xlab('Data Distribution of Year 2021') +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
###Using ggplot and Log To Identify Year - ‘2022’ Data Distribution
ggplot(df, aes(x = log(`2022`))) +
geom_density()+ggtitle('Data Distribution of Year 2022')+
xlab('Data Distribution of Year 2022') +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
###Using ggplot and Log To Identify Year - ‘2023’ Data Distribution
ggplot(df, aes(x = log(`2023`))) +
geom_density()+ggtitle('Data Distribution of Year 2023')+
xlab('Data Distribution of Year 2023') +theme_economist()
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
##RESULT - Highly Right Skewed Data , Hence MEDIAN To Fill NA
###filling Nulls Values with Median , as we have a highly skewed data
df$`2020`[is.na(df$`2020`)] <- median(df$`2020`, na.rm = TRUE)
df$`2021`[is.na(df$`2021`)] <- median(df$`2021`, na.rm = TRUE)
df$`2022`[is.na(df$`2022`)] <- median(df$`2022`, na.rm = TRUE)
df$`2023`[is.na(df$`2023`)] <- median(df$`2023`, na.rm = TRUE)
###Recheck NA
df_na <- colSums(is.na(df))
df_na[df_na > 0]
## named numeric(0)
###Identify any outliers in the dataset. ###Using Box Plot to identify Outliers
log_data <- log(df[, c("2020", "2021", "2022", "2023")])
log_data<- log_data %>%
pivot_longer(cols = c("2020", "2021", "2022", "2023"),
names_to = "Year", values_to = "Emissions")
ggplot(log_data, aes(x = Year, y = Emissions)) +
geom_boxplot(outlier.colour = "red", color = "black") + # Adding outliers and color
ggtitle("Finding Outliers") +
xlab("Year") +
ylab("Greenhouse Gas Emissions") +
theme_economist() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ylim(-5, 20)
##RESULT - Outliers Observed In Every Year
###Explore relationships between variables through initial pattern analysis ##Cor Matrix
df_selected_cor = df[,c("2020","2021","2022","2023")]
cor_set = cor(df_selected_cor)
print(cor_set)
## 2020 2021 2022 2023
## 2020 1.0000000 0.9999878 0.9999217 0.9996506
## 2021 0.9999878 1.0000000 0.9999547 0.9996613
## 2022 0.9999217 0.9999547 1.0000000 0.9997853
## 2023 0.9996506 0.9996613 0.9997853 1.0000000
###correlation plot
ggpairs(cor_set) +
theme_economist() +
theme(panel.grid = element_blank()) +
theme(axis.text.x = element_text(size = 8))
##RESULT - High Cor Visible Across Years
###Analyze the total greenhouse gas emissions excluding LULUCF (Mt CO2e) across ###different countries. As we have 4 years of data (2020,2021,2022,2023) ###We will take avg of 4 years towards each country using Mean Function
df_avg <- df
df_avg$Average_Emissions <- rowMeans(df[, c("2020", "2021", "2022", "2023")], na.rm = TRUE)
##Ploting Green House Gas Emission(Mean) by Country
ggplot(df_avg, aes(x = reorder(Countries, -Average_Emissions), y = Average_Emissions)) +
geom_bar(stat = "identity") +
xlab("Countries") +
ylab("Megatonnes of Carbon Dioxide") +
ggtitle("AVG Greenhouse Gas Emissions for 2020-2023") +
theme_economist()+
theme(axis.text.x = element_text(angle = 90, size = 6))
##Top 10 & Least 10 Emitting Countries(Calculations)
df_sorted <- df_avg[order(df_avg$Average_Emissions, decreasing = TRUE), ]
top_10_countries <- head(df_sorted, 10)
least_10_countries <- tail(df_sorted, 10)
print(top_10_countries)
## Countries Country Code
## 107 IDA & IBRD total IBT
## 106 IBRD only IBD
## 67 East Asia & Pacific EAS
## 65 East Asia & Pacific (excluding high income) EAP
## 99 High income HIC
## 44 China CHN
## 66 Early-demographic dividend EAR
## 69 Europe & Central Asia ECS
## 113 India IND
## 77 European Union EUU
## Indicator \n Group
## 107 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 106 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 67 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 65 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 99 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 44 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 66 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 69 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 113 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 77 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## Income Group 2020 2021 2022 2023
## 107 33904.516 35504.526 35892.838 37161.637
## 106 30773.468 32278.656 32645.808 33886.632
## 67 20045.569 20778.963 20726.915 21578.194
## 65 17384.746 18086.111 18164.730 19088.032
## 99 17104.667 17893.354 17853.672 17492.897
## 44 Upper middle income 14497.899 15175.619 15159.642 15943.987
## 66 11519.574 12110.700 12490.736 12885.504
## 69 8030.991 8470.215 8346.704 8114.897
## 113 Lower middle income 3433.619 3679.862 3897.209 4133.554
## 77 3388.279 3577.018 3482.311 3221.795
## Average_Emissions
## 107 35615.879
## 106 32396.141
## 67 20782.410
## 65 18180.905
## 99 17586.147
## 44 15194.287
## 66 12251.628
## 69 8240.702
## 113 3786.061
## 77 3417.351
print(least_10_countries)
## Countries Country Code
## 4 Aruba ABW
## 16 Antigua and Barbuda ATG
## 56 Cayman Islands CYM
## 31 Bermuda BMU
## 94 Grenada GRD
## 61 Dominica DMA
## 82 Faroe Islands FRO
## 83 Micronesia, Fed. Sts. FSM
## 97 Guam GUM
## 15 American Samoa ASM
## Indicator \n Group
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 16 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 56 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 31 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 94 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 61 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 82 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 83 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 97 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 15 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## Income Group 2020 2021 2022 2023 Average_Emissions
## 4 High income 0.4822 0.5312 0.5336 0.5615 0.527125
## 16 High income 0.3413 0.3710 0.3723 0.3886 0.368300
## 56 High income 0.3358 0.3692 0.3710 0.3901 0.366525
## 31 High income 0.3441 0.3596 0.3606 0.3788 0.360775
## 94 Upper middle income 0.1772 0.1910 0.1923 0.2004 0.190225
## 61 Upper middle income 0.1347 0.1423 0.1426 0.1469 0.141625
## 82 High income 0.0507 0.0510 0.0506 0.0507 0.050750
## 83 Lower middle income 0.0473 0.0479 0.0487 0.0492 0.048275
## 97 High income 0.0151 0.0151 0.0152 0.0154 0.015200
## 15 High income 0.0085 0.0085 0.0085 0.0085 0.008500
##Plot of Top 10 Emitting Countries
ggplot(top_10_countries, aes(x = reorder(Countries, -Average_Emissions), y = Average_Emissions)) +
geom_bar(stat = "identity") + xlab("Countries or Regions") +ylab("Megatonnes of Carbon Dioxide") +
ggtitle("Top 10 Emitting Countries or Regions (2020-2023)") +
theme_economist() +theme(axis.text.x = element_text(angle = 90, size = 7))
##Plot of Least 10 Emitting Countries
ggplot(least_10_countries, aes(x = reorder(Countries , -Average_Emissions), y = Average_Emissions)) +
geom_bar(stat = "identity") + xlab("Countries") +ylab("Megatonnes of Carbon Dioxide") +
ggtitle("Least 10 Emitting Countries or Regions (2020-2023)") +
theme_economist() +theme(axis.text.x = element_text(angle = 90, size = 7))
#Question Investigating whether there is a relationship between IncomeGroup and total emissions.
##Using Bar Plot to observe relationship
df_avg$`Income Group` <- factor(df_avg$`Income Group`)
df_avg <- na.omit(df_avg)
df_avg$IncomeGroup <- factor(df_avg$`Income Group`, levels = c("Low income", "Medium income", "High income"))
df_avg <- na.omit(df_avg)
ggplot(df_avg, aes(x = IncomeGroup, y = Average_Emissions, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
xlab("Income Group") +
ylab("Average Emissions(Mega Ton)") +
ggtitle("Greenhouse Gas Emissions by Income Group") +
theme_economist() +
theme(axis.text.x = element_text(angle = 360, hjust = 1))
##Using Density Plot to observe relationship
ggplot(df_avg, aes(x = Average_Emissions, fill = IncomeGroup)) +
geom_density(alpha = 0.6) + # Smooth density curve
labs(title = "Density Plot of Emissions by Income Group",
x = "Emissions in Mega Ton",
y = "Density of Countries") +
theme_economist()
##Using Interactive Scatter Plot to observe relationship
log_data2 <- log(df_avg[, c("2020", "2021", "2022", "2023","Average_Emissions")])
df_avg <- cbind(df_avg, `Log Avg Emissions` = log_data2$Average_Emissions)
head(df_avg)
## Countries Country Code
## 4 Aruba ABW
## 6 Afghanistan AFG
## 10 Andorra AND
## 12 United Arab Emirates ARE
## 15 American Samoa ASM
## 16 Antigua and Barbuda ATG
## Indicator \n Group Income Group
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) High income
## 6 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) Low income
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) High income
## 12 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) High income
## 15 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) High income
## 16 Total greenhouse gas emissions excluding LULUCF (Mt CO2e) High income
## 2020 2021 2022 2023 Average_Emissions IncomeGroup
## 4 0.4822 0.53120 0.53360 0.5615 0.527125 High income
## 6 26.6463 27.64310 28.61410 29.4601 28.090900 Low income
## 10 44.2698 46.46975 44.98755 43.7173 44.861100 High income
## 12 249.3237 256.96490 267.63290 267.8232 260.436175 High income
## 15 0.0085 0.00850 0.00850 0.0085 0.008500 High income
## 16 0.3413 0.37100 0.37230 0.3886 0.368300 High income
## Log Avg Emissions
## 4 -0.6403176
## 6 3.3354457
## 10 3.8035710
## 12 5.5623578
## 15 -4.7676891
## 16 -0.9988575
ggplot_EmissionLevels <- ggplot(df_avg, aes(x = Countries, y = `Log Avg Emissions`, colour = `Income Group`)) +
geom_jitter() +
xlab("Countries") +
ylab("Emission Levels") +
theme_economist() +
theme(axis.text.x = element_blank())
interactive_plot <- ggplotly(ggplot_EmissionLevels, tooltip = c( "color", "Countries"))
interactive_plot
##RESULT - High Relation Observed Between High Come and High Emission across 3 Plots
###Partitioning the dataset into training (70%) and validation (30%) sets. ###Use set.seed() before partitioning to ensure reproducibility
set.seed(007)
n <- nrow(df_avg)
train_index <- sample(1:n, size = 0.7 * n)
train_set <- df[train_index, ]
validation_set <- df[-train_index, ]
###view Train Set
head(train_set)
## Countries Country Code
## 45 Cote d'Ivoire CIV
## 22 Benin BEN
## 34 Barbados BRB
## 31 Bermuda BMU
## 42 Channel Islands CHI
## 5 Africa Eastern and Southern AFE
## Indicator \n Group
## 45 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 22 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 34 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 31 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 42 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 5 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## Income Group 2020 2021 2022 2023
## 45 Lower middle income 29.5006 31.52860 32.16870 32.1840
## 22 Lower middle income 17.6528 16.01860 16.61500 16.6995
## 34 High income 0.8647 0.94940 0.95180 0.9890
## 31 High income 0.3441 0.35960 0.36060 0.3788
## 42 High income 44.2698 46.46975 44.98755 43.7173
## 5 1421.7752 1443.81120 1443.93570 1447.7204
###View Validation Set
head(validation_set)
## Countries Country Code
## 4 Aruba ABW
## 8 Angola AGO
## 10 Andorra AND
## 12 United Arab Emirates ARE
## 13 Argentina ARG
## 16 Antigua and Barbuda ATG
## Indicator \n Group
## 4 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 8 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 10 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 12 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 13 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## 16 Total greenhouse gas emissions excluding LULUCF (Mt CO2e)
## Income Group 2020 2021 2022 2023
## 4 High income 0.4822 0.53120 0.53360 0.5615
## 8 Lower middle income 61.6801 64.40900 67.21080 67.7008
## 10 High income 44.2698 46.46975 44.98755 43.7173
## 12 High income 249.3237 256.96490 267.63290 267.8232
## 13 Upper middle income 347.3124 367.61400 374.76130 365.6846
## 16 High income 0.3413 0.37100 0.37230 0.3886
#Summary -
###1.Loaded and cleaned the dataset, filling missing values with the median due to skewed data. ###2.Conducted exploratory data analysis (EDA) using density and box plots to understand emissions distributions and outliers. ###3.Performed correlation analysis between emission years and visualized relationships with ggpairs() and heatmaps. ###4.Calculated and visualized average emissions by country, highlighting top and least emitters. ###5.Investigated emissions by income group using bar plots and density plots. ###6.Applied log transformation, created interactive visualizations, and split the data into training and validation sets for further analysis
#Key Insights and Findings ###1.Emissions data exhibited a strong right skew, which was normalized using log transformation. ###2.A strong correlation was observed between emissions across the years 2020-2023, visualized via a heatmap and ggpairs. ###3.The top 10 and least 10 emitting countries were identified, highlighting significant contributors to global emissions. ###4.A positive relationship between income group and emissions was found, with higher-income countries emitting more greenhouse gases. ###5.Log transformation improved the normality of the emissions data, making it more suitable for analysis. ###6.Interactive plots were created to allow dynamic exploration of emissions data. ###7.The dataset was split into training and validation sets for future analysis or modeling.
##Thanks