# Install the necessary packages only if they are not installed
if (!require(readxl)) install.packages("readxl")
## Loading required package: readxl
if (!require(openxlsx)) install.packages("openxlsx")
## Loading required package: openxlsx
library(readxl)
# Read data from a specific sheet
co2_data <- read_excel("/Users/ashutoshverma/Downloads/EDGAR_2024_GHG_booklet_2024_fossilCO2only.xlsx", sheet = "fossil_CO2_totals_by_country")
head(co2_data)
## # A tibble: 6 × 57
## Substance `EDGAR Country Code` Country `1970` `1971` `1972` `1973` `1974`
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CO2 ABW Aruba 2.52e-2 2.88e-2 3.95e-2 4.43e-2 4.35e-2
## 2 CO2 AFG Afghan… 1.73e+0 1.73e+0 1.69e+0 1.73e+0 2.19e+0
## 3 CO2 AGO Angola 8.93e+0 8.52e+0 1.04e+1 1.13e+1 1.18e+1
## 4 CO2 AIA Anguil… 2.18e-3 2.18e-3 2.27e-3 2.12e-3 2.36e-3
## 5 CO2 AIR Intern… 1.70e+2 1.70e+2 1.80e+2 1.87e+2 1.80e+2
## 6 CO2 ALB Albania 4.84e+0 4.84e+0 5.52e+0 4.95e+0 5.33e+0
## # ℹ 49 more variables: `1975` <dbl>, `1976` <dbl>, `1977` <dbl>, `1978` <dbl>,
## # `1979` <dbl>, `1980` <dbl>, `1981` <dbl>, `1982` <dbl>, `1983` <dbl>,
## # `1984` <dbl>, `1985` <dbl>, `1986` <dbl>, `1987` <dbl>, `1988` <dbl>,
## # `1989` <dbl>, `1990` <dbl>, `1991` <dbl>, `1992` <dbl>, `1993` <dbl>,
## # `1994` <dbl>, `1995` <dbl>, `1996` <dbl>, `1997` <dbl>, `1998` <dbl>,
## # `1999` <dbl>, `2000` <dbl>, `2001` <dbl>, `2002` <dbl>, `2003` <dbl>,
## # `2004` <dbl>, `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, …
# Summary of the data
summary(co2_data)
## Substance EDGAR Country Code Country 1970
## Length:214 Length:214 Length:214 Min. : 0.001
## Class :character Class :character Class :character 1st Qu.: 0.286
## Mode :character Mode :character Mode :character Median : 3.387
## Mean : 165.188
## 3rd Qu.: 28.496
## Max. :15751.858
## NA's :2
## 1971 1972 1973
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.287 1st Qu.: 0.318 1st Qu.: 0.343
## Median : 3.597 Median : 3.848 Median : 4.354
## Mean : 164.594 Mean : 172.812 Mean : 182.994
## 3rd Qu.: 28.027 3rd Qu.: 30.037 3rd Qu.: 32.365
## Max. :15683.390 Max. :16481.436 Max. :17464.384
## NA's :2 NA's :2 NA's :2
## 1974 1975 1976
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.366 1st Qu.: 0.386 1st Qu.: 0.352
## Median : 4.417 Median : 4.938 Median : 5.336
## Mean : 182.107 Mean : 181.002 Mean : 191.553
## 3rd Qu.: 31.981 3rd Qu.: 34.741 3rd Qu.: 37.028
## Max. :17400.276 Max. :17328.558 Max. :18317.005
## NA's :2 NA's :2 NA's :2
## 1977 1978 1979
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.395 1st Qu.: 0.402 1st Qu.: 0.452
## Median : 4.913 Median : 5.286 Median : 5.109
## Mean : 196.689 Mean : 203.219 Mean : 209.009
## 3rd Qu.: 38.141 3rd Qu.: 38.766 3rd Qu.: 40.365
## Max. :18871.709 Max. :19491.221 Max. :20031.856
## NA's :2 NA's :2 NA's :2
## 1980 1981 1982
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.499 1st Qu.: 0.449 1st Qu.: 0.462
## Median : 5.531 Median : 5.360 Median : 5.565
## Mean : 206.272 Mean : 201.932 Mean : 199.146
## 3rd Qu.: 40.590 3rd Qu.: 38.795 3rd Qu.: 38.415
## Max. :19782.504 Max. :19411.719 Max. :19164.985
## NA's :2 NA's :2 NA's :2
## 1983 1984 1985
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.444 1st Qu.: 0.472 1st Qu.: 0.497
## Median : 5.561 Median : 5.354 Median : 5.249
## Mean : 200.415 Mean : 206.534 Mean : 209.093
## 3rd Qu.: 41.475 3rd Qu.: 44.216 3rd Qu.: 45.824
## Max. :19310.263 Max. :19942.492 Max. :20206.553
## NA's :2 NA's :2 NA's :2
## 1986 1987 1988
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.529 1st Qu.: 0.610 1st Qu.: 0.577
## Median : 5.707 Median : 4.962 Median : 5.008
## Mean : 212.515 Mean : 218.938 Mean : 226.071
## 3rd Qu.: 47.094 3rd Qu.: 49.848 3rd Qu.: 51.709
## Max. :20565.531 Max. :21224.635 Max. :21984.325
## NA's :2 NA's :2 NA's :2
## 1989 1990 1991
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.684 1st Qu.: 0.673 1st Qu.: 0.680
## Median : 6.072 Median : 6.072 Median : 5.051
## Mean : 229.746 Mean : 231.936 Mean : 232.909
## 3rd Qu.: 54.573 3rd Qu.: 56.149 3rd Qu.: 54.903
## Max. :22370.541 Max. :22680.400 Max. :22809.523
## NA's :2 NA's :2 NA's :2
## 1992 1993 1994
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.569 1st Qu.: 0.684 1st Qu.: 0.752
## Median : 5.499 Median : 5.403 Median : 5.657
## Mean : 231.290 Mean : 231.858 Mean : 233.981
## 3rd Qu.: 51.856 3rd Qu.: 51.550 3rd Qu.: 55.658
## Max. :22709.731 Max. :22802.877 Max. :23033.275
## NA's :2 NA's :2 NA's :2
## 1995 1996 1997
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.806 1st Qu.: 0.915 1st Qu.: 0.856
## Median : 5.742 Median : 6.164 Median : 6.180
## Mean : 240.319 Mean : 245.192 Mean : 249.128
## 3rd Qu.: 59.168 3rd Qu.: 61.432 3rd Qu.: 60.125
## Max. :23679.337 Max. :24145.443 Max. :24598.147
## NA's :2 NA's :2 NA's :2
## 1998 1999 2000
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.928 1st Qu.: 0.955 1st Qu.: 0.921
## Median : 6.443 Median : 6.880 Median : 6.748
## Mean : 250.582 Mean : 252.004 Mean : 259.501
## 3rd Qu.: 59.507 3rd Qu.: 57.583 3rd Qu.: 56.769
## Max. :24756.301 Max. :24939.597 Max. :25725.440
## NA's :2 NA's :2 NA's :2
## 2001 2002 2003
## Min. : 0.001 Min. : 0.001 Min. : 0.001
## 1st Qu.: 0.980 1st Qu.: 1.043 1st Qu.: 0.986
## Median : 6.864 Median : 7.347 Median : 7.311
## Mean : 262.634 Mean : 266.375 Mean : 278.390
## 3rd Qu.: 58.359 3rd Qu.: 59.816 3rd Qu.: 62.122
## Max. :26031.167 Max. :26433.290 Max. :27656.821
## NA's :2 NA's :2 NA's :2
## 2004 2005 2006 2007
## Min. : 0.002 Min. : 0.002 Min. : 0.002 Min. : 0.00
## 1st Qu.: 0.973 1st Qu.: 0.984 1st Qu.: 1.133 1st Qu.: 1.29
## Median : 7.807 Median : 7.546 Median : 8.340 Median : 8.48
## Mean : 290.609 Mean : 300.840 Mean : 310.548 Mean : 321.39
## 3rd Qu.: 61.633 3rd Qu.: 61.089 3rd Qu.: 62.619 3rd Qu.: 63.40
## Max. :28948.746 Max. :30044.543 Max. :31065.095 Max. :32232.51
## NA's :2 NA's :2 NA's :2 NA's :2
## 2008 2009 2010 2011
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.34 1st Qu.: 1.13 1st Qu.: 1.63 1st Qu.: 1.42
## Median : 8.42 Median : 7.98 Median : 8.24 Median : 9.03
## Mean : 322.84 Mean : 317.67 Mean : 336.89 Mean : 346.40
## 3rd Qu.: 63.44 3rd Qu.: 62.75 3rd Qu.: 65.91 3rd Qu.: 65.59
## Max. :32431.25 Max. :32020.68 Max. :34003.64 Max. :35059.57
## NA's :2 NA's :2 NA's :2 NA's :2
## 2012 2013 2014 2015
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.67 1st Qu.: 1.77 1st Qu.: 1.69 1st Qu.: 1.60
## Median : 9.84 Median : 9.75 Median : 9.93 Median : 10.58
## Mean : 350.45 Mean : 356.65 Mean : 357.93 Mean : 357.01
## 3rd Qu.: 68.48 3rd Qu.: 68.67 3rd Qu.: 65.95 3rd Qu.: 67.55
## Max. :35521.71 Max. :36221.14 Max. :36427.77 Max. :36300.47
## NA's :2 NA's :2 NA's :2 NA's :2
## 2016 2017 2018 2019
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.64 1st Qu.: 1.80 1st Qu.: 1.96 1st Qu.: 1.97
## Median : 10.71 Median : 10.88 Median : 12.21 Median : 12.53
## Mean : 358.22 Mean : 364.22 Mean : 372.63 Mean : 372.84
## 3rd Qu.: 67.69 3rd Qu.: 68.81 3rd Qu.: 66.47 3rd Qu.: 69.36
## Max. :36423.66 Max. :37047.47 Max. :37974.55 Max. :38066.43
## NA's :2 NA's :2 NA's :2 NA's :2
## 2020 2021 2022 2023
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2.02 1st Qu.: 2.15 1st Qu.: 2.28 1st Qu.: 2.38
## Median : 11.49 Median : 12.19 Median : 12.47 Median : 12.05
## Mean : 353.54 Mean : 373.00 Mean : 373.82 Mean : 380.00
## 3rd Qu.: 66.92 3rd Qu.: 65.51 3rd Qu.: 63.80 3rd Qu.: 64.70
## Max. :36154.31 Max. :38121.01 Max. :38246.62 Max. :39023.94
## NA's :2 NA's :2 NA's :2 NA's :2
Group countries based on how their CO₂ emissions have changed over time. Cluster countries using k means but before that i have to find optimal value of k. To do that i have used two methods :
Elbow Method The Elbow Method calculates the total within-cluster sum of squares (WSS) for different values of 𝑘The goal is to find the “elbow point” in the WSS plot, where adding more clusters no longer significantly reduces the WSS.
# Select only numeric columns
co2_data <- co2_data[, sapply(co2_data, is.numeric)]
# Remove rows with NA, NaN, or Inf values
co2_data <- co2_data[complete.cases(co2_data) & !apply(co2_data, 1, function(x) any(is.infinite(x))), ]
# Install and load dplyr
if (!require(dplyr)) install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dplyr)
# Select only numeric columns for clustering
co2_numeric <- co2_data %>% select(where(is.numeric))
# Standardize data
co2_numeric <- scale(co2_numeric)
if (!require(factoextra)) install.packages("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load library
library(factoextra)
# Determine the optimal number of clusters using the Elbow Method
fviz_nbclust(co2_numeric, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal K")
fviz_nbclust(co2_numeric, kmeans, method = "silhouette") +
labs(title = "Silhouette Method for Optimal K")
Based on results of Elbow method and Silhouette method we can see optimal value for 𝑘is 2. With k=2 lets try to get kmeans clustering
# Set the optimal number of clusters (replace X with the chosen number)
optimal_k <- 2 # For example, if elbow shows 4 clusters, set X = 4
# Perform K-Means Clustering
set.seed(123) # For reproducibility
kmeans_result <- kmeans(co2_numeric, centers = optimal_k, nstart = 25)
# Add cluster labels to the original data
co2_numeric$Cluster <- kmeans_result$cluster
## Warning in co2_numeric$Cluster <- kmeans_result$cluster: Coercing LHS to a list
head(co2_data) # View data with cluster labels
## # A tibble: 6 × 54
## `1970` `1971` `1972` `1973` `1974` `1975` `1976` `1977` `1978`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0252 0.0288 0.0395 4.43e-2 4.35e-2 5.74e-2 5.64e-2 6.71e-2 7.19e-2
## 2 1.73 1.73 1.69 1.73e+0 2.19e+0 2.03e+0 1.89e+0 2.28e+0 1.93e+0
## 3 8.93 8.52 10.4 1.13e+1 1.18e+1 1.09e+1 7.29e+0 1.20e+1 1.42e+1
## 4 0.00218 0.00218 0.00227 2.12e-3 2.36e-3 2.59e-3 2.44e-3 2.55e-3 2.91e-3
## 5 170. 170. 180. 1.87e+2 1.80e+2 1.75e+2 1.75e+2 1.91e+2 1.98e+2
## 6 4.84 4.84 5.52 4.95e+0 5.33e+0 5.42e+0 5.80e+0 6.20e+0 6.82e+0
## # ℹ 45 more variables: `1979` <dbl>, `1980` <dbl>, `1981` <dbl>, `1982` <dbl>,
## # `1983` <dbl>, `1984` <dbl>, `1985` <dbl>, `1986` <dbl>, `1987` <dbl>,
## # `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, `1991` <dbl>, `1992` <dbl>,
## # `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, `1996` <dbl>, `1997` <dbl>,
## # `1998` <dbl>, `1999` <dbl>, `2000` <dbl>, `2001` <dbl>, `2002` <dbl>,
## # `2003` <dbl>, `2004` <dbl>, `2005` <dbl>, `2006` <dbl>, `2007` <dbl>,
## # `2008` <dbl>, `2009` <dbl>, `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, …
Visualize the Clusters Use a dimensionality reduction method like PCA to visualize high-dimensional clustering.
# Visualize the clusters
# Check the filtered data
str(co2_numeric)
## List of 11449
## $ : num -0.143
## $ : num -0.141
## $ : num -0.135
## $ : num -0.143
## $ : num 0.00408
## $ : num -0.139
## $ : num -0.13
## $ : num -0.126
## $ : num -0.0671
## $ : num -0.134
## $ : num -0.143
## $ : num -0.00416
## $ : num -0.0971
## $ : num -0.122
## $ : num -0.143
## $ : num -0.0224
## $ : num -0.143
## $ : num -0.143
## $ : num -0.14
## $ : num -0.0831
## $ : num -0.14
## $ : num -0.142
## $ : num -0.133
## $ : num -0.0909
## $ : num -0.143
## $ : num -0.143
## $ : num -0.14
## $ : num -0.0504
## $ : num -0.143
## $ : num -0.141
## $ : num -0.143
## $ : num -0.143
## $ : num -0.143
## $ : num 0.167
## $ : num -0.107
## $ : num -0.118
## $ : num 0.644
## $ : num -0.141
## $ : num -0.142
## $ : num -0.14
## $ : num -0.141
## $ : num -0.143
## $ : num -0.118
## $ : num -0.143
## $ : num -0.143
## $ : num -0.142
## $ : num -0.124
## $ : num -0.143
## $ : num -0.141
## $ : num 0.02
## $ : num 0.795
## $ : num -0.142
## $ : num -0.143
## $ : num -0.0893
## $ : num -0.14
## $ : num -0.126
## $ : num -0.139
## $ : num -0.121
## $ : num -0.143
## $ : num -0.143
## $ : num -0.0181
## $ : num -0.124
## $ : num -0.142
## $ : num -0.107
## $ : num -0.142
## $ : num -0.143
## $ : num 0.263
## $ : num -0.143
## $ : num -0.142
## $ : num 0.44
## $ : num -0.122
## $ : num -0.141
## $ : num -0.143
## $ : num -0.142
## $ : num -0.143
## $ : num -0.143
## $ : num -0.143
## $ : num -0.143
## $ : num -0.118
## $ : num -0.143
## $ : num -0.143
## $ : num -0.141
## $ : num -0.143
## $ : num -0.142
## $ : num -0.135
## $ : num -0.142
## $ : num -0.124
## $ : num -0.142
## $ : num -0.0856
## $ : num -0.116
## $ : num 0.0422
## $ : num -0.125
## $ : num -0.0734
## $ : num -0.123
## $ : num -0.141
## $ : num -0.13
## $ : num 0.13
## $ : num -0.136
## $ : num -0.141
## [list output truncated]
# Keep only numeric columns (remove non-numeric)
co2_numeric <- co2_data[sapply(co2_data, is.numeric)]
fviz_cluster(kmeans_result, data = co2_numeric, geom = "point", ellipse.type = "convex") +
labs(title = "Clustering of Countries Based on CO₂ Emissions")