Clustering_project

# Install the necessary packages only if they are not installed
if (!require(readxl)) install.packages("readxl")

## Loading required package: readxl

if (!require(openxlsx)) install.packages("openxlsx")

## Loading required package: openxlsx

library(readxl)      
# Read data from a specific sheet 
co2_data <- read_excel("/Users/ashutoshverma/Downloads/EDGAR_2024_GHG_booklet_2024_fossilCO2only.xlsx", sheet = "fossil_CO2_totals_by_country")
head(co2_data)

## # A tibble: 6 × 57
##   Substance `EDGAR Country Code` Country  `1970`  `1971`  `1972`  `1973`  `1974`
##   <chr>     <chr>                <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 CO2       ABW                  Aruba   2.52e-2 2.88e-2 3.95e-2 4.43e-2 4.35e-2
## 2 CO2       AFG                  Afghan… 1.73e+0 1.73e+0 1.69e+0 1.73e+0 2.19e+0
## 3 CO2       AGO                  Angola  8.93e+0 8.52e+0 1.04e+1 1.13e+1 1.18e+1
## 4 CO2       AIA                  Anguil… 2.18e-3 2.18e-3 2.27e-3 2.12e-3 2.36e-3
## 5 CO2       AIR                  Intern… 1.70e+2 1.70e+2 1.80e+2 1.87e+2 1.80e+2
## 6 CO2       ALB                  Albania 4.84e+0 4.84e+0 5.52e+0 4.95e+0 5.33e+0
## # ℹ 49 more variables: `1975` <dbl>, `1976` <dbl>, `1977` <dbl>, `1978` <dbl>,
## #   `1979` <dbl>, `1980` <dbl>, `1981` <dbl>, `1982` <dbl>, `1983` <dbl>,
## #   `1984` <dbl>, `1985` <dbl>, `1986` <dbl>, `1987` <dbl>, `1988` <dbl>,
## #   `1989` <dbl>, `1990` <dbl>, `1991` <dbl>, `1992` <dbl>, `1993` <dbl>,
## #   `1994` <dbl>, `1995` <dbl>, `1996` <dbl>, `1997` <dbl>, `1998` <dbl>,
## #   `1999` <dbl>, `2000` <dbl>, `2001` <dbl>, `2002` <dbl>, `2003` <dbl>,
## #   `2004` <dbl>, `2005` <dbl>, `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, …

# Summary of the data
summary(co2_data)

##   Substance         EDGAR Country Code   Country               1970          
##  Length:214         Length:214         Length:214         Min.   :    0.001  
##  Class :character   Class :character   Class :character   1st Qu.:    0.286  
##  Mode  :character   Mode  :character   Mode  :character   Median :    3.387  
##                                                           Mean   :  165.188  
##                                                           3rd Qu.:   28.496  
##                                                           Max.   :15751.858  
##                                                           NA's   :2          
##       1971                1972                1973          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.287   1st Qu.:    0.318   1st Qu.:    0.343  
##  Median :    3.597   Median :    3.848   Median :    4.354  
##  Mean   :  164.594   Mean   :  172.812   Mean   :  182.994  
##  3rd Qu.:   28.027   3rd Qu.:   30.037   3rd Qu.:   32.365  
##  Max.   :15683.390   Max.   :16481.436   Max.   :17464.384  
##  NA's   :2           NA's   :2           NA's   :2          
##       1974                1975                1976          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.366   1st Qu.:    0.386   1st Qu.:    0.352  
##  Median :    4.417   Median :    4.938   Median :    5.336  
##  Mean   :  182.107   Mean   :  181.002   Mean   :  191.553  
##  3rd Qu.:   31.981   3rd Qu.:   34.741   3rd Qu.:   37.028  
##  Max.   :17400.276   Max.   :17328.558   Max.   :18317.005  
##  NA's   :2           NA's   :2           NA's   :2          
##       1977                1978                1979          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.395   1st Qu.:    0.402   1st Qu.:    0.452  
##  Median :    4.913   Median :    5.286   Median :    5.109  
##  Mean   :  196.689   Mean   :  203.219   Mean   :  209.009  
##  3rd Qu.:   38.141   3rd Qu.:   38.766   3rd Qu.:   40.365  
##  Max.   :18871.709   Max.   :19491.221   Max.   :20031.856  
##  NA's   :2           NA's   :2           NA's   :2          
##       1980                1981                1982          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.499   1st Qu.:    0.449   1st Qu.:    0.462  
##  Median :    5.531   Median :    5.360   Median :    5.565  
##  Mean   :  206.272   Mean   :  201.932   Mean   :  199.146  
##  3rd Qu.:   40.590   3rd Qu.:   38.795   3rd Qu.:   38.415  
##  Max.   :19782.504   Max.   :19411.719   Max.   :19164.985  
##  NA's   :2           NA's   :2           NA's   :2          
##       1983                1984                1985          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.444   1st Qu.:    0.472   1st Qu.:    0.497  
##  Median :    5.561   Median :    5.354   Median :    5.249  
##  Mean   :  200.415   Mean   :  206.534   Mean   :  209.093  
##  3rd Qu.:   41.475   3rd Qu.:   44.216   3rd Qu.:   45.824  
##  Max.   :19310.263   Max.   :19942.492   Max.   :20206.553  
##  NA's   :2           NA's   :2           NA's   :2          
##       1986                1987                1988          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.529   1st Qu.:    0.610   1st Qu.:    0.577  
##  Median :    5.707   Median :    4.962   Median :    5.008  
##  Mean   :  212.515   Mean   :  218.938   Mean   :  226.071  
##  3rd Qu.:   47.094   3rd Qu.:   49.848   3rd Qu.:   51.709  
##  Max.   :20565.531   Max.   :21224.635   Max.   :21984.325  
##  NA's   :2           NA's   :2           NA's   :2          
##       1989                1990                1991          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.684   1st Qu.:    0.673   1st Qu.:    0.680  
##  Median :    6.072   Median :    6.072   Median :    5.051  
##  Mean   :  229.746   Mean   :  231.936   Mean   :  232.909  
##  3rd Qu.:   54.573   3rd Qu.:   56.149   3rd Qu.:   54.903  
##  Max.   :22370.541   Max.   :22680.400   Max.   :22809.523  
##  NA's   :2           NA's   :2           NA's   :2          
##       1992                1993                1994          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.569   1st Qu.:    0.684   1st Qu.:    0.752  
##  Median :    5.499   Median :    5.403   Median :    5.657  
##  Mean   :  231.290   Mean   :  231.858   Mean   :  233.981  
##  3rd Qu.:   51.856   3rd Qu.:   51.550   3rd Qu.:   55.658  
##  Max.   :22709.731   Max.   :22802.877   Max.   :23033.275  
##  NA's   :2           NA's   :2           NA's   :2          
##       1995                1996                1997          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.806   1st Qu.:    0.915   1st Qu.:    0.856  
##  Median :    5.742   Median :    6.164   Median :    6.180  
##  Mean   :  240.319   Mean   :  245.192   Mean   :  249.128  
##  3rd Qu.:   59.168   3rd Qu.:   61.432   3rd Qu.:   60.125  
##  Max.   :23679.337   Max.   :24145.443   Max.   :24598.147  
##  NA's   :2           NA's   :2           NA's   :2          
##       1998                1999                2000          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.928   1st Qu.:    0.955   1st Qu.:    0.921  
##  Median :    6.443   Median :    6.880   Median :    6.748  
##  Mean   :  250.582   Mean   :  252.004   Mean   :  259.501  
##  3rd Qu.:   59.507   3rd Qu.:   57.583   3rd Qu.:   56.769  
##  Max.   :24756.301   Max.   :24939.597   Max.   :25725.440  
##  NA's   :2           NA's   :2           NA's   :2          
##       2001                2002                2003          
##  Min.   :    0.001   Min.   :    0.001   Min.   :    0.001  
##  1st Qu.:    0.980   1st Qu.:    1.043   1st Qu.:    0.986  
##  Median :    6.864   Median :    7.347   Median :    7.311  
##  Mean   :  262.634   Mean   :  266.375   Mean   :  278.390  
##  3rd Qu.:   58.359   3rd Qu.:   59.816   3rd Qu.:   62.122  
##  Max.   :26031.167   Max.   :26433.290   Max.   :27656.821  
##  NA's   :2           NA's   :2           NA's   :2          
##       2004                2005                2006                2007         
##  Min.   :    0.002   Min.   :    0.002   Min.   :    0.002   Min.   :    0.00  
##  1st Qu.:    0.973   1st Qu.:    0.984   1st Qu.:    1.133   1st Qu.:    1.29  
##  Median :    7.807   Median :    7.546   Median :    8.340   Median :    8.48  
##  Mean   :  290.609   Mean   :  300.840   Mean   :  310.548   Mean   :  321.39  
##  3rd Qu.:   61.633   3rd Qu.:   61.089   3rd Qu.:   62.619   3rd Qu.:   63.40  
##  Max.   :28948.746   Max.   :30044.543   Max.   :31065.095   Max.   :32232.51  
##  NA's   :2           NA's   :2           NA's   :2           NA's   :2         
##       2008               2009               2010               2011         
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
##  1st Qu.:    1.34   1st Qu.:    1.13   1st Qu.:    1.63   1st Qu.:    1.42  
##  Median :    8.42   Median :    7.98   Median :    8.24   Median :    9.03  
##  Mean   :  322.84   Mean   :  317.67   Mean   :  336.89   Mean   :  346.40  
##  3rd Qu.:   63.44   3rd Qu.:   62.75   3rd Qu.:   65.91   3rd Qu.:   65.59  
##  Max.   :32431.25   Max.   :32020.68   Max.   :34003.64   Max.   :35059.57  
##  NA's   :2          NA's   :2          NA's   :2          NA's   :2         
##       2012               2013               2014               2015         
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
##  1st Qu.:    1.67   1st Qu.:    1.77   1st Qu.:    1.69   1st Qu.:    1.60  
##  Median :    9.84   Median :    9.75   Median :    9.93   Median :   10.58  
##  Mean   :  350.45   Mean   :  356.65   Mean   :  357.93   Mean   :  357.01  
##  3rd Qu.:   68.48   3rd Qu.:   68.67   3rd Qu.:   65.95   3rd Qu.:   67.55  
##  Max.   :35521.71   Max.   :36221.14   Max.   :36427.77   Max.   :36300.47  
##  NA's   :2          NA's   :2          NA's   :2          NA's   :2         
##       2016               2017               2018               2019         
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
##  1st Qu.:    1.64   1st Qu.:    1.80   1st Qu.:    1.96   1st Qu.:    1.97  
##  Median :   10.71   Median :   10.88   Median :   12.21   Median :   12.53  
##  Mean   :  358.22   Mean   :  364.22   Mean   :  372.63   Mean   :  372.84  
##  3rd Qu.:   67.69   3rd Qu.:   68.81   3rd Qu.:   66.47   3rd Qu.:   69.36  
##  Max.   :36423.66   Max.   :37047.47   Max.   :37974.55   Max.   :38066.43  
##  NA's   :2          NA's   :2          NA's   :2          NA's   :2         
##       2020               2021               2022               2023         
##  Min.   :    0.00   Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
##  1st Qu.:    2.02   1st Qu.:    2.15   1st Qu.:    2.28   1st Qu.:    2.38  
##  Median :   11.49   Median :   12.19   Median :   12.47   Median :   12.05  
##  Mean   :  353.54   Mean   :  373.00   Mean   :  373.82   Mean   :  380.00  
##  3rd Qu.:   66.92   3rd Qu.:   65.51   3rd Qu.:   63.80   3rd Qu.:   64.70  
##  Max.   :36154.31   Max.   :38121.01   Max.   :38246.62   Max.   :39023.94  
##  NA's   :2          NA's   :2          NA's   :2          NA's   :2

Group countries based on how their CO₂ emissions have changed over time. Cluster countries using k means but before that i have to find optimal value of k. To do that i have used two methods :

Elbow Method The Elbow Method calculates the total within-cluster sum of squares (WSS) for different values of 𝑘The goal is to find the “elbow point” in the WSS plot, where adding more clusters no longer significantly reduces the WSS.
1. Silhouette Method The Silhouette Method measures how similar a data point is to its own cluster compared to other clusters. It calculates the silhouette coefficient for each 𝑘and the optimal 𝑘is where the average silhouette width is maximized.

# Select only numeric columns
co2_data <- co2_data[, sapply(co2_data, is.numeric)]

# Remove rows with NA, NaN, or Inf values
co2_data <- co2_data[complete.cases(co2_data) & !apply(co2_data, 1, function(x) any(is.infinite(x))), ]

# Install and load dplyr
if (!require(dplyr)) install.packages("dplyr")

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(dplyr)


# Select only numeric columns for clustering
co2_numeric <- co2_data %>% select(where(is.numeric))

# Standardize data
co2_numeric <- scale(co2_numeric)

if (!require(factoextra)) install.packages("factoextra")

## Loading required package: factoextra

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# Load library
library(factoextra)

# Determine the optimal number of clusters using the Elbow Method
fviz_nbclust(co2_numeric, kmeans, method = "wss") +
  labs(title = "Elbow Method for Optimal K")

fviz_nbclust(co2_numeric, kmeans, method = "silhouette") + 
  labs(title = "Silhouette Method for Optimal K")

Based on results of Elbow method and Silhouette method we can see optimal value for 𝑘is 2. With k=2 lets try to get kmeans clustering

# Set the optimal number of clusters (replace X with the chosen number)
optimal_k <- 2 # For example, if elbow shows 4 clusters, set X = 4

# Perform K-Means Clustering
set.seed(123)  # For reproducibility
kmeans_result <- kmeans(co2_numeric, centers = optimal_k, nstart = 25)

# Add cluster labels to the original data
co2_numeric$Cluster <- kmeans_result$cluster

## Warning in co2_numeric$Cluster <- kmeans_result$cluster: Coercing LHS to a list

head(co2_data)  # View data with cluster labels

## # A tibble: 6 × 54
##      `1970`    `1971`    `1972`   `1973`  `1974`  `1975`  `1976`  `1977`  `1978`
##       <dbl>     <dbl>     <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1   0.0252    0.0288    0.0395   4.43e-2 4.35e-2 5.74e-2 5.64e-2 6.71e-2 7.19e-2
## 2   1.73      1.73      1.69     1.73e+0 2.19e+0 2.03e+0 1.89e+0 2.28e+0 1.93e+0
## 3   8.93      8.52     10.4      1.13e+1 1.18e+1 1.09e+1 7.29e+0 1.20e+1 1.42e+1
## 4   0.00218   0.00218   0.00227  2.12e-3 2.36e-3 2.59e-3 2.44e-3 2.55e-3 2.91e-3
## 5 170.      170.      180.       1.87e+2 1.80e+2 1.75e+2 1.75e+2 1.91e+2 1.98e+2
## 6   4.84      4.84      5.52     4.95e+0 5.33e+0 5.42e+0 5.80e+0 6.20e+0 6.82e+0
## # ℹ 45 more variables: `1979` <dbl>, `1980` <dbl>, `1981` <dbl>, `1982` <dbl>,
## #   `1983` <dbl>, `1984` <dbl>, `1985` <dbl>, `1986` <dbl>, `1987` <dbl>,
## #   `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, `1991` <dbl>, `1992` <dbl>,
## #   `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, `1996` <dbl>, `1997` <dbl>,
## #   `1998` <dbl>, `1999` <dbl>, `2000` <dbl>, `2001` <dbl>, `2002` <dbl>,
## #   `2003` <dbl>, `2004` <dbl>, `2005` <dbl>, `2006` <dbl>, `2007` <dbl>,
## #   `2008` <dbl>, `2009` <dbl>, `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, …

Visualize the Clusters Use a dimensionality reduction method like PCA to visualize high-dimensional clustering.

# Visualize the clusters

# Check the filtered data
str(co2_numeric)

## List of 11449
##  $        : num -0.143
##  $        : num -0.141
##  $        : num -0.135
##  $        : num -0.143
##  $        : num 0.00408
##  $        : num -0.139
##  $        : num -0.13
##  $        : num -0.126
##  $        : num -0.0671
##  $        : num -0.134
##  $        : num -0.143
##  $        : num -0.00416
##  $        : num -0.0971
##  $        : num -0.122
##  $        : num -0.143
##  $        : num -0.0224
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.14
##  $        : num -0.0831
##  $        : num -0.14
##  $        : num -0.142
##  $        : num -0.133
##  $        : num -0.0909
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.14
##  $        : num -0.0504
##  $        : num -0.143
##  $        : num -0.141
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.143
##  $        : num 0.167
##  $        : num -0.107
##  $        : num -0.118
##  $        : num 0.644
##  $        : num -0.141
##  $        : num -0.142
##  $        : num -0.14
##  $        : num -0.141
##  $        : num -0.143
##  $        : num -0.118
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.142
##  $        : num -0.124
##  $        : num -0.143
##  $        : num -0.141
##  $        : num 0.02
##  $        : num 0.795
##  $        : num -0.142
##  $        : num -0.143
##  $        : num -0.0893
##  $        : num -0.14
##  $        : num -0.126
##  $        : num -0.139
##  $        : num -0.121
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.0181
##  $        : num -0.124
##  $        : num -0.142
##  $        : num -0.107
##  $        : num -0.142
##  $        : num -0.143
##  $        : num 0.263
##  $        : num -0.143
##  $        : num -0.142
##  $        : num 0.44
##  $        : num -0.122
##  $        : num -0.141
##  $        : num -0.143
##  $        : num -0.142
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.118
##  $        : num -0.143
##  $        : num -0.143
##  $        : num -0.141
##  $        : num -0.143
##  $        : num -0.142
##  $        : num -0.135
##  $        : num -0.142
##  $        : num -0.124
##  $        : num -0.142
##  $        : num -0.0856
##  $        : num -0.116
##  $        : num 0.0422
##  $        : num -0.125
##  $        : num -0.0734
##  $        : num -0.123
##  $        : num -0.141
##  $        : num -0.13
##  $        : num 0.13
##  $        : num -0.136
##  $        : num -0.141
##   [list output truncated]

# Keep only numeric columns (remove non-numeric)
co2_numeric <- co2_data[sapply(co2_data, is.numeric)]

fviz_cluster(kmeans_result, data = co2_numeric, geom = "point", ellipse.type = "convex") +
  labs(title = "Clustering of Countries Based on CO₂ Emissions")

Clustering_project

Ashutosh Kumar Verma

2024-11-24