assignment

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)

df=read.csv(file.choose())
head(df)

dim(df)

## [1] 368  16

str(df)

## 'data.frame':    368 obs. of  16 variables:
##  $ Country.Name  : chr  "Philippines" "Singapore" "Thailand" "Vietnam" ...
##  $ Country.Code  : int  566 576 578 582 514 826 556 565 853 813 ...
##  $ Indicator.Name: chr  "External Trade, Imports of Goods and Services, Percent Change" "External Trade, Imports of Goods and Services, Percent Change" "External Trade, Imports of Goods and Services, Percent Change" "External Trade, Imports of Goods and Services, Percent Change" ...
##  $ Indicator.Code: chr  "TM_R_PC_PP_PT" "TM_R_PC_PP_PT" "TM_R_PC_PP_PT" "TM_R_PC_PP_PT" ...
##  $ Attribute     : chr  "Value" "Value" "Value" "Value" ...
##  $ X2012         : logi  NA NA NA NA NA NA ...
##  $ X2013         : num  0.201 5.391 1.579 19.266 -15.213 ...
##  $ X2014         : num  14.91 5.33 -5.28 14.09 -4.95 ...
##  $ X2015         : num  13.2268 7.0446 0.8064 20.7797 -0.0847 ...
##  $ X2016         : num  9.003 0.826 -3.182 11.365 8.053 ...
##  $ X2017         : num  2.79 1.33 3.47 8.21 5.41 ...
##  $ X2018         : num  5.24 3.92 3.54 12.04 -1.91 ...
##  $ X2019         : num  5.73 4.328 3.644 13.4 -0.226 ...
##  $ X2020         : num  5.5 4.45 3.98 13.05 -4.8 ...
##  $ X2021         : num  6.075 4.85 3.84 13.015 0.862 ...
##  $ Base.Year     : logi  NA NA NA NA NA NA ...

glimpse(df)

## Rows: 368
## Columns: 16
## $ Country.Name   <chr> "Philippines", "Singapore", "Thailand", "Vietnam", "Bhu…
## $ Country.Code   <int> 566, 576, 578, 582, 514, 826, 556, 565, 853, 813, 948, …
## $ Indicator.Name <chr> "External Trade, Imports of Goods and Services, Percent…
## $ Indicator.Code <chr> "TM_R_PC_PP_PT", "TM_R_PC_PP_PT", "TM_R_PC_PP_PT", "TM_…
## $ Attribute      <chr> "Value", "Value", "Value", "Value", "Value", "Value", "…
## $ X2012          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ X2013          <dbl> 0.2007986, 5.3905451, 1.5787047, 19.2660172, -15.212636…
## $ X2014          <dbl> 14.90558288, 5.33087164, -5.27975893, 14.08729043, -4.9…
## $ X2015          <dbl> 13.22679081, 7.04458317, 0.80638186, 20.77972716, -0.08…
## $ X2016          <dbl> 9.00266555, 0.82555128, -3.18169957, 11.36508352, 8.053…
## $ X2017          <dbl> 2.78733404, 1.33036113, 3.47418836, 8.20726502, 5.40929…
## $ X2018          <dbl> 5.24187536, 3.92104291, 3.53673055, 12.03958872, -1.913…
## $ X2019          <dbl> 5.73007461, 4.32835289, 3.64435402, 13.39977739, -0.225…
## $ X2020          <dbl> 5.50213376, 4.44790596, 3.97933357, 13.05089222, -4.796…
## $ X2021          <dbl> 6.0747274, 4.8495605, 3.8404765, 13.0151510, 0.8616156,…
## $ Base.Year      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

summary(df)

##  Country.Name        Country.Code    Indicator.Name     Indicator.Code    
##  Length:368         Min.   : 158.0   Length:368         Length:368        
##  Class :character   1st Qu.: 522.0   Class :character   Class :character  
##  Mode  :character   Median : 547.0   Mode  :character   Mode  :character  
##                     Mean   : 831.9                                        
##                     3rd Qu.: 819.0                                        
##                     Max.   :9502.0                                        
##   Attribute          X2012             X2013                X2014           
##  Length:368         Mode:logical   Min.   :-5.076e+10   Min.   :-4.261e+10  
##  Class :character   NA's:368       1st Qu.: 0.000e+00   1st Qu.: 1.000e+00  
##  Mode  :character                  Median : 4.000e+00   Median : 5.000e+00  
##                                    Mean   : 1.389e+11   Mean   : 1.456e+11  
##                                    3rd Qu.: 2.200e+01   3rd Qu.: 2.800e+01  
##                                    Max.   : 2.305e+13   Max.   : 2.398e+13  
##      X2015                X2016                X2017           
##  Min.   :-5.798e+10   Min.   :-4.385e+10   Min.   :-5.959e+10  
##  1st Qu.: 0.000e+00   1st Qu.:-1.000e+00   1st Qu.: 0.000e+00  
##  Median : 3.000e+00   Median : 3.000e+00   Median : 3.000e+00  
##  Mean   : 1.459e+11   Mean   : 1.531e+11   Mean   : 1.645e+11  
##  3rd Qu.: 2.700e+01   3rd Qu.: 1.500e+01   3rd Qu.: 1.200e+01  
##  Max.   : 2.382e+13   Max.   : 2.503e+13   Max.   : 2.705e+13  
##      X2018                X2019                X2020           
##  Min.   :-7.386e+10   Min.   :-8.136e+10   Min.   :-8.996e+10  
##  1st Qu.: 0.000e+00   1st Qu.: 1.000e+00   1st Qu.: 0.000e+00  
##  Median : 3.000e+00   Median : 4.000e+00   Median : 4.000e+00  
##  Mean   : 1.765e+11   Mean   : 1.902e+11   Mean   : 2.052e+11  
##  3rd Qu.: 1.300e+01   3rd Qu.: 1.400e+01   3rd Qu.: 1.400e+01  
##  Max.   : 2.906e+13   Max.   : 3.134e+13   Max.   : 3.383e+13  
##      X2021            Base.Year     
##  Min.   :-9.606e+10   Mode:logical  
##  1st Qu.: 1.000e+00   NA's:368      
##  Median : 4.000e+00                 
##  Mean   : 2.210e+11                 
##  3rd Qu.: 1.400e+01                 
##  Max.   : 3.640e+13

colSums(is.na(df))

##   Country.Name   Country.Code Indicator.Name Indicator.Code      Attribute 
##              0              0              0              0              0 
##          X2012          X2013          X2014          X2015          X2016 
##            368              0              0              0              0 
##          X2017          X2018          X2019          X2020          X2021 
##              0              0              0              0              0 
##      Base.Year 
##            368

df_clean <- na.omit(df)

# Histograms of Yearly Data
hist(df$X2014, main = "Histogram of 2014", xlab = "Percent Change in Import", col = "lightblue")

hist(df$X2015, main = "Histogram of 2015", xlab = "Percent Change in Import", col = "Green")

hist(df$X2016, main = "Histogram of 2016", xlab = "Percent Change in Import", col = "blue")

hist(df$X2017, main = "Histogram of 2017", xlab = "Percent Change in Import", col = "red")

hist(df$X2018, main = "Histogram of 2018", xlab = "Percent Change in Import", col = "yellow")

hist(df$X2019, main = "Histogram of 2019", xlab = "Percent Change in Import", col = "brown")

hist(df$X2020, main = "Histogram of 2020", xlab = "Percent Change in Import", col = "purple")

# Boxplot for 2014 comparing countries
boxplot(X2014 ~ Country.Name, data = df, main = "Imports Percent Change in 2014 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2015 ~ Country.Name, data = df, main = "Imports Percent Change in 2015 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2016 ~ Country.Name, data = df, main = "Imports Percent Change in 2016 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2017 ~ Country.Name, data = df, main = "Imports Percent Change in 2017 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2018 ~ Country.Name, data = df, main = "Imports Percent Change in 2018 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2019 ~ Country.Name, data = df, main = "Imports Percent Change in 2019 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2020 ~ Country.Name, data = df, main = "Imports Percent Change in 2020 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2021 ~ Country.Name, data = df, main = "Imports Percent Change in 2021 by Country", xlab = "Country", ylab = "Percent Change", las=2)

# Correlation matrix of yearly data
cor(df[, c("X2013", "X2014", "X2015", "X2016", "X2017", "X2018", "X2019", "X2020", "X2021")], use="complete.obs")

##           X2013     X2014     X2015     X2016     X2017     X2018     X2019
## X2013 1.0000000 0.9995585 0.9981268 0.9989754 0.9989554 0.9983884 0.9977487
## X2014 0.9995585 1.0000000 0.9994809 0.9998176 0.9997890 0.9995335 0.9991746
## X2015 0.9981268 0.9994809 1.0000000 0.9998065 0.9997754 0.9998578 0.9997996
## X2016 0.9989754 0.9998176 0.9998065 1.0000000 0.9999746 0.9998624 0.9996359
## X2017 0.9989554 0.9997890 0.9997754 0.9999746 1.0000000 0.9999238 0.9997284
## X2018 0.9983884 0.9995335 0.9998578 0.9998624 0.9999238 1.0000000 0.9999396
## X2019 0.9977487 0.9991746 0.9997996 0.9996359 0.9997284 0.9999396 1.0000000
## X2020 0.9968542 0.9986124 0.9995968 0.9992294 0.9993528 0.9997200 0.9999194
## X2021 0.9959052 0.9979592 0.9992603 0.9987176 0.9988695 0.9993783 0.9997044
##           X2020     X2021
## X2013 0.9968542 0.9959052
## X2014 0.9986124 0.9979592
## X2015 0.9995968 0.9992603
## X2016 0.9992294 0.9987176
## X2017 0.9993528 0.9988695
## X2018 0.9997200 0.9993783
## X2019 0.9999194 0.9997044
## X2020 1.0000000 0.9999317
## X2021 0.9999317 1.0000000

df_cluster <- df[-c(1:6)]
df_cluster <- df_cluster[-10]

df_cluster <- na.omit(df_cluster)  # This removes any rows with missing data

df_scaled <- scale(df_cluster)




fviz_nbclust(df_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method for Optimal K", x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares")

# Apply K-Means 
set.seed(123)  
kmeans_result <- kmeans(df_scaled, centers = 3, nstart = 25)

df$Cluster <- kmeans_result$cluster  

head(df[, c("Country.Name", "Cluster")])

# Visualize the clusters using PCA
fviz_cluster(kmeans_result, data = df_scaled, geom = "point", ellipse.type = "convex",
             ggtheme = theme_minimal(), main = "K-Means Clustering (k = 3)") +
  labs(x = "Principal Component 1", y = "Principal Component 2")

# Compute the mean of each year within each cluster
aggregate(df[, c("X2012", "X2013", "X2014", "X2015", "X2016", "X2017", "X2018", "X2019", "X2020", "X2021")],
          by = list(Cluster = df$Cluster), FUN = mean)

assignment

Raj Patel

2024-09-12