library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
df=read.csv(file.choose())
head(df)
dim(df)
## [1] 368 16
str(df)
## 'data.frame': 368 obs. of 16 variables:
## $ Country.Name : chr "Philippines" "Singapore" "Thailand" "Vietnam" ...
## $ Country.Code : int 566 576 578 582 514 826 556 565 853 813 ...
## $ Indicator.Name: chr "External Trade, Imports of Goods and Services, Percent Change" "External Trade, Imports of Goods and Services, Percent Change" "External Trade, Imports of Goods and Services, Percent Change" "External Trade, Imports of Goods and Services, Percent Change" ...
## $ Indicator.Code: chr "TM_R_PC_PP_PT" "TM_R_PC_PP_PT" "TM_R_PC_PP_PT" "TM_R_PC_PP_PT" ...
## $ Attribute : chr "Value" "Value" "Value" "Value" ...
## $ X2012 : logi NA NA NA NA NA NA ...
## $ X2013 : num 0.201 5.391 1.579 19.266 -15.213 ...
## $ X2014 : num 14.91 5.33 -5.28 14.09 -4.95 ...
## $ X2015 : num 13.2268 7.0446 0.8064 20.7797 -0.0847 ...
## $ X2016 : num 9.003 0.826 -3.182 11.365 8.053 ...
## $ X2017 : num 2.79 1.33 3.47 8.21 5.41 ...
## $ X2018 : num 5.24 3.92 3.54 12.04 -1.91 ...
## $ X2019 : num 5.73 4.328 3.644 13.4 -0.226 ...
## $ X2020 : num 5.5 4.45 3.98 13.05 -4.8 ...
## $ X2021 : num 6.075 4.85 3.84 13.015 0.862 ...
## $ Base.Year : logi NA NA NA NA NA NA ...
glimpse(df)
## Rows: 368
## Columns: 16
## $ Country.Name <chr> "Philippines", "Singapore", "Thailand", "Vietnam", "Bhu…
## $ Country.Code <int> 566, 576, 578, 582, 514, 826, 556, 565, 853, 813, 948, …
## $ Indicator.Name <chr> "External Trade, Imports of Goods and Services, Percent…
## $ Indicator.Code <chr> "TM_R_PC_PP_PT", "TM_R_PC_PP_PT", "TM_R_PC_PP_PT", "TM_…
## $ Attribute <chr> "Value", "Value", "Value", "Value", "Value", "Value", "…
## $ X2012 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ X2013 <dbl> 0.2007986, 5.3905451, 1.5787047, 19.2660172, -15.212636…
## $ X2014 <dbl> 14.90558288, 5.33087164, -5.27975893, 14.08729043, -4.9…
## $ X2015 <dbl> 13.22679081, 7.04458317, 0.80638186, 20.77972716, -0.08…
## $ X2016 <dbl> 9.00266555, 0.82555128, -3.18169957, 11.36508352, 8.053…
## $ X2017 <dbl> 2.78733404, 1.33036113, 3.47418836, 8.20726502, 5.40929…
## $ X2018 <dbl> 5.24187536, 3.92104291, 3.53673055, 12.03958872, -1.913…
## $ X2019 <dbl> 5.73007461, 4.32835289, 3.64435402, 13.39977739, -0.225…
## $ X2020 <dbl> 5.50213376, 4.44790596, 3.97933357, 13.05089222, -4.796…
## $ X2021 <dbl> 6.0747274, 4.8495605, 3.8404765, 13.0151510, 0.8616156,…
## $ Base.Year <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
summary(df)
## Country.Name Country.Code Indicator.Name Indicator.Code
## Length:368 Min. : 158.0 Length:368 Length:368
## Class :character 1st Qu.: 522.0 Class :character Class :character
## Mode :character Median : 547.0 Mode :character Mode :character
## Mean : 831.9
## 3rd Qu.: 819.0
## Max. :9502.0
## Attribute X2012 X2013 X2014
## Length:368 Mode:logical Min. :-5.076e+10 Min. :-4.261e+10
## Class :character NA's:368 1st Qu.: 0.000e+00 1st Qu.: 1.000e+00
## Mode :character Median : 4.000e+00 Median : 5.000e+00
## Mean : 1.389e+11 Mean : 1.456e+11
## 3rd Qu.: 2.200e+01 3rd Qu.: 2.800e+01
## Max. : 2.305e+13 Max. : 2.398e+13
## X2015 X2016 X2017
## Min. :-5.798e+10 Min. :-4.385e+10 Min. :-5.959e+10
## 1st Qu.: 0.000e+00 1st Qu.:-1.000e+00 1st Qu.: 0.000e+00
## Median : 3.000e+00 Median : 3.000e+00 Median : 3.000e+00
## Mean : 1.459e+11 Mean : 1.531e+11 Mean : 1.645e+11
## 3rd Qu.: 2.700e+01 3rd Qu.: 1.500e+01 3rd Qu.: 1.200e+01
## Max. : 2.382e+13 Max. : 2.503e+13 Max. : 2.705e+13
## X2018 X2019 X2020
## Min. :-7.386e+10 Min. :-8.136e+10 Min. :-8.996e+10
## 1st Qu.: 0.000e+00 1st Qu.: 1.000e+00 1st Qu.: 0.000e+00
## Median : 3.000e+00 Median : 4.000e+00 Median : 4.000e+00
## Mean : 1.765e+11 Mean : 1.902e+11 Mean : 2.052e+11
## 3rd Qu.: 1.300e+01 3rd Qu.: 1.400e+01 3rd Qu.: 1.400e+01
## Max. : 2.906e+13 Max. : 3.134e+13 Max. : 3.383e+13
## X2021 Base.Year
## Min. :-9.606e+10 Mode:logical
## 1st Qu.: 1.000e+00 NA's:368
## Median : 4.000e+00
## Mean : 2.210e+11
## 3rd Qu.: 1.400e+01
## Max. : 3.640e+13
colSums(is.na(df))
## Country.Name Country.Code Indicator.Name Indicator.Code Attribute
## 0 0 0 0 0
## X2012 X2013 X2014 X2015 X2016
## 368 0 0 0 0
## X2017 X2018 X2019 X2020 X2021
## 0 0 0 0 0
## Base.Year
## 368
df_clean <- na.omit(df)
# Histograms of Yearly Data
hist(df$X2014, main = "Histogram of 2014", xlab = "Percent Change in Import", col = "lightblue")

hist(df$X2015, main = "Histogram of 2015", xlab = "Percent Change in Import", col = "Green")

hist(df$X2016, main = "Histogram of 2016", xlab = "Percent Change in Import", col = "blue")

hist(df$X2017, main = "Histogram of 2017", xlab = "Percent Change in Import", col = "red")

hist(df$X2018, main = "Histogram of 2018", xlab = "Percent Change in Import", col = "yellow")

hist(df$X2019, main = "Histogram of 2019", xlab = "Percent Change in Import", col = "brown")

hist(df$X2020, main = "Histogram of 2020", xlab = "Percent Change in Import", col = "purple")

# Boxplot for 2014 comparing countries
boxplot(X2014 ~ Country.Name, data = df, main = "Imports Percent Change in 2014 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2015 ~ Country.Name, data = df, main = "Imports Percent Change in 2015 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2016 ~ Country.Name, data = df, main = "Imports Percent Change in 2016 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2017 ~ Country.Name, data = df, main = "Imports Percent Change in 2017 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2018 ~ Country.Name, data = df, main = "Imports Percent Change in 2018 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2019 ~ Country.Name, data = df, main = "Imports Percent Change in 2019 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2020 ~ Country.Name, data = df, main = "Imports Percent Change in 2020 by Country", xlab = "Country", ylab = "Percent Change", las=2)

boxplot(X2021 ~ Country.Name, data = df, main = "Imports Percent Change in 2021 by Country", xlab = "Country", ylab = "Percent Change", las=2)

# Correlation matrix of yearly data
cor(df[, c("X2013", "X2014", "X2015", "X2016", "X2017", "X2018", "X2019", "X2020", "X2021")], use="complete.obs")
## X2013 X2014 X2015 X2016 X2017 X2018 X2019
## X2013 1.0000000 0.9995585 0.9981268 0.9989754 0.9989554 0.9983884 0.9977487
## X2014 0.9995585 1.0000000 0.9994809 0.9998176 0.9997890 0.9995335 0.9991746
## X2015 0.9981268 0.9994809 1.0000000 0.9998065 0.9997754 0.9998578 0.9997996
## X2016 0.9989754 0.9998176 0.9998065 1.0000000 0.9999746 0.9998624 0.9996359
## X2017 0.9989554 0.9997890 0.9997754 0.9999746 1.0000000 0.9999238 0.9997284
## X2018 0.9983884 0.9995335 0.9998578 0.9998624 0.9999238 1.0000000 0.9999396
## X2019 0.9977487 0.9991746 0.9997996 0.9996359 0.9997284 0.9999396 1.0000000
## X2020 0.9968542 0.9986124 0.9995968 0.9992294 0.9993528 0.9997200 0.9999194
## X2021 0.9959052 0.9979592 0.9992603 0.9987176 0.9988695 0.9993783 0.9997044
## X2020 X2021
## X2013 0.9968542 0.9959052
## X2014 0.9986124 0.9979592
## X2015 0.9995968 0.9992603
## X2016 0.9992294 0.9987176
## X2017 0.9993528 0.9988695
## X2018 0.9997200 0.9993783
## X2019 0.9999194 0.9997044
## X2020 1.0000000 0.9999317
## X2021 0.9999317 1.0000000
df_cluster <- df[-c(1:6)]
df_cluster <- df_cluster[-10]
df_cluster <- na.omit(df_cluster) # This removes any rows with missing data
df_scaled <- scale(df_cluster)
fviz_nbclust(df_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal K", x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares")

# Apply K-Means
set.seed(123)
kmeans_result <- kmeans(df_scaled, centers = 3, nstart = 25)
df$Cluster <- kmeans_result$cluster
head(df[, c("Country.Name", "Cluster")])
# Visualize the clusters using PCA
fviz_cluster(kmeans_result, data = df_scaled, geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(), main = "K-Means Clustering (k = 3)") +
labs(x = "Principal Component 1", y = "Principal Component 2")

# Compute the mean of each year within each cluster
aggregate(df[, c("X2012", "X2013", "X2014", "X2015", "X2016", "X2017", "X2018", "X2019", "X2020", "X2021")],
by = list(Cluster = df$Cluster), FUN = mean)