library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
country = read_csv("Country-data.csv",show_col_types = FALSE)
glimpse(country)
## Rows: 167
## Columns: 10
## $ country <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua and…
## $ child_mort <dbl> 90.2, 16.6, 27.3, 119.0, 10.3, 14.5, 18.1, 4.8, 4.3, 39.2, …
## $ exports <dbl> 10.0, 28.0, 38.4, 62.3, 45.5, 18.9, 20.8, 19.8, 51.3, 54.3,…
## $ health <dbl> 7.58, 6.55, 4.17, 2.85, 6.03, 8.10, 4.40, 8.73, 11.00, 5.88…
## $ imports <dbl> 44.9, 48.6, 31.4, 42.9, 58.9, 16.0, 45.3, 20.9, 47.8, 20.7,…
## $ income <dbl> 1610, 9930, 12900, 5900, 19100, 18700, 6700, 41400, 43200, …
## $ inflation <dbl> 9.440, 4.490, 16.100, 22.400, 1.440, 20.900, 7.770, 1.160, …
## $ life_expec <dbl> 56.2, 76.3, 76.5, 60.1, 76.8, 75.8, 73.3, 82.0, 80.5, 69.1,…
## $ total_fer <dbl> 5.82, 1.65, 2.89, 6.16, 2.13, 2.37, 1.69, 1.93, 1.44, 1.92,…
## $ gdpp <dbl> 553, 4090, 4460, 3530, 12200, 10300, 3220, 51900, 46900, 58…
country_pc = country %>%
select(-country) %>%
prcomp(scale=TRUE)
country_pc$x %>%
as_tibble() %>%
select(PC1, PC2) %>%
ggplot(aes(x= PC1, y=PC2)) +
geom_point()

pr.var <- country_pc$sdev^2
pve <- pr.var[1:9]/sum(pr.var)
pve
## [1] 0.459517398 0.171816257 0.130042589 0.110531618 0.073402114 0.024842347
## [7] 0.012604304 0.009812817 0.007430556
str(country)
## spc_tbl_ [167 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ country : chr [1:167] "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ child_mort: num [1:167] 90.2 16.6 27.3 119 10.3 14.5 18.1 4.8 4.3 39.2 ...
## $ exports : num [1:167] 10 28 38.4 62.3 45.5 18.9 20.8 19.8 51.3 54.3 ...
## $ health : num [1:167] 7.58 6.55 4.17 2.85 6.03 8.1 4.4 8.73 11 5.88 ...
## $ imports : num [1:167] 44.9 48.6 31.4 42.9 58.9 16 45.3 20.9 47.8 20.7 ...
## $ income : num [1:167] 1610 9930 12900 5900 19100 18700 6700 41400 43200 16000 ...
## $ inflation : num [1:167] 9.44 4.49 16.1 22.4 1.44 20.9 7.77 1.16 0.873 13.8 ...
## $ life_expec: num [1:167] 56.2 76.3 76.5 60.1 76.8 75.8 73.3 82 80.5 69.1 ...
## $ total_fer : num [1:167] 5.82 1.65 2.89 6.16 2.13 2.37 1.69 1.93 1.44 1.92 ...
## $ gdpp : num [1:167] 553 4090 4460 3530 12200 10300 3220 51900 46900 5840 ...
## - attr(*, "spec")=
## .. cols(
## .. country = col_character(),
## .. child_mort = col_double(),
## .. exports = col_double(),
## .. health = col_double(),
## .. imports = col_double(),
## .. income = col_double(),
## .. inflation = col_double(),
## .. life_expec = col_double(),
## .. total_fer = col_double(),
## .. gdpp = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(country)
## country child_mort exports health
## Length:167 Min. : 2.60 Min. : 0.109 Min. : 1.810
## Class :character 1st Qu.: 8.25 1st Qu.: 23.800 1st Qu.: 4.920
## Mode :character Median : 19.30 Median : 35.000 Median : 6.320
## Mean : 38.27 Mean : 41.109 Mean : 6.816
## 3rd Qu.: 62.10 3rd Qu.: 51.350 3rd Qu.: 8.600
## Max. :208.00 Max. :200.000 Max. :17.900
## imports income inflation life_expec
## Min. : 0.0659 Min. : 609 Min. : -4.210 Min. :32.10
## 1st Qu.: 30.2000 1st Qu.: 3355 1st Qu.: 1.810 1st Qu.:65.30
## Median : 43.3000 Median : 9960 Median : 5.390 Median :73.10
## Mean : 46.8902 Mean : 17145 Mean : 7.782 Mean :70.56
## 3rd Qu.: 58.7500 3rd Qu.: 22800 3rd Qu.: 10.750 3rd Qu.:76.80
## Max. :174.0000 Max. :125000 Max. :104.000 Max. :82.80
## total_fer gdpp
## Min. :1.150 Min. : 231
## 1st Qu.:1.795 1st Qu.: 1330
## Median :2.410 Median : 4660
## Mean :2.948 Mean : 12964
## 3rd Qu.:3.880 3rd Qu.: 14050
## Max. :7.490 Max. :105000
PC=1:9
data=data.frame(PC, pve)
data
## PC pve
## 1 1 0.459517398
## 2 2 0.171816257
## 3 3 0.130042589
## 4 4 0.110531618
## 5 5 0.073402114
## 6 6 0.024842347
## 7 7 0.012604304
## 8 8 0.009812817
## 9 9 0.007430556
qplot(c(1:9), pve) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

## The elbow is around 2 or 6.
ggplot(data=data, aes(x=PC, y=cumsum(pve)))+
geom_hline(aes(yintercept=0.9),lty=2,color="purple",linewidth=1, alpha=0.5)+
geom_line(color="navy")+
geom_point(color="red",cex=2)+
labs(title="Cumulative Proportion of Variance Explained",
x="Principal Component",
y="cumulative pve")+
scale_x_continuous(breaks = 1:10)

country_s = scale(country_pc$x[,-1])
head(country_s, 4)
## PC2 PC3 PC4 PC5 PC6 PC7
## [1,] -0.07666441 0.6618017 -1.0048642 0.1941908 0.5368244 -1.1337419
## [2,] 0.47155756 0.3073327 1.1606076 -0.2142677 -0.1783359 -0.7368396
## [3,] 0.36493899 -1.1257111 0.8677773 -0.1919395 0.8469840 0.2581661
## [4,] -1.35942210 -1.4054456 -0.8392989 0.3351314 1.1554616 1.3049409
## PC8 PC9
## [1,] -1.3925325 -0.05454721
## [2,] 0.7415722 0.66819312
## [3,] 0.6178427 0.32399279
## [4,] 1.1943309 -0.35214412
fviz_nbclust(country_s, kmeans, method="gap_stat")

## The optimal number is 1.
km_mod = kmeans(country_s, centers=3)
fviz_cluster(km_mod, data = country_s)

pam_mod = pam(country_s, 3)
fviz_cluster(pam_mod, data = country_s)

## Almost all of blue and green got zoned differently.
country_pam = country %>% mutate(cluster=factor(pam_mod$cluster))
boxplot(gdpp~cluster,data=country_pam, horizontal=T)

boxplot(life_expec~cluster,data=country_pam, horizontal=T)

median(country_pam$life_expec)
## [1] 73.1
row.names(country_s) <- country$country
country_dend = country_s %>% dist(method="euclidean") %>%
hclust(method="ward.D2")
library(ggdendro)
ggdendrogram(country_dend, rotate = TRUE)+
labs(x="", y="")

library(ape)
colors = c("#1f77b4", "#ff7f0e", "#2ca02c")
clus3 = cutree(country_dend, 3)
plot(as.phylo(country_dend), type="fan", cex=0.5, no.margin=TRUE, tip.color = colors[clus3])

library(ape)
country_dend = country_s %>% dist(method="euclidean") %>%
hclust(method="ward.D2")
plot(as.phylo(country_dend), type = "cladogram", cex = 0.5,
label.offset = 0.5)
