library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
country = read_csv("Country-data.csv",show_col_types = FALSE)
glimpse(country)
## Rows: 167
## Columns: 10
## $ country    <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua and…
## $ child_mort <dbl> 90.2, 16.6, 27.3, 119.0, 10.3, 14.5, 18.1, 4.8, 4.3, 39.2, …
## $ exports    <dbl> 10.0, 28.0, 38.4, 62.3, 45.5, 18.9, 20.8, 19.8, 51.3, 54.3,…
## $ health     <dbl> 7.58, 6.55, 4.17, 2.85, 6.03, 8.10, 4.40, 8.73, 11.00, 5.88…
## $ imports    <dbl> 44.9, 48.6, 31.4, 42.9, 58.9, 16.0, 45.3, 20.9, 47.8, 20.7,…
## $ income     <dbl> 1610, 9930, 12900, 5900, 19100, 18700, 6700, 41400, 43200, …
## $ inflation  <dbl> 9.440, 4.490, 16.100, 22.400, 1.440, 20.900, 7.770, 1.160, …
## $ life_expec <dbl> 56.2, 76.3, 76.5, 60.1, 76.8, 75.8, 73.3, 82.0, 80.5, 69.1,…
## $ total_fer  <dbl> 5.82, 1.65, 2.89, 6.16, 2.13, 2.37, 1.69, 1.93, 1.44, 1.92,…
## $ gdpp       <dbl> 553, 4090, 4460, 3530, 12200, 10300, 3220, 51900, 46900, 58…
country_pc = country %>% 
  select(-country) %>% 
  prcomp(scale=TRUE)

country_pc$x %>% 
  as_tibble() %>%
  select(PC1, PC2) %>% 
  ggplot(aes(x= PC1, y=PC2)) +
  geom_point()

pr.var <- country_pc$sdev^2
pve <- pr.var[1:9]/sum(pr.var)
pve
## [1] 0.459517398 0.171816257 0.130042589 0.110531618 0.073402114 0.024842347
## [7] 0.012604304 0.009812817 0.007430556
str(country)
## spc_tbl_ [167 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ country   : chr [1:167] "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ child_mort: num [1:167] 90.2 16.6 27.3 119 10.3 14.5 18.1 4.8 4.3 39.2 ...
##  $ exports   : num [1:167] 10 28 38.4 62.3 45.5 18.9 20.8 19.8 51.3 54.3 ...
##  $ health    : num [1:167] 7.58 6.55 4.17 2.85 6.03 8.1 4.4 8.73 11 5.88 ...
##  $ imports   : num [1:167] 44.9 48.6 31.4 42.9 58.9 16 45.3 20.9 47.8 20.7 ...
##  $ income    : num [1:167] 1610 9930 12900 5900 19100 18700 6700 41400 43200 16000 ...
##  $ inflation : num [1:167] 9.44 4.49 16.1 22.4 1.44 20.9 7.77 1.16 0.873 13.8 ...
##  $ life_expec: num [1:167] 56.2 76.3 76.5 60.1 76.8 75.8 73.3 82 80.5 69.1 ...
##  $ total_fer : num [1:167] 5.82 1.65 2.89 6.16 2.13 2.37 1.69 1.93 1.44 1.92 ...
##  $ gdpp      : num [1:167] 553 4090 4460 3530 12200 10300 3220 51900 46900 5840 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   country = col_character(),
##   ..   child_mort = col_double(),
##   ..   exports = col_double(),
##   ..   health = col_double(),
##   ..   imports = col_double(),
##   ..   income = col_double(),
##   ..   inflation = col_double(),
##   ..   life_expec = col_double(),
##   ..   total_fer = col_double(),
##   ..   gdpp = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(country)
##    country            child_mort        exports            health      
##  Length:167         Min.   :  2.60   Min.   :  0.109   Min.   : 1.810  
##  Class :character   1st Qu.:  8.25   1st Qu.: 23.800   1st Qu.: 4.920  
##  Mode  :character   Median : 19.30   Median : 35.000   Median : 6.320  
##                     Mean   : 38.27   Mean   : 41.109   Mean   : 6.816  
##                     3rd Qu.: 62.10   3rd Qu.: 51.350   3rd Qu.: 8.600  
##                     Max.   :208.00   Max.   :200.000   Max.   :17.900  
##     imports             income         inflation         life_expec   
##  Min.   :  0.0659   Min.   :   609   Min.   : -4.210   Min.   :32.10  
##  1st Qu.: 30.2000   1st Qu.:  3355   1st Qu.:  1.810   1st Qu.:65.30  
##  Median : 43.3000   Median :  9960   Median :  5.390   Median :73.10  
##  Mean   : 46.8902   Mean   : 17145   Mean   :  7.782   Mean   :70.56  
##  3rd Qu.: 58.7500   3rd Qu.: 22800   3rd Qu.: 10.750   3rd Qu.:76.80  
##  Max.   :174.0000   Max.   :125000   Max.   :104.000   Max.   :82.80  
##    total_fer          gdpp       
##  Min.   :1.150   Min.   :   231  
##  1st Qu.:1.795   1st Qu.:  1330  
##  Median :2.410   Median :  4660  
##  Mean   :2.948   Mean   : 12964  
##  3rd Qu.:3.880   3rd Qu.: 14050  
##  Max.   :7.490   Max.   :105000
PC=1:9
data=data.frame(PC, pve)
data
##   PC         pve
## 1  1 0.459517398
## 2  2 0.171816257
## 3  3 0.130042589
## 4  4 0.110531618
## 5  5 0.073402114
## 6  6 0.024842347
## 7  7 0.012604304
## 8  8 0.009812817
## 9  9 0.007430556
qplot(c(1:9), pve) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot") +
  ylim(0, 1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

## The elbow is around 2 or 6.
ggplot(data=data, aes(x=PC, y=cumsum(pve)))+
  geom_hline(aes(yintercept=0.9),lty=2,color="purple",linewidth=1, alpha=0.5)+
  geom_line(color="navy")+
  geom_point(color="red",cex=2)+
  labs(title="Cumulative Proportion of Variance Explained", 
       x="Principal Component",
       y="cumulative pve")+
  scale_x_continuous(breaks = 1:10)

country_s = scale(country_pc$x[,-1])
head(country_s, 4)
##              PC2        PC3        PC4        PC5        PC6        PC7
## [1,] -0.07666441  0.6618017 -1.0048642  0.1941908  0.5368244 -1.1337419
## [2,]  0.47155756  0.3073327  1.1606076 -0.2142677 -0.1783359 -0.7368396
## [3,]  0.36493899 -1.1257111  0.8677773 -0.1919395  0.8469840  0.2581661
## [4,] -1.35942210 -1.4054456 -0.8392989  0.3351314  1.1554616  1.3049409
##             PC8         PC9
## [1,] -1.3925325 -0.05454721
## [2,]  0.7415722  0.66819312
## [3,]  0.6178427  0.32399279
## [4,]  1.1943309 -0.35214412
fviz_nbclust(country_s, kmeans, method="gap_stat")

## The optimal number is 1.
km_mod = kmeans(country_s, centers=3)

fviz_cluster(km_mod, data = country_s)

pam_mod = pam(country_s, 3)
fviz_cluster(pam_mod, data = country_s)

## Almost all of blue and green got zoned differently.
country_pam = country %>% mutate(cluster=factor(pam_mod$cluster))

boxplot(gdpp~cluster,data=country_pam, horizontal=T)

boxplot(life_expec~cluster,data=country_pam, horizontal=T)

median(country_pam$life_expec)
## [1] 73.1
row.names(country_s) <- country$country
country_dend = country_s %>% dist(method="euclidean") %>% 
  hclust(method="ward.D2")

library(ggdendro)
ggdendrogram(country_dend, rotate = TRUE)+
  labs(x="", y="")

library(ape)
colors = c("#1f77b4", "#ff7f0e", "#2ca02c")
clus3 = cutree(country_dend, 3)
plot(as.phylo(country_dend), type="fan", cex=0.5, no.margin=TRUE, tip.color = colors[clus3])

library(ape)
country_dend = country_s %>% dist(method="euclidean") %>% 
  hclust(method="ward.D2")

plot(as.phylo(country_dend), type = "cladogram", cex = 0.5, 
     label.offset = 0.5)