PCA
#standardize data
data_scaled <- scale(data[,2:10], center = TRUE, scale = TRUE)
# by scaling we are removing potential bias that the model can have towards features with higher magnitudes.
row.names(data_scaled) <- data$country
data.pca <- prcomp(data_scaled)
print(data.pca)
## Standard deviations (1, .., p=9):
## [1] 2.0336314 1.2435217 1.0818425 0.9973889 0.8127847 0.4728437 0.3368067
## [8] 0.2971790 0.2586020
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5
## child_mort -0.4195194 -0.192883937 0.02954353 -0.370653262 0.16896968
## exports 0.2838970 -0.613163494 -0.14476069 -0.003091019 -0.05761584
## health 0.1508378 0.243086779 0.59663237 -0.461897497 -0.51800037
## imports 0.1614824 -0.671820644 0.29992674 0.071907461 -0.25537642
## income 0.3984411 -0.022535530 -0.30154750 -0.392159039 0.24714960
## inflation -0.1931729 0.008404473 -0.64251951 -0.150441762 -0.71486910
## life_expec 0.4258394 0.222706743 -0.11391854 0.203797235 -0.10821980
## total_fer -0.4037290 -0.155233106 -0.01954925 -0.378303645 0.13526221
## gdpp 0.3926448 0.046022396 -0.12297749 -0.531994575 0.18016662
## PC6 PC7 PC8 PC9
## child_mort -0.200628153 0.07948854 0.68274306 0.32754180
## exports 0.059332832 0.70730269 0.01419742 -0.12308207
## health -0.007276456 0.24983051 -0.07249683 0.11308797
## imports 0.030031537 -0.59218953 0.02894642 0.09903717
## income -0.160346990 -0.09556237 -0.35262369 0.61298247
## inflation -0.066285372 -0.10463252 0.01153775 -0.02523614
## life_expec 0.601126516 -0.01848639 0.50466425 0.29403981
## total_fer 0.750688748 -0.02882643 -0.29335267 -0.02633585
## gdpp -0.016778761 -0.24299776 0.24969636 -0.62564572
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.13565658 45.9517398 45.95174
## Dim.2 1.54634631 17.1816257 63.13337
## Dim.3 1.17038330 13.0042589 76.13762
## Dim.4 0.99478456 11.0531618 87.19079
## Dim.5 0.66061903 7.3402114 94.53100
## Dim.6 0.22358112 2.4842347 97.01523
## Dim.7 0.11343874 1.2604304 98.27566
## Dim.8 0.08831536 0.9812817 99.25694
## Dim.9 0.06687501 0.7430556 100.00000
# Percentage of variance/inertia.
fviz_screeplot(data.pca, addlabels = TRUE, ylim = c(0, 100))

fviz_pca_var(data.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE
)

# Contributions of variables to PC1
pc1 <- fviz_contrib(data.pca, choice = "var", axes = 1, color = "darkblue", top = 4, ggtheme = theme_minimal())
# Contributions of variables to PC2
pc2 <- fviz_contrib(data.pca, choice = "var", axes = 2, color = "darkblue", top = 4, ggtheme = theme_minimal())
grid.arrange(pc1, pc2, ncol=2)

fvz_biplt1 <- fviz_pca_biplot(data.pca, repel = TRUE,
col.var = "magenta", # Variables color
col.ind = "#696969", # Individuals color
addEllipses = TRUE
)
fvz_biplt1

fvz_ind1 <- fviz_pca_ind(data.pca, #iris.pca
label = "none", # hide individual labels
habillage = as.factor(data$state5), # color by groups
palette = "RdBl",
#palette = c("cyan", "pink", "green"),
addEllipses = TRUE # Concentration ellipses
)
fvz_ind1

fvz_ellips1 <- fviz_ellipses(data.pca, habillage = as.factor(data$state5), ellipse.type = "confidence", geom = "point", palette = "lancet")
fvz_ellips1

data.pca$variance <- data.pca$sdev ^2
sum(data.pca$variance[1:2])/sum(data.pca$variance)
## [1] 0.6313337
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0336 1.2435 1.0818 0.9974 0.8128 0.47284 0.3368
## Proportion of Variance 0.4595 0.1718 0.1300 0.1105 0.0734 0.02484 0.0126
## Cumulative Proportion 0.4595 0.6313 0.7614 0.8719 0.9453 0.97015 0.9828
## PC8 PC9
## Standard deviation 0.29718 0.25860
## Proportion of Variance 0.00981 0.00743
## Cumulative Proportion 0.99257 1.00000
gg2 <- ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=state2)) +
geom_point(alpha=0.3, size=3) +
geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )
print(gg2)

gg3 <- ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=state3)) +
geom_point(alpha=0.3, size=3) +
geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )
print(gg3)

gg5 <- ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=state5)) +
geom_point(alpha=0.3, size=3) +
geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )
print(gg5)

Data.pca <- PCA(data[,2:10])


fviz_screeplot(Data.pca, addlabels = TRUE, ylim = c(0, 100))

#Estimate the number of significant components in Principal Component Analysis.
#ncp = the best number of dimensions to use (find the minimum or the first local minimum) ?
# and the mean error for each dimension tested
estim_ncp(data[,2:10], ncp.min=0, ncp.max=NULL, scale=TRUE, method="GCV") #FactoMiNeR
## $ncp
## [1] 6
##
## $criterion
## [1] 1.0000000 0.6965358 0.6281441 0.5602032 0.4383914 0.2961051 0.2908959
## [8] 0.3828931 0.6683697
## $Dim.1
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## life_expec 0.8660003 1.550359e-51
## income 0.8102823 3.895056e-40
## gdpp 0.7984948 3.329819e-38
## exports 0.5773418 3.155347e-16
## imports 0.3283958 1.472843e-05
## health 0.3067485 5.532100e-05
## inflation -0.3928425 1.511302e-07
## total_fer -0.8210359 5.086278e-42
## child_mort -0.8531479 1.701142e-48
##
## $Dim.2
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## imports 0.8354236 9.508003e-45
## exports 0.7624821 5.113174e-33
## child_mort 0.2398554 1.796042e-03
## total_fer 0.1930357 1.244078e-02
## life_expec -0.2769407 2.910838e-04
## health -0.3022837 7.177825e-05
##
## $Dim.3
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## inflation 0.6951049 1.990102e-25
## income 0.3262269 1.689449e-05
## exports 0.1566083 4.326810e-02
## imports -0.3244735 1.886204e-05
## health -0.6454623 4.677421e-21
# Optimal representation of the variables
# Visualization of correlation between pairs of variables
# by the cosine of their angle.
# Control variable colors using their contributions
fviz_pca_var(Data.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

# Contributions of variables to PC1
PC_1 <- fviz_contrib(Data.pca, choice = "var", axes = 1, color = "darkblue", top = 4, ggtheme = theme_minimal())
# Contributions of variables to PC2
PC_2 <- fviz_contrib(Data.pca, choice = "var", axes = 2, color = "darkblue", top = 4, ggtheme = theme_minimal())
library(gridExtra)
grid.arrange(PC_1, PC_2, ncol=2)

grid.arrange(pc1, pc2, PC_1, PC_2, nrow=2, ncol=2)

## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 4.13565658 45.9517398 45.95174
## comp 2 1.54634631 17.1816257 63.13337
## comp 3 1.17038330 13.0042589 76.13762
## comp 4 0.99478456 11.0531618 87.19079
## comp 5 0.66061903 7.3402114 94.53100
## comp 6 0.22358112 2.4842347 97.01523
## comp 7 0.11343874 1.2604304 98.27566
## comp 8 0.08831536 0.9812817 99.25694
## comp 9 0.06687501 0.7430556 100.00000
# cumulative (variance %) ---> component 2.
Data.pca$eig[2,3]
## [1] 63.13337
fvz_biplt2 <- fviz_pca_biplot(Data.pca, repel = TRUE,
col.var = "magenta", # Variables color
col.ind = "#696969", # Individuals color
addEllipses = TRUE,
title = "FactoMineR"
)
# Visualize
# Use habillage to specify groups for coloring
fvz_ind2 <- fviz_pca_ind(data.pca, #iris.pca
label = "none", # hide individual labels
habillage = as.factor(data$Continent), # color by groups
palette = "RdBl",
#palette = c("cyan", "pink", "green"),
addEllipses = TRUE # Concentration ellipses
)
fvz_ellips2 <- fviz_ellipses(data.pca, habillage= as.factor(data$Continent), ellipse.type= "confidence", geom = "point", palette = "lancet")
grid.arrange(fvz_biplt1, fvz_biplt2, ncol=2)

grid.arrange(fvz_ind1, fvz_ind2, ncol=2)

grid.arrange(fvz_ellips1, fvz_ellips2, ncol=2)

fvz_indTwo <- fviz_pca_ind(data.pca, #iris.pca
label = "none", # hide individual labels
habillage = as.factor(data$state2), # color by groups
palette = "RdBl",
addEllipses = TRUE # Concentration ellipses
)
fvz_indThree <- fviz_pca_ind(data.pca, #iris.pca
label = "none", # hide individual labels
habillage = as.factor(data$state3), # color by groups
palette = "RdBl",
addEllipses = TRUE # Concentration ellipses
)


# Hopkins statistic: If the value of Hopkins statistic is close to 1 (far above 0.5), then we can conclude that the dataset is significantly clusterable.
# VAT (Visual Assessment of cluster Tendency): The VAT detects the clustering tendency in a visual form by counting the number of square shaped dark (or colored) blocks along the diagonal in a VAT image.
#ordered dissimilarity image (ODI)
get_clust_tendency(data_scaled, n=167-1, graph=TRUE)
## $hopkins_stat
## [1] 0.844379
##
## $plot

# get_dist(): Computes a distance matrix between the rows of a data matrix.
# Compared to the standard dist() function, it supports correlation-based distance measures including "pearson", "kendall" and "spearman" methods.
distance <- get_dist(data_scaled)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

#f TRUE the ordered dissimilarity image (ODI) is shown.
fviz_dist(distance, show_labels=TRUE, order = FALSE, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# sigma=0.01
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.01),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2.5,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,01")

ggplot(data, aes(rotated(kpc)[,1], rotated(kpc)[,2], colour=state5)) +
geom_point(alpha = 0.3, size=3) +
geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

# kernel function
kernelf(kpc)
## Anova RBF kernel function.
## Hyperparameter : sigma = 0.01 degree = 1
#print the principal component vectors
pcv(kpc)
## [,1] [,2]
## [1,] 0.412274463 0.015715322
## [2,] -0.063983558 -0.228656647
## [3,] 0.032931783 -0.178673862
## [4,] 0.419019002 0.630415518
## [5,] -0.142435210 0.058622863
## [6,] -0.021662646 -0.678817319
## [7,] 0.008546485 -0.216781010
## [8,] -0.339511902 -0.731424362
## [9,] -0.420178167 -0.265169735
## [10,] 0.019379966 -0.154427205
## [11,] -0.179017598 -0.255091335
## [12,] -0.234074759 0.242520866
## [13,] 0.149371613 -0.366372238
## [14,] -0.151853540 -0.190696711
## [15,] -0.082792178 0.214474213
## [16,] -0.439209268 0.292273215
## [17,] -0.023569824 0.270865332
## [18,] 0.385788133 0.140824453
## [19,] 0.025334136 0.300556926
## [20,] 0.110500737 -0.057588336
## [21,] -0.143566991 -0.386442502
## [22,] 0.128258397 0.153790070
## [23,] -0.036760947 -0.813986144
## [24,] -0.345862995 0.071172831
## [25,] -0.125075318 0.011980898
## [26,] 0.442969502 -0.011367387
## [27,] 0.405681201 -0.196472167
## [28,] 0.088870985 0.338068479
## [29,] 0.402293797 0.011181295
## [30,] -0.366469354 -0.650295532
## [31,] 0.025816993 0.135617618
## [32,] 0.556375882 0.121715202
## [33,] 0.506960556 0.457872114
## [34,] -0.141395256 -0.416678864
## [35,] -0.016994380 -0.452683013
## [36,] -0.028685470 -0.673342106
## [37,] 0.301213033 0.122623440
## [38,] 0.449426876 0.364138213
## [39,] 0.251693109 0.838990491
## [40,] -0.139697648 -0.533295845
## [41,] 0.375726448 0.439501609
## [42,] -0.163491706 -0.327382387
## [43,] -0.304470121 0.021843623
## [44,] -0.283535188 0.181371817
## [45,] -0.426044933 -0.313051416
## [46,] 0.027414255 -0.344686610
## [47,] -0.008517421 -0.414045272
## [48,] 0.112642118 -0.317491092
## [49,] -0.013951469 -0.226113668
## [50,] 0.192368934 0.919621684
## [51,] 0.342017800 -0.231954614
## [52,] -0.227026878 0.419540097
## [53,] 0.033732319 0.414942472
## [54,] -0.349604225 -0.398140733
## [55,] -0.325581398 -0.709549445
## [56,] 0.200730036 0.122150291
## [57,] 0.318108807 0.065977718
## [58,] -0.050389167 -0.217065569
## [59,] -0.380525802 -0.481041804
## [60,] 0.291164943 0.127526593
## [61,] -0.258186989 -0.674553789
## [62,] -0.020114742 -0.167740094
## [63,] 0.091168918 -0.249699144
## [64,] 0.422764967 0.254467818
## [65,] 0.403276112 -0.064184803
## [66,] 0.052828087 0.529001924
## [67,] 0.575463036 0.577098270
## [68,] -0.251726751 0.520154481
## [69,] -0.352580072 -0.228467059
## [70,] 0.184593711 -0.212323789
## [71,] 0.124011653 -0.282079183
## [72,] -0.013510085 -0.505908988
## [73,] 0.142575099 -0.132170007
## [74,] -0.504782337 0.729751736
## [75,] -0.211688643 -0.396178089
## [76,] -0.312677529 -0.666641538
## [77,] -0.006592903 -0.094046995
## [78,] -0.330692072 -0.899712362
## [79,] -0.020275138 0.204167927
## [80,] 0.034809783 -0.093876331
## [81,] 0.268258511 -0.078054741
## [82,] 0.179895593 0.131338950
## [83,] -0.346328724 0.093713535
## [84,] 0.052059857 0.504012081
## [85,] 0.219430318 0.195228917
## [86,] -0.162536598 0.067684028
## [87,] -0.163453392 -0.092920527
## [88,] 0.261368868 0.756438606
## [89,] 0.258562109 0.390283664
## [90,] -0.117805560 0.163543051
## [91,] -0.193292022 0.296453130
## [92,] -0.882773313 1.934106818
## [93,] -0.102206884 -0.037363801
## [94,] 0.304863352 0.116872318
## [95,] 0.421584409 0.055170250
## [96,] -0.167129619 0.649128998
## [97,] -0.150501785 0.406639139
## [98,] 0.485292359 0.185398037
## [99,] -0.485233158 1.797408455
## [100,] 0.281422423 0.519831604
## [101,] -0.122249789 0.166797410
## [102,] 0.057762344 0.025789925
## [103,] -0.074685580 0.053803643
## [104,] 0.121357012 0.194048262
## [105,] -0.144343534 -0.102743531
## [106,] 0.033447146 -0.116005700
## [107,] 0.419903417 0.316655552
## [108,] 0.241130362 -0.574728036
## [109,] 0.157452890 0.371894849
## [110,] 0.173720800 -0.304659622
## [111,] -0.473105137 0.077919144
## [112,] -0.263991740 -0.604788453
## [113,] 0.491024628 0.340004993
## [114,] 0.573503199 -0.056879240
## [115,] -0.514488699 -0.490657100
## [116,] -0.161541318 0.213341440
## [117,] 0.327524632 -0.185318581
## [118,] -0.157588582 0.443980301
## [119,] -0.015065344 0.135385658
## [120,] -0.004218263 -0.415260250
## [121,] 0.110864215 -0.046084090
## [122,] -0.172549898 -0.253773241
## [123,] -0.260346120 -0.561306227
## [124,] -0.541063357 0.051981541
## [125,] -0.083005419 -0.244854969
## [126,] -0.033132283 -0.401711450
## [127,] 0.234785806 -0.407633181
## [128,] 0.083455855 -0.018175953
## [129,] -0.126603282 -0.054060520
## [130,] 0.276313197 0.017240462
## [131,] -0.122179186 -0.348120032
## [132,] -0.208076061 1.176879327
## [133,] 0.468322090 -0.143481766
## [134,] -0.736661691 2.397642793
## [135,] -0.279062854 0.428824590
## [136,] -0.316849007 0.090064204
## [137,] 0.121808000 0.494321414
## [138,] 0.165811753 -0.238735942
## [139,] -0.270503514 -0.151090577
## [140,] -0.291629933 -0.674352970
## [141,] 0.065406378 -0.379304128
## [142,] -0.003813910 -0.004372423
## [143,] 0.319203487 -0.304806783
## [144,] -0.024588846 -0.045651653
## [145,] -0.398762627 -0.330964457
## [146,] -0.567074583 -0.112251147
## [147,] 0.172041283 -0.014918011
## [148,] 0.360919073 -0.100097498
## [149,] -0.126574519 0.335427863
## [150,] 0.318024916 -0.464756331
## [151,] 0.294296565 0.343353222
## [152,] 0.107065761 -0.027518949
## [153,] -0.083503255 0.067108838
## [154,] -0.066090756 -0.538654998
## [155,] 0.074525004 0.499668797
## [156,] 0.400843344 -0.165878653
## [157,] -0.046731451 -0.045777967
## [158,] -0.340303257 0.498010231
## [159,] -0.298434718 -0.580351409
## [160,] -0.379645282 -1.098340987
## [161,] -0.095208920 -0.553393132
## [162,] 0.111213163 -0.262555109
## [163,] 0.122932380 0.236068239
## [164,] 0.049542809 -0.470367231
## [165,] -0.068668454 0.550674383
## [166,] 0.261144035 -0.059561097
## [167,] 0.406177784 0.157973479
#The corresponding eigenvalues
eig(kpc)
## Comp.1 Comp.2
## 0.07595030 0.02694576
# sigma=0.00000000000000001
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.00000000000000001),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1 * 10^(-17)")

# sigma=0.0000000000000001
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.0000000000000001),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1 * 10^(-16)")

# sigma=0.0000000000000002
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.0000000000000002),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 2 * 10^(-16)")

# sigma=0.000000000000000318
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.000000000000000318),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,18 * 10^(-16)")

# sigma=0.00000000000000034
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.00000000000000034),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,4 * 10^(-16)")

# sigma=0.00000000000000035
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.00000000000000035),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,5 * 10^(-16)")

# sigma=0.00000000000000038
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.00000000000000038),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,8 * 10^(-16)")

# sigma=0.00000000000000039
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.00000000000000039),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,9 * 10^(-16)")

# sigma=0.0000000000000004
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.0000000000000004),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 4 * 10^(-16)")

# sigma=0.05
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.05),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,05")

# sigma=0.62
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.62),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,62")

# sigma=0.63
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=0.63),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,63")

# sigma=13.9
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=13.9),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 13,9")

# sigma=14
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=14),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 14")

# sigma=229
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=229),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 229")

# sigma=230
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=230),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 230")

# sigma=799
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=799),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 799")

# sigma=800
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=800),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 800")

# sigma=1019
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=1019),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1019")

# sigma=1020
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
kpar=list(sigma=1020),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1020")

fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(subtitle = "Elbow method")

fviz_nbclust(data.pca$x, kmeans, method = "wss") +
labs(subtitle = "Elbow method")

fviz_nbclust(Data.pca$call$X, kmeans, method = "wss") +
labs(subtitle = "Elbow method")

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")

fviz_nbclust(data.pca$x, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")

fviz_nbclust(Data.pca$call$X, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")

fviz_nbclust(data_scaled, kmeans, nstart = 25, method = "gap_stat", nboot = 100) +
labs(subtitle = "Gap statistic method")

fviz_nbclust(data.pca$x, kmeans, nstart = 25, method = "gap_stat", nboot = 100) +
labs(subtitle = "Gap statistic method")

fviz_nbclust(Data.pca$call$X, kmeans, nstart = 25, method = "gap_stat", nboot = 100) +
labs(subtitle = "Gap statistic method")

nbclust <- NbClust(data = data.pca$x[,1:2], distance = "euclidean",
min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 10 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
# Best number of clusters proposed by each index and the corresponding index value.
nbclust[["Best.nc"]]
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 4.0000 8.000 4.0000 8.0000 4.000 4.0 3.00
## Value_Index 5.4023 163.687 34.9958 -2.6812 140.899 69965.5 63599.88
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 4.0000 4.0000 4.0000 2.0000 4.0000 2.0000 2.0000
## Value_Index 66.7049 3.6352 -0.1958 0.1939 0.8576 0.4155 0.5646
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.0000 2.0000 3.000 3.0000 4.0000 1 2.0000
## Value_Index 58.6142 0.7627 0.404 128.2709 0.5551 NA 0.5596
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 7.0000 0 4.0000 0 10.0000
## Value_Index 0.0609 0 1.7671 0 0.3292
# Values of indices for each partition of the dataset obtained with a number of clusters between min.nc and max.nc.
nbclust[["All.index"]]
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 0.5943 146.2577 60.9097 -3.0684 183.1325 235439.4 96301.452 500.0038
## 3 0.8326 129.7878 80.2574 -5.6293 278.0841 300008.7 32701.575 365.1930
## 4 5.4023 154.6719 45.2616 -3.3188 418.9831 229400.4 18267.604 245.1990
## 5 1.1848 158.5524 28.8335 -2.9719 493.9816 228757.7 10512.596 191.9098
## 6 0.3998 154.2266 35.8669 -3.3451 548.0993 238232.9 6087.265 162.9137
## 7 3.4154 162.1184 25.4688 -2.7480 618.3376 212929.5 3255.076 133.2326
## 8 2.1200 163.6870 18.6780 -2.6812 664.6517 210754.6 3658.801 114.9369
## 9 3.5525 161.3646 17.8869 -2.9312 702.9458 212078.0 3113.913 102.8545
## 10 0.2564 160.6375 14.4383 -3.0611 738.4794 211642.6 2012.351 92.3946
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 1.8703 1.8864 0.1939 1.0274 0.4155 0.5646 58.6142 0.7627 0.3452
## 3 3.1405 2.5828 0.2359 0.9941 0.4135 0.9413 2.3085 0.0555 0.4040
## 4 6.7757 3.8467 0.2798 0.8576 0.3925 0.9242 7.3021 0.0810 0.3950
## 5 8.0621 4.9149 0.3006 0.8986 0.3750 1.2377 -13.6378 -0.1889 0.3829
## 6 9.4530 5.7896 0.2738 0.9047 0.3599 1.1424 -5.8582 -0.1207 0.3610
## 7 12.4982 7.0794 0.3033 0.8662 0.3741 1.0764 -2.5540 -0.0684 0.3420
## 8 14.3635 8.2063 0.2823 0.8790 0.3573 1.4066 -12.7182 -0.2759 0.3251
## 9 16.0482 9.1704 0.2909 0.8839 0.3418 2.3211 -7.3993 -0.2846 0.3100
## 10 17.7871 10.2085 0.2753 0.8815 0.3307 1.6387 -13.6419 -0.3543 0.2967
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 250.0019 0.4409 -0.3387 0.5596 0.0111 0.0012 1.8268 1.4394 1.8465
## 3 121.7310 0.5520 0.6371 0.5669 0.0301 0.0018 2.3442 1.3130 1.1804
## 4 61.2997 0.5551 1.2868 0.7487 0.0096 0.0023 1.7671 1.0848 0.4765
## 5 38.3820 0.4970 0.9456 1.1149 0.0214 0.0024 1.9239 0.9524 0.5154
## 6 27.1523 0.4596 0.4442 1.4258 0.0528 0.0024 2.1364 0.8842 0.4554
## 7 19.0332 0.4455 0.8828 1.6007 0.0609 0.0025 1.9880 0.7972 0.4855
## 8 14.3671 0.4131 0.7258 1.9252 0.0464 0.0026 2.5968 0.7318 0.4470
## 9 11.4283 0.3876 0.4617 2.2281 0.0467 0.0027 2.9505 0.6960 0.4063
## 10 9.2395 0.3750 0.4029 2.3958 0.0510 0.0027 2.8636 0.6557 0.3292
# Critical values of some indices for each partition obtained with a number of clusters between min.nc and max.nc.
nbclust[["All.CriticalValues"]]
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.4230 103.6700 0.4679
## 3 -0.0987 -411.9228 0.9462
## 4 0.4075 129.4084 0.9222
## 5 0.3683 121.7865 1.0000
## 6 0.2521 139.4290 1.0000
## 7 0.2234 125.1193 1.0000
## 8 0.1671 219.2765 1.0000
## 9 -0.7431 -30.4948 1.0000
## 10 -0.0307 -1175.3080 1.0000
best_n_of_clusters <- nbclust[["Best.nc"]]
best_N_of_clusters <- as.integer(best_n_of_clusters[1,])
hist(best_N_of_clusters,
main = "Clusters proposed by indexes",
xlab = "Number of clusters",
border = "Magenta",
col = "Blue",
xlim = c (0, 10),
ylim = c (0, 20),
breaks = 20)

# k-means clustering
clusters_2 <- kmeans(x = data.pca$x[,1:2], centers = 2, iter.max = 30, nstart = 50)
clusters_3 <- kmeans(x = data.pca$x[,1:2], centers = 3, iter.max = 30, nstart = 50)
clusters_5 <- kmeans(x = data.pca$x[,1:2], centers = 5, iter.max = 30, nstart = 50)
par(mfrow=c(1,2))
plot(data.pca$x[,1:2], col=as.factor(data$state2), pch=20, cex=2.5, xlab="1st PC",ylab="2nd PC")
title(main = "original")
plot(data.pca$x[,1:2], col=clusters_2$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_2$centers[,1], clusters_2$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=clusters_2$cluster)) +
geom_point(alpha=0.3, size=3) +
scale_colour_gradientn(colours=rainbow(2)) +
geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme_dark() +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

plot(data.pca$x[,1:2], col=as.factor(data$state3), pch=20, cex=2.5, xlab="1st PC",ylab="2nd PC")
title(main = "original")
plot(data.pca$x[,1:2], col=clusters_3$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_3$centers[,1], clusters_3$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=clusters_3$cluster)) +
geom_point(alpha=0.3, size=3) +
scale_colour_gradientn(colours=rainbow(3)) +
geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme_dark() +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

plot(data.pca$x[,1:2], col=as.factor(data$state5), pch=20, cex=2.5, xlab="1st PC",ylab="2nd PC")
title(main = "original")
plot(data.pca$x[,1:2], col=clusters_5$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_5$centers[,1], clusters_5$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=clusters_5$cluster)) +
geom_point(alpha=0.3, size=3) +
scale_colour_gradientn(colours=rainbow(5)) +
geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme_dark() +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

# k-means clustering
Clusters_2 <- pam(x = data.pca$x[,1:2], k = 2, nstart = 50)
Clusters_3 <- pam(x = data.pca$x[,1:2], k = 3, nstart = 50)
Clusters_5 <- pam(x = data.pca$x[,1:2], k = 5, nstart = 50)
par(mfrow=c(1,2))
plot(data.pca$x[,1:2], col=clusters_2$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_2$centers[,1], clusters_2$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")
plot(data.pca$x[,1:2], col=Clusters_2$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(Clusters_2$medoids[,1], Clusters_2$medoids[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-medoids")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=Clusters_2$cluster)) +
geom_point(alpha=0.3, size=3) +
scale_colour_gradientn(colours=rainbow(2)) +
geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme_dark() +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

plot(data.pca$x[,1:2], col=clusters_3$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_3$centers[,1], clusters_3$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")
plot(data.pca$x[,1:2], col=Clusters_3$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(Clusters_3$medoids[,1], Clusters_3$medoids[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-medoids")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=Clusters_3$cluster)) +
geom_point(alpha=0.3, size=3) +
scale_colour_gradientn(colours=rainbow(3)) +
geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme_dark() +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

plot(data.pca$x[,1:2], col=clusters_5$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_5$centers[,1], clusters_5$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")
plot(data.pca$x[,1:2], col=Clusters_5$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(Clusters_5$medoids[,1], Clusters_5$medoids[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-medoids")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=Clusters_5$cluster)) +
geom_point(alpha=0.3, size=3) +
scale_colour_gradientn(colours=rainbow(5)) +
geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
theme_dark() +
theme(legend.key.size = unit(1.5, 'cm'),
legend.title = element_text(size = 12),
legend.text = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20) )

dist_matrix <- dist(data_scaled, method = "euclidean")
# By default, the COMPLETE LINKAGE method is used.
# Complete linkage method finds similar clusters.
clusters_completeL <- hclust(dist_matrix, method = "complete")
clusters_completeL$labels <- data$country
plot(clusters_completeL,
xlab = "Agglomerative clustering with 167 initial clusters/countries",
cex = 1.1,
label = data$country)

clusterCut_2 <- cutree(clusters_completeL, k = 2) # calculate final labeling given the number of clusters
table(clusterCut_2) # Number of members in each cluster
## clusterCut_2
## 1 2
## 55 112
clusterCut_3 <- cutree(clusters_completeL, k = 3)
table(clusterCut_3)
## clusterCut_3
## 1 2 3
## 55 109 3
clusterCut_5 <- cutree(clusters_completeL, k = 5)
table(clusterCut_5)
## clusterCut_5
## 1 2 3 4 5
## 54 95 14 3 1
# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Complete Linkage [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Complete Linkage [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Complete Linkage [5]")

dend_completeL <- as.dendrogram(clusters_completeL)
dend_2 <- color_branches(dend_completeL, k = 2)
plot(dend_2, type = "triangle", center = TRUE, cex = 0.4, main = "Complete Linkage method [2]")

dend_3 <- color_branches(dend_completeL, k = 3)
plot(dend_3, type = "triangle", center = TRUE, cex = 0.3, main = "Complete Linkage method [3]")

dend_5 <- color_branches(dend_completeL, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Complete Linkage method [5]")

fviz_dend(clusters_completeL,
k = 2,
k_colors = "nejm",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "nejm",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.auto",
ggtheme = theme_void() )

fviz_dend(clusters_completeL,
k = 3,
k_colors = "uchicago",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "uchicago",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_with_lgl",
ggtheme = theme_void() )

fviz_dend(clusters_completeL,
k = 5,
k_colors = "uchicago",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "uchicago",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.gem",
ggtheme = theme_void() )

# This time, we will use the SINGLE LINKAGE method:
# it adopts a ‘friends of friends’ clustering strategy.
clusters_singleL <- hclust(dist_matrix, method = "single")
clusters_singleL$labels <- data$country
plot(clusters_singleL, xlab = "Agglomerative clustering with 167 initial clusters/countries", cex = 1.1)

clusterCut_2 <- cutree(clusters_completeL, k = 2) # calculate final labeling given the number of clusters
table(clusterCut_2) # Number of members in each cluster
## clusterCut_2
## 1 2
## 55 112
clusterCut_3 <- cutree(clusters_completeL, k = 3)
table(clusterCut_3)
## clusterCut_3
## 1 2 3
## 55 109 3
clusterCut_5 <- cutree(clusters_completeL, k = 5)
table(clusterCut_5)
## clusterCut_5
## 1 2 3 4 5
## 54 95 14 3 1
# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Single Linkage [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Single Linkage [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Single Linkage [5]")

dend_singleL <- as.dendrogram(clusters_singleL)
dend_2 <- color_branches(dend_singleL, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Single Linkage method [2]")

dend_3 <- color_branches(dend_singleL, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Single Linkage method [3]")

dend_5 <- color_branches(dend_singleL, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Single Linkage method [5]")

fviz_dend(clusters_singleL,
k = 2,
k_colors = "nejm",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "nejm",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.auto",
ggtheme = theme_void() )

fviz_dend(clusters_singleL,
k = 3,
k_colors = "uchicago",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "uchicago",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_with_lgl",
ggtheme = theme_void() )

fviz_dend(clusters_singleL,
k = 5,
k_colors = "uchicago",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "uchicago",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.gem",
ggtheme = theme_void() )

# This time, we will use the AVERAGE method:
clusters_average <- hclust(dist_matrix, method = "average")
clusters_average$labels <- data$country
plot(clusters_average, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

clusterCut_2 <- cutree(clusters_average, k = 2) # calculate final labeling given the number of clusters
table(clusterCut_2) # Number of members in each cluster
## clusterCut_2
## 1 2
## 166 1
clusterCut_3 <- cutree(clusters_average, k = 3)
table(clusterCut_3)
## clusterCut_3
## 1 2 3
## 163 3 1
clusterCut_5 <- cutree(clusters_average, k = 5)
table(clusterCut_5)
## clusterCut_5
## 1 2 3 4 5
## 161 1 3 1 1
# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Average Method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Average Method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Average Method [5]")

dend_average <- as.dendrogram(clusters_average)
dend_2 <- color_branches(dend_average, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Average method [2]")

dend_3 <- color_branches(dend_average, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Average method [3]")

dend_5 <- color_branches(dend_average, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Average method [5]")

fviz_dend(clusters_average,
k = 2,
k_colors = "nejm",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "nejm",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.auto",
ggtheme = theme_void() )

fviz_dend(clusters_average,
k = 3,
k_colors = "uchicago",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "uchicago",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_with_lgl",
ggtheme = theme_void() )

fviz_dend(clusters_average,
k = 5,
k_colors = "uchicago",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "uchicago",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.gem",
ggtheme = theme_void() )

# This time, we will use the MEDIAN distance method:
clusters_median <- hclust(dist_matrix, method = "median")
clusters_median$labels <- data$country
plot(clusters_median, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

clusterCut_2 <- cutree(clusters_median, k = 2) # calculate final labeling given the number of clusters
table(clusterCut_2) # Number of members in each cluster
## clusterCut_2
## 1 2
## 166 1
clusterCut_3 <- cutree(clusters_median, k = 3)
table(clusterCut_3)
## clusterCut_3
## 1 2 3
## 161 5 1
clusterCut_5 <- cutree(clusters_median, k = 5)
table(clusterCut_5)
## clusterCut_5
## 1 2 3 4 5
## 161 3 1 1 1
# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Median method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Median method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Median method [5]")

#--------------------
dend_median <- as.dendrogram(clusters_median)
dend_2 <- color_branches(dend_median, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Average method [2]")

dend_3 <- color_branches(dend_median, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Average method [3]")

dend_5 <- color_branches(dend_median, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Average method [5]")

# This time, we will use the centroid method:
clusters_centroid <- hclust(dist_matrix, method = "centroid")
clusters_centroid$labels <- data$country
plot(clusters_centroid, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

clusterCut_2 <- cutree(clusters_centroid, k = 2) # calculate final labeling given the number of clusters
table(clusterCut_2) # Number of members in each cluster
## clusterCut_2
## 1 2
## 166 1
clusterCut_3 <- cutree(clusters_centroid, k = 3)
table(clusterCut_3)
## clusterCut_3
## 1 2 3
## 165 1 1
clusterCut_5 <- cutree(clusters_centroid, k = 5)
table(clusterCut_5)
## clusterCut_5
## 1 2 3 4 5
## 161 3 1 1 1
# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Centroid method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Centroid method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Centroid method [5]")

#--------------------
dend_centroid <- as.dendrogram(clusters_centroid)
dend_2 <- color_branches(dend_centroid, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Centroid method [2]")

dend_3 <- color_branches(dend_centroid, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Centroid method [3]")

dend_5 <- color_branches(dend_centroid, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Centroid method [5]")

# This time, we will use Ward's method:
# Ward's minimum variance method aims at finding compact, spherical clusters.
# "ward.D" does not implement Ward's (1963) clustering criterion,
# whereas option "ward.D2" implements that criterion:
# with the latter, dissimilarities are squared before cluster updating.
clusters_ward <- hclust(dist_matrix, method = "ward.D2")
clusters_ward$labels <- data$country
plot(clusters_ward, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)
rect.hclust(clusters_ward, k = 5, border = 2:5)

clusterCut_2 <- cutree(clusters_ward, k = 2) # calculate final labeling given the number of clusters
table(clusterCut_2) # Number of members in each cluster
## clusterCut_2
## 1 2
## 133 34
clusterCut_3 <- cutree(clusters_ward, k = 3)
table(clusterCut_3)
## clusterCut_3
## 1 2 3
## 27 106 34
clusterCut_5 <- cutree(clusters_ward, k = 5)
table(clusterCut_5)
## clusterCut_5
## 1 2 3 4 5
## 27 63 43 31 3
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Ward's method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Ward's method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Ward's method [5]")

dend_ward <- as.dendrogram(clusters_ward)
dend_2 <- color_branches(dend_ward, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Ward's method [2]")

dend_3 <- color_branches(dend_ward, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Ward's method [3]")

dend_5 <- color_branches(dend_ward, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Ward's method [5]")

fviz_dend(clusters_ward,
k = 2,
k_colors = "igv",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "igv",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.auto",
ggtheme = theme_void() )

fviz_dend(clusters_ward,
k = 3,
k_colors = "jco",
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = "jco",
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_with_lgl",
ggtheme = theme_void() )

fviz_dend(clusters_ward,
k = 5,
k_colors = c("#5050FFFF", "#7AA6DCFF", "#868686FF", "#CD534CFF", "#EFC000FF"),
color_labels_by_k = TRUE,
lwd = 0.7,
cex = 1.3,
rect = TRUE,
rect_border = c("#5050FFFF", "#7AA6DCFF", "#868686FF", "#CD534CFF", "#EFC000FF"),
rect_fill = TRUE,
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.gem",
ggtheme = theme_void() )
