Exploratory Data Analysis

library(DT)
library(ggplot2)                     
library(GGally) 
library(DataExplorer)                
library(tidyr)
library(factoextra)
library(gridExtra)
library(FactoMineR)
library(kernlab)
library(NbClust)
library(cluster)
library(dendextend)

dt_st1 <- read.csv("Country-data.csv")
dt_st2 <- read.csv("DevelopedCountriesList.csv")
dt_merged <- merge(dt_st1, dt_st2, by = "country")
DATA <- dt_merged[,-11]
DATA$state2 <- as.factor(ifelse(DATA$hdi < 0.8, "i.      Developing", "ii.      Developed"))

DATA$state3 <- as.factor(ifelse(DATA$hdi > 0.8, "iii.      Developed",
                         ifelse(DATA$hdi > 0.5, "ii.      Developing", "i.      Under-Developed")))

DATA$state5 <- as.factor(ifelse(DATA$hdi > 0.80, "v.     self-sufficient",
                         ifelse(DATA$hdi > 0.65, "iv.    ok",
                         ifelse(DATA$hdi > 0.55, "iii.    maybe in need",
                         ifelse(DATA$hdi > 0.45, "ii.    probably need aid", "i.     definately need aid")))))

data <- DATA[,-(11:12)]

str(data)

## 'data.frame':    167 obs. of  13 variables:
##  $ country   : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ child_mort: num  90.2 16.6 27.3 119 10.3 14.5 18.1 4.8 4.3 39.2 ...
##  $ exports   : num  10 28 38.4 62.3 45.5 18.9 20.8 19.8 51.3 54.3 ...
##  $ health    : num  7.58 6.55 4.17 2.85 6.03 8.1 4.4 8.73 11 5.88 ...
##  $ imports   : num  44.9 48.6 31.4 42.9 58.9 16 45.3 20.9 47.8 20.7 ...
##  $ income    : int  1610 9930 12900 5900 19100 18700 6700 41400 43200 16000 ...
##  $ inflation : num  9.44 4.49 16.1 22.4 1.44 20.9 7.77 1.16 0.873 13.8 ...
##  $ life_expec: num  56.2 76.3 76.5 60.1 76.8 75.8 73.3 82 80.5 69.1 ...
##  $ total_fer : num  5.82 1.65 2.89 6.16 2.13 2.37 1.69 1.93 1.44 1.92 ...
##  $ gdpp      : int  553 4090 4460 3530 12200 10300 3220 51900 46900 5840 ...
##  $ state2    : Factor w/ 2 levels "i.      Developing",..: 1 1 1 1 1 2 1 2 2 1 ...
##  $ state3    : Factor w/ 3 levels "i.      Under-Developed",..: 2 2 2 2 2 3 2 3 3 2 ...
##  $ state5    : Factor w/ 5 levels "i.     definately need aid",..: 2 4 4 3 4 5 4 5 5 4 ...

DT::datatable(data)

ggcorr(data, palette = "RdBu", label = TRUE, label_alpha = TRUE)

plot_missing(data, ggtheme = theme_minimal(), title = "Missing Values")

data %>%
  gather(Attributes, value, 2:10) %>%
  ggplot(aes(x=value, fill=Attributes)) +
  geom_histogram(colour="black", show.legend=FALSE) +
  facet_wrap(~Attributes, scales="free_x") +
  labs(x="Values", y="Frequency",
       title="Country Data - Histograms") +
  theme_bw()

data$Continent[   data$country == "Anguilla" |
                  data$country == "Antigua and Barbuda" |
                  data$country == "Aruba" |
                  data$country == "Bahamas" |
                  data$country == "Barbados" |
                  data$country == "Belize" |
                  data$country == "Bermuda" |
                  data$country == "Bonaire" |
                  data$country == "British Virgin Islands" |
                  data$country == "Canada" |
                  data$country == "Cayman Islands" |
                  data$country == "Cliperton Island" |
                  data$country == "Costa Rica" |
                  data$country == "Cuba" |
                  data$country == "Curacao" |
                  data$country == "Dominica" |
                  data$country == "Dominican Republic" |
                  data$country == "El Salvador" | 
                  data$country == "Venezuela" |
                  data$country == "Greenland" |
                  data$country == "Grenada" |
                  data$country == "Guadeloupe" |
                  data$country == "Guatemala" | 
                  data$country == "Haiti" |
                  data$country == "Honduras" |
                  data$country == "Jamaica" |
                  data$country == "Martinique" |
                  data$country == "Mexico" | 
                  data$country == "Montseratt" |
                  data$country == "Nicaragua" |
                  data$country == "Panama" |
                  data$country == "Puerto Rico" |
                  data$country == "Saba" | 
                  data$country == "San Andres and Providencia" |
                  data$country == "Saint Barthelemy" |
                  data$country == "Saint Kitts and Nevis" |
                  data$country == "Saint Lucia" |
                  data$country == "Saint Martin" | 
                  data$country == "Saint Pierre Miquelon" |
                  data$country == "St. Vincent and the Grenadines" |
                  data$country == "Sint Eustatius" |
                  data$country == "Trinidad and Tobago" |
                  data$country == "Turks and Caicos Islands" | 
                  data$country == "United States" |
                  data$country == "US Virgin Islands"
          
                ] <- "North America"

data$Continent[     data$country == "Brazil" |
                    data$country == "Uruguay" |
                    data$country == "Paraguay" |
                    data$country == "Argentina" |
                    data$country == "Chile" |
                    data$country == "Bolivia" |
                    data$country == "Peru" |
                    data$country == "Ecuador" |
                    data$country == "Colombia" |
                    data$country == "Venezuela" |
                    data$country == "Guyana" |
                    data$country == "Suriname" |
                    data$country == "French Guiana" |
                    data$country == "Falkland Islands" |
                    data$country == "South Georgia and the South Sandwitch Islands"
                                                      
              ] <- "South America"

data$Continent[     data$country == "Albania" |
                    data$country == "Andorra" |
                    data$country == "Austria" |
                    data$country == "Belarus" |
                    data$country == "Belgium" |
                    data$country == "Bosnia and Herzegovina" |
                    data$country == "Bulgaria" |
                    data$country == "Croatia" |
                    data$country == "Cyprus" |
                    data$country == "Czech Republic" |
                    data$country == "Denmark" |
                    data$country == "Estonia" |
                    data$country == "Finland" |
                    data$country == "France" |
                    data$country == "Germany" |
                    data$country == "Greece" |
                    data$country == "Hungary" |
                    data$country == "Iceland" |
                    data$country == "Ireland" |
                    data$country == "Italy" |
                    data$country == "Latvia" |
                    data$country == "Liechtestein" |
                    data$country == "Lithuania" |
                    data$country == "Luxembourg" |
                    data$country == "Malta" |
                    data$country == "Moldova" |
                    data$country == "Montenegro" |
                    data$country == "Netherlands" |
                    data$country == "Macedonia, FYR" |
                    data$country == "Norway" |
                    data$country == "Poland" |
                    data$country == "Portugal" |
                    data$country == "Romania" |
                    data$country == "San Marino" |
                    data$country == "Serbia" |
                    data$country == "Slovak Republic" |
                    data$country == "Slovenia" |
                    data$country == "Spain" |
                    data$country == "Sweden" |
                    data$country == "Switzerland" |
                    data$country == "Ukraine" |
                    data$country == "United Kingdom" |
                    data$country == "Vatican City"
                    
                ] <- "Europe"

data$Continent[     data$country == "Egypt" |
                    data$country == "Libya" |
                    data$country == "Tunisia" |
                    data$country == "Algeria" |
                    data$country == "Morocco" |
                    data$country == "Western Sahara" |
                    data$country == "Mauritania" |
                    data$country == "Mali" |
                    data$country == "Senegal" |
                    data$country == "Gambia" |
                    data$country == "Cape Verde" |
                    data$country == "Guinea-Bissau" |
                    data$country == "Guinea" |
                    data$country == "Sierra Leone" |
                    data$country == "Liberia" |
                    data$country == "Cote d'Ivoire" |
                    data$country == "Burkina Faso" |
                    data$country == "Ghana" |
                    data$country == "Togo" |
                    data$country == "Benin" |
                    data$country == "Nigeria" |
                    data$country == "Niger" |
                    data$country == "Chad" |
                    data$country == "Sudan" |
                    data$country == "Eritrea" |
                    data$country == "Djibouti" |
                    data$country == "Ethiopia" |
                    data$country == "Somalia" |
                    data$country == "Kenya" |
                    data$country == "Uganda" |
                    data$country == "Rwanda" |
                    data$country == "Burundi" |
                    data$country == "Congo, Dem. Rep." |
                    data$country == "Congo, Rep." |
                    data$country == "Central African Republic" |
                    data$country == "Cameroon" |
                    data$country == "Equatorial Guinea" |
                    data$country == "Gabon" |
                    data$country == "Angola" |
                    data$country == "Zambia" |
                    data$country == "Tanzania" |
                    data$country == "Malawi" |
                    data$country == "Mozambique" |
                    data$country == "Comoros" |
                    data$country == "Madagascar" |
                    data$country == "Zimbabwe" |
                    data$country == "Botswana" |
                    data$country == "Namibia" |
                    data$country == "South Africa"|
                    data$country == "Lesotho" |
                    data$country == "Eswatini" |
                    data$country == "Mauritius" |
                    data$country == "Seychelles"
                    
                ] <- "Africa"

data$Continent[     data$country == "Turkey" |
                    data$country == "Georgia" |
                    data$country == "Armenia" |
                    data$country == "Azerbaijan" |
                    data$country == "Syria" |
                    data$country == "Iraq" |
                    data$country == "Lebanon" |
                    data$country == "Israel" |
                    data$country == "Palestine" |
                    data$country == "Jordan" |
                    data$country == "Saudi Arabia" |
                    data$country == "Kuwait" |
                    data$country == "Bahrain" |
                    data$country == "Qatar" |
                    data$country == "United Arab Emirates" |
                    data$country == "Oman" |
                    data$country == "Yemen" |
                    data$country == "Iran" |
                    data$country == "Afghanistan" |
                    data$country == "Pakistan" |
                    data$country == "India" |
                    data$country == "Nepal" |
                    data$country == "Bhutan" |
                    data$country == "Bangladesh" |
                    data$country == "Sri Lanka" |
                    data$country == "Maldives" |
                    data$country == "Myanmar" |
                    data$country == "Lao" |
                    data$country == "Vietnam" |
                    data$country == "Cambodia" |
                    data$country == "Thailand" |
                    data$country == "Malaysia" |
                    data$country == "Singapore" |
                    data$country == "Brunei" |
                    data$country == "Philippines" |
                    data$country == "Indonesia" |
                    data$country == "Timor-Leste" |
                    data$country == "Taiwan" |
                    data$country == "Japan" |
                    data$country == "South Korea" |
                    data$country == "North Korea" |
                    data$country == "China" |
                    data$country == "Mongolia" |
                    data$country == "Russia" |
                    data$country == "Kazakhstan" |
                    data$country == "Kyrgyz Republic" |
                    data$country == "Tajikistan" |
                    data$country == "Uzbekistan" |
                    data$country == "Turkmenistan"
                      
              ] <- "Asia"

data$Continent[     data$country == "Australia" |
                    data$country == "Papua New Guinea" |
                    data$country == "New Zealand" |
                    data$country == "Fiji" |
                    data$country == "Solomon Islands" |
                    data$country == "Micronesia, Fed. Sts." |
                    data$country == "Vanuatu" |
                    data$country == "Samoa" |
                    data$country == "Kiribati" |
                    data$country == "Tonga" |
                    data$country == "Marshall Islands" |
                    data$country == "Palau" |
                    data$country == "Tuvalu" |
                    data$country == "Nauru"
                  
              ]  <- "Oceania"

d <- data[,-(11:14)]
ggpairs(d[,-1], aes(colour = data$state2, alpha = 0.2), upper = list(continuous = wrap("cor", size = 2.5))) + #1.75 - 2.5
   theme(axis.text = element_text(size = 4.5))

ggpairs(d[,-1], aes(colour = data$state3, alpha = 0.2), upper = list(continuous = wrap("cor", size = 2.5))) + #1.75 - 2.5
  theme(axis.text = element_text(size = 4.5))

ggpairs(d[,-1], aes(colour = data$state5, alpha = 0.2), upper = list(continuous = wrap("cor", size = 2.5))) + #1.75 - 2.5
  theme(axis.text = element_text(size = 4.5))

histogram <- ggplot(data, aes(Continent)) + geom_bar(alpha=0.7, aes(fill=state5)) +theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
print(histogram)

density_income <- ggplot(data, aes(income, colour=Continent)) + geom_density(alpha=0.25, aes(fill=Continent)) +theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
print(density_income)

boxplot_incomeA <- ggplot(data, aes(x = state5, y = income, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
boxplot_incomeB <- ggplot(data, aes(x = Continent, y = income, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
grid.arrange(boxplot_incomeA, boxplot_incomeB, ncol=2)

density_lfexp <- ggplot(data, aes(life_expec, colour=Continent)) + geom_density(alpha=0.25, aes(fill=Continent)) +theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
print(density_lfexp)

boxplot_lfexpA <- ggplot(data, aes(x = state5, y = life_expec, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
boxplot_lfexpB <- ggplot(data, aes(x = Continent, y = life_expec, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
grid.arrange(boxplot_lfexpA, boxplot_lfexpB, ncol=2)

density_tfert <- ggplot(data, aes(total_fer, colour=Continent)) + geom_density(alpha=0.25, aes(fill=Continent)) +theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
print(density_tfert)

boxplot_tfertA <- ggplot(data, aes(x = state5, y = total_fer, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
boxplot_tfertB <- ggplot(data, aes(x = Continent, y = total_fer, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
grid.arrange(boxplot_tfertA, boxplot_tfertB, ncol=2)

density_chmort <- ggplot(data, aes(child_mort, colour=Continent)) + geom_density(alpha=0.25, aes(fill=Continent)) +theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
print(density_chmort)

boxplot_chmortA <- ggplot(data, aes(x = state5, y = child_mort, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
boxplot_chmortB <- ggplot(data, aes(x = Continent, y = child_mort, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
grid.arrange(boxplot_chmortA, boxplot_chmortB, ncol=2)

density_health <- ggplot(data, aes(health, colour=Continent)) + geom_density(alpha=0.25, aes(fill=Continent)) +theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
print(density_health)

boxplot_healthA <- ggplot(data, aes(x = state5, y = health, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
boxplot_healthB <- ggplot(data, aes(x = Continent, y = health, fill = state5)) + geom_boxplot(notch = TRUE) +stat_summary(fun = "mean", geom = "point", shape = 8,size = 2, color = "darkred") +theme(axis.title.y = element_text(size = 18))
grid.arrange(boxplot_healthA, boxplot_healthB, ncol=2)

# income  ~  gdpp  = [0.9]
c_0.9 <- ggplot(data, aes(income, gdpp, colour=state5)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 16),
            axis.title.x = element_text(size = 24),
            axis.title.y = element_text(size = 24)  )
print(c_0.9)

# child mortality  ~  total fer.   =   [0.8]
c_0.8 <- ggplot(data, aes(child_mort, total_fer, colour=state5)) +
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 16),
            axis.title.x = element_text(size = 24),
            axis.title.y = element_text(size = 24)  )
print(c_0.8)

# exports ~ imports  =  [0.7]
c_0.7 <- ggplot(data, aes(exports, imports, colour=state5)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 16),
            axis.title.x = element_text(size = 24),
            axis.title.y = element_text(size = 24)  )
print(c_0.7)

# life expectancy ~ income  =  [0.6]
c_0.6a <- ggplot(data, aes(life_expec, income, colour=state5)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 16),
            axis.title.x = element_text(size = 24),
            axis.title.y = element_text(size = 24)  )
print(c_0.6a)

# life expectancy ~ GDP per capita  =  [0.6]
c_0.6b <- ggplot(data, aes(life_expec, gdpp, colour=state5)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 16),
            axis.title.x = element_text(size = 24),
            axis.title.y = element_text(size = 24)  )
print(c_0.6b)

PCA

#standardize data
data_scaled <- scale(data[,2:10], center = TRUE, scale = TRUE)
# by scaling we are removing potential bias that the model can have towards features with higher magnitudes.
row.names(data_scaled) <- data$country

data.pca <- prcomp(data_scaled)
print(data.pca)

## Standard deviations (1, .., p=9):
## [1] 2.0336314 1.2435217 1.0818425 0.9973889 0.8127847 0.4728437 0.3368067
## [8] 0.2971790 0.2586020
## 
## Rotation (n x k) = (9 x 9):
##                   PC1          PC2         PC3          PC4         PC5
## child_mort -0.4195194 -0.192883937  0.02954353 -0.370653262  0.16896968
## exports     0.2838970 -0.613163494 -0.14476069 -0.003091019 -0.05761584
## health      0.1508378  0.243086779  0.59663237 -0.461897497 -0.51800037
## imports     0.1614824 -0.671820644  0.29992674  0.071907461 -0.25537642
## income      0.3984411 -0.022535530 -0.30154750 -0.392159039  0.24714960
## inflation  -0.1931729  0.008404473 -0.64251951 -0.150441762 -0.71486910
## life_expec  0.4258394  0.222706743 -0.11391854  0.203797235 -0.10821980
## total_fer  -0.4037290 -0.155233106 -0.01954925 -0.378303645  0.13526221
## gdpp        0.3926448  0.046022396 -0.12297749 -0.531994575  0.18016662
##                     PC6         PC7         PC8         PC9
## child_mort -0.200628153  0.07948854  0.68274306  0.32754180
## exports     0.059332832  0.70730269  0.01419742 -0.12308207
## health     -0.007276456  0.24983051 -0.07249683  0.11308797
## imports     0.030031537 -0.59218953  0.02894642  0.09903717
## income     -0.160346990 -0.09556237 -0.35262369  0.61298247
## inflation  -0.066285372 -0.10463252  0.01153775 -0.02523614
## life_expec  0.601126516 -0.01848639  0.50466425  0.29403981
## total_fer   0.750688748 -0.02882643 -0.29335267 -0.02633585
## gdpp       -0.016778761 -0.24299776  0.24969636 -0.62564572

get_eig(data.pca)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.13565658       45.9517398                    45.95174
## Dim.2 1.54634631       17.1816257                    63.13337
## Dim.3 1.17038330       13.0042589                    76.13762
## Dim.4 0.99478456       11.0531618                    87.19079
## Dim.5 0.66061903        7.3402114                    94.53100
## Dim.6 0.22358112        2.4842347                    97.01523
## Dim.7 0.11343874        1.2604304                    98.27566
## Dim.8 0.08831536        0.9812817                    99.25694
## Dim.9 0.06687501        0.7430556                   100.00000

# Percentage of variance/inertia.
fviz_screeplot(data.pca, addlabels = TRUE, ylim = c(0, 100))

fviz_pca_var(data.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE  
)

# Contributions of variables to PC1
pc1 <- fviz_contrib(data.pca, choice = "var", axes = 1, color = "darkblue", top = 4, ggtheme = theme_minimal())
# Contributions of variables to PC2
pc2 <- fviz_contrib(data.pca, choice = "var", axes = 2, color = "darkblue", top = 4, ggtheme = theme_minimal())
grid.arrange(pc1, pc2, ncol=2)

fvz_biplt1 <- fviz_pca_biplot(data.pca, repel = TRUE,
                col.var = "magenta", # Variables color
                col.ind = "#696969",  # Individuals color
                addEllipses = TRUE
)
fvz_biplt1

fvz_ind1 <- fviz_pca_ind(data.pca, #iris.pca
             label = "none", # hide individual labels
             habillage = as.factor(data$state5), # color by groups
             palette = "RdBl",
             #palette = c("cyan", "pink", "green"),
             addEllipses = TRUE # Concentration ellipses
)
fvz_ind1

fvz_ellips1 <- fviz_ellipses(data.pca, habillage = as.factor(data$state5),              ellipse.type = "confidence", geom = "point", palette = "lancet")
fvz_ellips1

data.pca$variance <- data.pca$sdev ^2
sum(data.pca$variance[1:2])/sum(data.pca$variance)

## [1] 0.6313337

summary(data.pca)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6    PC7
## Standard deviation     2.0336 1.2435 1.0818 0.9974 0.8128 0.47284 0.3368
## Proportion of Variance 0.4595 0.1718 0.1300 0.1105 0.0734 0.02484 0.0126
## Cumulative Proportion  0.4595 0.6313 0.7614 0.8719 0.9453 0.97015 0.9828
##                            PC8     PC9
## Standard deviation     0.29718 0.25860
## Proportion of Variance 0.00981 0.00743
## Cumulative Proportion  0.99257 1.00000

gg2 <- ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=state2)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )
print(gg2)

gg3 <- ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=state3)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )
print(gg3)

gg5 <- ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=state5)) + 
  geom_point(alpha=0.3, size=3) +
    geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )
print(gg5)

Data.pca <- PCA(data[,2:10])

fviz_screeplot(Data.pca, addlabels = TRUE, ylim = c(0, 100))

#Estimate the number of significant components in Principal Component Analysis.
#ncp = the best number of dimensions to use (find the minimum or the first local minimum)  ?
#  and the mean error for each dimension tested 
estim_ncp(data[,2:10], ncp.min=0, ncp.max=NULL, scale=TRUE, method="GCV")  #FactoMiNeR

## $ncp
## [1] 6
## 
## $criterion
## [1] 1.0000000 0.6965358 0.6281441 0.5602032 0.4383914 0.2961051 0.2908959
## [8] 0.3828931 0.6683697

dimdesc(Data.pca)

## $Dim.1
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##            correlation      p.value
## life_expec   0.8660003 1.550359e-51
## income       0.8102823 3.895056e-40
## gdpp         0.7984948 3.329819e-38
## exports      0.5773418 3.155347e-16
## imports      0.3283958 1.472843e-05
## health       0.3067485 5.532100e-05
## inflation   -0.3928425 1.511302e-07
## total_fer   -0.8210359 5.086278e-42
## child_mort  -0.8531479 1.701142e-48
## 
## $Dim.2
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##            correlation      p.value
## imports      0.8354236 9.508003e-45
## exports      0.7624821 5.113174e-33
## child_mort   0.2398554 1.796042e-03
## total_fer    0.1930357 1.244078e-02
## life_expec  -0.2769407 2.910838e-04
## health      -0.3022837 7.177825e-05
## 
## $Dim.3
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##           correlation      p.value
## inflation   0.6951049 1.990102e-25
## income      0.3262269 1.689449e-05
## exports     0.1566083 4.326810e-02
## imports    -0.3244735 1.886204e-05
## health     -0.6454623 4.677421e-21

# Optimal representation of the variables
# Visualization of correlation between pairs of variables
#                    by the cosine of their angle.
# Control variable colors using their contributions
fviz_pca_var(Data.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE  # Avoid text overlapping
)

# Contributions of variables to PC1
PC_1 <- fviz_contrib(Data.pca, choice = "var", axes = 1, color = "darkblue", top = 4, ggtheme = theme_minimal())
# Contributions of variables to PC2
PC_2 <- fviz_contrib(Data.pca, choice = "var", axes = 2, color = "darkblue", top = 4, ggtheme = theme_minimal())
library(gridExtra)
grid.arrange(PC_1, PC_2, ncol=2)

grid.arrange(pc1, pc2, PC_1, PC_2, nrow=2, ncol=2)

Data.pca$eig

##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1 4.13565658             45.9517398                          45.95174
## comp 2 1.54634631             17.1816257                          63.13337
## comp 3 1.17038330             13.0042589                          76.13762
## comp 4 0.99478456             11.0531618                          87.19079
## comp 5 0.66061903              7.3402114                          94.53100
## comp 6 0.22358112              2.4842347                          97.01523
## comp 7 0.11343874              1.2604304                          98.27566
## comp 8 0.08831536              0.9812817                          99.25694
## comp 9 0.06687501              0.7430556                         100.00000

# cumulative (variance %) ---> component 2. 
Data.pca$eig[2,3]

## [1] 63.13337

fvz_biplt2 <- fviz_pca_biplot(Data.pca, repel = TRUE,
                col.var = "magenta", # Variables color
                col.ind = "#696969",  # Individuals color
                addEllipses = TRUE,
                title = "FactoMineR"
)

# Visualize
# Use habillage to specify groups for coloring
fvz_ind2 <- fviz_pca_ind(data.pca, #iris.pca
             label = "none", # hide individual labels
             habillage = as.factor(data$Continent), # color by groups
             palette = "RdBl",
             #palette = c("cyan", "pink", "green"),
             addEllipses = TRUE # Concentration ellipses
)

fvz_ellips2 <- fviz_ellipses(data.pca, habillage= as.factor(data$Continent),              ellipse.type= "confidence", geom = "point", palette = "lancet")

grid.arrange(fvz_biplt1, fvz_biplt2, ncol=2)

grid.arrange(fvz_ind1, fvz_ind2, ncol=2)

grid.arrange(fvz_ellips1, fvz_ellips2, ncol=2)

fvz_indTwo <- fviz_pca_ind(data.pca, #iris.pca
             label = "none", # hide individual labels
             habillage = as.factor(data$state2), # color by groups
             palette = "RdBl",
             addEllipses = TRUE # Concentration ellipses
)
fvz_indThree <- fviz_pca_ind(data.pca, #iris.pca
             label = "none", # hide individual labels
             habillage = as.factor(data$state3), # color by groups
             palette = "RdBl",
             addEllipses = TRUE # Concentration ellipses
)

# Hopkins statistic: If the value of Hopkins statistic is close to 1 (far above 0.5), then we can conclude that the dataset is significantly clusterable.

# VAT (Visual Assessment of cluster Tendency): The VAT detects the clustering tendency in a visual form by counting the number of square shaped dark (or colored) blocks along the diagonal in a VAT image.
#ordered dissimilarity image (ODI)
get_clust_tendency(data_scaled, n=167-1, graph=TRUE)

## $hopkins_stat
## [1] 0.844379
## 
## $plot

# get_dist(): Computes a distance matrix between the rows of a data matrix. 
#  Compared to the standard dist() function, it supports correlation-based distance measures including "pearson", "kendall" and "spearman" methods.
distance <- get_dist(data_scaled)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

#f TRUE the ordered dissimilarity image (ODI) is shown.
fviz_dist(distance, show_labels=TRUE, order = FALSE, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# sigma=0.01
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.01),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2.5,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,01")

ggplot(data, aes(rotated(kpc)[,1], rotated(kpc)[,2], colour=state5)) + 
  geom_point(alpha = 0.3, size=3) + 
  geom_text( label=data$country, nudge_x=0.45, nudge_y=0.1, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

# kernel function
kernelf(kpc)

## Anova RBF kernel function. 
##  Hyperparameter : sigma =  0.01 degree =  1

#print the principal component vectors
pcv(kpc)

##                [,1]         [,2]
##   [1,]  0.412274463  0.015715322
##   [2,] -0.063983558 -0.228656647
##   [3,]  0.032931783 -0.178673862
##   [4,]  0.419019002  0.630415518
##   [5,] -0.142435210  0.058622863
##   [6,] -0.021662646 -0.678817319
##   [7,]  0.008546485 -0.216781010
##   [8,] -0.339511902 -0.731424362
##   [9,] -0.420178167 -0.265169735
##  [10,]  0.019379966 -0.154427205
##  [11,] -0.179017598 -0.255091335
##  [12,] -0.234074759  0.242520866
##  [13,]  0.149371613 -0.366372238
##  [14,] -0.151853540 -0.190696711
##  [15,] -0.082792178  0.214474213
##  [16,] -0.439209268  0.292273215
##  [17,] -0.023569824  0.270865332
##  [18,]  0.385788133  0.140824453
##  [19,]  0.025334136  0.300556926
##  [20,]  0.110500737 -0.057588336
##  [21,] -0.143566991 -0.386442502
##  [22,]  0.128258397  0.153790070
##  [23,] -0.036760947 -0.813986144
##  [24,] -0.345862995  0.071172831
##  [25,] -0.125075318  0.011980898
##  [26,]  0.442969502 -0.011367387
##  [27,]  0.405681201 -0.196472167
##  [28,]  0.088870985  0.338068479
##  [29,]  0.402293797  0.011181295
##  [30,] -0.366469354 -0.650295532
##  [31,]  0.025816993  0.135617618
##  [32,]  0.556375882  0.121715202
##  [33,]  0.506960556  0.457872114
##  [34,] -0.141395256 -0.416678864
##  [35,] -0.016994380 -0.452683013
##  [36,] -0.028685470 -0.673342106
##  [37,]  0.301213033  0.122623440
##  [38,]  0.449426876  0.364138213
##  [39,]  0.251693109  0.838990491
##  [40,] -0.139697648 -0.533295845
##  [41,]  0.375726448  0.439501609
##  [42,] -0.163491706 -0.327382387
##  [43,] -0.304470121  0.021843623
##  [44,] -0.283535188  0.181371817
##  [45,] -0.426044933 -0.313051416
##  [46,]  0.027414255 -0.344686610
##  [47,] -0.008517421 -0.414045272
##  [48,]  0.112642118 -0.317491092
##  [49,] -0.013951469 -0.226113668
##  [50,]  0.192368934  0.919621684
##  [51,]  0.342017800 -0.231954614
##  [52,] -0.227026878  0.419540097
##  [53,]  0.033732319  0.414942472
##  [54,] -0.349604225 -0.398140733
##  [55,] -0.325581398 -0.709549445
##  [56,]  0.200730036  0.122150291
##  [57,]  0.318108807  0.065977718
##  [58,] -0.050389167 -0.217065569
##  [59,] -0.380525802 -0.481041804
##  [60,]  0.291164943  0.127526593
##  [61,] -0.258186989 -0.674553789
##  [62,] -0.020114742 -0.167740094
##  [63,]  0.091168918 -0.249699144
##  [64,]  0.422764967  0.254467818
##  [65,]  0.403276112 -0.064184803
##  [66,]  0.052828087  0.529001924
##  [67,]  0.575463036  0.577098270
##  [68,] -0.251726751  0.520154481
##  [69,] -0.352580072 -0.228467059
##  [70,]  0.184593711 -0.212323789
##  [71,]  0.124011653 -0.282079183
##  [72,] -0.013510085 -0.505908988
##  [73,]  0.142575099 -0.132170007
##  [74,] -0.504782337  0.729751736
##  [75,] -0.211688643 -0.396178089
##  [76,] -0.312677529 -0.666641538
##  [77,] -0.006592903 -0.094046995
##  [78,] -0.330692072 -0.899712362
##  [79,] -0.020275138  0.204167927
##  [80,]  0.034809783 -0.093876331
##  [81,]  0.268258511 -0.078054741
##  [82,]  0.179895593  0.131338950
##  [83,] -0.346328724  0.093713535
##  [84,]  0.052059857  0.504012081
##  [85,]  0.219430318  0.195228917
##  [86,] -0.162536598  0.067684028
##  [87,] -0.163453392 -0.092920527
##  [88,]  0.261368868  0.756438606
##  [89,]  0.258562109  0.390283664
##  [90,] -0.117805560  0.163543051
##  [91,] -0.193292022  0.296453130
##  [92,] -0.882773313  1.934106818
##  [93,] -0.102206884 -0.037363801
##  [94,]  0.304863352  0.116872318
##  [95,]  0.421584409  0.055170250
##  [96,] -0.167129619  0.649128998
##  [97,] -0.150501785  0.406639139
##  [98,]  0.485292359  0.185398037
##  [99,] -0.485233158  1.797408455
## [100,]  0.281422423  0.519831604
## [101,] -0.122249789  0.166797410
## [102,]  0.057762344  0.025789925
## [103,] -0.074685580  0.053803643
## [104,]  0.121357012  0.194048262
## [105,] -0.144343534 -0.102743531
## [106,]  0.033447146 -0.116005700
## [107,]  0.419903417  0.316655552
## [108,]  0.241130362 -0.574728036
## [109,]  0.157452890  0.371894849
## [110,]  0.173720800 -0.304659622
## [111,] -0.473105137  0.077919144
## [112,] -0.263991740 -0.604788453
## [113,]  0.491024628  0.340004993
## [114,]  0.573503199 -0.056879240
## [115,] -0.514488699 -0.490657100
## [116,] -0.161541318  0.213341440
## [117,]  0.327524632 -0.185318581
## [118,] -0.157588582  0.443980301
## [119,] -0.015065344  0.135385658
## [120,] -0.004218263 -0.415260250
## [121,]  0.110864215 -0.046084090
## [122,] -0.172549898 -0.253773241
## [123,] -0.260346120 -0.561306227
## [124,] -0.541063357  0.051981541
## [125,] -0.083005419 -0.244854969
## [126,] -0.033132283 -0.401711450
## [127,]  0.234785806 -0.407633181
## [128,]  0.083455855 -0.018175953
## [129,] -0.126603282 -0.054060520
## [130,]  0.276313197  0.017240462
## [131,] -0.122179186 -0.348120032
## [132,] -0.208076061  1.176879327
## [133,]  0.468322090 -0.143481766
## [134,] -0.736661691  2.397642793
## [135,] -0.279062854  0.428824590
## [136,] -0.316849007  0.090064204
## [137,]  0.121808000  0.494321414
## [138,]  0.165811753 -0.238735942
## [139,] -0.270503514 -0.151090577
## [140,] -0.291629933 -0.674352970
## [141,]  0.065406378 -0.379304128
## [142,] -0.003813910 -0.004372423
## [143,]  0.319203487 -0.304806783
## [144,] -0.024588846 -0.045651653
## [145,] -0.398762627 -0.330964457
## [146,] -0.567074583 -0.112251147
## [147,]  0.172041283 -0.014918011
## [148,]  0.360919073 -0.100097498
## [149,] -0.126574519  0.335427863
## [150,]  0.318024916 -0.464756331
## [151,]  0.294296565  0.343353222
## [152,]  0.107065761 -0.027518949
## [153,] -0.083503255  0.067108838
## [154,] -0.066090756 -0.538654998
## [155,]  0.074525004  0.499668797
## [156,]  0.400843344 -0.165878653
## [157,] -0.046731451 -0.045777967
## [158,] -0.340303257  0.498010231
## [159,] -0.298434718 -0.580351409
## [160,] -0.379645282 -1.098340987
## [161,] -0.095208920 -0.553393132
## [162,]  0.111213163 -0.262555109
## [163,]  0.122932380  0.236068239
## [164,]  0.049542809 -0.470367231
## [165,] -0.068668454  0.550674383
## [166,]  0.261144035 -0.059561097
## [167,]  0.406177784  0.157973479

#The corresponding eigenvalues
eig(kpc)

##     Comp.1     Comp.2 
## 0.07595030 0.02694576

# sigma=0.00000000000000001
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.00000000000000001),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1 * 10^(-17)")

# sigma=0.0000000000000001
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.0000000000000001),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1 * 10^(-16)")

# sigma=0.0000000000000002
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.0000000000000002),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 2 * 10^(-16)")

# sigma=0.000000000000000318
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.000000000000000318),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,18 * 10^(-16)")

# sigma=0.00000000000000034
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.00000000000000034),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,4 * 10^(-16)")

# sigma=0.00000000000000035
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.00000000000000035),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,5 * 10^(-16)")

# sigma=0.00000000000000038
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.00000000000000038),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,8 * 10^(-16)")

# sigma=0.00000000000000039
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.00000000000000039),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 3,9 * 10^(-16)")

# sigma=0.0000000000000004
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.0000000000000004),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 4 * 10^(-16)")

# sigma=0.05
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.05),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,05")

# sigma=0.62
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.62),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,62")

# sigma=0.63
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=0.63),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 0,63")

# sigma=13.9
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=13.9),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 13,9")

# sigma=14
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=14),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 14")

# sigma=229
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=229),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 229")

# sigma=230
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=230),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 230")

# sigma=799
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=799),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 799")

# sigma=800
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=800),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 800")

# sigma=1019
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=1019),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1019")

# sigma=1020
kpc <- kpca(~.,data=as.data.frame(data_scaled),kernel="anovadot",
            kpar=list(sigma=1020),features=2)
plot(rotated(kpc),col=as.factor(data[,13]), pch=19, cex=2,
     xlab="1st Principal Component",ylab="2nd Principal Component")
title(main = "sigma = 1020")

fviz_nbclust(data_scaled, kmeans, method = "wss") + 
  labs(subtitle = "Elbow method")

fviz_nbclust(data.pca$x, kmeans, method = "wss") + 
  labs(subtitle = "Elbow method")

fviz_nbclust(Data.pca$call$X, kmeans, method = "wss") + 
  labs(subtitle = "Elbow method")

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

fviz_nbclust(data.pca$x, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

fviz_nbclust(Data.pca$call$X, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

fviz_nbclust(data_scaled, kmeans, nstart = 25,  method = "gap_stat", nboot = 100) +
  labs(subtitle = "Gap statistic method")

fviz_nbclust(data.pca$x, kmeans, nstart = 25,  method = "gap_stat", nboot = 100) +
  labs(subtitle = "Gap statistic method")

fviz_nbclust(Data.pca$call$X, kmeans, nstart = 25,  method = "gap_stat", nboot = 100) +
  labs(subtitle = "Gap statistic method")

nbclust <- NbClust(data = data.pca$x[,1:2], distance = "euclidean",
                   min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 10 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 2 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

# Best number of clusters proposed by each index and the corresponding index value.
nbclust[["Best.nc"]]

##                     KL      CH Hartigan     CCC   Scott Marriot   TrCovW
## Number_clusters 4.0000   8.000   4.0000  8.0000   4.000     4.0     3.00
## Value_Index     5.4023 163.687  34.9958 -2.6812 140.899 69965.5 63599.88
##                  TraceW Friedman   Rubin Cindex     DB Silhouette   Duda
## Number_clusters  4.0000   4.0000  4.0000 2.0000 4.0000     2.0000 2.0000
## Value_Index     66.7049   3.6352 -0.1958 0.1939 0.8576     0.4155 0.5646
##                 PseudoT2  Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000 2.0000     3.000   3.0000     4.0000    1  2.0000
## Value_Index      58.6142 0.7627     0.404 128.2709     0.5551   NA  0.5596
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 7.0000      0  4.0000      0 10.0000
## Value_Index     0.0609      0  1.7671      0  0.3292

# Values of indices for each partition of the dataset obtained with a number of clusters between min.nc and max.nc.
nbclust[["All.index"]]

##        KL       CH Hartigan     CCC    Scott  Marriot    TrCovW   TraceW
## 2  0.5943 146.2577  60.9097 -3.0684 183.1325 235439.4 96301.452 500.0038
## 3  0.8326 129.7878  80.2574 -5.6293 278.0841 300008.7 32701.575 365.1930
## 4  5.4023 154.6719  45.2616 -3.3188 418.9831 229400.4 18267.604 245.1990
## 5  1.1848 158.5524  28.8335 -2.9719 493.9816 228757.7 10512.596 191.9098
## 6  0.3998 154.2266  35.8669 -3.3451 548.0993 238232.9  6087.265 162.9137
## 7  3.4154 162.1184  25.4688 -2.7480 618.3376 212929.5  3255.076 133.2326
## 8  2.1200 163.6870  18.6780 -2.6812 664.6517 210754.6  3658.801 114.9369
## 9  3.5525 161.3646  17.8869 -2.9312 702.9458 212078.0  3113.913 102.8545
## 10 0.2564 160.6375  14.4383 -3.0611 738.4794 211642.6  2012.351  92.3946
##    Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2    1.8703  1.8864 0.1939 1.0274     0.4155 0.5646  58.6142  0.7627    0.3452
## 3    3.1405  2.5828 0.2359 0.9941     0.4135 0.9413   2.3085  0.0555    0.4040
## 4    6.7757  3.8467 0.2798 0.8576     0.3925 0.9242   7.3021  0.0810    0.3950
## 5    8.0621  4.9149 0.3006 0.8986     0.3750 1.2377 -13.6378 -0.1889    0.3829
## 6    9.4530  5.7896 0.2738 0.9047     0.3599 1.1424  -5.8582 -0.1207    0.3610
## 7   12.4982  7.0794 0.3033 0.8662     0.3741 1.0764  -2.5540 -0.0684    0.3420
## 8   14.3635  8.2063 0.2823 0.8790     0.3573 1.4066 -12.7182 -0.2759    0.3251
## 9   16.0482  9.1704 0.2909 0.8839     0.3418 2.3211  -7.3993 -0.2846    0.3100
## 10  17.7871 10.2085 0.2753 0.8815     0.3307 1.6387 -13.6419 -0.3543    0.2967
##        Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  250.0019     0.4409 -0.3387  0.5596 0.0111 0.0012  1.8268 1.4394 1.8465
## 3  121.7310     0.5520  0.6371  0.5669 0.0301 0.0018  2.3442 1.3130 1.1804
## 4   61.2997     0.5551  1.2868  0.7487 0.0096 0.0023  1.7671 1.0848 0.4765
## 5   38.3820     0.4970  0.9456  1.1149 0.0214 0.0024  1.9239 0.9524 0.5154
## 6   27.1523     0.4596  0.4442  1.4258 0.0528 0.0024  2.1364 0.8842 0.4554
## 7   19.0332     0.4455  0.8828  1.6007 0.0609 0.0025  1.9880 0.7972 0.4855
## 8   14.3671     0.4131  0.7258  1.9252 0.0464 0.0026  2.5968 0.7318 0.4470
## 9   11.4283     0.3876  0.4617  2.2281 0.0467 0.0027  2.9505 0.6960 0.4063
## 10   9.2395     0.3750  0.4029  2.3958 0.0510 0.0027  2.8636 0.6557 0.3292

# Critical values of some indices for each partition obtained with a number of clusters between min.nc and max.nc.
nbclust[["All.CriticalValues"]]

##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.4230           103.6700       0.4679
## 3         -0.0987          -411.9228       0.9462
## 4          0.4075           129.4084       0.9222
## 5          0.3683           121.7865       1.0000
## 6          0.2521           139.4290       1.0000
## 7          0.2234           125.1193       1.0000
## 8          0.1671           219.2765       1.0000
## 9         -0.7431           -30.4948       1.0000
## 10        -0.0307         -1175.3080       1.0000

best_n_of_clusters <- nbclust[["Best.nc"]]
best_N_of_clusters <- as.integer(best_n_of_clusters[1,])
hist(best_N_of_clusters,
     main = "Clusters proposed by indexes",
     xlab = "Number of clusters",
     border = "Magenta",
     col = "Blue",
     xlim = c (0, 10),
     ylim = c (0, 20),
     breaks = 20)

# k-means clustering
clusters_2 <- kmeans(x = data.pca$x[,1:2], centers = 2, iter.max = 30, nstart = 50)
clusters_3 <- kmeans(x = data.pca$x[,1:2], centers = 3, iter.max = 30, nstart = 50)
clusters_5 <- kmeans(x = data.pca$x[,1:2], centers = 5, iter.max = 30, nstart = 50)
par(mfrow=c(1,2))

plot(data.pca$x[,1:2], col=as.factor(data$state2), pch=20, cex=2.5, xlab="1st PC",ylab="2nd PC")
title(main = "original")
plot(data.pca$x[,1:2], col=clusters_2$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_2$centers[,1], clusters_2$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=clusters_2$cluster)) + 
  geom_point(alpha=0.3, size=3) +
   scale_colour_gradientn(colours=rainbow(2)) +
    geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
     theme_dark() +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

plot(data.pca$x[,1:2], col=as.factor(data$state3), pch=20, cex=2.5, xlab="1st PC",ylab="2nd PC")
title(main = "original")
plot(data.pca$x[,1:2], col=clusters_3$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_3$centers[,1], clusters_3$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=clusters_3$cluster)) + 
  geom_point(alpha=0.3, size=3) +
   scale_colour_gradientn(colours=rainbow(3)) +
    geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
     theme_dark() +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

plot(data.pca$x[,1:2], col=as.factor(data$state5), pch=20, cex=2.5, xlab="1st PC",ylab="2nd PC")
title(main = "original")
plot(data.pca$x[,1:2], col=clusters_5$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_5$centers[,1], clusters_5$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=clusters_5$cluster)) + 
  geom_point(alpha=0.3, size=3) +
   scale_colour_gradientn(colours=rainbow(5)) +
    geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
     theme_dark() +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

# k-means clustering
Clusters_2 <- pam(x = data.pca$x[,1:2], k = 2, nstart = 50)
Clusters_3 <- pam(x = data.pca$x[,1:2], k = 3, nstart = 50)
Clusters_5 <- pam(x = data.pca$x[,1:2], k = 5, nstart = 50)
par(mfrow=c(1,2))

plot(data.pca$x[,1:2], col=clusters_2$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_2$centers[,1], clusters_2$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")
plot(data.pca$x[,1:2], col=Clusters_2$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(Clusters_2$medoids[,1], Clusters_2$medoids[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-medoids")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=Clusters_2$cluster)) + 
  geom_point(alpha=0.3, size=3) +
   scale_colour_gradientn(colours=rainbow(2)) +
    geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
     theme_dark() +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

plot(data.pca$x[,1:2], col=clusters_3$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_3$centers[,1], clusters_3$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")
plot(data.pca$x[,1:2], col=Clusters_3$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(Clusters_3$medoids[,1], Clusters_3$medoids[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-medoids")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=Clusters_3$cluster)) + 
  geom_point(alpha=0.3, size=3) +
   scale_colour_gradientn(colours=rainbow(3)) +
    geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
     theme_dark() +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

plot(data.pca$x[,1:2], col=clusters_5$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(clusters_5$centers[,1], clusters_5$centers[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-means")
plot(data.pca$x[,1:2], col=Clusters_5$cluster, pch=20, cex=2.5, xlab="X",ylab="Y")
points(Clusters_5$medoids[,1], Clusters_5$medoids[,2], col="darkviolet", pch = 8, lwd=1.5)
title(main = "k-medoids")

ggplot(data, aes(data.pca$x[,1], data.pca$x[,2], colour=Clusters_5$cluster)) + 
  geom_point(alpha=0.3, size=3) +
   scale_colour_gradientn(colours=rainbow(5)) +
    geom_text( label=data$country, nudge_x=0, nudge_y=0, hjust=0.5, vjust=-1, angle=0, size=3.65, check_overlap = TRUE ) +
     theme_dark() +
      theme(legend.key.size = unit(1.5, 'cm'),
            legend.title = element_text(size = 12), 
            legend.text  = element_text(size = 18),
            axis.title.x = element_text(size = 20),
            axis.title.y = element_text(size = 20)  )

dist_matrix <- dist(data_scaled, method = "euclidean")

# By default, the COMPLETE LINKAGE method is used.
# Complete linkage method finds similar clusters.
clusters_completeL <- hclust(dist_matrix, method = "complete")
clusters_completeL$labels <- data$country

plot(clusters_completeL, 
     xlab = "Agglomerative clustering with 167 initial clusters/countries", 
     cex = 1.1, 
     label = data$country)

clusterCut_2 <- cutree(clusters_completeL, k = 2)    # calculate final labeling given the number of clusters
table(clusterCut_2)                    # Number of members in each cluster

## clusterCut_2
##   1   2 
##  55 112

clusterCut_3 <- cutree(clusters_completeL, k = 3)
table(clusterCut_3)

## clusterCut_3
##   1   2   3 
##  55 109   3

clusterCut_5 <- cutree(clusters_completeL, k = 5)
table(clusterCut_5)

## clusterCut_5
##  1  2  3  4  5 
## 54 95 14  3  1

# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Complete Linkage [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Complete Linkage [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Complete Linkage [5]")

dend_completeL <- as.dendrogram(clusters_completeL)

dend_2 <- color_branches(dend_completeL, k = 2)
plot(dend_2, type = "triangle", center = TRUE, cex = 0.4, main = "Complete Linkage method [2]")

dend_3 <- color_branches(dend_completeL, k = 3)
plot(dend_3, type = "triangle", center = TRUE, cex = 0.3, main = "Complete Linkage method [3]")

dend_5 <- color_branches(dend_completeL, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Complete Linkage method [5]")

fviz_dend(clusters_completeL,
          k = 2,
          k_colors = "nejm",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "nejm",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.auto",
          ggtheme = theme_void()  )

fviz_dend(clusters_completeL,
          k = 3,
          k_colors = "uchicago",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "uchicago",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout_with_lgl",
          ggtheme = theme_void()  )

fviz_dend(clusters_completeL,
          k = 5,
          k_colors = "uchicago",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "uchicago",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.gem",
          ggtheme = theme_void()  )

# This time, we will use the SINGLE LINKAGE method:
# it adopts a ‘friends of friends’ clustering strategy.
clusters_singleL <- hclust(dist_matrix, method = "single")
clusters_singleL$labels <- data$country
plot(clusters_singleL, xlab = "Agglomerative clustering with 167 initial clusters/countries", cex = 1.1)

clusterCut_2 <- cutree(clusters_completeL, k = 2)    # calculate final labeling given the number of clusters
table(clusterCut_2)                    # Number of members in each cluster

## clusterCut_2
##   1   2 
##  55 112

clusterCut_3 <- cutree(clusters_completeL, k = 3)
table(clusterCut_3)

## clusterCut_3
##   1   2   3 
##  55 109   3

clusterCut_5 <- cutree(clusters_completeL, k = 5)
table(clusterCut_5)

## clusterCut_5
##  1  2  3  4  5 
## 54 95 14  3  1

# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Single Linkage [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Single Linkage [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Single Linkage [5]")

dend_singleL <- as.dendrogram(clusters_singleL)

dend_2 <- color_branches(dend_singleL, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Single Linkage method [2]")

dend_3 <- color_branches(dend_singleL, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Single Linkage method [3]")

dend_5 <- color_branches(dend_singleL, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Single Linkage method [5]")

fviz_dend(clusters_singleL,
          k = 2,
          k_colors = "nejm",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "nejm",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.auto",
          ggtheme = theme_void()  )

fviz_dend(clusters_singleL,
          k = 3,
          k_colors = "uchicago",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "uchicago",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout_with_lgl",
          ggtheme = theme_void()  )

fviz_dend(clusters_singleL,
          k = 5,
          k_colors = "uchicago",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "uchicago",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.gem",
          ggtheme = theme_void()  )

# This time, we will use the AVERAGE method:
clusters_average <- hclust(dist_matrix, method = "average")
clusters_average$labels <- data$country
plot(clusters_average, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

clusterCut_2 <- cutree(clusters_average, k = 2)    # calculate final labeling given the number of clusters
table(clusterCut_2)                    # Number of members in each cluster

## clusterCut_2
##   1   2 
## 166   1

clusterCut_3 <- cutree(clusters_average, k = 3)
table(clusterCut_3)

## clusterCut_3
##   1   2   3 
## 163   3   1

clusterCut_5 <- cutree(clusters_average, k = 5)
table(clusterCut_5)

## clusterCut_5
##   1   2   3   4   5 
## 161   1   3   1   1

# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Average Method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Average Method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Average Method [5]")

dend_average <- as.dendrogram(clusters_average)

dend_2 <- color_branches(dend_average, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Average method [2]")

dend_3 <- color_branches(dend_average, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Average method [3]")

dend_5 <- color_branches(dend_average, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Average method [5]")

fviz_dend(clusters_average,
          k = 2,
          k_colors = "nejm",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "nejm",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.auto",
          ggtheme = theme_void()  )

fviz_dend(clusters_average,
          k = 3,
          k_colors = "uchicago",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "uchicago",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout_with_lgl",
          ggtheme = theme_void()  )

fviz_dend(clusters_average,
          k = 5,
          k_colors = "uchicago",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "uchicago",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.gem",
          ggtheme = theme_void()  )

# This time, we will use the MEDIAN distance method:
clusters_median <- hclust(dist_matrix, method = "median")
clusters_median$labels <- data$country
plot(clusters_median, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

clusterCut_2 <- cutree(clusters_median, k = 2)    # calculate final labeling given the number of clusters
table(clusterCut_2)                    # Number of members in each cluster

## clusterCut_2
##   1   2 
## 166   1

clusterCut_3 <- cutree(clusters_median, k = 3)
table(clusterCut_3)

## clusterCut_3
##   1   2   3 
## 161   5   1

clusterCut_5 <- cutree(clusters_median, k = 5)
table(clusterCut_5)

## clusterCut_5
##   1   2   3   4   5 
## 161   3   1   1   1

# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Median method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Median method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Median method [5]")

#--------------------

dend_median <- as.dendrogram(clusters_median)

dend_2 <- color_branches(dend_median, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Average method [2]")

dend_3 <- color_branches(dend_median, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Average method [3]")

dend_5 <- color_branches(dend_median, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Average method [5]")

# This time, we will use the centroid method:
clusters_centroid <- hclust(dist_matrix, method = "centroid")
clusters_centroid$labels <- data$country
plot(clusters_centroid, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

clusterCut_2 <- cutree(clusters_centroid, k = 2)    # calculate final labeling given the number of clusters
table(clusterCut_2)                    # Number of members in each cluster

## clusterCut_2
##   1   2 
## 166   1

clusterCut_3 <- cutree(clusters_centroid, k = 3)
table(clusterCut_3)

## clusterCut_3
##   1   2   3 
## 165   1   1

clusterCut_5 <- cutree(clusters_centroid, k = 5)
table(clusterCut_5)

## clusterCut_5
##   1   2   3   4   5 
## 161   3   1   1   1

# plot clustering result using 2d scatterplot
plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Centroid method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Centroid method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Centroid method [5]")

#--------------------

dend_centroid <- as.dendrogram(clusters_centroid)

dend_2 <- color_branches(dend_centroid, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Centroid method [2]")

dend_3 <- color_branches(dend_centroid, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Centroid method [3]")

dend_5 <- color_branches(dend_centroid, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Centroid method [5]")

# This time, we will use Ward's method:
# Ward's minimum variance method aims at finding compact, spherical clusters.
# "ward.D" does not implement Ward's (1963) clustering criterion,
# whereas option "ward.D2" implements that criterion:
# with the latter, dissimilarities are squared before cluster updating.
clusters_ward <- hclust(dist_matrix, method = "ward.D2")
clusters_ward$labels <- data$country
plot(clusters_ward, xlab = "Agglomerative clustering with 167 initial clusters", cex = 1.1)

rect.hclust(clusters_ward, k = 5, border = 2:5)

clusterCut_2 <- cutree(clusters_ward, k = 2)    # calculate final labeling given the number of clusters
table(clusterCut_2)                    # Number of members in each cluster

## clusterCut_2
##   1   2 
## 133  34

clusterCut_3 <- cutree(clusters_ward, k = 3)
table(clusterCut_3)

## clusterCut_3
##   1   2   3 
##  27 106  34

clusterCut_5 <- cutree(clusters_ward, k = 5)
table(clusterCut_5)

## clusterCut_5
##  1  2  3  4  5 
## 27 63 43 31  3

plot(data.pca$x[,1:2], col=clusterCut_2, pch=19, cex=2.5, xlab="X", ylab="Y", main="Ward's method [2]")

plot(data.pca$x[,1:2], col=clusterCut_3, pch=19, cex=2.5, xlab="X", ylab="Y", main="Ward's method [3]")

plot(data.pca$x[,1:2], col=clusterCut_5, pch=19, cex=2.5, xlab="X", ylab="Y", main="Ward's method [5]")

dend_ward <- as.dendrogram(clusters_ward)

dend_2 <- color_branches(dend_ward, k = 2)
plot(dend_2, type = "triangle", center = TRUE, main = "Ward's method [2]")

dend_3 <- color_branches(dend_ward, k = 3)
plot(dend_3, type = "triangle", center = TRUE, main = "Ward's method [3]")

dend_5 <- color_branches(dend_ward, k = 5)
plot(dend_5, type = "triangle", center = TRUE, main = "Ward's method [5]")

fviz_dend(clusters_ward,
          k = 2,
          k_colors = "igv",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "igv",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.auto",
          ggtheme = theme_void()  )

fviz_dend(clusters_ward,
          k = 3,
          k_colors = "jco",
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = "jco",
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout_with_lgl",
          ggtheme = theme_void()  )

fviz_dend(clusters_ward,
          k = 5,
          k_colors = c("#5050FFFF", "#7AA6DCFF", "#868686FF", "#CD534CFF", "#EFC000FF"),
          color_labels_by_k = TRUE,
          lwd = 0.7,
          cex = 1.3,
          rect = TRUE,
          rect_border = c("#5050FFFF", "#7AA6DCFF", "#868686FF", "#CD534CFF", "#EFC000FF"),
          rect_fill = TRUE,
          type = "phylogenic",
          repel = TRUE,
          phylo_layout = "layout.gem",
          ggtheme = theme_void()  )

Exploratory Data Analysis

Dimitrios Kapetanios

2023-04-22

PCA