Clustering demo y happy

library(htmltab)
demolink = "https://en.wikipedia.org/wiki/Democracy_Index"
demopath = '//*[@id="mw-content-text"]/div/table[2]/tbody'
demo<- htmltab(doc = demolink, which =demopath)

# limpieza democracia
library(stringr)
library(magrittr)
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)

demo[,-c(1,8,9,10)]=lapply(demo[,-c(1,8,9,10)], trimws,whitespace = "[\\h\\v]") # no blanks

# preparación
demo=demo[,-c(1)] #sin Rank
demo[,-c(1,8,9,10)]=lapply(demo[,-c(1,8,9,10)], as.numeric) # a numerico

# veamos que tenemos:
str(demo)

## 'data.frame':    167 obs. of  10 variables:
##  $ Country                     : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ Score                       : num  9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
##  $ Electoralprocessandpluralism: num  10 10 9.58 10 10 10 10 9.58 10 9.58 ...
##  $ Functioningofgovernment   : num  9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
##  $ Politicalparticipation     : num  10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
##  $ Politicalculture            : num  10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
##  $ Civilliberties              : num  9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
##  $ Regimetype                  : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
##  $ Region                      : chr  "Western Europe" "Western Europe" "Western Europe" "Asia & Australasia" ...
##  $ Changesfromlastyear         : chr  "Score: Rank:" "Score: Rank:" "Score: Rank:" "Score: Rank:" ...

library(htmltab)

happyL="https://en.wikipedia.org/wiki/World_Happiness_Report"
happyPath='//*[@id="mw-content-text"]/div/table/tbody'

happy = htmltab(doc = happyL,which  = happyPath,encoding = "UTF-8")

happy[,]=lapply(happy[,], trimws,whitespace = "[\\h\\v]") # no blanks

library(stringr)
names(happy)=str_split(names(happy)," ",simplify = T)[,1]
names(happy)[names(happy)=="Score"]="ScoreHappy"

happy$Overall=NULL # esto luego de lo anterior

happy[,c(2:8)]=lapply(happy[,c(2:8)],as.numeric)

happy=na.omit(happy) # esto luego de lo anterior

str(happy)

## 'data.frame':    156 obs. of  8 variables:
##  $ Country    : chr  "Finland" "Denmark" "Norway" "Iceland" ...
##  $ ScoreHappy : num  7.77 7.6 7.55 7.49 7.49 ...
##  $ GDP        : num  1.34 1.38 1.49 1.38 1.4 ...
##  $ Social     : num  1.59 1.57 1.58 1.62 1.52 ...
##  $ Healthy    : num  0.986 0.996 1.028 1.026 0.999 ...
##  $ Freedom    : num  0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
##  $ Generosity : num  0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
##  $ Perceptions: num  0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...

demohappy=merge(demo,happy)

demohappy=demohappy[,-c(8,9,10)]

library(cluster)

set.seed(123)
row.names(demohappy)=demohappy$Country
g.dist = daisy(demohappy[,c(3:14)], metric="gower")

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

res.pam=pam(g.dist,3,cluster.only = F)
demohappy$pam=res.pam$cluster
res.agnes <- hcut(g.dist, k = 3,hc_func='agnes')
demohappy$agnes=res.agnes$cluster

fviz_nbclust(demohappy[,c(3:14)], pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

Respuesta: No, se debió obtener 6 clusters con pam

fviz_nbclust(demohappy[,c(3:14)], hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

Respuesta: No, se debió obtener 6 clusters

fviz_silhouette(res.pam)

##   cluster size ave.sil.width
## 1       1   56          0.26
## 2       2   63          0.22
## 3       3   28          0.44

fviz_silhouette(res.agnes)

##   cluster size ave.sil.width
## 1       1   68          0.18
## 2       2   43          0.35
## 3       3   36          0.32

respuesta: PAM es mejor pues su silueta promedio es 0.28, y la de DIANA es 0.26

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']

# respuesta: TRECE 
setdiff(poorAGNES,poorPAM)

##  [1] "Liberia"    "Kuwait"     "Turkey"     "Madagascar" "Bhutan"    
##  [6] "Morocco"    "Kyrgyzstan" "Bolivia"    "Guatemala"  "Honduras"  
## [11] "Cyprus"     "Italy"      "Greece"

proyeccion = cmdscale(g.dist, k=2,add = T) 
demohappy$dim1 <- proyeccion$points[,1]
demohappy$dim2 <- proyeccion$points[,2]

g.dist.cmd = daisy(demohappy[,c('dim1','dim2')], metric = 'euclidean')

library(dbscan)
kNNdistplot(g.dist.cmd, k=6)

library(fpc)

## 
## Attaching package: 'fpc'

## The following object is masked from 'package:dbscan':
## 
##     dbscan

db.cmd = dbscan(g.dist.cmd, eps=0.09, MinPts=6,method = 'dist')

# respuesta:1 cluster

db.cmd

## dbscan Pts=147 MinPts=6 eps=0.09
##         0   1
## border 10   5
## seed    0 132
## total  10 137

#países no clusterizados
demohappy$dbscan=db.cmd$cluster

# respuesta:
demohappy[demohappy$dbscan==0,'Country']

##  [1] "Afghanistan"              "Bahrain"                 
##  [3] "Central African Republic" "Hong Kong"               
##  [5] "Qatar"                    "Saudi Arabia"            
##  [7] "Singapore"                "Syria"                   
##  [9] "United Arab Emirates"     "Uzbekistan"

#atípicos y mal clusterizados
atiDB=demohappy[demohappy$dbscan==0,'Country']

# respuesta: ninguno

intersect(atiDB,poorAGNES)

## character(0)

#países del cluster más bajo 
poorcluster=demohappy[demohappy$agnes=='1','Country']
intersect(poorcluster,atiDB)

## [1] "Afghanistan"              "Bahrain"                 
## [3] "Central African Republic" "Qatar"                   
## [5] "Saudi Arabia"             "Syria"                   
## [7] "United Arab Emirates"     "Uzbekistan"

#respuesta: