Clustering demo y happy
library(htmltab)
demolink = "https://en.wikipedia.org/wiki/Democracy_Index"
demopath = '//*[@id="mw-content-text"]/div/table[2]/tbody'
demo<- htmltab(doc = demolink, which =demopath)
# limpieza democracia
library(stringr)
library(magrittr)
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,-c(1,8,9,10)]=lapply(demo[,-c(1,8,9,10)], trimws,whitespace = "[\\h\\v]") # no blanks
# preparación
demo=demo[,-c(1)] #sin Rank
demo[,-c(1,8,9,10)]=lapply(demo[,-c(1,8,9,10)], as.numeric) # a numerico
# veamos que tenemos:
str(demo)
## 'data.frame': 167 obs. of 10 variables:
## $ Country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ Score : num 9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
## $ Electoralprocessandpluralism: num 10 10 9.58 10 10 10 10 9.58 10 9.58 ...
## $ Functioningofgovernment : num 9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
## $ Politicalparticipation : num 10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
## $ Politicalculture : num 10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
## $ Civilliberties : num 9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
## $ Regimetype : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
## $ Region : chr "Western Europe" "Western Europe" "Western Europe" "Asia & Australasia" ...
## $ Changesfromlastyear : chr "Score: Rank:" "Score: Rank:" "Score: Rank:" "Score: Rank:" ...
library(htmltab)
happyL="https://en.wikipedia.org/wiki/World_Happiness_Report"
happyPath='//*[@id="mw-content-text"]/div/table/tbody'
happy = htmltab(doc = happyL,which = happyPath,encoding = "UTF-8")
happy[,]=lapply(happy[,], trimws,whitespace = "[\\h\\v]") # no blanks
library(stringr)
names(happy)=str_split(names(happy)," ",simplify = T)[,1]
names(happy)[names(happy)=="Score"]="ScoreHappy"
happy$Overall=NULL # esto luego de lo anterior
happy[,c(2:8)]=lapply(happy[,c(2:8)],as.numeric)
happy=na.omit(happy) # esto luego de lo anterior
str(happy)
## 'data.frame': 156 obs. of 8 variables:
## $ Country : chr "Finland" "Denmark" "Norway" "Iceland" ...
## $ ScoreHappy : num 7.77 7.6 7.55 7.49 7.49 ...
## $ GDP : num 1.34 1.38 1.49 1.38 1.4 ...
## $ Social : num 1.59 1.57 1.58 1.62 1.52 ...
## $ Healthy : num 0.986 0.996 1.028 1.026 0.999 ...
## $ Freedom : num 0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
## $ Generosity : num 0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
## $ Perceptions: num 0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...
demohappy=merge(demo,happy)
demohappy=demohappy[,-c(8,9,10)]
library(cluster)
set.seed(123)
row.names(demohappy)=demohappy$Country
g.dist = daisy(demohappy[,c(3:14)], metric="gower")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.pam=pam(g.dist,3,cluster.only = F)
demohappy$pam=res.pam$cluster
res.agnes <- hcut(g.dist, k = 3,hc_func='agnes')
demohappy$agnes=res.agnes$cluster
fviz_nbclust(demohappy[,c(3:14)], pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
Respuesta: No, se debió obtener 6 clusters con pam
fviz_nbclust(demohappy[,c(3:14)], hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
Respuesta: No, se debió obtener 6 clusters
fviz_silhouette(res.pam)
## cluster size ave.sil.width
## 1 1 56 0.26
## 2 2 63 0.22
## 3 3 28 0.44
fviz_silhouette(res.agnes)
## cluster size ave.sil.width
## 1 1 68 0.18
## 2 2 43 0.35
## 3 3 36 0.32
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']
# respuesta: TRECE
setdiff(poorAGNES,poorPAM)
## [1] "Liberia" "Kuwait" "Turkey" "Madagascar" "Bhutan"
## [6] "Morocco" "Kyrgyzstan" "Bolivia" "Guatemala" "Honduras"
## [11] "Cyprus" "Italy" "Greece"
proyeccion = cmdscale(g.dist, k=2,add = T)
demohappy$dim1 <- proyeccion$points[,1]
demohappy$dim2 <- proyeccion$points[,2]
g.dist.cmd = daisy(demohappy[,c('dim1','dim2')], metric = 'euclidean')
library(dbscan)
kNNdistplot(g.dist.cmd, k=6)
library(fpc)
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
db.cmd = dbscan(g.dist.cmd, eps=0.09, MinPts=6,method = 'dist')
# respuesta:1 cluster
db.cmd
## dbscan Pts=147 MinPts=6 eps=0.09
## 0 1
## border 10 5
## seed 0 132
## total 10 137
#países no clusterizados
demohappy$dbscan=db.cmd$cluster
# respuesta:
demohappy[demohappy$dbscan==0,'Country']
## [1] "Afghanistan" "Bahrain"
## [3] "Central African Republic" "Hong Kong"
## [5] "Qatar" "Saudi Arabia"
## [7] "Singapore" "Syria"
## [9] "United Arab Emirates" "Uzbekistan"
#atípicos y mal clusterizados
atiDB=demohappy[demohappy$dbscan==0,'Country']
# respuesta: ninguno
intersect(atiDB,poorAGNES)
## character(0)
#países del cluster más bajo
poorcluster=demohappy[demohappy$agnes=='1','Country']
intersect(poorcluster,atiDB)
## [1] "Afghanistan" "Bahrain"
## [3] "Central African Republic" "Qatar"
## [5] "Saudi Arabia" "Syria"
## [7] "United Arab Emirates" "Uzbekistan"
#respuesta: