Pasos previos:
library(htmltab)
happyL="https://en.wikipedia.org/wiki/World_Happiness_Report"
happyPath='//*[@id="mw-content-text"]/div/table/tbody'
happy = htmltab(doc = happyL,which = happyPath,encoding = "UTF-8")
happy[,]=lapply(happy[,], trimws,whitespace = "[\\h\\v]") # no blanks
library(stringr)
names(happy)=str_split(names(happy)," ",simplify = T)[,1]
names(happy)[names(happy)=="Score"]="ScoreHappy"
happy$Overall=NULL # esto luego de lo anterior
happy[,c(2:8)]=lapply(happy[,c(2:8)],as.numeric)
happy=na.omit(happy) # esto luego de lo anterior
str(happy)
## 'data.frame': 155 obs. of 8 variables:
## $ Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ ScoreHappy : num 7.63 7.59 7.55 7.5 7.49 ...
## $ GDP : num 1.3 1.46 1.35 1.34 1.42 ...
## $ Social : num 1.59 1.58 1.59 1.64 1.55 ...
## $ Healthy : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ Freedom : num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ Generosity : num 0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ Perceptions: num 0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
## - attr(*, "na.action")= 'omit' Named int 20
## ..- attr(*, "names")= chr "21"
summary(happy)
## Country ScoreHappy GDP Social
## Length:155 Min. :2.905 Min. :0.0000 Min. :0.000
## Class :character 1st Qu.:4.452 1st Qu.:0.6125 1st Qu.:1.075
## Mode :character Median :5.358 Median :0.9400 Median :1.258
## Mean :5.367 Mean :0.8837 Mean :1.216
## 3rd Qu.:6.154 3rd Qu.:1.1925 3rd Qu.:1.464
## Max. :7.632 Max. :1.6490 Max. :1.644
## Healthy Freedom Generosity Perceptions
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.:0.4205 1st Qu.:0.3575 1st Qu.:0.109 1st Qu.:0.051
## Median :0.6430 Median :0.4930 Median :0.173 Median :0.082
## Mean :0.5969 Mean :0.4556 Mean :0.181 Mean :0.112
## 3rd Qu.:0.7785 3rd Qu.:0.5790 3rd Qu.:0.240 3rd Qu.:0.137
## Max. :1.0300 Max. :0.7240 Max. :0.598 Max. :0.457
cortes=c(0,5,7,10)
happy$intervalos=cut(happy$ScoreHappy,
breaks=cortes)
table(happy$intervalos)
##
## (0,5] (5,7] (7,10]
## 59 83 13
library(cluster)
set.seed(123)
row.names(happy)=happy$Country
g.dist = daisy(happy[,c(3:8)], metric="gower")
library(factoextra)
res.pam=pam(g.dist,3,cluster.only = F)
happy$pam=res.pam$cluster
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
happy$diana=res.diana$cluster
fviz_nbclust(happy[,c(3:8)], pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
# respuesta: con PAM era mejor 5
fviz_nbclust(happy[,c(3:8)], hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
# respuesta: con DIANA era mejor 4
fviz_silhouette(res.pam)
## cluster size ave.sil.width
## 1 1 22 0.58
## 2 2 89 0.32
## 3 3 44 0.34
fviz_silhouette(res.diana)
## cluster size ave.sil.width
## 1 1 97 0.23
## 2 2 18 0.28
## 3 3 40 0.07
# respuesta: PAM es mejor pues su siluata promedio es 0.36, y la de DIANA es 0.19
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']
silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']
##
# respuesta: CINCO (los mismos de PAM pues no hubo interseccion)
setdiff(poorPAM,poorDIANA)
## [1] "Myanmar" "Nepal" "Japan" "Bahrain" "France"
proyeccion = cmdscale(g.dist, k=2,add = T) # k is the number of dim
# data frame prep:
happy$dim1 <- proyeccion$points[,1]
happy$dim2 <- proyeccion$points[,2]
g.dist.cmd = daisy(happy[,c('dim1','dim2')], metric = 'euclidean')
# ESTO NO no usamos pues ya te dí el epsilon!!!!!!
# library(dbscan)
# kNNdistplot(g.dist.cmd, k=6) # 6 columnas de input
library(fpc)
db.cmd = dbscan(g.dist.cmd, eps=0.08, MinPts=6,method = 'dist')
# respuesta: 3 clusters
db.cmd
## dbscan Pts=155 MinPts=6 eps=0.08
## 0 1 2 3
## border 14 2 9 9
## seed 0 19 77 25
## total 14 21 86 34
happy$dbscan=db.cmd$cluster
# respuesta:
happy[happy$dbscan==0,'Country']
## [1] "Uzbekistan" "Tajikistan" "Bhutan" "Somalia" "Laos"
## [6] "Mozambique" "Myanmar" "Sudan" "Angola" "Afghanistan"
## [11] "Malawi" "Rwanda" "Yemen" "Burundi"
atiDB=happy[happy$dbscan==0,'Country']
# respuesta
intersect(atiDB,poorPAM)
## [1] "Myanmar"
poorIntervalo=happy[happy$intervalos=='(0,5]','Country']
intersect(poorIntervalo,atiDB)
## [1] "Somalia" "Laos" "Mozambique" "Myanmar" "Sudan"
## [6] "Angola" "Afghanistan" "Malawi" "Rwanda" "Yemen"
## [11] "Burundi"