CLASE 6 - MAGALLENES ### PASO PREVIO PREVIO - LIBRERIAS
# bibliotecas:
library(stringr)
library(magrittr)
library(htmltab)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
# coleccion
library(htmltab)
demolink = "https://en.wikipedia.org/wiki/Democracy_Index"
demopath = '//*[@id="mw-content-text"]/div[1]/table[3]'
demo<- htmltab(doc = demolink, which =demopath)
# limpieza
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,-c(1,8,9)]=lapply(demo[,-c(1,8,9)], trimws,whitespace = "[\\h\\v]")
# preparación
demo=na.omit(demo)
demo=demo[,-c(1)] #sin Rank
demo[,-c(1,8,9)]=lapply(demo[,-c(1,8,9)], as.numeric) # a numerico
## Warning in lapply(demo[, -c(1, 8, 9)], as.numeric): NAs introduced by coercion
row.names(demo)=demo$Country # cambiando row.names
# veamos que tenemos:
str(demo)
## 'data.frame': 167 obs. of 10 variables:
## $ Country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ Score : num 9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
## $ Electoralprocessandpluralism: num 10 10 9.58 10 10 10 10 9.58 10 9.58 ...
## $ FunctioÂningofgovernÂment : num 9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
## $ PoliticalparticiÂpation : num 10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
## $ Politicalculture : num 10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
## $ Civilliberties : num 9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
## $ Regimetype : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
## $ Region : chr "Western Europe" "Western Europe" "Western Europe" "Asia & Australasia" ...
## $ Changesfromlastyear : num NA NA NA NA NA NA NA NA NA NA ...
set.seed(2019)
inputData=demo[,c(3:7)]
g.dist = daisy(inputData, metric="gower")
A - RECOMENDACIÓN PARA PARTICIÓN
fviz_nbclust(inputData, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
B - RECOMENDACIÓN PARA JERARQUIZACIÓN
fviz_nbclust(inputData, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
res.pam = pam(g.dist,5,cluster.only = F) #PARTICION
fviz_silhouette(res.pam) #GRAFICO
## cluster size ave.sil.width
## 1 1 13 0.59
## 2 2 32 0.29
## 3 3 35 0.32
## 4 4 29 0.16
## 5 5 58 0.39
res.agnes = hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D") #JERARQUIZACION
fviz_silhouette(res.agnes)#GRAFICO
## cluster size ave.sil.width
## 1 1 12 0.64
## 2 2 34 0.29
## 3 3 42 0.26
## 4 4 38 0.12
## 5 5 41 0.41
res.diana = hcut(g.dist, k = 5,hc_func='diana')
fviz_silhouette(res.diana)#GRAFICO
## cluster size ave.sil.width
## 1 1 39 0.39
## 2 2 35 0.30
## 3 3 19 0.15
## 4 4 33 0.10
## 5 5 41 0.37
head(res.pam$silinfo$widths)#EN LOS WIDHTS ESTA LA INFORMACION NEGATIVA
## cluster neighbor sil_width
## Iceland 1 2 0.7195487
## Denmark 1 2 0.6967576
## Sweden 1 2 0.6783678
## Finland 1 2 0.6745581
## Canada 1 2 0.6569306
## Norway 1 2 0.6408634
##CREEMOS UN DATA FRAME
poorPAM=data.frame(res.pam$silinfo$widths)
poorPAM$country=row.names(poorPAM)
poorPAMcases=poorPAM[poorPAM$sil_width<0,'country']#MENOR A 0, PARA SABER LOS NEGATIVOS
poorPAMcases
## [1] "United Kingdom" "South Africa" "Trinidad and Tobago"
## [4] "Ukraine" "Madagascar" "Fiji"
## [7] "Honduras" "Papua New Guinea" "Sri Lanka"
## [10] "Burkina Faso" "Mauritania" "Ivory Coast"
### PARA SABER LA CANTIDAD
#lenght(poorPAMcases)
# agnes
poorAGNES=data.frame(res.agnes$silinfo$widths)
poorAGNES$country=row.names(poorAGNES)
poorAGNEScases=poorAGNES[poorAGNES$sil_width<0,'country']
poorAGNEScases
## [1] "Hungary" "Germany" "Pakistan" "Argentina" "Malaysia"
## [6] "Hong Kong" "Togo" "Liberia" "Myanmar" "Madagascar"
## [11] "Ukraine" "Georgia" "Angola"
#diana:
poorDIANA=data.frame(res.diana$silinfo$widths)
poorDIANA$country=row.names(poorDIANA)
poorDIANAcases=poorDIANA[poorDIANA$sil_width<0,'country']
poorDIANAcases
## [1] "Lithuania" "Slovenia" "Italy"
## [4] "Uganda" "Kyrgyzstan" "Pakistan"
## [7] "Zambia" "Morocco" "Benin"
## [10] "Sierra Leone" "Bolivia" "Mali"
## [13] "Tanzania" "Bosnia and Herzegovina"
PREGUNTA: PAISES MAL ASIGNADOS EN EL CLUSTER DE DOS OBJETOS, POR EJEMPLO, AGNES Y PAM
intersect(poorAGNEScases,poorPAMcases)#, COMO EL Y
## [1] "Madagascar" "Ukraine"
PREGUNTA: PAISES MAL ASIGNADOS EN UNO, PERO NO EN EL OTRO, POR EJEMPLO MAL ASIGNAODS POR AGNES, PERO NO POR PAM
setdiff(poorAGNEScases,poorPAMcases)#, COMO EL PERO NO
## [1] "Hungary" "Germany" "Pakistan" "Argentina" "Malaysia" "Hong Kong"
## [7] "Togo" "Liberia" "Myanmar" "Georgia" "Angola"
PREGUNTA: PAISES MAL ASIGNADOS POR PAM O POR AGNES (O=+)
union(poorPAMcases,poorAGNEScases)
## [1] "United Kingdom" "South Africa" "Trinidad and Tobago"
## [4] "Ukraine" "Madagascar" "Fiji"
## [7] "Honduras" "Papua New Guinea" "Sri Lanka"
## [10] "Burkina Faso" "Mauritania" "Ivory Coast"
## [13] "Hungary" "Germany" "Pakistan"
## [16] "Argentina" "Malaysia" "Hong Kong"
## [19] "Togo" "Liberia" "Myanmar"
## [22] "Georgia" "Angola"
1- MAPA DE POSICIONES
proyeccion = cmdscale(g.dist, k=2,add = T) # k is the number of dim
# data frame prep:
inputData$dim1 <- proyeccion$points[,1]
inputData$dim2 <- proyeccion$points[,2]
2- LA GRAFICACIÓN
base= ggplot(inputData,aes(x=dim1, y=dim2,label=row.names(inputData)))
base + geom_text(size=2)
3 creando los clusters
inputData$pam=as.factor(res.pam$clustering)
inputData$agnes=as.factor(res.agnes$cluster)
inputData$diana=as.factor(res.diana$cluster)
# Estimado limites:
min(inputData[,c('dim1','dim2')]); max(inputData[,c('dim1','dim2')])
## [1] -0.6359377
## [1] 0.6579872
4- la graficación segun cada uno
limites=c(-0.7,0.7)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
base + geom_point(size=2, aes(color=pam)) + labs(title = "PAM")
base + geom_point(size=2, aes(color=agnes)) + labs(title = "AGNES")
base + geom_point(size=2, aes(color=diana)) + labs(title = "DIANA")
nuevas distancias. posiciones es el dbscan.
# euclidea!!
g.dist.cmd = daisy(inputData[,c('dim1','dim2')], metric = 'euclidean')
CALCULANDO EPSILON
library(dbscan)
kNNdistplot(g.dist.cmd, k=5)
OBTENIENDO CLUSTERS
library(fpc)
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
db.cmd = dbscan(g.dist.cmd, eps=0.06, MinPts=5,method = 'dist')
db.cmd
## dbscan Pts=167 MinPts=5 eps=0.06
## 0 1 2
## border 11 12 5
## seed 0 86 53
## total 11 98 58
##PARA PONERLO EN OTRA COLUMNA, DUDO QUE NOS SIRVA PARA MAÑANA PERO BUEH
inputData$dbCMD=as.factor(db.cmd$cluster)
library(ggrepel)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
dbplot= base + geom_point(aes(color=dbCMD))
dbplot
dbplot + geom_text_repel(size=5,aes(label=row.names(inputData)))
LABEL=ifelse(inputData$dbCMD==0,row.names(inputData),"")
dbplot + geom_text_repel(aes(label=LABEL),
size=5,
direction = "y", ylim = 0.45,
angle=45,
segment.colour = "grey")