R Notebook

CLASE 6 - MAGALLENES ### PASO PREVIO PREVIO - LIBRERIAS

# bibliotecas:
library(stringr)
library(magrittr)
library(htmltab)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)

# coleccion
library(htmltab)
demolink = "https://en.wikipedia.org/wiki/Democracy_Index"
demopath = '//*[@id="mw-content-text"]/div[1]/table[3]'
demo<- htmltab(doc = demolink, which =demopath)

TRAYENDO LA DATA

# limpieza
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,-c(1,8,9)]=lapply(demo[,-c(1,8,9)], trimws,whitespace = "[\\h\\v]")

# preparación
demo=na.omit(demo)
demo=demo[,-c(1)] #sin Rank
demo[,-c(1,8,9)]=lapply(demo[,-c(1,8,9)], as.numeric) # a numerico

## Warning in lapply(demo[, -c(1, 8, 9)], as.numeric): NAs introduced by coercion

row.names(demo)=demo$Country # cambiando row.names


# veamos que tenemos:
str(demo)

## 'data.frame':    167 obs. of  10 variables:
##  $ Country                     : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ Score                       : num  9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
##  $ Electoralprocessandpluralism: num  10 10 9.58 10 10 10 10 9.58 10 9.58 ...
##  $ Functioningofgovernment   : num  9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
##  $ Politicalparticipation     : num  10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
##  $ Politicalculture            : num  10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
##  $ Civilliberties              : num  9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
##  $ Regimetype                  : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
##  $ Region                      : chr  "Western Europe" "Western Europe" "Western Europe" "Asia & Australasia" ...
##  $ Changesfromlastyear         : num  NA NA NA NA NA NA NA NA NA NA ...

PASO 0 - CALCULAR LAS DISTANCIAS - SEMILLA DE TODO

set.seed(2019)
inputData=demo[,c(3:7)]
g.dist = daisy(inputData, metric="gower")

PASO 1 DETERMINANDO CANTIDAD DE CLUSTERS

A - RECOMENDACIÓN PARA PARTICIÓN

fviz_nbclust(inputData, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

B - RECOMENDACIÓN PARA JERARQUIZACIÓN

fviz_nbclust(inputData, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

PASO 2: EVALUACIÓN DE LOS CLUSTERS

res.pam = pam(g.dist,5,cluster.only = F) #PARTICION
fviz_silhouette(res.pam) #GRAFICO

##   cluster size ave.sil.width
## 1       1   13          0.59
## 2       2   32          0.29
## 3       3   35          0.32
## 4       4   29          0.16
## 5       5   58          0.39

res.agnes = hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D") #JERARQUIZACION
fviz_silhouette(res.agnes)#GRAFICO

##   cluster size ave.sil.width
## 1       1   12          0.64
## 2       2   34          0.29
## 3       3   42          0.26
## 4       4   38          0.12
## 5       5   41          0.41

res.diana = hcut(g.dist, k = 5,hc_func='diana')
fviz_silhouette(res.diana)#GRAFICO

##   cluster size ave.sil.width
## 1       1   39          0.39
## 2       2   35          0.30
## 3       3   19          0.15
## 4       4   33          0.10
## 5       5   41          0.37

PASO 3: EVALUACIÓN NÚMERICA - PAM

head(res.pam$silinfo$widths)#EN LOS WIDHTS ESTA LA INFORMACION NEGATIVA

##         cluster neighbor sil_width
## Iceland       1        2 0.7195487
## Denmark       1        2 0.6967576
## Sweden        1        2 0.6783678
## Finland       1        2 0.6745581
## Canada        1        2 0.6569306
## Norway        1        2 0.6408634

##CREEMOS UN DATA FRAME 

poorPAM=data.frame(res.pam$silinfo$widths)

poorPAM$country=row.names(poorPAM)


poorPAMcases=poorPAM[poorPAM$sil_width<0,'country']#MENOR A 0, PARA SABER LOS NEGATIVOS
poorPAMcases

##  [1] "United Kingdom"      "South Africa"        "Trinidad and Tobago"
##  [4] "Ukraine"             "Madagascar"          "Fiji"               
##  [7] "Honduras"            "Papua New Guinea"    "Sri Lanka"          
## [10] "Burkina Faso"        "Mauritania"          "Ivory Coast"

### PARA SABER LA CANTIDAD

#lenght(poorPAMcases)

# agnes
poorAGNES=data.frame(res.agnes$silinfo$widths)
poorAGNES$country=row.names(poorAGNES)
poorAGNEScases=poorAGNES[poorAGNES$sil_width<0,'country']
poorAGNEScases

##  [1] "Hungary"    "Germany"    "Pakistan"   "Argentina"  "Malaysia"  
##  [6] "Hong Kong"  "Togo"       "Liberia"    "Myanmar"    "Madagascar"
## [11] "Ukraine"    "Georgia"    "Angola"

#diana:
poorDIANA=data.frame(res.diana$silinfo$widths)
poorDIANA$country=row.names(poorDIANA)
poorDIANAcases=poorDIANA[poorDIANA$sil_width<0,'country']
poorDIANAcases

##  [1] "Lithuania"              "Slovenia"               "Italy"                 
##  [4] "Uganda"                 "Kyrgyzstan"             "Pakistan"              
##  [7] "Zambia"                 "Morocco"                "Benin"                 
## [10] "Sierra Leone"           "Bolivia"                "Mali"                  
## [13] "Tanzania"               "Bosnia and Herzegovina"

CREO QUE ESTA ES FIJA ASI QUE OJO ARIANNA AAAAH

PREGUNTA: PAISES MAL ASIGNADOS EN EL CLUSTER DE DOS OBJETOS, POR EJEMPLO, AGNES Y PAM

intersect(poorAGNEScases,poorPAMcases)#, COMO EL Y

## [1] "Madagascar" "Ukraine"

PREGUNTA: PAISES MAL ASIGNADOS EN UNO, PERO NO EN EL OTRO, POR EJEMPLO MAL ASIGNAODS POR AGNES, PERO NO POR PAM

setdiff(poorAGNEScases,poorPAMcases)#, COMO EL PERO NO

##  [1] "Hungary"   "Germany"   "Pakistan"  "Argentina" "Malaysia"  "Hong Kong"
##  [7] "Togo"      "Liberia"   "Myanmar"   "Georgia"   "Angola"

PREGUNTA: PAISES MAL ASIGNADOS POR PAM O POR AGNES (O=+)

union(poorPAMcases,poorAGNEScases)

##  [1] "United Kingdom"      "South Africa"        "Trinidad and Tobago"
##  [4] "Ukraine"             "Madagascar"          "Fiji"               
##  [7] "Honduras"            "Papua New Guinea"    "Sri Lanka"          
## [10] "Burkina Faso"        "Mauritania"          "Ivory Coast"        
## [13] "Hungary"             "Germany"             "Pakistan"           
## [16] "Argentina"           "Malaysia"            "Hong Kong"          
## [19] "Togo"                "Liberia"             "Myanmar"            
## [22] "Georgia"             "Angola"

DENSIDAD

1- MAPA DE POSICIONES

proyeccion = cmdscale(g.dist, k=2,add = T) # k is the number of dim
# data frame prep:
inputData$dim1 <- proyeccion$points[,1]
inputData$dim2 <- proyeccion$points[,2]

2- LA GRAFICACIÓN

base= ggplot(inputData,aes(x=dim1, y=dim2,label=row.names(inputData))) 
base + geom_text(size=2)

3 creando los clusters

inputData$pam=as.factor(res.pam$clustering)
inputData$agnes=as.factor(res.agnes$cluster)
inputData$diana=as.factor(res.diana$cluster)
# Estimado limites:
min(inputData[,c('dim1','dim2')]); max(inputData[,c('dim1','dim2')])

## [1] -0.6359377

## [1] 0.6579872

4- la graficación segun cada uno

limites=c(-0.7,0.7)

base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
base + geom_point(size=2, aes(color=pam))  + labs(title = "PAM")

base + geom_point(size=2, aes(color=agnes)) + labs(title = "AGNES")

base + geom_point(size=2, aes(color=diana)) + labs(title = "DIANA")

CALCULANDO USANDO DBSCAN

nuevas distancias. posiciones es el dbscan.

# euclidea!!
g.dist.cmd = daisy(inputData[,c('dim1','dim2')], metric = 'euclidean')

CALCULANDO EPSILON

library(dbscan)
kNNdistplot(g.dist.cmd, k=5)

OBTENIENDO CLUSTERS

library(fpc)

## 
## Attaching package: 'fpc'

## The following object is masked from 'package:dbscan':
## 
##     dbscan

db.cmd = dbscan(g.dist.cmd, eps=0.06, MinPts=5,method = 'dist')
db.cmd

## dbscan Pts=167 MinPts=5 eps=0.06
##         0  1  2
## border 11 12  5
## seed    0 86 53
## total  11 98 58

##PARA PONERLO EN OTRA COLUMNA, DUDO QUE NOS SIRVA PARA MAÑANA PERO BUEH

inputData$dbCMD=as.factor(db.cmd$cluster)

LA ULTIMA GRAN GRAFICACION

library(ggrepel)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
dbplot= base + geom_point(aes(color=dbCMD)) 
dbplot

dbplot + geom_text_repel(size=5,aes(label=row.names(inputData)))

Para que salgan los atipicos

LABEL=ifelse(inputData$dbCMD==0,row.names(inputData),"")
dbplot + geom_text_repel(aes(label=LABEL),
                         size=5, 
                         direction = "y", ylim = 0.45,
                         angle=45,
                         segment.colour = "grey")