Ejercicio de repaso

Paola Nieto (20150967)

Traer data

library(htmltab)
linkPage1 = "https://en.wikipedia.org/wiki/Democracy_Index"
linkPath1 = '//*[@id="mw-content-text"]/div/table[2]'
demo = htmltab(doc = linkPage1,
               which = linkPath1)

linkPage2 = "https://en.wikipedia.org/wiki/World_Happiness_Report"
linkPath2 = '//*[@id="mw-content-text"]/div/table'
feliz = htmltab(doc = linkPage2,
                which = linkPath2)

Limpieza de data

Democracia

library(stringr)
library(magrittr)
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,]=lapply(demo[,], trimws,whitespace = "[\\h\\v]")
demo[,c(1,10,11)]=NULL
demo[,-c(1,8)]=lapply(demo[,-c(1,8)],as.numeric) 
names(demo)[names(demo)=='Score']='ScoreDemo'
str(demo)

## 'data.frame':    167 obs. of  8 variables:
##  $ Country                     : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ ScoreDemo                   : num  9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
##  $ Electoralprocessandpluralism: num  10 10 9.58 10 10 10 10 9.58 10 9.58 ...
##  $ Functioningofgovernment   : num  9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
##  $ Politicalparticipation     : num  10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
##  $ Politicalculture            : num  10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
##  $ Civilliberties              : num  9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
##  $ Regimetype                  : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...

Felicidad

feliz[,]=lapply(feliz[,], trimws,whitespace = "[\\h\\v]")
feliz[,c(1)]=NULL
feliz[,-c(1)]=lapply(feliz[,-c(1)],as.numeric) 
names(feliz)[names(feliz)=='Score']='ScoreFeliz'
names(feliz)[names(feliz)=='Country or region']='Country'
str(feliz)

## 'data.frame':    156 obs. of  8 variables:
##  $ Country                     : chr  "Finland" "Denmark" "Norway" "Iceland" ...
##  $ ScoreFeliz                  : num  7.77 7.6 7.55 7.49 7.49 ...
##  $ GDP per capita              : num  1.34 1.38 1.49 1.38 1.4 ...
##  $ Social support              : num  1.59 1.57 1.58 1.62 1.52 ...
##  $ Healthy life expectancy     : num  0.986 0.996 1.028 1.026 0.999 ...
##  $ Freedom to make life choices: num  0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
##  $ Generosity                  : num  0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
##  $ Perceptions of corruption   : num  0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...

Merning

datajunta = merge(demo,feliz, by.x = 'Country', by.y = 'Country')
str(datajunta)

## 'data.frame':    147 obs. of  15 variables:
##  $ Country                     : chr  "Afghanistan" "Albania" "Algeria" "Argentina" ...
##  $ ScoreDemo                   : num  2.85 5.89 4.01 7.02 5.54 9.09 8.29 2.75 2.55 5.88 ...
##  $ Electoralprocessandpluralism: num  3.42 7 3.08 9.17 7.5 10 9.58 0.5 0.83 7.83 ...
##  $ Functioningofgovernment   : num  0.64 5.36 2.86 5.36 5.36 8.93 7.86 3.21 2.71 6.07 ...
##  $ Politicalparticipation     : num  3.89 4.44 5 6.11 6.11 7.78 8.33 2.78 2.78 6.11 ...
##  $ Politicalculture            : num  2.5 5 5 6.25 3.13 8.75 6.88 3.75 4.38 4.38 ...
##  $ Civilliberties              : num  3.82 7.65 4.12 8.24 5.59 10 8.82 3.53 2.06 5 ...
##  $ Regimetype                  : chr  "Authoritarian" "Hybrid regime" "Hybrid regime" "Flawed democracy" ...
##  $ ScoreFeliz                  : num  3.2 4.72 5.21 6.09 4.56 ...
##  $ GDP per capita              : num  0.35 0.947 1.002 1.092 0.85 ...
##  $ Social support              : num  0.517 0.848 1.16 1.432 1.055 ...
##  $ Healthy life expectancy     : num  0.361 0.874 0.785 0.881 0.815 ...
##  $ Freedom to make life choices: num  0 0.383 0.086 0.471 0.283 0.557 0.532 0.351 0.536 0.527 ...
##  $ Generosity                  : num  0.158 0.178 0.073 0.066 0.095 0.332 0.244 0.035 0.255 0.166 ...
##  $ Perceptions of corruption   : num  0.025 0.027 0.114 0.05 0.064 0.29 0.226 0.182 0.11 0.143 ...

Clusterización

Nos quedamos con los elementos indispensables

datajunta[,c(2,8,9)]=NULL
row.names(datajunta)= datajunta$Country
str(datajunta)

## 'data.frame':    147 obs. of  12 variables:
##  $ Country                     : chr  "Afghanistan" "Albania" "Algeria" "Argentina" ...
##  $ Electoralprocessandpluralism: num  3.42 7 3.08 9.17 7.5 10 9.58 0.5 0.83 7.83 ...
##  $ Functioningofgovernment   : num  0.64 5.36 2.86 5.36 5.36 8.93 7.86 3.21 2.71 6.07 ...
##  $ Politicalparticipation     : num  3.89 4.44 5 6.11 6.11 7.78 8.33 2.78 2.78 6.11 ...
##  $ Politicalculture            : num  2.5 5 5 6.25 3.13 8.75 6.88 3.75 4.38 4.38 ...
##  $ Civilliberties              : num  3.82 7.65 4.12 8.24 5.59 10 8.82 3.53 2.06 5 ...
##  $ GDP per capita              : num  0.35 0.947 1.002 1.092 0.85 ...
##  $ Social support              : num  0.517 0.848 1.16 1.432 1.055 ...
##  $ Healthy life expectancy     : num  0.361 0.874 0.785 0.881 0.815 ...
##  $ Freedom to make life choices: num  0 0.383 0.086 0.471 0.283 0.557 0.532 0.351 0.536 0.527 ...
##  $ Generosity                  : num  0.158 0.178 0.073 0.066 0.095 0.332 0.244 0.035 0.255 0.166 ...
##  $ Perceptions of corruption   : num  0.025 0.027 0.114 0.05 0.064 0.29 0.226 0.182 0.11 0.143 ...

###Colocamos de semilla aleatoria 123 y usamos la metrica gower

library(cluster)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(plyr)
set.seed(123)
inputData=datajunta[,c(2:12)]
g.dist = daisy(inputData, metric="gower")

Calculamos 3 clusters

res.pam = pam(g.dist,3,cluster.only = F) #partición
res.agnes = hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D") #aglomeración
res.diana = hcut(g.dist, k = 3,hc_func='diana') #división

Preguntas

1. ¿Se debió obtener 3 clusters usando k-medoides, u otro valor era mejor?

Si determinamos la cantidad de cluster con la medida gap, en el caso de la tecnica de partición nos recomienda 6 clusters

fviz_nbclust(inputData, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

2. ¿Se debió obtener 3 clusters usando jerarquizacion aglomerativa, u otro valor era mejor?

Si determinamos la cantidad de cluster con la medida gap, en el caso de la tecnica de jerarquización aglomerativa nos recomienda 6 clusters

fviz_nbclust(inputData, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

3. Si se mantiene pedir tres clusters en ambos procedimientos ¿Cuál clusterizó mejor?

Evaluación Gráfica

fviz_silhouette(res.pam)

##   cluster size ave.sil.width
## 1       1   58          0.25
## 2       2   59          0.29
## 3       3   30          0.42

fviz_silhouette(res.agnes)

##   cluster size ave.sil.width
## 1       1   49          0.31
## 2       2   80          0.22
## 3       3   18          0.60

fviz_silhouette(res.diana)

##   cluster size ave.sil.width
## 1       1   68          0.22
## 2       2   49          0.36
## 3       3   30          0.38

poorPAM=data.frame(res.pam$silinfo$widths)
poorPAM$Country=row.names(poorPAM)
poorPAMcases=poorPAM[poorPAM$sil_width<0,'Country']
poorPAMcases

## [1] "Kenya"          "Liberia"        "Nepal"          "Czech Republic"
## [5] "Kuwait"         "Slovenia"       "South Korea"

length(poorPAMcases)

## [1] 7

# agnes
poorAGNES=data.frame(res.agnes$silinfo$widths)
poorAGNES$country=row.names(poorAGNES)
poorAGNEScases=poorAGNES[poorAGNES$sil_width<0,'country']
poorAGNEScases

##  [1] "Portugal"   "Lebanon"    "Zambia"     "Mauritius"  "Benin"     
##  [6] "Kuwait"     "Pakistan"   "Uganda"     "Costa Rica" "Estonia"   
## [11] "Israel"     "Spain"      "Japan"      "Uruguay"    "Gambia"    
## [16] "Belgium"    "France"

length(poorAGNEScases)

## [1] 17

#diana:
poorDIANA=data.frame(res.diana$silinfo$widths)
poorDIANA$country=row.names(poorDIANA)
poorDIANAcases=poorDIANA[poorDIANA$sil_width<0,'country']
poorDIANAcases

##  [1] "Liberia"                "Lebanon"                "Nepal"                 
##  [4] "Madagascar"             "Morocco"                "Kyrgyzstan"            
##  [7] "Bolivia"                "Senegal"                "Bosnia and Herzegovina"
## [10] "Georgia"

length(poorDIANAcases)

## [1] 10

Segun la evaluación grafica y la información de las silurtas, el tiene menos elementos negativos es el preferible a los demás. Entonces la tecnica de partición clusterizó mejor

4. Si se mantiene pedir tres clusters en ambos procedimientos ¿ Cuántos países de los mal clusterizados k-medoides nos quedaría si no contásemos los mal clusterizados por jerarquizacion aglomerativa?

Los paises mal asignados por pam pero no por agnes:

quedarían 6 paises

setdiff(poorPAMcases,poorAGNEScases)

## [1] "Kenya"          "Liberia"        "Nepal"          "Czech Republic"
## [5] "Slovenia"       "South Korea"

5. Si usamos dbscan, ¿Cuántos clusters se formarían si usamos un epsilon de 0.09?

Calculando Clusters de Densidad

necesitamos un mapa de posiciones para todos los casos y usamos el escalamiento multidimensional

proyeccion = cmdscale(g.dist, k=2,add = T) #k es el número de dimensiones

inputData$dim1 <- proyeccion$points[,1]
inputData$dim2 <- proyeccion$points[,2]

base= ggplot(inputData,aes(x=dim1, y=dim2,label=row.names(inputData))) 
base + geom_text(size=2)

inputData$pam=as.factor(res.pam$clustering)
inputData$agnes=as.factor(res.agnes$cluster)
inputData$diana=as.factor(res.diana$cluster)

# Estimado limites:
min(inputData[,c('dim1','dim2')]); max(inputData[,c('dim1','dim2')])

## [1] -0.5581733

## [1] 0.5123219

#### Procedeamos a gráficar:

#PAM
limites=c(-0.5581733,0.5123219)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
base + geom_point(size=2, aes(color=pam))  + labs(title = "PAM")

#AGNES
base + geom_point(size=2, aes(color=agnes)) + labs(title = "AGNES")

#DIANA
base + geom_point(size=2, aes(color=diana)) + labs(title = "DIANA")

Ahora calculemos usando dbscan:

Nuevas distancias: Las posiciones son la información para dbscan.

# euclidea!!
g.dist.cmd = daisy(inputData[,c('dim1','dim2')], metric = 'euclidean')

Obteniendo clusters

library(fpc)
db.cmd = dbscan(g.dist.cmd, eps=0.09, MinPts=5,method = 'dist')
db.cmd

## dbscan Pts=147 MinPts=5 eps=0.09
##        0   1
## border 8   7
## seed   0 132
## total  8 139

inputData$dbCMD=as.factor(db.cmd$cluster)
library(ggrepel)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
dbplot= base + geom_point(aes(color=dbCMD)) 
dbplot

6. Si usamos dbscan, ¿Qué países no fueron clusterizados (atípicos)?

LABEL=ifelse(inputData$dbCMD==0,row.names(inputData),"")
dbplot + geom_text_repel(aes(label=LABEL),
                         size=5, 
                         direction = "y", ylim = 0.45,
                         angle=45,
                         segment.colour = "grey")

atipicos=data.frame(inputData$dbCMD==0,row.names(inputData),"")
names(atipicos)

## [1] "inputData.dbCMD....0" "row.names.inputData." "X.."

atipicos[,c(3)]=NULL

atipicos2= atipicos[atipicos$inputData.dbCMD....0 == "TRUE",]$row.names.inputData.

atipicos2=data.frame(atipicos2)
atipicos2

##                  atipicos2
## 1              Afghanistan
## 2                   Bhutan
## 3                  Burundi
## 4 Central African Republic
## 5                Hong Kong
## 6                Singapore
## 7                    Syria
## 8               Uzbekistan

7. ¿Qué paises coinciden entre los atipicos de dbscan y los mal clusterizados por jerarquizacion aglomerativa?

intersect(poorAGNEScases,atipicos2)

## data frame with 0 columns and 0 rows

8. ¿Hay alguno de los países del cluster más bajo calculado con agnes que sea parte de los países difíciles

de clusterizar en las dbscan?

peores =data.frame(inputData$agnes==1,row.names (inputData),"")
peores[,c(3)]=NULL
peores2= peores[peores$inputData.agnes....1 == "TRUE",]$row.names.inputData.
peores2=data.frame(peores2)
peores2

##                     peores2
## 1               Afghanistan
## 2                   Algeria
## 3                Azerbaijan
## 4                   Bahrain
## 5                   Belarus
## 6              Burkina Faso
## 7                   Burundi
## 8                  Cambodia
## 9                  Cameroon
## 10 Central African Republic
## 11                     Chad
## 12                    China
## 13                  Comoros
## 14                    Egypt
## 15                 Eswatini
## 16                 Ethiopia
## 17                    Gabon
## 18                   Guinea
## 19                    Haiti
## 20                     Iran
## 21                     Iraq
## 22              Ivory Coast
## 23                   Jordan
## 24               Kazakhstan
## 25                     Laos
## 26                    Libya
## 27                     Mali
## 28               Mauritania
## 29               Mozambique
## 30                  Myanmar
## 31                Nicaragua
## 32                    Niger
## 33                  Nigeria
## 34                    Qatar
## 35                   Russia
## 36                   Rwanda
## 37             Saudi Arabia
## 38             Sierra Leone
## 39                    Syria
## 40               Tajikistan
## 41                     Togo
## 42                   Turkey
## 43             Turkmenistan
## 44     United Arab Emirates
## 45               Uzbekistan
## 46                Venezuela
## 47                  Vietnam
## 48                    Yemen
## 49                 Zimbabwe

Hay 5 paises en ambos grupos

names(peores2)=c("a")
names(atipicos2)=c("a")
merge(atipicos2,peores2)

##                          a
## 1              Afghanistan
## 2                  Burundi
## 3 Central African Republic
## 4                    Syria
## 5               Uzbekistan