1. Con la data original del Democracy Index pide 4 clusters y compara el resultado con lo propuesto por The Economist. ¿Qué observas?
rm(list = ls())
library(htmltab)

# links
WhereDEMO=list(page="https://en.wikipedia.org/wiki/Democracy_Index",
               xpath='//*[@id="mw-content-text"]/div[1]/table[6]/tbody')
demo  = htmltab(doc = WhereDEMO$page, 
                which  = WhereDEMO$xpath,
                encoding = "UTF-8")

#seleccionando columns
demo=demo[,-c(1,2,6)]
# recombrando columns
newDemo=c("Pais","RegimeType","Score","Electoral","Functioning","participation","culture",'Civilliberties')
names(demo)=newDemo

# formateo: texto a ordinal
OrdinalVector=c('Authoritarian','Hybrid regime','Flawed democracy','Full democracy')
demo$RegimeType=factor(demo$RegimeType,
                          levels = OrdinalVector,
                          ordered = T)

#borrar espacios en blanco
demo[,]=lapply(demo[,], trimws,whitespace = "[\\h\\v]")


#a numerico
demo[,c(4:8)]=lapply(demo[,c(4:8)],as.numeric)
#borrar datos perdidos
demo=demo[complete.cases(demo),]
#Normalizacion
library(BBmisc)
## 
## Attaching package: 'BBmisc'
## The following object is masked from 'package:base':
## 
##     isFALSE
demo[,c(4:8)]=normalize(demo[,c(4:8)],method='standardize')
summary(demo)
##      Pais            RegimeType           Score             Electoral      
##  Length:167         Length:167         Length:167         Min.   :-1.4719  
##  Class :character   Class :character   Class :character   1st Qu.:-1.0901  
##  Mode  :character   Mode  :character   Mode  :character   Median : 0.3588  
##                                                           Mean   : 0.0000  
##                                                           3rd Qu.: 0.9263  
##                                                           Max.   : 1.1433  
##   Functioning      participation         culture        Civilliberties   
##  Min.   :-1.7971   Min.   :-2.74693   Min.   :-2.2501   Min.   :-1.9810  
##  1st Qu.:-0.7882   1st Qu.:-0.76540   1st Qu.:-0.8882   1st Qu.:-0.7857  
##  Median : 0.1394   Median : 0.08528   Median :-0.2072   Median : 0.0812  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6933   3rd Qu.: 0.65071   3rd Qu.: 0.4737   3rd Qu.: 0.8412  
##  Max.   : 1.9365   Max.   : 2.34698   Max.   : 2.5165   Max.   : 1.6011
#correlacion
cor(demo[,c(4:8)])
##                Electoral Functioning participation   culture Civilliberties
## Electoral      1.0000000   0.8104563     0.7960529 0.5166325      0.9122255
## Functioning    0.8104563   1.0000000     0.7143889 0.6411997      0.8427678
## participation  0.7960529   0.7143889     1.0000000 0.5640728      0.7937461
## culture        0.5166325   0.6411997     0.5640728 1.0000000      0.6329489
## Civilliberties 0.9122255   0.8427678     0.7937461 0.6329489      1.0000000
dataClus2=demo[,c(4:8)]
row.names(dataClus2)=demo$Pais
#Los nombres de los países se vuelven solo los nombres de las filas, pero ya no son una columna como tal
#MATRIZ DE DISTANCIAS
library(cluster)
g.dist = daisy(dataClus2, metric="gower")
set.seed(123)
res.pam=pam(g.dist,4,cluster.only = F) #Aquí el programa creó los clusters cono sus respectivos medoides

#nueva columna
dataClus2$pam=res.pam$cluster
#Siluetas:
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_silhouette(res.pam,print.summary = F)

#
library(magrittr)
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
lowPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
lowPAM
##  [1] "Albania"          "Bangladesh"       "East Timor"       "Ghana"           
##  [5] "Guyana"           "Kyrgyzstan"       "Mexico"           "Namibia"         
##  [9] "Papua New Guinea" "Sri Lanka"        "Tunisia"          "Ukraine"
aggregate(.~ pam, data=dataClus2,mean)
##   pam   Electoral  Functioning participation     culture Civilliberties
## 1   1  1.03937191  1.179368991    1.04256707  1.29708282    1.187022196
## 2   2  0.81947606  0.463258126    0.42986317 -0.52424631    0.528650986
## 3   3  0.06710133 -0.007315625   -0.05012824 -0.06408559    0.008139621
## 4   4 -1.19379075 -1.058504543   -0.91976935 -0.58680811   -1.107612152
demo$pamDEMOlow=demo$Pais%in%lowPAM
demo$pamDEMO=as.ordered(dataClus2$pam)
dataClus2$pam=NULL
# Comparando
#Veamos qué tanto se parece a la clasificación partición a la po:

# verificar recodificacion
table(demo$pamDEMO,demo$RegimeType,dnn = c('Particion','DEMO'))
##          DEMO
## Particion Authoritarian Flawed democracy Full democracy Hybrid regime
##         1             0               20             21             0
##         2             0               25              0             6
##         3             0                8              0            28
##         4            59                0              0             0

14. Con la data original del IDH averigua cuantos clusters se recomiendan siguiendo la técnica aglomerativa. R.: RECOMIENDA 3 CLUSTER (Nota: normalizar por default, se debe especificar si se normaliza por rangos

#IDH:

library(rio)
DataIDH='https://github.com/taiyonoJoel/CONTROL_3/raw/main/HDR21-22_Statistical_Annex_HDI_Table.csv'
idh=import(DataIDH)

newIDH=c('Pais','EsperanzaVida','EscolaridadDuracion','EscolaridadPromedio','PBI')
names(idh)=newIDH
idh[,-1]=lapply(idh[,-1], as.numeric)
idh=idh[complete.cases(idh[,-1]),]
row.names(idh)=NULL # resetear numero de filas
#creando subdata
dataClus=BBmisc::normalize(idh[,c(2:5)],method='standardize') #Variables normalizaas
row.names(dataClus)=idh$Pais
#matriz de distancias 
library(cluster)
g.dist = daisy(dataClus, metric="gower")
set.seed(123)

library(factoextra)
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

15. Con la data original del Democracy Index pide los clusters que se recomienden, ¿Cuántos países quedan mal clusterizados? RPTA: 6

demo$pam=NULL
dataClus2=demo[,c(4:8)]
row.names(dataClus2)=demo$Pais

#MATRIZ DE DISTANCIAS
library(cluster)
g.dist = daisy(dataClus2, metric="gower")

## para PAM
library(factoextra)
fviz_nbclust(dataClus2, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

set.seed(123)
res.pam=pam(g.dist,6,cluster.only = F) #Aquí el programa creó los clusters cono sus respectivos medoides

#nueva columna
dataClus2$pam=res.pam$cluster
#Siluetas
fviz_silhouette(res.pam,print.summary = F)

library(factoextra)
library(cluster)
library(magrittr)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
##  [1] "Albania"             "Angola"              "Eswatini"           
##  [4] "Ghana"               "Guyana"              "Hong Kong"          
##  [7] "Mexico"              "Namibia"             "Slovakia"           
## [10] "Trinidad and Tobago" "Tunisia"             "Turkey"             
## [13] "Ukraine"
  1. Haz un merge con IDH y Democracy Index, pero recuperando todos los paises aun cuando no se escriban igual. Normalice los valores del IDH con el metodo rango (del 0 al 10). ¿Hay diferencias en cuanto a las sugerencia de la cantidad de clusters para cada tecnica jerarquica si usa los componentes de ambos conceptos?

se cambia el rango para equiparla con el democracy index ()

#MERGE:

library(rio)
Dataunificada='https://github.com/taiyonoJoel/CONTROL_3/raw/main/idhdemo.csv'
idhdemo=import(Dataunificada)
str(idhdemo)
## 'data.frame':    145 obs. of  12 variables:
##  $ Pais               : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ EsperanzaVida      : num  62 76.5 76.4 61.6 75.4 ...
##  $ EscolaridadDuracion: num  10.3 14.4 14.6 12.2 17.9 13.1 21.1 16 13.5 16.3 ...
##  $ EscolaridadPromedio: num  3 11.3 8.1 5.4 11.1 11.3 12.7 12.3 10.5 11 ...
##  $ PBI                : int  1824 14131 10800 5466 20925 13158 49238 53619 14257 39497 ...
##  $ RegimeType         : chr  "Authoritarian" "Flawed democracy" "Authoritarian" "Authoritarian" ...
##  $ Score              : num  0.32 6.11 3.77 3.37 6.81 5.49 8.9 8.07 2.68 2.52 ...
##  $ Electoral          : num  0 7 3.08 1.33 9.17 7.5 10 9.58 0.5 0.42 ...
##  $ Functioning        : num  0.07 6.43 2.5 2.86 5 5.71 8.57 6.79 2.5 2.71 ...
##  $ participation      : num  0 4.44 4.44 5 7.22 6.11 7.78 8.89 2.78 3.33 ...
##  $ culture            : num  1.25 5.63 5 5 5 3.13 8.75 6.88 5 4.38 ...
##  $ Civilliberties     : num  0.29 7.06 3.82 2.65 7.65 5 9.41 8.24 2.65 1.76 ...
idhdemo[2:5,8:12]=lapply(idhdemo[2:5,8:12],as.numeric)
#str(idhdemo)
#Normalizando por rangos
library(BBmisc)

idhdemo[,c(2:5)]=normalize(idhdemo[,c(2:5)],method='range',range=c(0,10)) 

#Variables estandarizadas
#descriptivos:
summary(idhdemo)
##      Pais           EsperanzaVida    EscolaridadDuracion EscolaridadPromedio
##  Length:145         Min.   : 0.000   Min.   : 0.000      Min.   : 0.000     
##  Class :character   1st Qu.: 3.950   1st Qu.: 3.191      1st Qu.: 3.167     
##  Mode  :character   Median : 6.050   Median : 4.752      Median : 6.083     
##                     Mean   : 5.841   Mean   : 4.705      Mean   : 5.716     
##                     3rd Qu.: 7.631   3rd Qu.: 6.241      3rd Qu.: 8.000     
##                     Max.   :10.000   Max.   :10.000      Max.   :10.000     
##       PBI           RegimeType            Score         Electoral     
##  Min.   : 0.0000   Length:145         Min.   :0.320   Min.   : 0.000  
##  1st Qu.: 0.4251   Class :character   1st Qu.:3.400   1st Qu.: 2.080  
##  Median : 1.3626   Mode  :character   Median :5.720   Median : 7.420  
##  Mean   : 2.2255                      Mean   :5.425   Mean   : 5.918  
##  3rd Qu.: 3.5545                      3rd Qu.:7.160   3rd Qu.: 9.170  
##  Max.   :10.0000                      Max.   :9.750   Max.   :10.000  
##   Functioning    participation       culture       Civilliberties 
##  Min.   :0.000   Min.   : 0.000   Min.   : 1.250   Min.   :0.290  
##  1st Qu.:2.860   1st Qu.: 3.890   1st Qu.: 4.380   1st Qu.:3.530  
##  Median :5.000   Median : 5.560   Median : 5.000   Median :5.590  
##  Mean   :4.763   Mean   : 5.471   Mean   : 5.446   Mean   :5.532  
##  3rd Qu.:6.430   3rd Qu.: 6.670   3rd Qu.: 6.250   3rd Qu.:7.650  
##  Max.   :9.640   Max.   :10.000   Max.   :10.000   Max.   :9.710
# sin los Scores ni nombre de país, discriminando columnas:
dontselect=c("Pais","RegimeType","Score")
select=setdiff(names(idhdemo),dontselect) 

#Subsetear la data
theData=idhdemo[,select] 
dataClus=theData
row.names(dataClus)=idhdemo$Pais
#Los nombres de los países se vuelven solo los nombres de las filas, pero ya no son una columna como tal
library(cluster)
g.dist = daisy(dataClus, metric="gower") #funcion daisy
## PARA JERARQUICO
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

-DIANA

## PARA JERARQUICO

fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")

  1. Para el caso anterior, si Ud pide 3 clusters, ¿Qué diferencias encuentra al clusterizar por ambas técnicas? –> #LAS UNICAS DIFERENCIAS: fijarnos en 1) valor de la silueta promedio y 2) cantidad de pasises mal clusterizados ojito: la pregunta solo se refiere a clusterizacion jerarquica
res.agnes<- hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D")
fviz_silhouette(res.agnes,print.summary = F)

- 2) Paises mal clusterizados por AGNES

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']#%>%sort()
poorAGNES
##  [1] "Guatemala"        "Madagascar"       "Zambia"           "Malawi"          
##  [5] "Morocco"          "Papua New Guinea" "Honduras"         "Lesotho"         
##  [9] "Bhutan"           "Lithuania"        "Latvia"
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
fviz_silhouette(res.diana,print.summary = F)

- 2) Paises mal clusterizados por DIANA

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']#%>%sort()
poorDIANA
## [1] "Namibia"                "Bosnia and Herzegovina" "Bhutan"
  1. Rehaga las preguntas 16 y 17, pero normalizando toda la data con la técnica de estandarización. NOTA: si no solicita nada, se normaliza (no usar el meotdo del rango a menos que se solicite en la pregunta)
idhdemo2=merge(idh,demo)
idhdemo2[,c(3:6,9:13)]=BBmisc::normalize(idhdemo2[,c(3:6,9:13)],method='standardize')
dataClus=idhdemo2[,c(3:6,9:13)]
row.names(dataClus)=idhdemo2$Pais

library(cluster)
g.dist = daisy(dataClus2, metric="gower")
fviz_nbclust(dataClus2, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

res.agnes<- hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D")
fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']#%>%sort()
poorAGNES
## [1] "Hong Kong"
fviz_nbclust(dataClus2, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")

res.diana <- hcut(g.dist, k = 3,hc_func='diana')
fviz_silhouette(res.diana,print.summary = F)

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']#%>%sort()
poorDIANA
## [1] "Turkey"    "Hong Kong" "Kenya"     "Pakistan"  "Nepal"