rm(list = ls())
library(htmltab)
# links
WhereDEMO=list(page="https://en.wikipedia.org/wiki/Democracy_Index",
xpath='//*[@id="mw-content-text"]/div[1]/table[6]/tbody')
demo = htmltab(doc = WhereDEMO$page,
which = WhereDEMO$xpath,
encoding = "UTF-8")
#seleccionando columns
demo=demo[,-c(1,2,6)]
# recombrando columns
newDemo=c("Pais","RegimeType","Score","Electoral","Functioning","participation","culture",'Civilliberties')
names(demo)=newDemo
# formateo: texto a ordinal
OrdinalVector=c('Authoritarian','Hybrid regime','Flawed democracy','Full democracy')
demo$RegimeType=factor(demo$RegimeType,
levels = OrdinalVector,
ordered = T)
#borrar espacios en blanco
demo[,]=lapply(demo[,], trimws,whitespace = "[\\h\\v]")
#a numerico
demo[,c(4:8)]=lapply(demo[,c(4:8)],as.numeric)
#borrar datos perdidos
demo=demo[complete.cases(demo),]
#Normalizacion
library(BBmisc)
##
## Attaching package: 'BBmisc'
## The following object is masked from 'package:base':
##
## isFALSE
demo[,c(4:8)]=normalize(demo[,c(4:8)],method='standardize')
summary(demo)
## Pais RegimeType Score Electoral
## Length:167 Length:167 Length:167 Min. :-1.4719
## Class :character Class :character Class :character 1st Qu.:-1.0901
## Mode :character Mode :character Mode :character Median : 0.3588
## Mean : 0.0000
## 3rd Qu.: 0.9263
## Max. : 1.1433
## Functioning participation culture Civilliberties
## Min. :-1.7971 Min. :-2.74693 Min. :-2.2501 Min. :-1.9810
## 1st Qu.:-0.7882 1st Qu.:-0.76540 1st Qu.:-0.8882 1st Qu.:-0.7857
## Median : 0.1394 Median : 0.08528 Median :-0.2072 Median : 0.0812
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6933 3rd Qu.: 0.65071 3rd Qu.: 0.4737 3rd Qu.: 0.8412
## Max. : 1.9365 Max. : 2.34698 Max. : 2.5165 Max. : 1.6011
#correlacion
cor(demo[,c(4:8)])
## Electoral Functioning participation culture Civilliberties
## Electoral 1.0000000 0.8104563 0.7960529 0.5166325 0.9122255
## Functioning 0.8104563 1.0000000 0.7143889 0.6411997 0.8427678
## participation 0.7960529 0.7143889 1.0000000 0.5640728 0.7937461
## culture 0.5166325 0.6411997 0.5640728 1.0000000 0.6329489
## Civilliberties 0.9122255 0.8427678 0.7937461 0.6329489 1.0000000
dataClus2=demo[,c(4:8)]
row.names(dataClus2)=demo$Pais
#Los nombres de los paÃses se vuelven solo los nombres de las filas, pero ya no son una columna como tal
#MATRIZ DE DISTANCIAS
library(cluster)
g.dist = daisy(dataClus2, metric="gower")
set.seed(123)
res.pam=pam(g.dist,4,cluster.only = F) #Aquà el programa creó los clusters cono sus respectivos medoides
#nueva columna
dataClus2$pam=res.pam$cluster
#Siluetas:
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_silhouette(res.pam,print.summary = F)
#
library(magrittr)
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
lowPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
lowPAM
## [1] "Albania" "Bangladesh" "East Timor" "Ghana"
## [5] "Guyana" "Kyrgyzstan" "Mexico" "Namibia"
## [9] "Papua New Guinea" "Sri Lanka" "Tunisia" "Ukraine"
aggregate(.~ pam, data=dataClus2,mean)
## pam Electoral Functioning participation culture Civilliberties
## 1 1 1.03937191 1.179368991 1.04256707 1.29708282 1.187022196
## 2 2 0.81947606 0.463258126 0.42986317 -0.52424631 0.528650986
## 3 3 0.06710133 -0.007315625 -0.05012824 -0.06408559 0.008139621
## 4 4 -1.19379075 -1.058504543 -0.91976935 -0.58680811 -1.107612152
demo$pamDEMOlow=demo$Pais%in%lowPAM
demo$pamDEMO=as.ordered(dataClus2$pam)
dataClus2$pam=NULL
# Comparando
#Veamos qué tanto se parece a la clasificación partición a la po:
# verificar recodificacion
table(demo$pamDEMO,demo$RegimeType,dnn = c('Particion','DEMO'))
## DEMO
## Particion Authoritarian Flawed democracy Full democracy Hybrid regime
## 1 0 20 21 0
## 2 0 25 0 6
## 3 0 8 0 28
## 4 59 0 0 0
#IDH:
library(rio)
DataIDH='https://github.com/taiyonoJoel/CONTROL_3/raw/main/HDR21-22_Statistical_Annex_HDI_Table.csv'
idh=import(DataIDH)
newIDH=c('Pais','EsperanzaVida','EscolaridadDuracion','EscolaridadPromedio','PBI')
names(idh)=newIDH
idh[,-1]=lapply(idh[,-1], as.numeric)
idh=idh[complete.cases(idh[,-1]),]
row.names(idh)=NULL # resetear numero de filas
#creando subdata
dataClus=BBmisc::normalize(idh[,c(2:5)],method='standardize') #Variables normalizaas
row.names(dataClus)=idh$Pais
#matriz de distancias
library(cluster)
g.dist = daisy(dataClus, metric="gower")
set.seed(123)
library(factoextra)
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")
demo$pam=NULL
dataClus2=demo[,c(4:8)]
row.names(dataClus2)=demo$Pais
#MATRIZ DE DISTANCIAS
library(cluster)
g.dist = daisy(dataClus2, metric="gower")
## para PAM
library(factoextra)
fviz_nbclust(dataClus2, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
set.seed(123)
res.pam=pam(g.dist,6,cluster.only = F) #Aquà el programa creó los clusters cono sus respectivos medoides
#nueva columna
dataClus2$pam=res.pam$cluster
#Siluetas
fviz_silhouette(res.pam,print.summary = F)
library(factoextra)
library(cluster)
library(magrittr)
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
## [1] "Albania" "Angola" "Eswatini"
## [4] "Ghana" "Guyana" "Hong Kong"
## [7] "Mexico" "Namibia" "Slovakia"
## [10] "Trinidad and Tobago" "Tunisia" "Turkey"
## [13] "Ukraine"
se cambia el rango para equiparla con el democracy index ()
#MERGE:
library(rio)
Dataunificada='https://github.com/taiyonoJoel/CONTROL_3/raw/main/idhdemo.csv'
idhdemo=import(Dataunificada)
str(idhdemo)
## 'data.frame': 145 obs. of 12 variables:
## $ Pais : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ EsperanzaVida : num 62 76.5 76.4 61.6 75.4 ...
## $ EscolaridadDuracion: num 10.3 14.4 14.6 12.2 17.9 13.1 21.1 16 13.5 16.3 ...
## $ EscolaridadPromedio: num 3 11.3 8.1 5.4 11.1 11.3 12.7 12.3 10.5 11 ...
## $ PBI : int 1824 14131 10800 5466 20925 13158 49238 53619 14257 39497 ...
## $ RegimeType : chr "Authoritarian" "Flawed democracy" "Authoritarian" "Authoritarian" ...
## $ Score : num 0.32 6.11 3.77 3.37 6.81 5.49 8.9 8.07 2.68 2.52 ...
## $ Electoral : num 0 7 3.08 1.33 9.17 7.5 10 9.58 0.5 0.42 ...
## $ Functioning : num 0.07 6.43 2.5 2.86 5 5.71 8.57 6.79 2.5 2.71 ...
## $ participation : num 0 4.44 4.44 5 7.22 6.11 7.78 8.89 2.78 3.33 ...
## $ culture : num 1.25 5.63 5 5 5 3.13 8.75 6.88 5 4.38 ...
## $ Civilliberties : num 0.29 7.06 3.82 2.65 7.65 5 9.41 8.24 2.65 1.76 ...
idhdemo[2:5,8:12]=lapply(idhdemo[2:5,8:12],as.numeric)
#str(idhdemo)
#Normalizando por rangos
library(BBmisc)
idhdemo[,c(2:5)]=normalize(idhdemo[,c(2:5)],method='range',range=c(0,10))
#Variables estandarizadas
#descriptivos:
summary(idhdemo)
## Pais EsperanzaVida EscolaridadDuracion EscolaridadPromedio
## Length:145 Min. : 0.000 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.: 3.950 1st Qu.: 3.191 1st Qu.: 3.167
## Mode :character Median : 6.050 Median : 4.752 Median : 6.083
## Mean : 5.841 Mean : 4.705 Mean : 5.716
## 3rd Qu.: 7.631 3rd Qu.: 6.241 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000
## PBI RegimeType Score Electoral
## Min. : 0.0000 Length:145 Min. :0.320 Min. : 0.000
## 1st Qu.: 0.4251 Class :character 1st Qu.:3.400 1st Qu.: 2.080
## Median : 1.3626 Mode :character Median :5.720 Median : 7.420
## Mean : 2.2255 Mean :5.425 Mean : 5.918
## 3rd Qu.: 3.5545 3rd Qu.:7.160 3rd Qu.: 9.170
## Max. :10.0000 Max. :9.750 Max. :10.000
## Functioning participation culture Civilliberties
## Min. :0.000 Min. : 0.000 Min. : 1.250 Min. :0.290
## 1st Qu.:2.860 1st Qu.: 3.890 1st Qu.: 4.380 1st Qu.:3.530
## Median :5.000 Median : 5.560 Median : 5.000 Median :5.590
## Mean :4.763 Mean : 5.471 Mean : 5.446 Mean :5.532
## 3rd Qu.:6.430 3rd Qu.: 6.670 3rd Qu.: 6.250 3rd Qu.:7.650
## Max. :9.640 Max. :10.000 Max. :10.000 Max. :9.710
ahora procedemos con agnes y DIANA para ambos indices
PERO ANTES SUBSETEAR SOLO CON LAS VARIABLES
# sin los Scores ni nombre de paÃs, discriminando columnas:
dontselect=c("Pais","RegimeType","Score")
select=setdiff(names(idhdemo),dontselect)
#Subsetear la data
theData=idhdemo[,select]
dataClus=theData
row.names(dataClus)=idhdemo$Pais
#Los nombres de los paÃses se vuelven solo los nombres de las filas, pero ya no son una columna como tal
library(cluster)
g.dist = daisy(dataClus, metric="gower") #funcion daisy
## PARA JERARQUICO
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")
-DIANA
## PARA JERARQUICO
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")
res.agnes<- hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D")
fviz_silhouette(res.agnes,print.summary = F)
- 2) Paises mal clusterizados por AGNES
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']#%>%sort()
poorAGNES
## [1] "Guatemala" "Madagascar" "Zambia" "Malawi"
## [5] "Morocco" "Papua New Guinea" "Honduras" "Lesotho"
## [9] "Bhutan" "Lithuania" "Latvia"
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
fviz_silhouette(res.diana,print.summary = F)
- 2) Paises mal clusterizados por DIANA
silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']#%>%sort()
poorDIANA
## [1] "Namibia" "Bosnia and Herzegovina" "Bhutan"
idhdemo2=merge(idh,demo)
idhdemo2[,c(3:6,9:13)]=BBmisc::normalize(idhdemo2[,c(3:6,9:13)],method='standardize')
dataClus=idhdemo2[,c(3:6,9:13)]
row.names(dataClus)=idhdemo2$Pais
library(cluster)
g.dist = daisy(dataClus2, metric="gower")
fviz_nbclust(dataClus2, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")
res.agnes<- hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D")
fviz_silhouette(res.agnes,print.summary = F)
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']#%>%sort()
poorAGNES
## [1] "Hong Kong"
fviz_nbclust(dataClus2, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")
res.diana <- hcut(g.dist, k = 3,hc_func='diana')
fviz_silhouette(res.diana,print.summary = F)
silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']#%>%sort()
poorDIANA
## [1] "Turkey" "Hong Kong" "Kenya" "Pakistan" "Nepal"