library(htmltab)
linkPage1 = "https://en.wikipedia.org/wiki/Democracy_Index"
linkPath1 = '//*[@id="mw-content-text"]/div/table[2]'
demo = htmltab(doc = linkPage1,
which = linkPath1)
linkPage2 = "https://en.wikipedia.org/wiki/World_Happiness_Report"
linkPath2 = '//*[@id="mw-content-text"]/div/table'
feliz = htmltab(doc = linkPage2,
which = linkPath2)
library(stringr)
library(magrittr)
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,]=lapply(demo[,], trimws,whitespace = "[\\h\\v]")
demo[,c(1,10,11)]=NULL
demo[,-c(1,8)]=lapply(demo[,-c(1,8)],as.numeric)
names(demo)[names(demo)=='Score']='ScoreDemo'
str(demo)
## 'data.frame': 167 obs. of 8 variables:
## $ Country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ ScoreDemo : num 9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
## $ Electoralprocessandpluralism: num 10 10 9.58 10 10 10 10 9.58 10 9.58 ...
## $ Functioningofgovernment : num 9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
## $ Politicalparticipation : num 10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
## $ Politicalculture : num 10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
## $ Civilliberties : num 9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
## $ Regimetype : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
feliz[,]=lapply(feliz[,], trimws,whitespace = "[\\h\\v]")
feliz[,c(1)]=NULL
feliz[,-c(1)]=lapply(feliz[,-c(1)],as.numeric)
names(feliz)[names(feliz)=='Score']='ScoreFeliz'
names(feliz)[names(feliz)=='Country or region']='Country'
str(feliz)
## 'data.frame': 156 obs. of 8 variables:
## $ Country : chr "Finland" "Denmark" "Norway" "Iceland" ...
## $ ScoreFeliz : num 7.77 7.6 7.55 7.49 7.49 ...
## $ GDP per capita : num 1.34 1.38 1.49 1.38 1.4 ...
## $ Social support : num 1.59 1.57 1.58 1.62 1.52 ...
## $ Healthy life expectancy : num 0.986 0.996 1.028 1.026 0.999 ...
## $ Freedom to make life choices: num 0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
## $ Generosity : num 0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
## $ Perceptions of corruption : num 0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...
datajunta = merge(demo,feliz, by.x = 'Country', by.y = 'Country')
str(datajunta)
## 'data.frame': 147 obs. of 15 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Argentina" ...
## $ ScoreDemo : num 2.85 5.89 4.01 7.02 5.54 9.09 8.29 2.75 2.55 5.88 ...
## $ Electoralprocessandpluralism: num 3.42 7 3.08 9.17 7.5 10 9.58 0.5 0.83 7.83 ...
## $ Functioningofgovernment : num 0.64 5.36 2.86 5.36 5.36 8.93 7.86 3.21 2.71 6.07 ...
## $ Politicalparticipation : num 3.89 4.44 5 6.11 6.11 7.78 8.33 2.78 2.78 6.11 ...
## $ Politicalculture : num 2.5 5 5 6.25 3.13 8.75 6.88 3.75 4.38 4.38 ...
## $ Civilliberties : num 3.82 7.65 4.12 8.24 5.59 10 8.82 3.53 2.06 5 ...
## $ Regimetype : chr "Authoritarian" "Hybrid regime" "Hybrid regime" "Flawed democracy" ...
## $ ScoreFeliz : num 3.2 4.72 5.21 6.09 4.56 ...
## $ GDP per capita : num 0.35 0.947 1.002 1.092 0.85 ...
## $ Social support : num 0.517 0.848 1.16 1.432 1.055 ...
## $ Healthy life expectancy : num 0.361 0.874 0.785 0.881 0.815 ...
## $ Freedom to make life choices: num 0 0.383 0.086 0.471 0.283 0.557 0.532 0.351 0.536 0.527 ...
## $ Generosity : num 0.158 0.178 0.073 0.066 0.095 0.332 0.244 0.035 0.255 0.166 ...
## $ Perceptions of corruption : num 0.025 0.027 0.114 0.05 0.064 0.29 0.226 0.182 0.11 0.143 ...
Nos quedamos con los elementos indispensables
datajunta[,c(2,8,9)]=NULL
row.names(datajunta)= datajunta$Country
str(datajunta)
## 'data.frame': 147 obs. of 12 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Argentina" ...
## $ Electoralprocessandpluralism: num 3.42 7 3.08 9.17 7.5 10 9.58 0.5 0.83 7.83 ...
## $ Functioningofgovernment : num 0.64 5.36 2.86 5.36 5.36 8.93 7.86 3.21 2.71 6.07 ...
## $ Politicalparticipation : num 3.89 4.44 5 6.11 6.11 7.78 8.33 2.78 2.78 6.11 ...
## $ Politicalculture : num 2.5 5 5 6.25 3.13 8.75 6.88 3.75 4.38 4.38 ...
## $ Civilliberties : num 3.82 7.65 4.12 8.24 5.59 10 8.82 3.53 2.06 5 ...
## $ GDP per capita : num 0.35 0.947 1.002 1.092 0.85 ...
## $ Social support : num 0.517 0.848 1.16 1.432 1.055 ...
## $ Healthy life expectancy : num 0.361 0.874 0.785 0.881 0.815 ...
## $ Freedom to make life choices: num 0 0.383 0.086 0.471 0.283 0.557 0.532 0.351 0.536 0.527 ...
## $ Generosity : num 0.158 0.178 0.073 0.066 0.095 0.332 0.244 0.035 0.255 0.166 ...
## $ Perceptions of corruption : num 0.025 0.027 0.114 0.05 0.064 0.29 0.226 0.182 0.11 0.143 ...
###Colocamos de semilla aleatoria 123 y usamos la metrica gower
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plyr)
set.seed(123)
inputData=datajunta[,c(2:12)]
g.dist = daisy(inputData, metric="gower")
res.pam = pam(g.dist,3,cluster.only = F) #partición
res.agnes = hcut(g.dist, k = 3,hc_func='agnes',hc_method = "ward.D") #aglomeración
res.diana = hcut(g.dist, k = 3,hc_func='diana') #división
Si determinamos la cantidad de cluster con la medida gap, en el caso de la tecnica de partición nos recomienda 6 clusters
fviz_nbclust(inputData, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
Si determinamos la cantidad de cluster con la medida gap, en el caso de la tecnica de jerarquización aglomerativa nos recomienda 6 clusters
fviz_nbclust(inputData, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
fviz_silhouette(res.pam)
## cluster size ave.sil.width
## 1 1 58 0.25
## 2 2 59 0.29
## 3 3 30 0.42
fviz_silhouette(res.agnes)
## cluster size ave.sil.width
## 1 1 49 0.31
## 2 2 80 0.22
## 3 3 18 0.60
fviz_silhouette(res.diana)
## cluster size ave.sil.width
## 1 1 68 0.22
## 2 2 49 0.36
## 3 3 30 0.38
poorPAM=data.frame(res.pam$silinfo$widths)
poorPAM$Country=row.names(poorPAM)
poorPAMcases=poorPAM[poorPAM$sil_width<0,'Country']
poorPAMcases
## [1] "Kenya" "Liberia" "Nepal" "Czech Republic"
## [5] "Kuwait" "Slovenia" "South Korea"
length(poorPAMcases)
## [1] 7
# agnes
poorAGNES=data.frame(res.agnes$silinfo$widths)
poorAGNES$country=row.names(poorAGNES)
poorAGNEScases=poorAGNES[poorAGNES$sil_width<0,'country']
poorAGNEScases
## [1] "Portugal" "Lebanon" "Zambia" "Mauritius" "Benin"
## [6] "Kuwait" "Pakistan" "Uganda" "Costa Rica" "Estonia"
## [11] "Israel" "Spain" "Japan" "Uruguay" "Gambia"
## [16] "Belgium" "France"
length(poorAGNEScases)
## [1] 17
#diana:
poorDIANA=data.frame(res.diana$silinfo$widths)
poorDIANA$country=row.names(poorDIANA)
poorDIANAcases=poorDIANA[poorDIANA$sil_width<0,'country']
poorDIANAcases
## [1] "Liberia" "Lebanon" "Nepal"
## [4] "Madagascar" "Morocco" "Kyrgyzstan"
## [7] "Bolivia" "Senegal" "Bosnia and Herzegovina"
## [10] "Georgia"
length(poorDIANAcases)
## [1] 10
Segun la evaluación grafica y la información de las silurtas, el tiene menos elementos negativos es el preferible a los demás. Entonces la tecnica de partición clusterizó mejor
quedarían 6 paises
setdiff(poorPAMcases,poorAGNEScases)
## [1] "Kenya" "Liberia" "Nepal" "Czech Republic"
## [5] "Slovenia" "South Korea"
necesitamos un mapa de posiciones para todos los casos y usamos el escalamiento multidimensional
proyeccion = cmdscale(g.dist, k=2,add = T) #k es el número de dimensiones
inputData$dim1 <- proyeccion$points[,1]
inputData$dim2 <- proyeccion$points[,2]
base= ggplot(inputData,aes(x=dim1, y=dim2,label=row.names(inputData)))
base + geom_text(size=2)
inputData$pam=as.factor(res.pam$clustering)
inputData$agnes=as.factor(res.agnes$cluster)
inputData$diana=as.factor(res.diana$cluster)
# Estimado limites:
min(inputData[,c('dim1','dim2')]); max(inputData[,c('dim1','dim2')])
## [1] -0.5581733
## [1] 0.5123219
#### Procedeamos a gráficar:
#PAM
limites=c(-0.5581733,0.5123219)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
base + geom_point(size=2, aes(color=pam)) + labs(title = "PAM")
#AGNES
base + geom_point(size=2, aes(color=agnes)) + labs(title = "AGNES")
#DIANA
base + geom_point(size=2, aes(color=diana)) + labs(title = "DIANA")
Nuevas distancias: Las posiciones son la información para dbscan.
# euclidea!!
g.dist.cmd = daisy(inputData[,c('dim1','dim2')], metric = 'euclidean')
library(fpc)
db.cmd = dbscan(g.dist.cmd, eps=0.09, MinPts=5,method = 'dist')
db.cmd
## dbscan Pts=147 MinPts=5 eps=0.09
## 0 1
## border 8 7
## seed 0 132
## total 8 139
inputData$dbCMD=as.factor(db.cmd$cluster)
library(ggrepel)
base= ggplot(inputData,aes(x=dim1, y=dim2)) + ylim(limites) + xlim(limites) + coord_fixed()
dbplot= base + geom_point(aes(color=dbCMD))
dbplot
LABEL=ifelse(inputData$dbCMD==0,row.names(inputData),"")
dbplot + geom_text_repel(aes(label=LABEL),
size=5,
direction = "y", ylim = 0.45,
angle=45,
segment.colour = "grey")
atipicos=data.frame(inputData$dbCMD==0,row.names(inputData),"")
names(atipicos)
## [1] "inputData.dbCMD....0" "row.names.inputData." "X.."
atipicos[,c(3)]=NULL
atipicos2= atipicos[atipicos$inputData.dbCMD....0 == "TRUE",]$row.names.inputData.
atipicos2=data.frame(atipicos2)
atipicos2
## atipicos2
## 1 Afghanistan
## 2 Bhutan
## 3 Burundi
## 4 Central African Republic
## 5 Hong Kong
## 6 Singapore
## 7 Syria
## 8 Uzbekistan
intersect(poorAGNEScases,atipicos2)
## data frame with 0 columns and 0 rows
de clusterizar en las dbscan?
peores =data.frame(inputData$agnes==1,row.names (inputData),"")
peores[,c(3)]=NULL
peores2= peores[peores$inputData.agnes....1 == "TRUE",]$row.names.inputData.
peores2=data.frame(peores2)
peores2
## peores2
## 1 Afghanistan
## 2 Algeria
## 3 Azerbaijan
## 4 Bahrain
## 5 Belarus
## 6 Burkina Faso
## 7 Burundi
## 8 Cambodia
## 9 Cameroon
## 10 Central African Republic
## 11 Chad
## 12 China
## 13 Comoros
## 14 Egypt
## 15 Eswatini
## 16 Ethiopia
## 17 Gabon
## 18 Guinea
## 19 Haiti
## 20 Iran
## 21 Iraq
## 22 Ivory Coast
## 23 Jordan
## 24 Kazakhstan
## 25 Laos
## 26 Libya
## 27 Mali
## 28 Mauritania
## 29 Mozambique
## 30 Myanmar
## 31 Nicaragua
## 32 Niger
## 33 Nigeria
## 34 Qatar
## 35 Russia
## 36 Rwanda
## 37 Saudi Arabia
## 38 Sierra Leone
## 39 Syria
## 40 Tajikistan
## 41 Togo
## 42 Turkey
## 43 Turkmenistan
## 44 United Arab Emirates
## 45 Uzbekistan
## 46 Venezuela
## 47 Vietnam
## 48 Yemen
## 49 Zimbabwe
Hay 5 paises en ambos grupos
names(peores2)=c("a")
names(atipicos2)=c("a")
merge(atipicos2,peores2)
## a
## 1 Afghanistan
## 2 Burundi
## 3 Central African Republic
## 4 Syria
## 5 Uzbekistan