library(htmltab)
# coleccion
links=list(web="https://en.wikipedia.org/wiki/Democracy_Index",
xpath ='//*[@id="mw-content-text"]/div/table[2]/tbody')
demo<- htmltab(doc = links$web, which =links$xpath)
2.1. Inspeccion:
Veamos la estructura
names(demo)
## [1] "Rank >> Rank"
## [2] "Country >> Country"
## [3] "Score >> Score"
## [4] "Electoral processand pluralism >> Electoral processand pluralism"
## [5] "Functioning ofgovernment >> Functioning ofgovernment"
## [6] "Politicalparticipation >> Politicalparticipation"
## [7] "Politicalculture >> Politicalculture"
## [8] "Civilliberties >> Civilliberties"
## [9] "Regimetype >> Regimetype"
## [10] "Continent >> Continent"
2.2. Pre procesamiento:
Nombres sin espacios
library(stringr)
names(demo)=str_split(names(demo)," ",simplify = T)[,1]
nombres con simbolos raros
names(demo)=str_replace_all(names(demo),"[^[:ascii:]]", "")
valores del data frame si.n simbolos raros:
demo[,]=lapply(demo[,], str_replace_all,"[^[:ascii:]]","")
elimnar coñlumnas que no se usaran y que se podrian recalcular
demo$Rank=NULL
recuperar numeros
library(readr)
demo[,c(2:7)]=lapply(demo[,c(2:7)],parse_number)
configurar categoriales # ordinales
table(demo$Regimetype)
##
## Authoritarian Flawed democracy Full democracy Hybrid regime
## 53 55 20 39
ordenok=c("Authoritarian","Hybrid regime", "Flawed democracy","Full democracy")
demo$Regimetype=factor(demo$Regimetype, levels=ordenok,ordered = TRUE)
demo$Continent=as.factor(demo$Continent)
str(demo)
## 'data.frame': 167 obs. of 9 variables:
## $ Country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ Score : num 9.87 9.58 9.39 9.26 9.22 9.15 9.15 9.14 9.09 9.03 ...
## $ Electoral : num 10 10 9.58 10 10 9.58 9.58 10 10 9.58 ...
## $ Functioning : num 9.64 9.29 9.64 9.29 9.29 7.86 9.64 8.93 8.93 9.29 ...
## $ Politicalparticipation: num 10 8.89 8.33 8.89 8.33 8.33 7.78 8.33 7.78 7.78 ...
## $ Politicalculture : num 10 10 10 8.13 9.38 10 8.75 8.75 8.75 9.38 ...
## $ Civilliberties : num 9.71 9.71 9.41 10 9.12 10 10 9.71 10 9.12 ...
## $ Regimetype : Ord.factor w/ 4 levels "Authoritarian"<..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Continent : Factor w/ 6 levels "Africa","Asia",..: 3 3 3 5 3 3 4 3 5 3 ...
str(demo$Regimetype)
## Ord.factor w/ 4 levels "Authoritarian"<..: 4 4 4 4 4 4 4 4 4 4 ...
library(DescTools)
Median(demo$Regimetype)
## [1] Hybrid regime
## 4 Levels: Authoritarian < Hybrid regime < ... < Full democracy
library(questionr)
library(magrittr)
ordef=freq(demo$Regimetype, total = F,exclude = c(NA),cum = T) %>%data.frame()
ordef=data.frame(row.names(ordef),ordef,row.names = NULL)
names(ordef)=c("categoria","conteo","porcentaje", "porcentaje acumulado")
ordef
## categoria conteo porcentaje porcentaje acumulado
## 1 Authoritarian 53 31.7 31.7
## 2 Hybrid regime 39 23.4 55.1
## 3 Flawed democracy 55 32.9 88.0
## 4 Full democracy 20 12.0 100.0
3.1. ¿Cual es el valor representativo de Continente?
Mode(demo$Continent)
## [1] "Africa"
3.2. ¿Es el valor representativo de continente muy prominente? (quizas otros continentes no envian informacion?)
tablacontinente=prop.table(table(demo$Continent))
Herfindahl(tablacontinente)
## [1] 0.238266
library(questionr)
library(magrittr)
OrdDf=freq(demo$Continent,total = F,exclude = c(NA),cum = T) %>% data.frame()
OrdDf=data.frame(row.names(OrdDf),OrdDf,row.names = NULL)
names(OrdDf)=c("Categoria","Conteo", "Porcentaje", "Porcentaje Acumulado")
# viendo a NomDf
OrdDf
## Categoria Conteo Porcentaje Porcentaje Acumulado
## 1 Africa 50 29.9 29.9
## 2 Asia 42 25.1 55.1
## 3 Europe 45 26.9 82.0
## 4 North America 14 8.4 90.4
## 5 Oceania 4 2.4 92.8
## 6 South America 12 7.2 100.0
library(ggplot2)
# en base solo se pone: en data como se llama la tabla de frecuencias
# y en aes la variable 'x' para los nombres y la 'y' para los conteos.
base = ggplot(data=OrdDf,aes(x=Categoria , y=Conteo))
# luego a la base se le pide añadir la 'geometria' deseada:
bar1 = base + geom_bar(stat='identity')
# aqui resultado:
bar1 + scale_x_discrete(limits =OrdDf$Categoria)
library(ggplot2)
basep=ggplot(data=demo, aes(y=as.numeric(Continent))) # ojo
basep + geom_boxplot() + coord_flip()
library(ggplot2)
base=ggplot(data=ordef,aes(x=categoria , y=conteo))
bar1 = base + geom_bar(stat = "identity")
bar1 + scale_x_discrete(limits = ordef$categoria)
library(ggplot2)
basepp=ggplot(data=demo, aes(y=as.numeric(Regimetype))) # ojo
basepp + geom_boxplot() + coord_flip()
6. ¿El valor representativo de indice de democracia es robusto?
summary(demo$Score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.080 3.545 5.690 5.479 7.175 9.870
baser=ggplot(data=demo,aes(x=Score))
baser + geom_histogram(bins=10)
Skew(demo$Score, conf.level = 0.05)
## skew lwr.ci upr.ci
## -0.07107195 -0.08036376 -0.06774729
Gini(demo$Score)
## [1] 0.2316285
library(ggplot2)
library(gglorenz)
## Registered S3 methods overwritten by 'ineq':
## method from
## plot.Lc DescTools
## lines.Lc DescTools
ggplot(demo,aes(x=Score))+ gglorenz::stat_lorenz(color='purple') +
geom_abline(linetype = "dashed") + coord_fixed() +
labs(x = "% Paises ordenados por Indice de Democracia",
y = "% Acumulado de Puntuación de ID",
title = "Relación pais/Indice de democracia",
caption = "Fuente: The Economist") +
scale_y_continuous(breaks=seq(0,1,0.15)) +
scale_x_continuous(breaks=seq(0,1,0.2))
9.1. ¿Hay atipicos en el score de democracia?
q3=quantile(demo$Score,0.75)
q1=quantile(demo$Score, 0.25)
umbralalto=q3+1.5*IQR(demo$Score)
umbralbajo=q1-1.5*IQR(demo$Score)
umbralalto
## 75%
## 12.62
umbralbajo
## 25%
## -1.9
9.2. ¿Ausencia de atipicos grandes?
demo[demo$Score>=umbralalto,]
## [1] Country Score Electoral
## [4] Functioning Politicalparticipation Politicalculture
## [7] Civilliberties Regimetype Continent
## <0 rows> (or 0-length row.names)
demo[demo$Score<=umbralbajo,]
## [1] Country Score Electoral
## [4] Functioning Politicalparticipation Politicalculture
## [7] Civilliberties Regimetype Continent
## <0 rows> (or 0-length row.names)