library(htmltab)

# coleccion
links=list(web="https://en.wikipedia.org/wiki/Democracy_Index",
           xpath ='//*[@id="mw-content-text"]/div/table[2]/tbody')
demo<- htmltab(doc = links$web, which =links$xpath)
  1. Limpieza de datos

2.1. Inspeccion:

Veamos la estructura

names(demo)
##  [1] "Rank >> Rank"                                                          
##  [2] "Country >> Country"                                                    
##  [3] "Score >> Score"                                                        
##  [4] "Elec­toral pro­cessand plura­lism >> Elec­toral pro­cessand plura­lism"
##  [5] "Functio­ning ofgovern­ment >> Functio­ning ofgovern­ment"              
##  [6] "Poli­ticalpartici­pation >> Poli­ticalpartici­pation"                  
##  [7] "Poli­ticalculture >> Poli­ticalculture"                                
##  [8] "Civilliber­ties >> Civilliber­ties"                                    
##  [9] "Regimetype >> Regimetype"                                              
## [10] "Conti­nent >> Conti­nent"

2.2. Pre procesamiento:

Nombres sin espacios

library(stringr)
names(demo)=str_split(names(demo)," ",simplify = T)[,1]

nombres con simbolos raros

names(demo)=str_replace_all(names(demo),"[^[:ascii:]]", "")

valores del data frame si.n simbolos raros:

demo[,]=lapply(demo[,], str_replace_all,"[^[:ascii:]]","")

elimnar coñlumnas que no se usaran y que se podrian recalcular

demo$Rank=NULL 

recuperar numeros

library(readr)
demo[,c(2:7)]=lapply(demo[,c(2:7)],parse_number)

configurar categoriales # ordinales

table(demo$Regimetype)
## 
##    Authoritarian Flawed democracy   Full democracy    Hybrid regime 
##               53               55               20               39
ordenok=c("Authoritarian","Hybrid regime", "Flawed democracy","Full democracy")
demo$Regimetype=factor(demo$Regimetype, levels=ordenok,ordered = TRUE)

nominales

demo$Continent=as.factor(demo$Continent)
str(demo)
## 'data.frame':    167 obs. of  9 variables:
##  $ Country               : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ Score                 : num  9.87 9.58 9.39 9.26 9.22 9.15 9.15 9.14 9.09 9.03 ...
##  $ Electoral             : num  10 10 9.58 10 10 9.58 9.58 10 10 9.58 ...
##  $ Functioning           : num  9.64 9.29 9.64 9.29 9.29 7.86 9.64 8.93 8.93 9.29 ...
##  $ Politicalparticipation: num  10 8.89 8.33 8.89 8.33 8.33 7.78 8.33 7.78 7.78 ...
##  $ Politicalculture      : num  10 10 10 8.13 9.38 10 8.75 8.75 8.75 9.38 ...
##  $ Civilliberties        : num  9.71 9.71 9.41 10 9.12 10 10 9.71 10 9.12 ...
##  $ Regimetype            : Ord.factor w/ 4 levels "Authoritarian"<..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Continent             : Factor w/ 6 levels "Africa","Asia",..: 3 3 3 5 3 3 4 3 5 3 ...
str(demo$Regimetype)
##  Ord.factor w/ 4 levels "Authoritarian"<..: 4 4 4 4 4 4 4 4 4 4 ...

preguntas

  1. ¿Cual es el valor representativo de Tipo de Regimen?
library(DescTools)

Median(demo$Regimetype)
## [1] Hybrid regime
## 4 Levels: Authoritarian < Hybrid regime < ... < Full democracy
  1. ¿Podemos afirmar que 3 cuartas partes de los paises del mundo que tenemos en nuestra tabla no superan el tipo hibrido de democracia?
library(questionr)
library(magrittr)
ordef=freq(demo$Regimetype, total = F,exclude = c(NA),cum = T) %>%data.frame()
ordef=data.frame(row.names(ordef),ordef,row.names = NULL)

names(ordef)=c("categoria","conteo","porcentaje", "porcentaje acumulado")

ordef
##          categoria conteo porcentaje porcentaje acumulado
## 1    Authoritarian     53       31.7                 31.7
## 2    Hybrid regime     39       23.4                 55.1
## 3 Flawed democracy     55       32.9                 88.0
## 4   Full democracy     20       12.0                100.0

3.1. ¿Cual es el valor representativo de Continente?

Mode(demo$Continent)
## [1] "Africa"

3.2. ¿Es el valor representativo de continente muy prominente? (quizas otros continentes no envian informacion?)

tablacontinente=prop.table(table(demo$Continent))
Herfindahl(tablacontinente)
## [1] 0.238266
  1. ¿La variable continente se distribuye asimetricamente?
library(questionr)
library(magrittr)
OrdDf=freq(demo$Continent,total = F,exclude = c(NA),cum = T) %>% data.frame()
OrdDf=data.frame(row.names(OrdDf),OrdDf,row.names = NULL)

names(OrdDf)=c("Categoria","Conteo", "Porcentaje", "Porcentaje Acumulado")
# viendo a NomDf
OrdDf
##       Categoria Conteo Porcentaje Porcentaje Acumulado
## 1        Africa     50       29.9                 29.9
## 2          Asia     42       25.1                 55.1
## 3        Europe     45       26.9                 82.0
## 4 North America     14        8.4                 90.4
## 5       Oceania      4        2.4                 92.8
## 6 South America     12        7.2                100.0
library(ggplot2)
# en base solo se pone: en data como se llama la tabla de frecuencias
# y en aes la variable 'x' para los nombres y la 'y' para los conteos.
base = ggplot(data=OrdDf,aes(x=Categoria , y=Conteo)) 
# luego a la base se le pide añadir la 'geometria' deseada:
bar1 = base + geom_bar(stat='identity') 

# aqui resultado:
bar1 + scale_x_discrete(limits =OrdDf$Categoria)

library(ggplot2)
basep=ggplot(data=demo, aes(y=as.numeric(Continent))) # ojo
basep +  geom_boxplot() + coord_flip()

  1. ¿La variable regimetype se distribuye asimetricamente?
library(ggplot2)
base=ggplot(data=ordef,aes(x=categoria , y=conteo))
bar1 = base + geom_bar(stat = "identity")
bar1 + scale_x_discrete(limits = ordef$categoria)

library(ggplot2)
basepp=ggplot(data=demo, aes(y=as.numeric(Regimetype))) # ojo
basepp +  geom_boxplot() + coord_flip()

6. ¿El valor representativo de indice de democracia es robusto?

summary(demo$Score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.080   3.545   5.690   5.479   7.175   9.870
baser=ggplot(data=demo,aes(x=Score))
baser + geom_histogram(bins=10)

Skew(demo$Score, conf.level = 0.05)
##        skew      lwr.ci      upr.ci 
## -0.07107195 -0.08036376 -0.06774729
  1. ¿Puedes concluir que hay mucha desigualdad en este indice?
Gini(demo$Score)
## [1] 0.2316285
library(ggplot2)
library(gglorenz)
## Registered S3 methods overwritten by 'ineq':
##   method   from     
##   plot.Lc  DescTools
##   lines.Lc DescTools
ggplot(demo,aes(x=Score))+ gglorenz::stat_lorenz(color='purple') +
    geom_abline(linetype = "dashed") + coord_fixed() +
    labs(x = "% Paises ordenados por Indice de Democracia",
         y = "% Acumulado de Puntuación de ID",
         title = "Relación pais/Indice de democracia",
         caption = "Fuente: The Economist") + 
     scale_y_continuous(breaks=seq(0,1,0.15)) +
     scale_x_continuous(breaks=seq(0,1,0.2))

9.1. ¿Hay atipicos en el score de democracia?

q3=quantile(demo$Score,0.75)
q1=quantile(demo$Score, 0.25)
umbralalto=q3+1.5*IQR(demo$Score)
umbralbajo=q1-1.5*IQR(demo$Score)

umbralalto
##   75% 
## 12.62
umbralbajo
##  25% 
## -1.9

9.2. ¿Ausencia de atipicos grandes?

demo[demo$Score>=umbralalto,]
## [1] Country                Score                  Electoral             
## [4] Functioning            Politicalparticipation Politicalculture      
## [7] Civilliberties         Regimetype             Continent             
## <0 rows> (or 0-length row.names)
demo[demo$Score<=umbralbajo,]
## [1] Country                Score                  Electoral             
## [4] Functioning            Politicalparticipation Politicalculture      
## [7] Civilliberties         Regimetype             Continent             
## <0 rows> (or 0-length row.names)