Facultad de Derecho y Ciencia Politica

Escuela de Ciencia Política

Guia de Clase de ESTADISTICA


Practica Dirigida ANALISIS BIVARIADO

Cargando y pre procesando la data:

Data a utilizar:

library(htmltab)
link="https://en.wikipedia.org/wiki/Democracy_Index"
path='//*[@id="mw-content-text"]/div/table[2]'

demo=htmltab(doc = link,which = path)

Revisando los nombres:

names(demo)
##  [1] "Rank >> Rank"                                                          
##  [2] "Country >> Country"                                                    
##  [3] "Score >> Score"                                                        
##  [4] "Elec­toral pro­cessand plura­lism >> Elec­toral pro­cessand plura­lism"
##  [5] "Functio­ning ofgovern­ment >> Functio­ning ofgovern­ment"              
##  [6] "Poli­ticalpartici­pation >> Poli­ticalpartici­pation"                  
##  [7] "Poli­ticalculture >> Poli­ticalculture"                                
##  [8] "Civilliber­ties >> Civilliber­ties"                                    
##  [9] "Regimetype >> Regimetype"                                              
## [10] "Conti­nent >> Conti­nent"

Mejorando nombres:

newNames=c("rank", "country","score","electoral", "functioning",
           "participation","culture","civilliber","regimetype","continent")
#resultado
names(demo)=newNames

Revisando tipo de datos:

str(demo)
## 'data.frame':    167 obs. of  10 variables:
##  $ rank         : chr  "1" "2" "3" "4" ...
##  $ country      : chr  " Norway" " Iceland" " Sweden" " New Zealand" ...
##  $ score        : chr  "9.87" "9.58" "9.39" "9.26" ...
##  $ electoral    : chr  "10.00" "10.00" "9.58" "10.00" ...
##  $ functioning  : chr  "9.64" "9.29" "9.64" "9.29" ...
##  $ participation: chr  "10.00" "8.89" "8.33" "8.89" ...
##  $ culture      : chr  "10.00" "10.00" "10.00" "8.13" ...
##  $ civilliber   : chr  "9.71" "9.71" "9.41" "10.00" ...
##  $ regimetype   : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
##  $ continent    : chr  "Europe" "Europe" "Europe" "Oceania" ...

Mejorando datos con problemas de formato:

  • Evitando presencia de espacios en blanco:
# siempre que venga como texto, eliminar espacios en blanco
demo[,]=lapply(demo[,],trimws,whitespace = "[\\h\\v]")
  • Conviertiendo en variable categórica nominal:
demo$continent=as.factor(demo$continent)
  • Conviertiendo en variable categórica ordinal:

  • Viendo niveles (levels):

table(demo$regimetype)
## 
##    Authoritarian Flawed democracy   Full democracy    Hybrid regime 
##               53               55               20               39
  • Ajustando niveles:
ordenOK=c('Authoritarian', "Hybrid regime","Flawed democracy","Full democracy")

demo$regimetype=ordered(demo$regimetype,levels=ordenOK)
  • Conviertiendo en variable numerica:
demo[,-c(2,9,10)]=lapply(demo[,-c(2,9,10)],as.numeric)
## Warning in lapply(demo[, -c(2, 9, 10)], as.numeric): NAs introduced by
## coercion
  • Explorando valores perdidos:
# estos son:
demo[!complete.cases(demo),]
##     rank      country score electoral functioning participation culture
## 24    NA        Chile  7.97      9.58        8.57          4.44    8.13
## 25    NA      Estonia  7.97      9.58        8.21          6.67    6.88
## 37    NA     Slovenia  7.50      9.58        6.79          6.67    6.25
## 38    NA    Lithuania  7.50      9.58        6.43          6.11    6.25
## 48    NA    Argentina  7.02      9.17        5.36          6.11    6.25
## 49    NA      Jamaica  7.02      8.75        7.14          4.44    6.25
## 55    NA       Poland  6.67      9.17        6.07          6.11    4.38
## 56    NA       Guyana  6.67      9.17        5.71          6.11    5.00
## 58    NA        Ghana  6.63      8.33        5.71          6.67    6.25
## 59    NA      Hungary  6.63      8.75        6.07          5.00    6.25
## 64    NA       Serbia  6.41      8.25        5.36          6.11    5.00
## 65    NA      Tunisia  6.41      6.42        5.71          7.78    6.25
## 67    NA    Singapore  6.38      4.33        7.86          6.11    6.25
## 68    NA      Romania  6.38      9.17        5.71          5.00    4.38
## 72    NA    Sri Lanka  6.19      7.83        5.71          5.00    6.25
## 73    NA       Mexico  6.19      8.33        6.07          7.22    3.13
## 74    NA    Hong Kong  6.15      3.08        6.07          5.56    7.50
## 75    NA      Senegal  6.15      7.50        6.07          4.44    6.25
## 80    NA      Moldova  5.85      7.08        4.64          6.11    4.38
## 81    NA         Fiji  5.85      6.58        5.36          6.11    5.63
## 82    NA   Montenegro  5.74      6.08        5.36          6.11    4.38
## 83    NA        Benin  5.74      6.50        5.71          5.00    5.63
## 92    NA     Tanzania  5.41      7.00        5.00          5.00    5.63
## 93    NA         Mali  5.41      7.42        3.93          3.89    5.63
## 99    NA        Kenya  5.11      3.50        5.36          6.67    5.63
## 100   NA   Kyrgyzstan  5.11      6.58        2.93          6.67    4.38
## 107   NA      Lebanon  4.63      3.92        2.21          6.67    5.63
## 108   NA     Thailand  4.63      3.00        4.29          5.00    5.00
## 117   NA   Mozambique  3.85      3.58        2.14          5.00    5.00
## 118   NA       Kuwait  3.85      3.17        4.29          3.89    4.38
## 129   NA     Ethiopia  3.35      0.00        3.57          5.56    5.00
## 130   NA       Rwanda  3.35      1.67        5.00          2.78    4.38
## 135   NA     Zimbabwe  3.16      0.50        2.00          4.44    5.63
## 136   NA    Venezuela  3.16      1.67        1.79          4.44    4.38
## 145   NA   Kazakhstan  2.94      0.50        2.14          4.44    4.38
## 146   NA       Russia  2.94      2.17        1.79          5.00    2.50
## 152   NA      Eritrea  2.37      0.00        2.14          1.67    6.88
## 153   NA         Laos  2.37      0.83        2.86          1.67    5.00
## 160   NA Saudi Arabia  1.93      0.00        2.86          2.22    3.13
## 161   NA   Tajikistan  1.93      0.08        0.79          1.67    6.25
##     civilliber       regimetype     continent
## 24        9.12 Flawed democracy South America
## 25        8.53 Flawed democracy        Europe
## 37        8.24 Flawed democracy        Europe
## 38        9.12 Flawed democracy        Europe
## 48        8.24 Flawed democracy South America
## 49        8.53 Flawed democracy North America
## 55        7.65 Flawed democracy        Europe
## 56        7.35 Flawed democracy South America
## 58        6.18 Flawed democracy        Africa
## 59        7.06 Flawed democracy        Europe
## 64        7.35 Flawed democracy        Europe
## 65        5.88 Flawed democracy        Africa
## 67        7.35 Flawed democracy          Asia
## 68        7.65 Flawed democracy        Europe
## 72        6.18 Flawed democracy          Asia
## 73        6.18 Flawed democracy North America
## 74        8.53 Flawed democracy          Asia
## 75        6.47 Flawed democracy        Africa
## 80        7.06    Hybrid regime        Europe
## 81        5.59    Hybrid regime       Oceania
## 82        6.76    Hybrid regime        Europe
## 83        5.88    Hybrid regime        Africa
## 92        4.41    Hybrid regime        Africa
## 93        6.18    Hybrid regime        Africa
## 99        4.41    Hybrid regime        Africa
## 100       5.00    Hybrid regime          Asia
## 107       4.71    Hybrid regime          Asia
## 108       5.88    Hybrid regime          Asia
## 117       3.53    Authoritarian        Africa
## 118       3.53    Authoritarian          Asia
## 129       2.65    Authoritarian        Africa
## 130       2.94    Authoritarian        Africa
## 135       3.24    Authoritarian        Africa
## 136       3.53    Authoritarian South America
## 145       3.24    Authoritarian        Europe
## 146       3.24    Authoritarian        Europe
## 152       1.18    Authoritarian        Africa
## 153       1.47    Authoritarian          Asia
## 160       1.47    Authoritarian          Asia
## 161       0.88    Authoritarian          Asia
  • desechar columna con perdidos (si no fuera importante):
demo$rank=NULL
  • Pedir resumen estadistico:
summary(demo)
##    country              score         electoral       functioning   
##  Length:167         Min.   :1.080   Min.   : 0.000   Min.   :0.000  
##  Class :character   1st Qu.:3.545   1st Qu.: 3.000   1st Qu.:2.860  
##  Mode  :character   Median :5.690   Median : 6.580   Median :5.000  
##                     Mean   :5.479   Mean   : 5.903   Mean   :4.885  
##                     3rd Qu.:7.175   3rd Qu.: 9.170   3rd Qu.:6.790  
##                     Max.   :9.870   Max.   :10.000   Max.   :9.640  
##  participation      culture         civilliber                regimetype
##  Min.   : 1.11   Min.   : 1.250   Min.   : 0.000   Authoritarian   :53  
##  1st Qu.: 3.89   1st Qu.: 4.380   1st Qu.: 3.530   Hybrid regime   :39  
##  Median : 5.56   Median : 5.630   Median : 5.880   Flawed democracy:55  
##  Mean   : 5.25   Mean   : 5.594   Mean   : 5.768   Full democracy  :20  
##  3rd Qu.: 6.67   3rd Qu.: 6.250   3rd Qu.: 8.240                        
##  Max.   :10.00   Max.   :10.000   Max.   :10.000                        
##          continent 
##  Africa       :50  
##  Asia         :42  
##  Europe       :45  
##  North America:14  
##  Oceania      : 4  
##  South America:12

Responder preguntas

1. Analizar la relacion entre El score (indice) y el continente

  1. Determinando tipo de relación: A partir del resumen estadístico se determina que es Numerica - Categórica

  2. Determinando si la variable numerica se comporta de manera normal:

library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
ggqqplot(data=demo,x="score") + facet_grid(. ~ continent)

f1=formula(score~continent)
# funcion ad-hoc
normalidadTest=function(x) {y =shapiro.test(x); 
                            c(y$statistic, y$p.value)}
# calculando
resultado= aggregate(f1, demo,
                     FUN = normalidadTest) 


# mostrando resultado
library(knitr)

shapiroTest=as.data.frame(resultado[,2])
names(shapiroTest)=c("SW_Statistic","Probabilidad")
kable(cbind(resultado[1],shapiroTest))
continent SW_Statistic Probabilidad
Africa 0.9653422 0.1487353
Asia 0.9486915 0.0579857
Europe 0.9370922 0.0168389
North America 0.9740888 0.9260364
Oceania 0.7752667 0.0647579
South America 0.8433322 0.0304021
  1. Usando prueba respectiva:
kruskal.test(f1,demo)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  score by continent
## Kruskal-Wallis chi-squared = 52.932, df = 5, p-value = 3.473e-10

Aqui no muestra asteriscos, pero la probabilidad (p-value) es también menor a 0.05.

Visualmente, para saber cuál es diferente a los demás:

ggplot(data=demo, aes(x=continent, y=score)) + geom_boxplot(notch = T)
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

2. Analizar la relacion entre Continente y Tipo de Regimen

  1. Determinando tipo de relación: A partir del resumen estadístico se determina que es Categórica - Categórica

  2. Construir tabla de contingencia:

columna=demo$continent
fila=demo$regimetype

(t=table(fila,columna))
##                   columna
## fila               Africa Asia Europe North America Oceania South America
##   Authoritarian        26   20      4             2       0             1
##   Hybrid regime        15    9      9             4       1             1
##   Flawed democracy      8   13     18             6       1             9
##   Full democracy        1    0     14             2       2             1

Mostrar porcentajes:

# marginal por columna (suma 1 por columna, no por fila)
prop_t=prop.table(t,margin = 2)
round(prop_t,2)
##                   columna
## fila               Africa Asia Europe North America Oceania South America
##   Authoritarian      0.52 0.48   0.09          0.14    0.00          0.08
##   Hybrid regime      0.30 0.21   0.20          0.29    0.25          0.08
##   Flawed democracy   0.16 0.31   0.40          0.43    0.25          0.75
##   Full democracy     0.02 0.00   0.31          0.14    0.50          0.08
library("gplots")
# nota que uso la funcion "t()":
balloonplot(t(prop_t), main ="tabla",
            label = T, show.margins = FALSE)

  1. Determinar si hay indepencia entre las variables estadistica
chisq.test(t,simulate.p.value = T)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  t
## X-squared = 64.445, df = NA, p-value = 0.0004998
  1. Si no hay independencia, calcular intensidad:
library(oii)
association.measures(fila,columna)
## Chi-square-based measures of association:
##    Phi:                      0.621 
##    Contingency coefficient:  0.528 
##    Cramer's V:               0.359 
## 
## Ordinal measures of association:
##    Total number of pairs:   13861 
##    Concordant pairs:        6064   ( 43.75 %)
##    Discordant pairs:        1872   ( 13.51 %)
##    Tied on first variable:  2686   ( 19.38 %)
##    Tied on second variable: 2131   ( 15.37 %)
##    Tied on both variables:  1108   ( 7.99 %)
## 
##    Goodman-Kruskal Gamma: 0.528 
##    Somers' d (col dep.):  0.416 
##    Kendall's tau-b:       0.405 
##    Stuart's tau-c:        0.401

El Coeficiente de contingencia o Cramer sugieren una intensidad relevante (mayor a 0.3).

3. Analizar la relacion entre Participacion Politica y todas las otras componentes del indice (no analizar el indice)

  1. Determinando tipo de relación: A partir del resumen estadístico se determina que es Numerica - Numérica

  2. Determinando si la variable numerica se comporta de manera normal:

library(dlookr)
normality(demo[,c(3:7)])
## # A tibble: 5 x 4
##   vars          statistic  p_value sample
##   <chr>             <dbl>    <dbl>  <dbl>
## 1 electoral         0.865 4.14e-11    167
## 2 functioning       0.977 6.52e- 3    167
## 3 participation     0.981 2.12e- 2    167
## 4 culture           0.964 2.69e- 4    167
## 5 civilliber        0.954 2.87e- 5    167
  1. Calculando correlaciones solicitadas:
library(ggpubr)
p1=ggscatter(demo, 
          x = "electoral", y = "participation",
          cor.coef = TRUE, 
          cor.method = "spearman") 
p2=ggscatter(demo, 
          x = "functioning", y = "participation",
          cor.coef = TRUE, 
          cor.method = "spearman")
p3=ggscatter(demo, 
          x = "culture", y = "participation",
          cor.coef = TRUE, 
          cor.method = "spearman")
p4=ggscatter(demo, 
          x = "civilliber", y = "participation",
          cor.coef = TRUE, 
          cor.method = "spearman")
# paso 1:
all_ps=ggarrange(p1,p2,p3,p4,
          ncol = 2, nrow = 2) 

# paso 2
annotate_figure(all_ps,
               top = text_grob("Correlacion con PARTICIPATION", 
                               color = "blue", 
                               face = "bold", 
                               size = 14))

Sin los gráficos lo puedes ver asi:

dataForCor=demo[,c(5,3,4,6,7)]
#cor.test(dataForCor[,-1], dataForCor[,1],method = "spearman")
lapply(dataForCor[,-1],
       cor.test,y=dataForCor[,1],method="spearman",exact=FALSE)
## $electoral
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 183345, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7637965 
## 
## 
## $functioning
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 219483, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7172393 
## 
## 
## $culture
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 389588, p-value = 7.43e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4980935 
## 
## 
## $civilliber
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 177123, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7718116