Practica Bivariada

Data:

library(htmltab)

link="https://en.wikipedia.org/wiki/Democracy_Index"

path='//*/div/table[2]/tbody'

democracia=htmltab(doc = link,which = path)

Mejorando contenido de la data

Revisamos los nombres:

names(democracia)
##  [1] "Rank >> Rank"                                                    
##  [2] "Country >> Country"                                              
##  [3] "Score >> Score"                                                  
##  [4] "Electoral processand pluralism >> Electoral processand pluralism"
##  [5] "Functioning ofgovernment >> Functioning ofgovernment"            
##  [6] "Politicalparticipation >> Politicalparticipation"                
##  [7] "Politicalculture >> Politicalculture"                            
##  [8] "Civilliberties >> Civilliberties"                                
##  [9] "Regimetype >> Regimetype"                                        
## [10] "Continent >> Continent"

Mejoramos nombres

newN=c("rank","country","score","electoral","functioning","participacion","culture","civilliber","regimentype","continet")

#resultado:
names(democracia)=newN
str(democracia)
## 'data.frame':    167 obs. of  10 variables:
##  $ rank         : chr  "1" "2" "3" "4" ...
##  $ country      : chr  " Norway" " Iceland" " Sweden" " New Zealand" ...
##  $ score        : chr  "9.87" "9.58" "9.39" "9.26" ...
##  $ electoral    : chr  "10.00" "10.00" "9.58" "10.00" ...
##  $ functioning  : chr  "9.64" "9.29" "9.64" "9.29" ...
##  $ participacion: chr  "10.00" "8.89" "8.33" "8.89" ...
##  $ culture      : chr  "10.00" "10.00" "10.00" "8.13" ...
##  $ civilliber   : chr  "9.71" "9.71" "9.41" "10.00" ...
##  $ regimentype  : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
##  $ continet     : chr  "Europe" "Europe" "Europe" "Oceania" ...

Mejorando datos internos * Evitamos espacios en blanco

democracia[,]=lapply(democracia[,],trimws,whitespace="[\\h\\v]")
democracia$continet=as.factor(democracia$continet)
table(democracia$regimentype)
## 
##    Authoritarian Flawed democracy   Full democracy    Hybrid regime 
##               53               55               20               39

Ajusto los niveles de tipos de regimen;

ordenOK=c('Authoritarian',"Hybrid regime","Full democracy","Flawed democracy")
democracia$regimentype=ordered(democracia$regimentype,levels=ordenOK)

*Convertimos en variable numerica

democracia[,-c(2,9,10)]=lapply(democracia[,-c(2,9,10)],as.numeric)
democracia[!complete.cases(democracia),]
##  [1] rank          country       score         electoral     functioning  
##  [6] participacion culture       civilliber    regimentype   continet     
## <0 rows> (or 0-length row.names)
democracia$rank=NULL
str(democracia)
## 'data.frame':    167 obs. of  9 variables:
##  $ country      : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ score        : num  9.87 9.58 9.39 9.26 9.22 9.15 9.15 9.14 9.09 9.03 ...
##  $ electoral    : num  10 10 9.58 10 10 9.58 9.58 10 10 9.58 ...
##  $ functioning  : num  9.64 9.29 9.64 9.29 9.29 7.86 9.64 8.93 8.93 9.29 ...
##  $ participacion: num  10 8.89 8.33 8.89 8.33 8.33 7.78 8.33 7.78 7.78 ...
##  $ culture      : num  10 10 10 8.13 9.38 10 8.75 8.75 8.75 9.38 ...
##  $ civilliber   : num  9.71 9.71 9.41 10 9.12 10 10 9.71 10 9.12 ...
##  $ regimentype  : Ord.factor w/ 4 levels "Authoritarian"<..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ continet     : Factor w/ 6 levels "Africa","Asia",..: 3 3 3 5 3 3 4 3 5 3 ...

Pedimos resumen estadistico

summary(democracia)
##    country              score         electoral       functioning   
##  Length:167         Min.   :1.080   Min.   : 0.000   Min.   :0.000  
##  Class :character   1st Qu.:3.545   1st Qu.: 3.000   1st Qu.:2.860  
##  Mode  :character   Median :5.690   Median : 6.580   Median :5.000  
##                     Mean   :5.479   Mean   : 5.903   Mean   :4.885  
##                     3rd Qu.:7.175   3rd Qu.: 9.170   3rd Qu.:6.790  
##                     Max.   :9.870   Max.   :10.000   Max.   :9.640  
##  participacion      culture         civilliber               regimentype
##  Min.   : 1.11   Min.   : 1.250   Min.   : 0.000   Authoritarian   :53  
##  1st Qu.: 3.89   1st Qu.: 4.380   1st Qu.: 3.530   Hybrid regime   :39  
##  Median : 5.56   Median : 5.630   Median : 5.880   Full democracy  :20  
##  Mean   : 5.25   Mean   : 5.594   Mean   : 5.768   Flawed democracy:55  
##  3rd Qu.: 6.67   3rd Qu.: 6.250   3rd Qu.: 8.240                        
##  Max.   :10.00   Max.   :10.000   Max.   :10.000                        
##           continet 
##  Africa       :50  
##  Asia         :42  
##  Europe       :45  
##  North America:14  
##  Oceania      : 4  
##  South America:12

2) Analizar la relacion entre El score (indice) y el continente

Relacion: **Numerica - Categorica Debemos determinar si es NORMAl o no

library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
ggqqplot(data=democracia,x="score") + facet_grid(. ~ continet)

f1=formula(score~continet)
#ad-hoc
normalidadTest=function(x){y=shapiro.test(x);c(y$statistic,y$p.value)}

#calculando
resultado=aggregate(f1,democracia,FUN = normalidadTest)
#mostrando resultado
library(knitr)
shapiroTest=as.data.frame(resultado[,2])
names(shapiroTest)=c("SW_Statistic","Probabilidad")
kable(cbind(resultado[1],shapiroTest))
continet SW_Statistic Probabilidad
Africa 0.9653422 0.1487353
Asia 0.9486915 0.0579857
Europe 0.9370922 0.0168389
North America 0.9740888 0.9260364
Oceania 0.7752667 0.0647579
South America 0.8433322 0.0304021

Es NO PARAMETRICA usamos prueba Kruskal

kruskal.test(f1,democracia)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  score by continet
## Kruskal-Wallis chi-squared = 52.932, df = 5, p-value = 3.473e-10

No muestra asteriscos pero la probabilidad es menor a 0.05 con un grafico veremos cual es mas diferentes

ggplot(data=democracia,aes(x=continet,y=score))+geom_boxplot(notch = T)
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

3) Analizar la relacion entre Continente y Tipo de Regimen

Relacion CATEGORICA-CATEGORICA construir tabla de contingencia

columna=democracia$continet
fila=democracia$regimentype
(t=table(fila,columna))
##                   columna
## fila               Africa Asia Europe North America Oceania South America
##   Authoritarian        26   20      4             2       0             1
##   Hybrid regime        15    9      9             4       1             1
##   Full democracy        1    0     14             2       2             1
##   Flawed democracy      8   13     18             6       1             9

Mostrando POrcentajes:

prop_t=prop.table(t,margin = 2)
round(prop_t,2)
##                   columna
## fila               Africa Asia Europe North America Oceania South America
##   Authoritarian      0.52 0.48   0.09          0.14    0.00          0.08
##   Hybrid regime      0.30 0.21   0.20          0.29    0.25          0.08
##   Full democracy     0.02 0.00   0.31          0.14    0.50          0.08
##   Flawed democracy   0.16 0.31   0.40          0.43    0.25          0.75

Uso la fUNCION T

library("gplots")
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
balloonplot(t(prop_t),main="tabla",label=T,show.margins = FALSE)

Determinar si hay indep entre las variables estadisticas

chisq.test(t,simulate.p.value = T)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  t
## X-squared = 64.445, df = NA, p-value = 0.0004998

Si no hay indep, calcular Intensidad

library(oii)
association.measures(fila,columna)
## Chi-square-based measures of association:
##    Phi:                      0.621 
##    Contingency coefficient:  0.528 
##    Cramer's V:               0.359 
## 
## Ordinal measures of association:
##    Total number of pairs:   13861 
##    Concordant pairs:        5865   ( 42.31 %)
##    Discordant pairs:        2071   ( 14.94 %)
##    Tied on first variable:  2686   ( 19.38 %)
##    Tied on second variable: 2131   ( 15.37 %)
##    Tied on both variables:  1108   ( 7.99 %)
## 
##    Goodman-Kruskal Gamma: 0.478 
##    Somers' d (col dep.):  0.377 
##    Kendall's tau-b:       0.367 
##    Stuart's tau-c:        0.363

El Coef de contingencia o Cramer sugiere Intensidad relevante when es > a 0.03

4) Analizar la relacion entre Participacion Politica y todas las otras componentes del indice (no analizar el indice)

Relacion: NUMERICA-NUMERICA determinar si la variable num es normal o no

library(dlookr)
## Loading required package: mice
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
normality(democracia[,c(3:7)])
## # A tibble: 5 x 4
##   vars          statistic  p_value sample
##   <chr>             <dbl>    <dbl>  <dbl>
## 1 electoral         0.865 4.14e-11    167
## 2 functioning       0.977 6.52e- 3    167
## 3 participacion     0.981 2.12e- 2    167
## 4 culture           0.964 2.69e- 4    167
## 5 civilliber        0.954 2.87e- 5    167

Calculando correlaciones

library(ggpubr)
p1=ggscatter(democracia,x="electoral",y="participacion",cor.coef = TRUE,cor.method = "spearman")

p2=ggscatter(democracia,x="functioning",y="participacion",cor.coef = TRUE,cor.method = "spearman")

p3=ggscatter(democracia,x="culture",y="participacion",cor.coef = TRUE,cor.method = "spearman")

p4=ggscatter(democracia,x="civilliber",y="participacion",cor.coef = TRUE,cor.method = "spearman")
#paso 1:
all_ps=ggarrange(p1,p2,p3,p4,ncol = 2,nrow = 2)

#paso 2:
annotate_figure(all_ps,top = text_grob("Correlacion con PARTICIPACION",color="blue",face="bold",size = 14))

Si no quieres graficos :

dataForCor=democracia[,c(5,3,4,6,7)]

lapply(dataForCor[,-1],cor.test,y=dataForCor[,1],method="spearman",exact=FALSE)
## $electoral
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 183345, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7637965 
## 
## 
## $functioning
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 219483, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7172393 
## 
## 
## $culture
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 389588, p-value = 7.43e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4980935 
## 
## 
## $civilliber
## 
##  Spearman's rank correlation rho
## 
## data:  X[[i]] and dataForCor[, 1]
## S = 177123, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7718116