Data:
library(htmltab)
link="https://en.wikipedia.org/wiki/Democracy_Index"
path='//*/div/table[2]/tbody'
democracia=htmltab(doc = link,which = path)
Revisamos los nombres:
names(democracia)
## [1] "Rank >> Rank"
## [2] "Country >> Country"
## [3] "Score >> Score"
## [4] "Electoral processand pluralism >> Electoral processand pluralism"
## [5] "Functioning ofgovernment >> Functioning ofgovernment"
## [6] "Politicalparticipation >> Politicalparticipation"
## [7] "Politicalculture >> Politicalculture"
## [8] "Civilliberties >> Civilliberties"
## [9] "Regimetype >> Regimetype"
## [10] "Continent >> Continent"
Mejoramos nombres
newN=c("rank","country","score","electoral","functioning","participacion","culture","civilliber","regimentype","continet")
#resultado:
names(democracia)=newN
str(democracia)
## 'data.frame': 167 obs. of 10 variables:
## $ rank : chr "1" "2" "3" "4" ...
## $ country : chr "Â Norway" "Â Iceland" "Â Sweden" "Â New Zealand" ...
## $ score : chr "9.87" "9.58" "9.39" "9.26" ...
## $ electoral : chr "10.00" "10.00" "9.58" "10.00" ...
## $ functioning : chr "9.64" "9.29" "9.64" "9.29" ...
## $ participacion: chr "10.00" "8.89" "8.33" "8.89" ...
## $ culture : chr "10.00" "10.00" "10.00" "8.13" ...
## $ civilliber : chr "9.71" "9.71" "9.41" "10.00" ...
## $ regimentype : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
## $ continet : chr "Europe" "Europe" "Europe" "Oceania" ...
Mejorando datos internos * Evitamos espacios en blanco
democracia[,]=lapply(democracia[,],trimws,whitespace="[\\h\\v]")
democracia$continet=as.factor(democracia$continet)
table(democracia$regimentype)
##
## Authoritarian Flawed democracy Full democracy Hybrid regime
## 53 55 20 39
Ajusto los niveles de tipos de regimen;
ordenOK=c('Authoritarian',"Hybrid regime","Full democracy","Flawed democracy")
democracia$regimentype=ordered(democracia$regimentype,levels=ordenOK)
*Convertimos en variable numerica
democracia[,-c(2,9,10)]=lapply(democracia[,-c(2,9,10)],as.numeric)
democracia[!complete.cases(democracia),]
## [1] rank country score electoral functioning
## [6] participacion culture civilliber regimentype continet
## <0 rows> (or 0-length row.names)
democracia$rank=NULL
str(democracia)
## 'data.frame': 167 obs. of 9 variables:
## $ country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ score : num 9.87 9.58 9.39 9.26 9.22 9.15 9.15 9.14 9.09 9.03 ...
## $ electoral : num 10 10 9.58 10 10 9.58 9.58 10 10 9.58 ...
## $ functioning : num 9.64 9.29 9.64 9.29 9.29 7.86 9.64 8.93 8.93 9.29 ...
## $ participacion: num 10 8.89 8.33 8.89 8.33 8.33 7.78 8.33 7.78 7.78 ...
## $ culture : num 10 10 10 8.13 9.38 10 8.75 8.75 8.75 9.38 ...
## $ civilliber : num 9.71 9.71 9.41 10 9.12 10 10 9.71 10 9.12 ...
## $ regimentype : Ord.factor w/ 4 levels "Authoritarian"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ continet : Factor w/ 6 levels "Africa","Asia",..: 3 3 3 5 3 3 4 3 5 3 ...
Pedimos resumen estadistico
summary(democracia)
## country score electoral functioning
## Length:167 Min. :1.080 Min. : 0.000 Min. :0.000
## Class :character 1st Qu.:3.545 1st Qu.: 3.000 1st Qu.:2.860
## Mode :character Median :5.690 Median : 6.580 Median :5.000
## Mean :5.479 Mean : 5.903 Mean :4.885
## 3rd Qu.:7.175 3rd Qu.: 9.170 3rd Qu.:6.790
## Max. :9.870 Max. :10.000 Max. :9.640
## participacion culture civilliber regimentype
## Min. : 1.11 Min. : 1.250 Min. : 0.000 Authoritarian :53
## 1st Qu.: 3.89 1st Qu.: 4.380 1st Qu.: 3.530 Hybrid regime :39
## Median : 5.56 Median : 5.630 Median : 5.880 Full democracy :20
## Mean : 5.25 Mean : 5.594 Mean : 5.768 Flawed democracy:55
## 3rd Qu.: 6.67 3rd Qu.: 6.250 3rd Qu.: 8.240
## Max. :10.00 Max. :10.000 Max. :10.000
## continet
## Africa :50
## Asia :42
## Europe :45
## North America:14
## Oceania : 4
## South America:12
Relacion: **Numerica - Categorica Debemos determinar si es NORMAl o no
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
ggqqplot(data=democracia,x="score") + facet_grid(. ~ continet)
f1=formula(score~continet)
#ad-hoc
normalidadTest=function(x){y=shapiro.test(x);c(y$statistic,y$p.value)}
#calculando
resultado=aggregate(f1,democracia,FUN = normalidadTest)
#mostrando resultado
library(knitr)
shapiroTest=as.data.frame(resultado[,2])
names(shapiroTest)=c("SW_Statistic","Probabilidad")
kable(cbind(resultado[1],shapiroTest))
| continet | SW_Statistic | Probabilidad |
|---|---|---|
| Africa | 0.9653422 | 0.1487353 |
| Asia | 0.9486915 | 0.0579857 |
| Europe | 0.9370922 | 0.0168389 |
| North America | 0.9740888 | 0.9260364 |
| Oceania | 0.7752667 | 0.0647579 |
| South America | 0.8433322 | 0.0304021 |
Es NO PARAMETRICA usamos prueba Kruskal
kruskal.test(f1,democracia)
##
## Kruskal-Wallis rank sum test
##
## data: score by continet
## Kruskal-Wallis chi-squared = 52.932, df = 5, p-value = 3.473e-10
No muestra asteriscos pero la probabilidad es menor a 0.05 con un grafico veremos cual es mas diferentes
ggplot(data=democracia,aes(x=continet,y=score))+geom_boxplot(notch = T)
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
Relacion CATEGORICA-CATEGORICA construir tabla de contingencia
columna=democracia$continet
fila=democracia$regimentype
(t=table(fila,columna))
## columna
## fila Africa Asia Europe North America Oceania South America
## Authoritarian 26 20 4 2 0 1
## Hybrid regime 15 9 9 4 1 1
## Full democracy 1 0 14 2 2 1
## Flawed democracy 8 13 18 6 1 9
Mostrando POrcentajes:
prop_t=prop.table(t,margin = 2)
round(prop_t,2)
## columna
## fila Africa Asia Europe North America Oceania South America
## Authoritarian 0.52 0.48 0.09 0.14 0.00 0.08
## Hybrid regime 0.30 0.21 0.20 0.29 0.25 0.08
## Full democracy 0.02 0.00 0.31 0.14 0.50 0.08
## Flawed democracy 0.16 0.31 0.40 0.43 0.25 0.75
Uso la fUNCION T
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
balloonplot(t(prop_t),main="tabla",label=T,show.margins = FALSE)
Determinar si hay indep entre las variables estadisticas
chisq.test(t,simulate.p.value = T)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: t
## X-squared = 64.445, df = NA, p-value = 0.0004998
Si no hay indep, calcular Intensidad
library(oii)
association.measures(fila,columna)
## Chi-square-based measures of association:
## Phi: 0.621
## Contingency coefficient: 0.528
## Cramer's V: 0.359
##
## Ordinal measures of association:
## Total number of pairs: 13861
## Concordant pairs: 5865 ( 42.31 %)
## Discordant pairs: 2071 ( 14.94 %)
## Tied on first variable: 2686 ( 19.38 %)
## Tied on second variable: 2131 ( 15.37 %)
## Tied on both variables: 1108 ( 7.99 %)
##
## Goodman-Kruskal Gamma: 0.478
## Somers' d (col dep.): 0.377
## Kendall's tau-b: 0.367
## Stuart's tau-c: 0.363
El Coef de contingencia o Cramer sugiere Intensidad relevante when es > a 0.03
Relacion: NUMERICA-NUMERICA determinar si la variable num es normal o no
library(dlookr)
## Loading required package: mice
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
normality(democracia[,c(3:7)])
## # A tibble: 5 x 4
## vars statistic p_value sample
## <chr> <dbl> <dbl> <dbl>
## 1 electoral 0.865 4.14e-11 167
## 2 functioning 0.977 6.52e- 3 167
## 3 participacion 0.981 2.12e- 2 167
## 4 culture 0.964 2.69e- 4 167
## 5 civilliber 0.954 2.87e- 5 167
Calculando correlaciones
library(ggpubr)
p1=ggscatter(democracia,x="electoral",y="participacion",cor.coef = TRUE,cor.method = "spearman")
p2=ggscatter(democracia,x="functioning",y="participacion",cor.coef = TRUE,cor.method = "spearman")
p3=ggscatter(democracia,x="culture",y="participacion",cor.coef = TRUE,cor.method = "spearman")
p4=ggscatter(democracia,x="civilliber",y="participacion",cor.coef = TRUE,cor.method = "spearman")
#paso 1:
all_ps=ggarrange(p1,p2,p3,p4,ncol = 2,nrow = 2)
#paso 2:
annotate_figure(all_ps,top = text_grob("Correlacion con PARTICIPACION",color="blue",face="bold",size = 14))
Si no quieres graficos :
dataForCor=democracia[,c(5,3,4,6,7)]
lapply(dataForCor[,-1],cor.test,y=dataForCor[,1],method="spearman",exact=FALSE)
## $electoral
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 183345, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7637965
##
##
## $functioning
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 219483, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7172393
##
##
## $culture
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 389588, p-value = 7.43e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4980935
##
##
## $civilliber
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 177123, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7718116