Data a utilizar:
library(htmltab)
link="https://en.wikipedia.org/wiki/Democracy_Index"
path='//*[@id="mw-content-text"]/div/table[2]'
demo=htmltab(doc = link,which = path)
Revisando los nombres:
names(demo)
## [1] "Rank >> Rank"
## [2] "Country >> Country"
## [3] "Score >> Score"
## [4] "ElecÂtoral proÂcessand pluraÂlism >> ElecÂtoral proÂcessand pluraÂlism"
## [5] "FunctioÂning ofgovernÂment >> FunctioÂning ofgovernÂment"
## [6] "PoliÂticalparticiÂpation >> PoliÂticalparticiÂpation"
## [7] "PoliÂticalculture >> PoliÂticalculture"
## [8] "CivilliberÂties >> CivilliberÂties"
## [9] "Regimetype >> Regimetype"
## [10] "ContiÂnent >> ContiÂnent"
Mejorando nombres:
newNames=c("rank", "country","score","electoral", "functioning",
"participation","culture","civilliber","regimetype","continent")
#resultado
names(demo)=newNames
Revisando tipo de datos:
str(demo)
## 'data.frame': 167 obs. of 10 variables:
## $ rank : chr "1" "2" "3" "4" ...
## $ country : chr "Â Norway" "Â Iceland" "Â Sweden" "Â New Zealand" ...
## $ score : chr "9.87" "9.58" "9.39" "9.26" ...
## $ electoral : chr "10.00" "10.00" "9.58" "10.00" ...
## $ functioning : chr "9.64" "9.29" "9.64" "9.29" ...
## $ participation: chr "10.00" "8.89" "8.33" "8.89" ...
## $ culture : chr "10.00" "10.00" "10.00" "8.13" ...
## $ civilliber : chr "9.71" "9.71" "9.41" "10.00" ...
## $ regimetype : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
## $ continent : chr "Europe" "Europe" "Europe" "Oceania" ...
Mejorando datos con problemas de formato:
# siempre que venga como texto, eliminar espacios en blanco
demo[,]=lapply(demo[,],trimws,whitespace = "[\\h\\v]")
demo$continent=as.factor(demo$continent)
Conviertiendo en variable categórica ordinal:
Viendo niveles (levels):
table(demo$regimetype)
##
## Authoritarian Flawed democracy Full democracy Hybrid regime
## 53 55 20 39
ordenOK=c('Authoritarian', "Hybrid regime","Flawed democracy","Full democracy")
demo$regimetype=ordered(demo$regimetype,levels=ordenOK)
demo[,-c(2,9,10)]=lapply(demo[,-c(2,9,10)],as.numeric)
## Warning in lapply(demo[, -c(2, 9, 10)], as.numeric): NAs introduced by
## coercion
# estos son:
demo[!complete.cases(demo),]
## rank country score electoral functioning participation culture
## 24 NA Chile 7.97 9.58 8.57 4.44 8.13
## 25 NA Estonia 7.97 9.58 8.21 6.67 6.88
## 37 NA Slovenia 7.50 9.58 6.79 6.67 6.25
## 38 NA Lithuania 7.50 9.58 6.43 6.11 6.25
## 48 NA Argentina 7.02 9.17 5.36 6.11 6.25
## 49 NA Jamaica 7.02 8.75 7.14 4.44 6.25
## 55 NA Poland 6.67 9.17 6.07 6.11 4.38
## 56 NA Guyana 6.67 9.17 5.71 6.11 5.00
## 58 NA Ghana 6.63 8.33 5.71 6.67 6.25
## 59 NA Hungary 6.63 8.75 6.07 5.00 6.25
## 64 NA Serbia 6.41 8.25 5.36 6.11 5.00
## 65 NA Tunisia 6.41 6.42 5.71 7.78 6.25
## 67 NA Singapore 6.38 4.33 7.86 6.11 6.25
## 68 NA Romania 6.38 9.17 5.71 5.00 4.38
## 72 NA Sri Lanka 6.19 7.83 5.71 5.00 6.25
## 73 NA Mexico 6.19 8.33 6.07 7.22 3.13
## 74 NA Hong Kong 6.15 3.08 6.07 5.56 7.50
## 75 NA Senegal 6.15 7.50 6.07 4.44 6.25
## 80 NA Moldova 5.85 7.08 4.64 6.11 4.38
## 81 NA Fiji 5.85 6.58 5.36 6.11 5.63
## 82 NA Montenegro 5.74 6.08 5.36 6.11 4.38
## 83 NA Benin 5.74 6.50 5.71 5.00 5.63
## 92 NA Tanzania 5.41 7.00 5.00 5.00 5.63
## 93 NA Mali 5.41 7.42 3.93 3.89 5.63
## 99 NA Kenya 5.11 3.50 5.36 6.67 5.63
## 100 NA Kyrgyzstan 5.11 6.58 2.93 6.67 4.38
## 107 NA Lebanon 4.63 3.92 2.21 6.67 5.63
## 108 NA Thailand 4.63 3.00 4.29 5.00 5.00
## 117 NA Mozambique 3.85 3.58 2.14 5.00 5.00
## 118 NA Kuwait 3.85 3.17 4.29 3.89 4.38
## 129 NA Ethiopia 3.35 0.00 3.57 5.56 5.00
## 130 NA Rwanda 3.35 1.67 5.00 2.78 4.38
## 135 NA Zimbabwe 3.16 0.50 2.00 4.44 5.63
## 136 NA Venezuela 3.16 1.67 1.79 4.44 4.38
## 145 NA Kazakhstan 2.94 0.50 2.14 4.44 4.38
## 146 NA Russia 2.94 2.17 1.79 5.00 2.50
## 152 NA Eritrea 2.37 0.00 2.14 1.67 6.88
## 153 NA Laos 2.37 0.83 2.86 1.67 5.00
## 160 NA Saudi Arabia 1.93 0.00 2.86 2.22 3.13
## 161 NA Tajikistan 1.93 0.08 0.79 1.67 6.25
## civilliber regimetype continent
## 24 9.12 Flawed democracy South America
## 25 8.53 Flawed democracy Europe
## 37 8.24 Flawed democracy Europe
## 38 9.12 Flawed democracy Europe
## 48 8.24 Flawed democracy South America
## 49 8.53 Flawed democracy North America
## 55 7.65 Flawed democracy Europe
## 56 7.35 Flawed democracy South America
## 58 6.18 Flawed democracy Africa
## 59 7.06 Flawed democracy Europe
## 64 7.35 Flawed democracy Europe
## 65 5.88 Flawed democracy Africa
## 67 7.35 Flawed democracy Asia
## 68 7.65 Flawed democracy Europe
## 72 6.18 Flawed democracy Asia
## 73 6.18 Flawed democracy North America
## 74 8.53 Flawed democracy Asia
## 75 6.47 Flawed democracy Africa
## 80 7.06 Hybrid regime Europe
## 81 5.59 Hybrid regime Oceania
## 82 6.76 Hybrid regime Europe
## 83 5.88 Hybrid regime Africa
## 92 4.41 Hybrid regime Africa
## 93 6.18 Hybrid regime Africa
## 99 4.41 Hybrid regime Africa
## 100 5.00 Hybrid regime Asia
## 107 4.71 Hybrid regime Asia
## 108 5.88 Hybrid regime Asia
## 117 3.53 Authoritarian Africa
## 118 3.53 Authoritarian Asia
## 129 2.65 Authoritarian Africa
## 130 2.94 Authoritarian Africa
## 135 3.24 Authoritarian Africa
## 136 3.53 Authoritarian South America
## 145 3.24 Authoritarian Europe
## 146 3.24 Authoritarian Europe
## 152 1.18 Authoritarian Africa
## 153 1.47 Authoritarian Asia
## 160 1.47 Authoritarian Asia
## 161 0.88 Authoritarian Asia
demo$rank=NULL
summary(demo)
## country score electoral functioning
## Length:167 Min. :1.080 Min. : 0.000 Min. :0.000
## Class :character 1st Qu.:3.545 1st Qu.: 3.000 1st Qu.:2.860
## Mode :character Median :5.690 Median : 6.580 Median :5.000
## Mean :5.479 Mean : 5.903 Mean :4.885
## 3rd Qu.:7.175 3rd Qu.: 9.170 3rd Qu.:6.790
## Max. :9.870 Max. :10.000 Max. :9.640
## participation culture civilliber regimetype
## Min. : 1.11 Min. : 1.250 Min. : 0.000 Authoritarian :53
## 1st Qu.: 3.89 1st Qu.: 4.380 1st Qu.: 3.530 Hybrid regime :39
## Median : 5.56 Median : 5.630 Median : 5.880 Flawed democracy:55
## Mean : 5.25 Mean : 5.594 Mean : 5.768 Full democracy :20
## 3rd Qu.: 6.67 3rd Qu.: 6.250 3rd Qu.: 8.240
## Max. :10.00 Max. :10.000 Max. :10.000
## continent
## Africa :50
## Asia :42
## Europe :45
## North America:14
## Oceania : 4
## South America:12
Determinando tipo de relación: A partir del resumen estadÃstico se determina que es Numerica - Categórica
Determinando si la variable numerica se comporta de manera normal:
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
ggqqplot(data=demo,x="score") + facet_grid(. ~ continent)
f1=formula(score~continent)
# funcion ad-hoc
normalidadTest=function(x) {y =shapiro.test(x);
c(y$statistic, y$p.value)}
# calculando
resultado= aggregate(f1, demo,
FUN = normalidadTest)
# mostrando resultado
library(knitr)
shapiroTest=as.data.frame(resultado[,2])
names(shapiroTest)=c("SW_Statistic","Probabilidad")
kable(cbind(resultado[1],shapiroTest))
| continent | SW_Statistic | Probabilidad |
|---|---|---|
| Africa | 0.9653422 | 0.1487353 |
| Asia | 0.9486915 | 0.0579857 |
| Europe | 0.9370922 | 0.0168389 |
| North America | 0.9740888 | 0.9260364 |
| Oceania | 0.7752667 | 0.0647579 |
| South America | 0.8433322 | 0.0304021 |
kruskal.test(f1,demo)
##
## Kruskal-Wallis rank sum test
##
## data: score by continent
## Kruskal-Wallis chi-squared = 52.932, df = 5, p-value = 3.473e-10
Aqui no muestra asteriscos, pero la probabilidad (p-value) es también menor a 0.05.
Visualmente, para saber cuál es diferente a los demás:
ggplot(data=demo, aes(x=continent, y=score)) + geom_boxplot(notch = T)
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
Determinando tipo de relación: A partir del resumen estadÃstico se determina que es Categórica - Categórica
Construir tabla de contingencia:
columna=demo$continent
fila=demo$regimetype
(t=table(fila,columna))
## columna
## fila Africa Asia Europe North America Oceania South America
## Authoritarian 26 20 4 2 0 1
## Hybrid regime 15 9 9 4 1 1
## Flawed democracy 8 13 18 6 1 9
## Full democracy 1 0 14 2 2 1
Mostrar porcentajes:
# marginal por columna (suma 1 por columna, no por fila)
prop_t=prop.table(t,margin = 2)
round(prop_t,2)
## columna
## fila Africa Asia Europe North America Oceania South America
## Authoritarian 0.52 0.48 0.09 0.14 0.00 0.08
## Hybrid regime 0.30 0.21 0.20 0.29 0.25 0.08
## Flawed democracy 0.16 0.31 0.40 0.43 0.25 0.75
## Full democracy 0.02 0.00 0.31 0.14 0.50 0.08
library("gplots")
# nota que uso la funcion "t()":
balloonplot(t(prop_t), main ="tabla",
label = T, show.margins = FALSE)
chisq.test(t,simulate.p.value = T)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: t
## X-squared = 64.445, df = NA, p-value = 0.0004998
library(oii)
association.measures(fila,columna)
## Chi-square-based measures of association:
## Phi: 0.621
## Contingency coefficient: 0.528
## Cramer's V: 0.359
##
## Ordinal measures of association:
## Total number of pairs: 13861
## Concordant pairs: 6064 ( 43.75 %)
## Discordant pairs: 1872 ( 13.51 %)
## Tied on first variable: 2686 ( 19.38 %)
## Tied on second variable: 2131 ( 15.37 %)
## Tied on both variables: 1108 ( 7.99 %)
##
## Goodman-Kruskal Gamma: 0.528
## Somers' d (col dep.): 0.416
## Kendall's tau-b: 0.405
## Stuart's tau-c: 0.401
El Coeficiente de contingencia o Cramer sugieren una intensidad relevante (mayor a 0.3).
Determinando tipo de relación: A partir del resumen estadÃstico se determina que es Numerica - Numérica
Determinando si la variable numerica se comporta de manera normal:
library(dlookr)
normality(demo[,c(3:7)])
## # A tibble: 5 x 4
## vars statistic p_value sample
## <chr> <dbl> <dbl> <dbl>
## 1 electoral 0.865 4.14e-11 167
## 2 functioning 0.977 6.52e- 3 167
## 3 participation 0.981 2.12e- 2 167
## 4 culture 0.964 2.69e- 4 167
## 5 civilliber 0.954 2.87e- 5 167
library(ggpubr)
p1=ggscatter(demo,
x = "electoral", y = "participation",
cor.coef = TRUE,
cor.method = "spearman")
p2=ggscatter(demo,
x = "functioning", y = "participation",
cor.coef = TRUE,
cor.method = "spearman")
p3=ggscatter(demo,
x = "culture", y = "participation",
cor.coef = TRUE,
cor.method = "spearman")
p4=ggscatter(demo,
x = "civilliber", y = "participation",
cor.coef = TRUE,
cor.method = "spearman")
# paso 1:
all_ps=ggarrange(p1,p2,p3,p4,
ncol = 2, nrow = 2)
# paso 2
annotate_figure(all_ps,
top = text_grob("Correlacion con PARTICIPATION",
color = "blue",
face = "bold",
size = 14))
Sin los gráficos lo puedes ver asi:
dataForCor=demo[,c(5,3,4,6,7)]
#cor.test(dataForCor[,-1], dataForCor[,1],method = "spearman")
lapply(dataForCor[,-1],
cor.test,y=dataForCor[,1],method="spearman",exact=FALSE)
## $electoral
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 183345, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7637965
##
##
## $functioning
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 219483, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7172393
##
##
## $culture
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 389588, p-value = 7.43e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4980935
##
##
## $civilliber
##
## Spearman's rank correlation rho
##
## data: X[[i]] and dataForCor[, 1]
## S = 177123, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7718116