1.1 Carga de datos Para esta etapa vamos a proceder a scrapear las dos páginas web, con la ayuda de la biblioteca htmltab. La descarga utilizará el xpath de la tabla de interés.
rm(list = ls())
library(htmltab)
library(rio)
# links
WhereDEMO=list(page="https://en.wikipedia.org/wiki/Democracy_Index",
xpath='/html/body/div[3]/div[3]/div[5]/div[1]/table[6]/tbody')
IDH="https://github.com/CarlosChavarri23/CarlosChavarri23/blob/main/HDR%202.0.xlsx?raw=true"
Idh=import(IDH)
demo = htmltab(doc = WhereDEMO$page,
which = WhereDEMO$xpath,
encoding = "UTF-8")1.2 Limpieza
#Nos quesamos con los que nos interesa
demo=demo[,-c(2,6)]
Idh=Idh[,-c(8)]Los nombres en ambas datas presentaban serios problemas. Se hicieron algunos cambios para que sea más práctico y entendible la realización de pruebas.
newNames=c('Rank.Idh','Country','Score.Idh','Lifeexpectancy','SchoolExpected','SchoolMean','GNI')
names(Idh)=newNamesnewNames1=c('Rank.Dem','Country','Type','Score.Dem','Electprocess','Fgovernment','Polparticipation','Polculture','Civilliberties')
names(demo)=newNames1Se incluyeron estos código sin embargo no eran necesarios pues los nombres de la misma se mantuvieron en inglés.
# en DEMO
## Capitalizar
library(stringr)
names(demo)=str_to_title(names(demo))## sin tildes ni ñs.
library(stringi)
names(demo)=stri_trans_general(str = names(demo),
id = "Latin-ASCII")## sin espacios
names(demo)=gsub(" ","",names(demo))Valores en las celdas: Por lo general, hay que asegurarse que no haya espacios en blanco ni al inicio ni al final de cada valor en una celda.
Idh[,]=lapply(Idh[,], trimws,whitespace = "[\\h\\v]")
demo[,]=lapply(demo[,], trimws,whitespace = "[\\h\\v]")FORMATEO
Hablamos de formateo cuando buscamos que los valores de cada celda estén el correcto tipo de dato. Para ello debemos primero ver qué tipo ha sido asignado por R.
str(Idh)## 'data.frame': 222 obs. of 7 variables:
## $ Rank.Idh : chr "1" "2" "3" "4" ...
## $ Country : chr "Switzerland" "Norway" "Iceland" "Hong Kong, China (SAR)" ...
## $ Score.Idh : chr "0.96199999999999997" "0.96099999999999997" "0.95899999999999996" "0.95199999999999996" ...
## $ Lifeexpectancy: chr "83.9872" "83.2339" "82.6782" "85.4734" ...
## $ SchoolExpected: chr "16.50029945" "18.185199740000002" "19.163059230000002" "17.278169630000001" ...
## $ SchoolMean : chr "13.85966015" "13.00362968" "13.76716995" "12.22620964" ...
## $ GNI : chr "66933.004539999994" "64660.106220000001" "55782.049809999997" "62606.845399999998" ...
str(demo)## 'data.frame': 167 obs. of 9 variables:
## $ Rank.dem : chr "1" "2" "3" "4" ...
## $ Country : chr "Norway" "New Zealand" "Finland" "Sweden" ...
## $ Type : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
## $ Score.dem : chr "9.75" "9.37" "9.27" "9.26" ...
## $ Electprocess : chr "10.00" "10.00" "10.00" "9.58" ...
## $ Fgovernment : chr "9.64" "8.93" "9.29" "9.29" ...
## $ Polparticipation: chr "10.00" "9.44" "8.89" "8.33" ...
## $ Polculture : chr "10.00" "8.75" "8.75" "10.00" ...
## $ Civilliberties : chr "9.12" "9.71" "9.41" "9.12" ...
Conversión a tipo numérico: Vemos que muchos valores que deberian ser numéricos han sido reconocidos como texto. Normalmente eso sucede pues hay caracteres dentro del numero que evitan que se lea adecuadamente. Luego de esa corrección recién se puede cambiar el tipo.
# eliminar coma en los miles:
Idh$GNI=gsub(',','',Idh$GNI)
# ahora a numerico
Idh[,-2]=lapply(Idh[,-2], as.numeric)## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion
## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion
## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion
## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion
# cambiar coma en los decimales:
demo[,-c(2,3)]=lapply(demo[,-c(2,3)],
function(x){gsub(",",".",x)})
# ahora a numerico
demo$Rank.dem=as.numeric(demo$Rank)
demo$Score.dem=as.numeric(demo$Score)
demo$Electprocess=as.numeric(demo$Electprocess)
demo$Fgovernment=as.numeric(demo$Fgovernment)
demo$Polparticipation=as.numeric(demo$Polparticipation)
demo$Polculture=as.numeric(demo$Polculture)
demo$Civilliberties=as.numeric(demo$Civilliberties)Luego de pasar a tipo numérico, las celdas que no tenían un valor numérico adecuado se convirtieron en NAs. Aquí hay que revisar las filas donde eso se generó
1.4 Merge
setdiff(demo$Country,Idh$Country)## [1] "Taiwan" "South Korea"
## [3] "Czech Republic" "Cape Verde"
## [5] "East Timor" "Moldova"
## [7] "Hong Kong" "Tanzania"
## [9] "Bolivia" "Turkey"
## [11] "Ivory Coast" "Palestine"
## [13] "Russia" "Eswatini"
## [15] "Vietnam" "Republic of the Congo"
## [17] "Venezuela" "Iran"
## [19] "Laos" "Syria"
## [21] "Democratic Republic of the Congo" "North Korea"
setdiff(Idh$Country,demo$Country)## [1] "Hong Kong, China (SAR)" "Liechtenstein"
## [3] "Korea (Republic of)" "Czechia"
## [5] "Andorra" "San Marino"
## [7] "Türkiye" "Brunei Darussalam"
## [9] "Russian Federation" "Bahamas"
## [11] "Grenada" "Barbados"
## [13] "Antigua and Barbuda" "Seychelles"
## [15] "Saint Kitts and Nevis" "Iran (Islamic Republic of)"
## [17] "Moldova (Republic of)" "Palau"
## [19] "Saint Vincent and the Grenadines" "Maldives"
## [21] "Tonga" "Dominica"
## [23] "Palestine, State of" "Saint Lucia"
## [25] "Samoa" "Viet Nam"
## [27] "Bolivia (Plurinational State of)" "Venezuela (Bolivarian Republic of)"
## [29] "Belize" "Cabo Verde"
## [31] "Tuvalu" "Marshall Islands"
## [33] "Micronesia (Federated States of)" "Kiribati"
## [35] "Sao Tome and Principe" "Lao People's Democratic Republic"
## [37] "Timor-Leste" "Vanuatu"
## [39] "Eswatini (Kingdom of)" "Syrian Arab Republic"
## [41] "Congo" "Solomon Islands"
## [43] "Côte d'Ivoire" "Tanzania (United Republic of)"
## [45] "Congo (Democratic Republic of the)" "South Sudan"
Se puede corroborar que sí hay valores que pueden ser corregidos.
……………………………………………………….
Idh[Idh$Country=='Hong Kong, China (SAR)','Country']='Hong Kong'
Idh[Idh$Country=='Iran (Islamic Republic of)','Country']='Iran'
Idh[Idh$Country=='Venezuela (Bolivarian Republic of)','Country']='Venezuela'
Idh[Idh$Country=='Russian Federation','Country']='Russia'
Idh[Idh$Country=='Congo','Country']='Republic of the Congo'
Idh[Idh$Country=='Congo (Democratic Republic of the)','Country']='Democratic Republic of the Congo'
Idh[Idh$Country=='Bolivia (Plurinational State of)','Country']='Bolivia'
Idh[Idh$Country=='Korea (Republic of)','Country']='South Korea'
Idh[Idh$Country=='Timor-Leste','Country']='East Timor'
Idh[Idh$Country=='Eswatini (Kingdom of)','Country']='Eswatini'
Idh[Idh$Country=='Czechia','Country']='Czech Republic'
Idh[Idh$Country=='Moldova (Republic of)','Country']='Moldova'
Idh[Idh$Country=='Viet Nam','Country']='Vietnam'Este proceso me demandó más tiempo del que pensé.
Ahora si hay más comodidad para hacer el merge:
idhdemo=merge(Idh,demo,by="Country")
head(idhdemo)## Country Rank.Idh Score.Idh Lifeexpectancy SchoolExpected SchoolMean
## 1 Afghanistan 180 0.478 61.9824 10.26384 2.985070
## 2 Albania 67 0.796 76.4626 14.44800 11.286455
## 3 Algeria 91 0.745 76.3767 14.62690 8.069284
## 4 Angola 148 0.586 61.6434 12.17210 5.417391
## 5 Argentina 47 0.842 75.3899 17.87487 11.147269
## 6 Armenia 85 0.759 72.0431 13.11676 11.330300
## GNI Rank.dem Type Score.dem Electprocess Fgovernment
## 1 1824.191 167 Authoritarian 0.32 0.00 0.07
## 2 14131.110 68 Flawed democracy 6.11 7.00 6.43
## 3 10800.225 113 Authoritarian 3.77 3.08 2.50
## 4 5465.618 122 Authoritarian 3.37 1.33 2.86
## 5 20925.268 50 Flawed democracy 6.81 9.17 5.00
## 6 13157.994 89 Hybrid regime 5.49 7.50 5.71
## Polparticipation Polculture Civilliberties
## 1 0.00 1.25 0.29
## 2 4.44 5.63 7.06
## 3 4.44 5.00 3.82
## 4 5.00 5.00 2.65
## 5 7.22 5.00 7.65
## 6 6.11 3.13 5.00
library(rio)
str(idhdemo)## 'data.frame': 158 obs. of 15 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ Rank.Idh : num 180 67 91 148 47 85 5 25 91 35 ...
## $ Score.Idh : num 0.478 0.796 0.745 0.586 0.842 0.759 0.951 0.916 0.745 0.875 ...
## $ Lifeexpectancy : num 62 76.5 76.4 61.6 75.4 ...
## $ SchoolExpected : num 10.3 14.4 14.6 12.2 17.9 ...
## $ SchoolMean : num 2.99 11.29 8.07 5.42 11.15 ...
## $ GNI : num 1824 14131 10800 5466 20925 ...
## $ Rank.dem : num 167 68 113 122 50 89 9 20 141 144 ...
## $ Type : chr "Authoritarian" "Flawed democracy" "Authoritarian" "Authoritarian" ...
## $ Score.dem : num 0.32 6.11 3.77 3.37 6.81 5.49 8.9 8.07 2.68 2.52 ...
## $ Electprocess : num 0 7 3.08 1.33 9.17 7.5 10 9.58 0.5 0.42 ...
## $ Fgovernment : num 0.07 6.43 2.5 2.86 5 5.71 8.57 6.79 2.5 2.71 ...
## $ Polparticipation: num 0 4.44 4.44 5 7.22 6.11 7.78 8.89 2.78 3.33 ...
## $ Polculture : num 1.25 5.63 5 5 5 3.13 8.75 6.88 5 4.38 ...
## $ Civilliberties : num 0.29 7.06 3.82 2.65 7.65 5 9.41 8.24 2.65 1.76 ...
export(idhdemo,"DATA.csv")Luego de haber realizado la limpieza previa…..Viene propiamente el Análisis Factorial Exploratorio
library(rio)
link="https://github.com/CarlosChavarri23/An-lsis-Factorial-/raw/main/DATA.csv"
idhdemo=import(link)dontselect=c("Country","Rank.dem","Rank.Idh","Score.Idh","Score.dem", "Type")
select=setdiff(names(idhdemo),dontselect)
theData=idhdemo[,select]str(theData)## 'data.frame': 158 obs. of 9 variables:
## $ Lifeexpectancy : num 62 76.5 76.4 61.6 75.4 ...
## $ SchoolExpected : num 10.3 14.4 14.6 12.2 17.9 ...
## $ SchoolMean : num 2.99 11.29 8.07 5.42 11.15 ...
## $ GNI : num 1824 14131 10800 5466 20925 ...
## $ Electprocess : num 0 7 3.08 1.33 9.17 7.5 10 9.58 0.5 0.42 ...
## $ Fgovernment : num 0.07 6.43 2.5 2.86 5 5.71 8.57 6.79 2.5 2.71 ...
## $ Polparticipation: num 0 4.44 4.44 5 7.22 6.11 7.78 8.89 2.78 3.33 ...
## $ Polculture : num 1.25 5.63 5 5 5 3.13 8.75 6.88 5 4.38 ...
## $ Civilliberties : num 0.29 7.06 3.82 2.65 7.65 5 9.41 8.24 2.65 1.76 ...
library(polycor)
corMatrix=polycor::hetcor(theData)$correlationslibrary(ggcorrplot)## Loading required package: ggplot2
ggcorrplot(corMatrix)library(psych)##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:polycor':
##
## polyserial
psych::KMO(corMatrix) ## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA = 0.89
## MSA for each item =
## Lifeexpectancy SchoolExpected SchoolMean GNI
## 0.90 0.90 0.89 0.89
## Electprocess Fgovernment Polparticipation Polculture
## 0.82 0.96 0.96 0.88
## Civilliberties
## 0.86
cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05## [1] FALSE
library(matrixcalc)library(matrixcalc)
is.singular.matrix(corMatrix)## [1] FALSE
fa.parallel(theData, fa = 'fa',correct = T,plot = F)## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
library(GPArotation)
resfa <- fa(theData,
nfactors = 2,
cor = 'mixed',
rotate = "varimax",
fm="minres")
print(resfa$loadings)##
## Loadings:
## MR1 MR2
## Lifeexpectancy 0.308 0.858
## SchoolExpected 0.397 0.812
## SchoolMean 0.302 0.793
## GNI 0.303 0.801
## Electprocess 0.909 0.227
## Fgovernment 0.763 0.489
## Polparticipation 0.773 0.338
## Polculture 0.511 0.415
## Civilliberties 0.908 0.331
##
## MR1 MR2
## SS loadings 3.528 3.350
## Proportion Var 0.392 0.372
## Cumulative Var 0.392 0.764
print(resfa$loadings,cutoff = 0.5)##
## Loadings:
## MR1 MR2
## Lifeexpectancy 0.858
## SchoolExpected 0.812
## SchoolMean 0.793
## GNI 0.801
## Electprocess 0.909
## Fgovernment 0.763
## Polparticipation 0.773
## Polculture 0.511
## Civilliberties 0.908
##
## MR1 MR2
## SS loadings 3.528 3.350
## Proportion Var 0.392 0.372
## Cumulative Var 0.392 0.764
fa.diagram(resfa,main = "Resultados del EFA")sort(resfa$communality)## Polculture Polparticipation SchoolMean GNI
## 0.4329945 0.7113670 0.7199503 0.7329995
## SchoolExpected Fgovernment Lifeexpectancy Electprocess
## 0.8167403 0.8210855 0.8306177 0.8776938
## Civilliberties
## 0.9341848
sort(resfa$complexity)## Electprocess Lifeexpectancy Civilliberties GNI
## 1.124793 1.254550 1.260408 1.281374
## SchoolMean Polparticipation SchoolExpected Fgovernment
## 1.284787 1.368459 1.453542 1.702795
## Polculture
## 1.918847
library(magrittr)
as.data.frame(resfa$scores)%>%head()## MR1 MR2
## 1 -1.7977793 -0.83798415
## 2 0.3324736 0.27861886
## 3 -0.8749699 0.43423600
## 4 -0.7793731 -0.64292948
## 5 0.6584106 0.45134811
## 6 0.1048445 -0.05519384
library(ggplot2)idhdemo$demos_efa=resfa$scores[,1]
idhdemo$desahu_efa=resfa$scores[,2]
ggplot(data=idhdemo,aes(x=Score.dem,y=demos_efa)) + geom_point() + theme_minimal() + labs(x="Indice de Democracia (original)", y="Indice de Democracia EFA")library(BBmisc)##
## Attaching package: 'BBmisc'
## The following object is masked from 'package:base':
##
## isFALSE
efa_scores_ok=normalize(resfa$scores,
method = "range",
margin=2, # by column
range = c(0, 10))
idhdemo$demos_efa_ok=efa_scores_ok[,1]
idhdemo$desahu_efa_ok=efa_scores_ok[,2]
ggplot(data=idhdemo,aes(x=Score.dem,y=demos_efa_ok)) + geom_point() + theme_minimal() + labs(x="Indice de Democracia (original)", y="Indice de Democracia EFA (cambiado)")Análisis Factorial Exploratorio:
modelCFA <- ' democracia =~ Electprocess + Fgovernment + Polparticipation + Polculture + Civilliberties
desaHumano=~ Lifeexpectancy +SchoolExpected +SchoolMean +GNI'# normalizar las variables:
theDataNorm=scale(theData)
library(lavaan)## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.
##
## Attaching package: 'lavaan'
## The following object is masked from 'package:psych':
##
## cor2cov
cfa_fit <- cfa(modelCFA, data=theDataNorm,
std.lv=TRUE,
missing="fiml")
summary(cfa_fit)## lavaan 0.6-12 ended normally after 27 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 28
##
## Number of observations 158
## Number of missing patterns 1
##
## Model Test User Model:
##
## Test statistic 122.880
## Degrees of freedom 26
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## democracia =~
## Electprocess 0.925 0.060 15.356 0.000
## Fgovernment 0.883 0.062 14.125 0.000
## Polparticipatn 0.835 0.065 12.911 0.000
## Polculture 0.633 0.072 8.790 0.000
## Civilliberties 0.964 0.058 16.521 0.000
## desaHumano =~
## Lifeexpectancy 0.897 0.062 14.369 0.000
## SchoolExpected 0.913 0.062 14.822 0.000
## SchoolMean 0.869 0.064 13.667 0.000
## GNI 0.826 0.066 12.560 0.000
##
## Covariances:
## Estimate Std.Err z-value P(>|z|)
## democracia ~~
## desaHumano 0.687 0.047 14.691 0.000
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|)
## .Electprocess -0.000 0.079 -0.000 1.000
## .Fgovernment -0.000 0.079 -0.000 1.000
## .Polparticipatn -0.000 0.079 -0.000 1.000
## .Polculture -0.000 0.079 -0.000 1.000
## .Civilliberties -0.000 0.079 -0.000 1.000
## .Lifeexpectancy 0.000 0.079 0.000 1.000
## .SchoolExpected 0.000 0.079 0.000 1.000
## .SchoolMean -0.000 0.079 -0.000 1.000
## .GNI -0.000 0.079 -0.000 1.000
## democracia 0.000
## desaHumano 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .Electprocess 0.138 0.020 6.827 0.000
## .Fgovernment 0.215 0.029 7.439 0.000
## .Polparticipatn 0.297 0.037 8.002 0.000
## .Polculture 0.593 0.069 8.644 0.000
## .Civilliberties 0.065 0.016 4.096 0.000
## .Lifeexpectancy 0.189 0.031 6.135 0.000
## .SchoolExpected 0.159 0.029 5.580 0.000
## .SchoolMean 0.238 0.034 6.989 0.000
## .GNI 0.311 0.042 7.428 0.000
## democracia 1.000
## desaHumano 1.000
Averigüemos qué tan bien salió el modelo:
allParamCFA=parameterEstimates(cfa_fit,standardized = T)
allFitCFA=as.list(fitMeasures(cfa_fit))El ChiSquare es NO significativo? (p_value debe ser mayor a 0.05 para que sea bueno)
allFitCFA[c("chisq", "df", "pvalue")] # pvalue>0.05 ## $chisq
## [1] 122.8801
##
## $df
## [1] 26
##
## $pvalue
## [1] 1.554312e-14
El Índice Tucker Lewis es mayor a 0.9?
allFitCFA$tli ## [1] 0.9061035
La Raíz del error cuadrático medio de aproximación es menor a 0.05?
allFitCFA[c('rmsea.ci.lower','rmsea' ,'rmsea.ci.upper')] ## $rmsea.ci.lower
## [1] 0.1268419
##
## $rmsea
## [1] 0.1535685
##
## $rmsea.ci.upper
## [1] 0.18142
No hay requisitos pero aún así cálculamos los resultados obtenidos.
scorescfa=normalize(lavPredict(cfa_fit),
method = "range",
margin=2, # by column
range = c(0, 10))
idhdemo$demos_cfa_ok=scorescfa[,1]
idhdemo$desahu_cfa_ok=scorescfa[,2]Veamos que tanto se parece el score obtenido via CFA con la puntuación original en la Figura 3.1.
ggplot(data=idhdemo,aes(x=Score.dem,y=demos_cfa_ok)) + geom_point() + theme_minimal() + labs(x="Indice de Democracia (original)", y="Indice de Democracia CFA (cambiado)")Figure 3.1: Comparación Indice de Democracia con Score CFA con puntuación original
Podemos ver los resultados del CFA en la Figura 3.2.
library(lavaanPlot)
lavaanPlot(model = cfa_fit, node_options = list(shape = "box", fontname = "Helvetica"), edge_options = list(color = "grey"), coefs = T)