1.1 Carga de datos Para esta etapa vamos a proceder a scrapear las dos páginas web, con la ayuda de la biblioteca htmltab. La descarga utilizará el xpath de la tabla de interés.

rm(list = ls())

library(htmltab)
library(rio)

# links
WhereDEMO=list(page="https://en.wikipedia.org/wiki/Democracy_Index", 
              xpath='/html/body/div[3]/div[3]/div[5]/div[1]/table[6]/tbody')

IDH="https://github.com/CarlosChavarri23/CarlosChavarri23/blob/main/HDR%202.0.xlsx?raw=true"
Idh=import(IDH)


demo  = htmltab(doc = WhereDEMO$page, 
               which  = WhereDEMO$xpath,
               encoding = "UTF-8")

1.2 Limpieza

#Nos quesamos con los que nos interesa
demo=demo[,-c(2,6)]
Idh=Idh[,-c(8)]

Los nombres en ambas datas presentaban serios problemas. Se hicieron algunos cambios para que sea más práctico y entendible la realización de pruebas.

newNames=c('Rank.Idh','Country','Score.Idh','Lifeexpectancy','SchoolExpected','SchoolMean','GNI')


names(Idh)=newNames
newNames1=c('Rank.Dem','Country','Type','Score.Dem','Electprocess','Fgovernment','Polparticipation','Polculture','Civilliberties')


names(demo)=newNames1

Se incluyeron estos código sin embargo no eran necesarios pues los nombres de la misma se mantuvieron en inglés.

# en DEMO
## Capitalizar
library(stringr)
names(demo)=str_to_title(names(demo))
## sin tildes ni ñs.
library(stringi)
names(demo)=stri_trans_general(str = names(demo), 
                               id = "Latin-ASCII")
## sin espacios
names(demo)=gsub(" ","",names(demo))

Valores en las celdas: Por lo general, hay que asegurarse que no haya espacios en blanco ni al inicio ni al final de cada valor en una celda.

Idh[,]=lapply(Idh[,], trimws,whitespace = "[\\h\\v]") 

demo[,]=lapply(demo[,], trimws,whitespace = "[\\h\\v]")

FORMATEO

Hablamos de formateo cuando buscamos que los valores de cada celda estén el correcto tipo de dato. Para ello debemos primero ver qué tipo ha sido asignado por R.

str(Idh)
## 'data.frame':    222 obs. of  7 variables:
##  $ Rank.Idh      : chr  "1" "2" "3" "4" ...
##  $ Country       : chr  "Switzerland" "Norway" "Iceland" "Hong Kong, China (SAR)" ...
##  $ Score.Idh     : chr  "0.96199999999999997" "0.96099999999999997" "0.95899999999999996" "0.95199999999999996" ...
##  $ Lifeexpectancy: chr  "83.9872" "83.2339" "82.6782" "85.4734" ...
##  $ SchoolExpected: chr  "16.50029945" "18.185199740000002" "19.163059230000002" "17.278169630000001" ...
##  $ SchoolMean    : chr  "13.85966015" "13.00362968" "13.76716995" "12.22620964" ...
##  $ GNI           : chr  "66933.004539999994" "64660.106220000001" "55782.049809999997" "62606.845399999998" ...
str(demo)
## 'data.frame':    167 obs. of  9 variables:
##  $ Rank.dem        : chr  "1" "2" "3" "4" ...
##  $ Country         : chr  "Norway" "New Zealand" "Finland" "Sweden" ...
##  $ Type            : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
##  $ Score.dem       : chr  "9.75" "9.37" "9.27" "9.26" ...
##  $ Electprocess    : chr  "10.00" "10.00" "10.00" "9.58" ...
##  $ Fgovernment     : chr  "9.64" "8.93" "9.29" "9.29" ...
##  $ Polparticipation: chr  "10.00" "9.44" "8.89" "8.33" ...
##  $ Polculture      : chr  "10.00" "8.75" "8.75" "10.00" ...
##  $ Civilliberties  : chr  "9.12" "9.71" "9.41" "9.12" ...

Conversión a tipo numérico: Vemos que muchos valores que deberian ser numéricos han sido reconocidos como texto. Normalmente eso sucede pues hay caracteres dentro del numero que evitan que se lea adecuadamente. Luego de esa corrección recién se puede cambiar el tipo.

# eliminar coma en los miles:
Idh$GNI=gsub(',','',Idh$GNI)

# ahora a numerico
Idh[,-2]=lapply(Idh[,-2], as.numeric)
## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion

## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion

## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion

## Warning in lapply(Idh[, -2], as.numeric): NAs introduced by coercion
# cambiar coma en los decimales:
demo[,-c(2,3)]=lapply(demo[,-c(2,3)],
                      function(x){gsub(",",".",x)})


# ahora a numerico
demo$Rank.dem=as.numeric(demo$Rank)
demo$Score.dem=as.numeric(demo$Score)
demo$Electprocess=as.numeric(demo$Electprocess)
demo$Fgovernment=as.numeric(demo$Fgovernment)
demo$Polparticipation=as.numeric(demo$Polparticipation)
demo$Polculture=as.numeric(demo$Polculture)
demo$Civilliberties=as.numeric(demo$Civilliberties)

Luego de pasar a tipo numérico, las celdas que no tenían un valor numérico adecuado se convirtieron en NAs. Aquí hay que revisar las filas donde eso se generó

1.4 Merge

setdiff(demo$Country,Idh$Country)
##  [1] "Taiwan"                           "South Korea"                     
##  [3] "Czech Republic"                   "Cape Verde"                      
##  [5] "East Timor"                       "Moldova"                         
##  [7] "Hong Kong"                        "Tanzania"                        
##  [9] "Bolivia"                          "Turkey"                          
## [11] "Ivory Coast"                      "Palestine"                       
## [13] "Russia"                           "Eswatini"                        
## [15] "Vietnam"                          "Republic of the Congo"           
## [17] "Venezuela"                        "Iran"                            
## [19] "Laos"                             "Syria"                           
## [21] "Democratic Republic of the Congo" "North Korea"
setdiff(Idh$Country,demo$Country)
##  [1] "Hong Kong, China (SAR)"             "Liechtenstein"                     
##  [3] "Korea (Republic of)"                "Czechia"                           
##  [5] "Andorra"                            "San Marino"                        
##  [7] "Türkiye"                            "Brunei Darussalam"                 
##  [9] "Russian Federation"                 "Bahamas"                           
## [11] "Grenada"                            "Barbados"                          
## [13] "Antigua and Barbuda"                "Seychelles"                        
## [15] "Saint Kitts and Nevis"              "Iran (Islamic Republic of)"        
## [17] "Moldova (Republic of)"              "Palau"                             
## [19] "Saint Vincent and the Grenadines"   "Maldives"                          
## [21] "Tonga"                              "Dominica"                          
## [23] "Palestine, State of"                "Saint Lucia"                       
## [25] "Samoa"                              "Viet Nam"                          
## [27] "Bolivia (Plurinational State of)"   "Venezuela (Bolivarian Republic of)"
## [29] "Belize"                             "Cabo Verde"                        
## [31] "Tuvalu"                             "Marshall Islands"                  
## [33] "Micronesia (Federated States of)"   "Kiribati"                          
## [35] "Sao Tome and Principe"              "Lao People's Democratic Republic"  
## [37] "Timor-Leste"                        "Vanuatu"                           
## [39] "Eswatini (Kingdom of)"              "Syrian Arab Republic"              
## [41] "Congo"                              "Solomon Islands"                   
## [43] "Côte d'Ivoire"                      "Tanzania (United Republic of)"     
## [45] "Congo (Democratic Republic of the)" "South Sudan"

Se puede corroborar que sí hay valores que pueden ser corregidos.

……………………………………………………….

Idh[Idh$Country=='Hong Kong, China (SAR)','Country']='Hong Kong'
Idh[Idh$Country=='Iran (Islamic Republic of)','Country']='Iran'
Idh[Idh$Country=='Venezuela (Bolivarian Republic of)','Country']='Venezuela'
Idh[Idh$Country=='Russian Federation','Country']='Russia'
Idh[Idh$Country=='Congo','Country']='Republic of the Congo'
Idh[Idh$Country=='Congo (Democratic Republic of the)','Country']='Democratic Republic of the Congo'
Idh[Idh$Country=='Bolivia (Plurinational State of)','Country']='Bolivia'
Idh[Idh$Country=='Korea (Republic of)','Country']='South Korea'
Idh[Idh$Country=='Timor-Leste','Country']='East Timor'
Idh[Idh$Country=='Eswatini (Kingdom of)','Country']='Eswatini'
Idh[Idh$Country=='Czechia','Country']='Czech Republic'
Idh[Idh$Country=='Moldova (Republic of)','Country']='Moldova'
Idh[Idh$Country=='Viet Nam','Country']='Vietnam'

Este proceso me demandó más tiempo del que pensé.

Ahora si hay más comodidad para hacer el merge:

idhdemo=merge(Idh,demo,by="Country")
head(idhdemo)
##       Country Rank.Idh Score.Idh Lifeexpectancy SchoolExpected SchoolMean
## 1 Afghanistan      180     0.478        61.9824       10.26384   2.985070
## 2     Albania       67     0.796        76.4626       14.44800  11.286455
## 3     Algeria       91     0.745        76.3767       14.62690   8.069284
## 4      Angola      148     0.586        61.6434       12.17210   5.417391
## 5   Argentina       47     0.842        75.3899       17.87487  11.147269
## 6     Armenia       85     0.759        72.0431       13.11676  11.330300
##         GNI Rank.dem             Type Score.dem Electprocess Fgovernment
## 1  1824.191      167    Authoritarian      0.32         0.00        0.07
## 2 14131.110       68 Flawed democracy      6.11         7.00        6.43
## 3 10800.225      113    Authoritarian      3.77         3.08        2.50
## 4  5465.618      122    Authoritarian      3.37         1.33        2.86
## 5 20925.268       50 Flawed democracy      6.81         9.17        5.00
## 6 13157.994       89    Hybrid regime      5.49         7.50        5.71
##   Polparticipation Polculture Civilliberties
## 1             0.00       1.25           0.29
## 2             4.44       5.63           7.06
## 3             4.44       5.00           3.82
## 4             5.00       5.00           2.65
## 5             7.22       5.00           7.65
## 6             6.11       3.13           5.00
library(rio)
str(idhdemo)
## 'data.frame':    158 obs. of  15 variables:
##  $ Country         : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ Rank.Idh        : num  180 67 91 148 47 85 5 25 91 35 ...
##  $ Score.Idh       : num  0.478 0.796 0.745 0.586 0.842 0.759 0.951 0.916 0.745 0.875 ...
##  $ Lifeexpectancy  : num  62 76.5 76.4 61.6 75.4 ...
##  $ SchoolExpected  : num  10.3 14.4 14.6 12.2 17.9 ...
##  $ SchoolMean      : num  2.99 11.29 8.07 5.42 11.15 ...
##  $ GNI             : num  1824 14131 10800 5466 20925 ...
##  $ Rank.dem        : num  167 68 113 122 50 89 9 20 141 144 ...
##  $ Type            : chr  "Authoritarian" "Flawed democracy" "Authoritarian" "Authoritarian" ...
##  $ Score.dem       : num  0.32 6.11 3.77 3.37 6.81 5.49 8.9 8.07 2.68 2.52 ...
##  $ Electprocess    : num  0 7 3.08 1.33 9.17 7.5 10 9.58 0.5 0.42 ...
##  $ Fgovernment     : num  0.07 6.43 2.5 2.86 5 5.71 8.57 6.79 2.5 2.71 ...
##  $ Polparticipation: num  0 4.44 4.44 5 7.22 6.11 7.78 8.89 2.78 3.33 ...
##  $ Polculture      : num  1.25 5.63 5 5 5 3.13 8.75 6.88 5 4.38 ...
##  $ Civilliberties  : num  0.29 7.06 3.82 2.65 7.65 5 9.41 8.24 2.65 1.76 ...
export(idhdemo,"DATA.csv")

Luego de haber realizado la limpieza previa…..Viene propiamente el Análisis Factorial Exploratorio

library(rio)
link="https://github.com/CarlosChavarri23/An-lsis-Factorial-/raw/main/DATA.csv"
idhdemo=import(link)
dontselect=c("Country","Rank.dem","Rank.Idh","Score.Idh","Score.dem", "Type")

select=setdiff(names(idhdemo),dontselect) 
theData=idhdemo[,select]
str(theData)
## 'data.frame':    158 obs. of  9 variables:
##  $ Lifeexpectancy  : num  62 76.5 76.4 61.6 75.4 ...
##  $ SchoolExpected  : num  10.3 14.4 14.6 12.2 17.9 ...
##  $ SchoolMean      : num  2.99 11.29 8.07 5.42 11.15 ...
##  $ GNI             : num  1824 14131 10800 5466 20925 ...
##  $ Electprocess    : num  0 7 3.08 1.33 9.17 7.5 10 9.58 0.5 0.42 ...
##  $ Fgovernment     : num  0.07 6.43 2.5 2.86 5 5.71 8.57 6.79 2.5 2.71 ...
##  $ Polparticipation: num  0 4.44 4.44 5 7.22 6.11 7.78 8.89 2.78 3.33 ...
##  $ Polculture      : num  1.25 5.63 5 5 5 3.13 8.75 6.88 5 4.38 ...
##  $ Civilliberties  : num  0.29 7.06 3.82 2.65 7.65 5 9.41 8.24 2.65 1.76 ...
library(polycor)
corMatrix=polycor::hetcor(theData)$correlations
library(ggcorrplot)
## Loading required package: ggplot2
ggcorrplot(corMatrix)

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:polycor':
## 
##     polyserial
psych::KMO(corMatrix) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.89
## MSA for each item = 
##   Lifeexpectancy   SchoolExpected       SchoolMean              GNI 
##             0.90             0.90             0.89             0.89 
##     Electprocess      Fgovernment Polparticipation       Polculture 
##             0.82             0.96             0.96             0.88 
##   Civilliberties 
##             0.86
cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05
## [1] FALSE
library(matrixcalc)
library(matrixcalc)


is.singular.matrix(corMatrix)
## [1] FALSE
fa.parallel(theData, fa = 'fa',correct = T,plot = F)
## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA
library(GPArotation)
resfa <- fa(theData,
            nfactors = 2,
            cor = 'mixed',
            rotate = "varimax",
            fm="minres")
print(resfa$loadings)
## 
## Loadings:
##                  MR1   MR2  
## Lifeexpectancy   0.308 0.858
## SchoolExpected   0.397 0.812
## SchoolMean       0.302 0.793
## GNI              0.303 0.801
## Electprocess     0.909 0.227
## Fgovernment      0.763 0.489
## Polparticipation 0.773 0.338
## Polculture       0.511 0.415
## Civilliberties   0.908 0.331
## 
##                  MR1   MR2
## SS loadings    3.528 3.350
## Proportion Var 0.392 0.372
## Cumulative Var 0.392 0.764
print(resfa$loadings,cutoff = 0.5)
## 
## Loadings:
##                  MR1   MR2  
## Lifeexpectancy         0.858
## SchoolExpected         0.812
## SchoolMean             0.793
## GNI                    0.801
## Electprocess     0.909      
## Fgovernment      0.763      
## Polparticipation 0.773      
## Polculture       0.511      
## Civilliberties   0.908      
## 
##                  MR1   MR2
## SS loadings    3.528 3.350
## Proportion Var 0.392 0.372
## Cumulative Var 0.392 0.764
fa.diagram(resfa,main = "Resultados del EFA")

sort(resfa$communality)
##       Polculture Polparticipation       SchoolMean              GNI 
##        0.4329945        0.7113670        0.7199503        0.7329995 
##   SchoolExpected      Fgovernment   Lifeexpectancy     Electprocess 
##        0.8167403        0.8210855        0.8306177        0.8776938 
##   Civilliberties 
##        0.9341848
sort(resfa$complexity)
##     Electprocess   Lifeexpectancy   Civilliberties              GNI 
##         1.124793         1.254550         1.260408         1.281374 
##       SchoolMean Polparticipation   SchoolExpected      Fgovernment 
##         1.284787         1.368459         1.453542         1.702795 
##       Polculture 
##         1.918847
library(magrittr)
as.data.frame(resfa$scores)%>%head()
##          MR1         MR2
## 1 -1.7977793 -0.83798415
## 2  0.3324736  0.27861886
## 3 -0.8749699  0.43423600
## 4 -0.7793731 -0.64292948
## 5  0.6584106  0.45134811
## 6  0.1048445 -0.05519384
library(ggplot2)
idhdemo$demos_efa=resfa$scores[,1]
idhdemo$desahu_efa=resfa$scores[,2]


ggplot(data=idhdemo,aes(x=Score.dem,y=demos_efa)) + geom_point() + theme_minimal() + labs(x="Indice de Democracia (original)", y="Indice de Democracia EFA")

library(BBmisc)
## 
## Attaching package: 'BBmisc'
## The following object is masked from 'package:base':
## 
##     isFALSE
efa_scores_ok=normalize(resfa$scores, 
                       method = "range", 
                       margin=2, # by column
                       range = c(0, 10))

idhdemo$demos_efa_ok=efa_scores_ok[,1]
idhdemo$desahu_efa_ok=efa_scores_ok[,2]

ggplot(data=idhdemo,aes(x=Score.dem,y=demos_efa_ok)) + geom_point() + theme_minimal() + labs(x="Indice de Democracia (original)", y="Indice de Democracia EFA (cambiado)")

Análisis Factorial Exploratorio:

modelCFA <- ' democracia  =~ Electprocess  + Fgovernment + Polparticipation + Polculture  + Civilliberties 

desaHumano=~ Lifeexpectancy +SchoolExpected +SchoolMean +GNI'
# normalizar las variables:
theDataNorm=scale(theData)

library(lavaan)
## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.
## 
## Attaching package: 'lavaan'
## The following object is masked from 'package:psych':
## 
##     cor2cov
cfa_fit <- cfa(modelCFA, data=theDataNorm, 
           std.lv=TRUE,  
           missing="fiml")
summary(cfa_fit)
## lavaan 0.6-12 ended normally after 27 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        28
## 
##   Number of observations                           158
##   Number of missing patterns                         1
## 
## Model Test User Model:
##                                                       
##   Test statistic                               122.880
##   Degrees of freedom                                26
##   P-value (Chi-square)                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Observed
##   Observed information based on                Hessian
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   democracia =~                                       
##     Electprocess      0.925    0.060   15.356    0.000
##     Fgovernment       0.883    0.062   14.125    0.000
##     Polparticipatn    0.835    0.065   12.911    0.000
##     Polculture        0.633    0.072    8.790    0.000
##     Civilliberties    0.964    0.058   16.521    0.000
##   desaHumano =~                                       
##     Lifeexpectancy    0.897    0.062   14.369    0.000
##     SchoolExpected    0.913    0.062   14.822    0.000
##     SchoolMean        0.869    0.064   13.667    0.000
##     GNI               0.826    0.066   12.560    0.000
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   democracia ~~                                       
##     desaHumano        0.687    0.047   14.691    0.000
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .Electprocess     -0.000    0.079   -0.000    1.000
##    .Fgovernment      -0.000    0.079   -0.000    1.000
##    .Polparticipatn   -0.000    0.079   -0.000    1.000
##    .Polculture       -0.000    0.079   -0.000    1.000
##    .Civilliberties   -0.000    0.079   -0.000    1.000
##    .Lifeexpectancy    0.000    0.079    0.000    1.000
##    .SchoolExpected    0.000    0.079    0.000    1.000
##    .SchoolMean       -0.000    0.079   -0.000    1.000
##    .GNI              -0.000    0.079   -0.000    1.000
##     democracia        0.000                           
##     desaHumano        0.000                           
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .Electprocess      0.138    0.020    6.827    0.000
##    .Fgovernment       0.215    0.029    7.439    0.000
##    .Polparticipatn    0.297    0.037    8.002    0.000
##    .Polculture        0.593    0.069    8.644    0.000
##    .Civilliberties    0.065    0.016    4.096    0.000
##    .Lifeexpectancy    0.189    0.031    6.135    0.000
##    .SchoolExpected    0.159    0.029    5.580    0.000
##    .SchoolMean        0.238    0.034    6.989    0.000
##    .GNI               0.311    0.042    7.428    0.000
##     democracia        1.000                           
##     desaHumano        1.000

Averigüemos qué tan bien salió el modelo:

allParamCFA=parameterEstimates(cfa_fit,standardized = T)
allFitCFA=as.list(fitMeasures(cfa_fit))

El ChiSquare es NO significativo? (p_value debe ser mayor a 0.05 para que sea bueno)

allFitCFA[c("chisq", "df", "pvalue")] # pvalue>0.05  
## $chisq
## [1] 122.8801
## 
## $df
## [1] 26
## 
## $pvalue
## [1] 1.554312e-14

El Índice Tucker Lewis es mayor a 0.9?

allFitCFA$tli 
## [1] 0.9061035

La Raíz del error cuadrático medio de aproximación es menor a 0.05?

allFitCFA[c('rmsea.ci.lower','rmsea' ,'rmsea.ci.upper')] 
## $rmsea.ci.lower
## [1] 0.1268419
## 
## $rmsea
## [1] 0.1535685
## 
## $rmsea.ci.upper
## [1] 0.18142

No hay requisitos pero aún así cálculamos los resultados obtenidos.

scorescfa=normalize(lavPredict(cfa_fit),
                    method = "range", 
                    margin=2, # by column
                    range = c(0, 10))

idhdemo$demos_cfa_ok=scorescfa[,1]
idhdemo$desahu_cfa_ok=scorescfa[,2]

Veamos que tanto se parece el score obtenido via CFA con la puntuación original en la Figura 3.1.

ggplot(data=idhdemo,aes(x=Score.dem,y=demos_cfa_ok)) + geom_point() + theme_minimal() + labs(x="Indice de Democracia (original)", y="Indice de Democracia CFA (cambiado)")

Figure 3.1: Comparación Indice de Democracia con Score CFA con puntuación original

Podemos ver los resultados del CFA en la Figura 3.2.

library(lavaanPlot)
lavaanPlot(model = cfa_fit, node_options = list(shape = "box", fontname = "Helvetica"), edge_options = list(color = "grey"), coefs = T)