Paola Nieto (20150967)

DATA 1

library(readxl)
IDH <- read_excel("2018_Statistical_Annex_Table_1.xlsx")
## New names:
## * `` -> ...1
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...
str(IDH)
## tibble [265 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1                                               : chr [1:265] NA NA "HDI rank" NA ...
##  $ Table 1. Human Development Index and its components: chr [1:265] NA NA "Country" NA ...
##  $ ...3                                               : chr [1:265] NA "Human Development Index (HDI)" "Value" "2017" ...
##  $ ...4                                               : logi [1:265] NA NA NA NA NA NA ...
##  $ ...5                                               : chr [1:265] "SDG 3" "Life expectancy at birth" "(years)" "2017" ...
##  $ ...6                                               : chr [1:265] NA NA NA NA ...
##  $ ...7                                               : chr [1:265] "SDG 4.3" "Expected years of schooling" "(years)" "2017" ...
##  $ ...8                                               : chr [1:265] NA NA NA "a" ...
##  $ ...9                                               : chr [1:265] "SDG 4.6" "Mean years of schooling" "(years)" "2017" ...
##  $ ...10                                              : chr [1:265] NA NA NA "a" ...
##  $ ...11                                              : chr [1:265] "SDG 8.5" "Gross national income (GNI) per capita" "(2011 PPP $)" "2017" ...
##  $ ...12                                              : chr [1:265] NA NA NA NA ...
##  $ ...13                                              : chr [1:265] NA "GNI per capita rank minus HDI rank" NA "2017" ...
##  $ ...14                                              : logi [1:265] NA NA NA NA NA NA ...
##  $ ...15                                              : chr [1:265] NA "HDI rank" NA "2016" ...
IDH[,c(1,4,6,8,10,12,14)]=NULL
names(IDH) = c("country", "hdi", "Life expectancy", "Expected years of schooling",  "Mean years of schooling", "GNI per capita",    "GNI per capita rank minus HDI rank", "rank")
HumanDevelopment = c("Very High")
VeryHigh = IDH[5:64,]
VeryHigh=data.frame(VeryHigh, HumanDevelopment, stringsAsFactors = F)
VeryHigh = VeryHigh [-1,]

HumanDevelopment = c("High")
High = IDH[65:118,]
High = data.frame(High, HumanDevelopment, stringsAsFactors = F)
High = High [-1,]

HumanDevelopment = c("Medium")
Medium = IDH[119:158,]
Medium = data.frame(Medium, HumanDevelopment, stringsAsFactors = F)
Medium = Medium [-1,]

HumanDevelopment = c("Low")
Low = IDH[159:197,]
Low =data.frame(Low, HumanDevelopment, stringsAsFactors = F)
Low = Low [-1,]

HumanDevelopment = c("Other")
Other = IDH[198:204,]
Other =data.frame(Other, HumanDevelopment, stringsAsFactors = F)
Other = Other [-1,]
IDH2=rbind(VeryHigh, High, Medium, Low, Other)
IDH2[,c(7,8)]=NULL
str(IDH2)
## 'data.frame':    195 obs. of  7 variables:
##  $ country                    : chr  "Norway" "Switzerland" "Australia" "Ireland" ...
##  $ hdi                        : chr  "0.95252201967581829" "0.94399757027811748" "0.9386312851065749" "0.93841005899505603" ...
##  $ Life.expectancy            : chr  "82.328000000000003" "83.472999999999999" "83.067999999999998" "81.643000000000001" ...
##  $ Expected.years.of.schooling: chr  "17.852060000000002" "16.208819999999999" "22.921250000000001" "19.61374" ...
##  $ Mean.years.of.schooling    : chr  "12.56682" "13.407999999999999" "12.855040000000001" "12.526289999999999" ...
##  $ GNI.per.capita             : chr  "68012.492920000004" "57625.069710000003" "43560.057739999997" "53754.186260000002" ...
##  $ HumanDevelopment           : chr  "Very High" "Very High" "Very High" "Very High" ...
library(readr)
IDH2[,2:6]=lapply(IDH2[,2:6], as.numeric)
## Warning in lapply(IDH2[, 2:6], as.numeric): NAs introduced by coercion

## Warning in lapply(IDH2[, 2:6], as.numeric): NAs introduced by coercion

## Warning in lapply(IDH2[, 2:6], as.numeric): NAs introduced by coercion

## Warning in lapply(IDH2[, 2:6], as.numeric): NAs introduced by coercion

## Warning in lapply(IDH2[, 2:6], as.numeric): NAs introduced by coercion
str(IDH2)
## 'data.frame':    195 obs. of  7 variables:
##  $ country                    : chr  "Norway" "Switzerland" "Australia" "Ireland" ...
##  $ hdi                        : num  0.953 0.944 0.939 0.938 0.936 ...
##  $ Life.expectancy            : num  82.3 83.5 83.1 81.6 81.2 ...
##  $ Expected.years.of.schooling: num  17.9 16.2 22.9 19.6 17 ...
##  $ Mean.years.of.schooling    : num  12.6 13.4 12.9 12.5 14.1 ...
##  $ GNI.per.capita             : num  68012 57625 43560 53754 46136 ...
##  $ HumanDevelopment           : chr  "Very High" "Very High" "Very High" "Very High" ...

DATA 2

EPI <- read_excel("2018-epi.xlsx", 
    sheet = "2018EPI_ScoresCurrent")
EPI[,c(1:2)]=NULL
EPI[,c(3:14)]=NULL
EPI[,c(9:26)]=NULL
library(stringr)
names(EPI)=str_split(names(EPI),".cu",simplify = T)[,1]%>%gsub('\\s','',.)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
EPI[2:8]= replace(EPI[2:8], EPI[2:8]==-9999, NA)
str(EPI)
## tibble [180 × 8] (S3: tbl_df/tbl/data.frame)
##  $ country: chr [1:180] "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ EPI    : num [1:180] 37.7 65.5 57.2 37.4 59.2 ...
##  $ HAD    : num [1:180] 0 30.9 87.4 14.8 72.8 ...
##  $ PME    : num [1:180] 71.8 88.9 100 60.7 100 ...
##  $ PMW    : num [1:180] 77.1 88.1 96.7 68.5 100 ...
##  $ USD    : num [1:180] 24.88 67.15 64.36 9.42 54.69 ...
##  $ UWD    : num [1:180] 26.62 65.97 56.17 8.99 51.96 ...
##  $ PBD    : num [1:180] 0 62.9 35.6 40.1 52 ...

MERGE

nrow(merge(IDH2,EPI))
## [1] 162
datajunta0=merge(IDH2,EPI,all.x=T, all.y=T)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(datajunta0[!complete.cases(datajunta0),],type='html')%>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  font_size = 10)
country hdi Life.expectancy Expected.years.of.schooling Mean.years.of.schooling GNI.per.capita HumanDevelopment EPI HAD PME PMW USD UWD PBD
4 Andorra 0.8576836 81.663 13.52402 10.155450 47573.8701 Very High NA NA NA NA NA NA NA
21 Bolivia NA NA NA NA NA NA 55.98 30.12 100.00 96.22 44.96 44.18 43.51
22 Bolivia (Plurinational State of) 0.6925370 69.473 14.02423 8.915070 6714.0272 Medium NA NA NA NA NA NA NA
31 Côte d’Ivoire NA NA NA NA NA NA 45.25 5.15 86.69 86.04 9.53 9.53 33.42
41 Congo 0.6062826 65.088 11.37073 6.310000 5694.2210 Medium NA NA NA NA NA NA NA
42 Congo (Democratic Republic of the) 0.4574692 60.031 9.75000 6.759480 795.8264 Low NA NA NA NA NA NA NA
44 Côte d’Ivoire 0.4923045 54.102 9.03737 5.192000 3481.2253 Low NA NA NA NA NA NA NA
48 Czech Republic NA NA NA NA NA NA 67.68 72.86 56.56 65.55 75.00 64.96 98.82
49 Czechia 0.8875614 78.877 16.85478 12.740350 30588.3008 Very High NA NA NA NA NA NA NA
50 Dem. Rep. Congo NA NA NA NA NA NA 30.41 5.97 31.83 35.45 9.80 10.21 40.47
61 Eswatini (Kingdom of) 0.5883164 58.268 11.19615 6.521370 7619.9435 Medium NA NA NA NA NA NA NA
79 Hong Kong, China (SAR) 0.9325829 84.097 16.32567 12.038130 58419.7099 Very High NA NA NA NA NA NA NA
84 Iran NA NA NA NA NA NA 58.16 84.14 86.08 85.46 62.45 55.03 21.21
85 Iran (Islamic Republic of) 0.7980573 76.153 14.88064 9.840175 19130.2400 High NA NA NA NA NA NA NA
96 Korea (Democratic People’s Rep. of) NA 71.887 12.00025 NA NA Other NA NA NA NA NA NA NA
97 Korea (Republic of) 0.9025611 82.361 16.49749 12.116330 35944.7095 Very High NA NA NA NA NA NA NA
100 Lao People’s Democratic Republic 0.6012757 67.021 11.20924 5.193850 6070.1156 Medium NA NA NA NA NA NA NA
101 Laos NA NA NA NA NA NA 42.94 4.41 33.75 38.28 26.05 29.12 33.69
107 Liechtenstein 0.9160829 80.410 14.72093 12.548460 97335.7496 Very High NA NA NA NA NA NA NA
110 Macedonia NA NA NA NA NA NA 61.06 30.74 90.78 89.67 69.78 68.53 70.11
117 Marshall Islands 0.7079474 73.620 13.00000 10.865770 5124.8962 High NA NA NA NA NA NA NA
121 Micronesia NA NA NA NA NA NA 49.80 17.69 100.00 100.00 48.14 44.63 47.40
122 Micronesia (Federated States of) 0.6272547 69.316 11.70000 7.954740 3842.9071 Medium NA NA NA NA NA NA NA
123 Moldova NA NA NA NA NA NA 51.97 45.65 68.56 70.31 58.20 63.83 60.77
124 Moldova (Republic of) 0.6997534 71.718 11.63386 11.595360 5553.8504 High NA NA NA NA NA NA NA
125 Monaco NA NA NA NA NA Other NA NA NA NA NA NA NA
132 Nauru NA NA 10.31429 NA 18572.9566 Other NA NA NA NA NA NA NA
142 Palau 0.7984784 73.445 15.60380 12.327280 12830.5903 High NA NA NA NA NA NA NA
143 Palestine, State of 0.6858355 73.646 12.82014 9.104640 5055.0862 Medium NA NA NA NA NA NA NA
152 Republic of Congo NA NA NA NA NA NA 42.39 12.49 30.40 32.40 9.95 10.57 46.29
154 Russia NA NA NA NA NA NA 63.79 76.75 81.37 82.94 61.33 66.52 86.20
155 Russian Federation 0.8162755 71.222 15.53573 12.019990 24232.5558 Very High NA NA NA NA NA NA NA
157 São Tomé and Príncipe NA NA NA NA NA NA 54.01 12.74 100.00 96.57 30.40 28.24 45.76
158 Saint Kitts and Nevis 0.7778446 74.372 14.38947 8.400000 23977.6260 High NA NA NA NA NA NA NA
162 San Marino NA NA 15.11120 NA NA Other NA NA NA NA NA NA NA
163 Sao Tome and Principe 0.5894763 66.762 12.46685 6.333470 2941.3128 Medium NA NA NA NA NA NA NA
173 Somalia NA 56.714 NA NA NA Other NA NA NA NA NA NA NA
175 South Korea NA NA NA NA NA NA 62.30 100.00 30.21 40.43 100.00 93.05 91.48
176 South Sudan 0.3877252 57.288 4.87162 4.849130 963.1741 Low NA NA NA NA NA NA NA
181 Swaziland NA NA NA NA NA NA 40.32 15.38 54.85 65.66 9.03 9.68 44.73
184 Syrian Arab Republic 0.5357037 70.963 8.75280 5.062500 2337.1701 Low NA NA NA NA NA NA NA
185 Taiwan NA NA NA NA NA NA 72.84 65.32 67.35 75.70 74.41 65.01 81.17
187 Tanzania NA NA NA NA NA NA 50.83 7.64 100.00 91.65 11.43 12.18 42.14
188 Tanzania (United Republic of) 0.5377147 66.310 8.92369 5.780000 2655.3938 Low NA NA NA NA NA NA NA
190 The former Yugoslav Republic of Macedonia 0.7566872 75.851 13.32826 9.631960 12504.9456 High NA NA NA NA NA NA NA
198 Tuvalu NA NA NA NA 5887.7243 Other NA NA NA NA NA NA NA
203 United States 0.9239136 79.541 16.46821 13.379990 54941.1093 Very High NA NA NA NA NA NA NA
204 United States of America NA NA NA NA NA NA 71.19 100.00 99.01 92.71 81.84 100.00 64.99
208 Venezuela NA NA NA NA NA NA 63.89 76.03 100.00 99.93 52.40 48.38 37.32
209 Venezuela (Bolivarian Republic of) 0.7607730 74.726 14.30000 10.323430 10671.5415 High NA NA NA NA NA NA NA
211 Yemen 0.4519004 65.157 8.97700 3.000000 1239.2914 Low NA NA NA NA NA NA NA
IDH2[IDH2$country=="United States",'country']="United States of America"
IDH2[IDH2$country=="Bolivia (Plurinational State of)",'country']="Bolivia"
IDH2[IDH2$country=="Venezuela (Bolivarian Republic of)",'country']="Venezuela"
IDH2[IDH2$country=="The former Yugoslav Republic of Macedonia",'country']="Macedonia"
IDH2[IDH2$country=="Tanzania (United Republic of)",'country']="Tanzania"
IDH2[IDH2$country=="Eswatini (Kingdom of)",'country']="Swaziland"
IDH2[IDH2$country=="Korea (Republic of)",'country']="South Korea"
IDH2[IDH2$country=="Russian Federation",'country']="Russia"
IDH2[IDH2$country=="Lao People's Democratic Republic",'country']="Laos"
EPI[EPI$country=="Côte d'Ivoire",'country']="Côte d'Ivoire"
EPI[EPI$country=="Czech Republic",'country']="Czechia"
EPI[EPI$country=="Republic of Congo",'country']="Congo"
IDH2[IDH2$country=="Congo (Democratic Republic of the)",'country']="Dem. Rep. Congo"
IDH2[IDH2$country=="Iran (Islamic Republic of)",'country']="Iran"
IDH2[IDH2$country=="Micronesia (Federated States of)",'country']="Micronesia"
IDH2[IDH2$country=="Moldova (Republic of)",'country']="Moldova"

Nuevo merge

datajunta=merge(IDH2,EPI) 
nrow(datajunta)
## [1] 178
str(datajunta)
## 'data.frame':    178 obs. of  14 variables:
##  $ country                    : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ hdi                        : num  0.498 0.785 0.754 0.581 0.78 ...
##  $ Life.expectancy            : num  64 78.5 76.3 61.8 76.5 ...
##  $ Expected.years.of.schooling: num  10.4 14.8 14.4 11.8 13.2 ...
##  $ Mean.years.of.schooling    : num  3.78 10.03 7.97 5.13 9.24 ...
##  $ GNI.per.capita             : num  1824 11886 13802 5790 20764 ...
##  $ HumanDevelopment           : chr  "Low" "High" "High" "Medium" ...
##  $ EPI                        : num  37.7 65.5 57.2 37.4 59.2 ...
##  $ HAD                        : num  0 30.9 87.4 14.8 72.8 ...
##  $ PME                        : num  71.8 88.9 100 60.7 100 ...
##  $ PMW                        : num  77.1 88.1 96.7 68.5 100 ...
##  $ USD                        : num  24.88 67.15 64.36 9.42 54.69 ...
##  $ UWD                        : num  26.62 65.97 56.17 8.99 51.96 ...
##  $ PBD                        : num  0 62.9 35.6 40.1 52 ...

Calculamos y graficamos la matriz de correlación.

dontselect=c("country","hdi","HumanDevelopment","EPI")


select=setdiff(names(datajunta),dontselect) 
theData= datajunta[,select] 

# esta es:
library(polycor)
corMatrix=polycor::hetcor(theData)$correlations
#Explorar correlaciones:
#Sin evaluar significancia:
library(ggcorrplot)
## Loading required package: ggplot2
ggcorrplot(corMatrix)

#Evaluando significancia:
ggcorrplot(corMatrix,
          p.mat = cor_pmat(corMatrix),
          insig = "blank")

Calcule el KMO

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:polycor':
## 
##     polyserial
psych::KMO(corMatrix) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.87
## MSA for each item = 
##             Life.expectancy Expected.years.of.schooling 
##                        0.96                        0.97 
##     Mean.years.of.schooling              GNI.per.capita 
##                        0.94                        0.93 
##                         HAD                         PME 
##                        0.93                        0.51 
##                         PMW                         USD 
##                        0.51                        0.86 
##                         UWD                         PBD 
##                        0.87                        0.92
# Es mayor de 0.5, si podemos factorizar

Prueba de identidad

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05
## [1] FALSE

Prueba de singularidad

library(matrixcalc)
is.singular.matrix(corMatrix)
## [1] FALSE
fa.parallel(theData,fm = 'ML', fa = 'fa')

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

#Redimensionamos

library(GPArotation)
resfa <- fa(theData,nfactors = 2,cor = 'mixed',rotate = "varimax",fm="minres")
print(resfa$loadings)
## 
## Loadings:
##                             MR1    MR2   
## Life.expectancy              0.896       
## Expected.years.of.schooling  0.878       
## Mean.years.of.schooling      0.871       
## GNI.per.capita               0.764       
## HAD                          0.869  0.126
## PME                                 0.996
## PMW                                 0.988
## USD                          0.983       
## UWD                          0.978       
## PBD                          0.735       
## 
##                  MR1   MR2
## SS loadings    6.141 1.997
## Proportion Var 0.614 0.200
## Cumulative Var 0.614 0.814
print(resfa$loadings,cutoff = 0.5) #cortando solo los que aportan más (kmo mayor a 0.5)
## 
## Loadings:
##                             MR1    MR2   
## Life.expectancy              0.896       
## Expected.years.of.schooling  0.878       
## Mean.years.of.schooling      0.871       
## GNI.per.capita               0.764       
## HAD                          0.869       
## PME                                 0.996
## PMW                                 0.988
## USD                          0.983       
## UWD                          0.978       
## PBD                          0.735       
## 
##                  MR1   MR2
## SS loadings    6.141 1.997
## Proportion Var 0.614 0.200
## Cumulative Var 0.614 0.814

Resultado visual:

fa.diagram(resfa)

Si bien todas las variables de HDI fueron factorizadas juntas, a estas se agregaron las variables relacionadas con agua y saneamiento, y expocisión a metales pesados

El otro facto creado agrupa a las variables relacionadas con calidad del aire, pero excluye a la variable de combustibles solidos domésticos (está incluida en el primer factor)

Evaluando el resultado

¿La Raíz del error cuadrático medio corregida está cerca a cero?

resfa$crms
## [1] 0.04354232

¿La Raíz del error cuadrático medio de aproximación es menor a 0.05?

resfa$RMSEA
##      RMSEA      lower      upper confidence 
##  0.1764266  0.1522265  0.2027940  0.9000000

¿El índice de Tucker-Lewis es mayor a 0.9?

resfa$TLI
## [1] 0.9020002

¿Qué variables aportaron mas a los factores?

sort(resfa$communality)
##                         PBD              GNI.per.capita 
##                   0.5415986                   0.5845982 
##     Mean.years.of.schooling                         HAD 
##                   0.7586788                   0.7711137 
## Expected.years.of.schooling             Life.expectancy 
##                   0.7711331                   0.8053404 
##                         UWD                         USD 
##                   0.9606300                   0.9695515 
##                         PMW                         PME 
##                   0.9796408                   0.9956592

¿Qué variables contribuyen a mas de un factor?

sort(resfa$complexity)
##     Mean.years.of.schooling Expected.years.of.schooling 
##                    1.000101                    1.001614 
##                         PBD              GNI.per.capita 
##                    1.004478                    1.004634 
##             Life.expectancy                         PME 
##                    1.006266                    1.007479 
##                         PMW                         USD 
##                    1.007744                    1.007872 
##                         UWD                         HAD 
##                    1.008031                    1.041965

Posibles valores proyectados:

Darles nombres

as.data.frame(resfa$scores)%>%head()
datajuntaFA=cbind(datajunta[1],as.data.frame(resfa$scores))

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(data=datajuntaFA, x = ~MR1, y = ~MR2, text=~country) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Factor1'),
                     yaxis = list(title = 'Factor2')))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
library(fpc)
library(cluster)
library(dbscan)
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:fpc':
## 
##     dbscan
g.dist.cmd = daisy(datajuntaFA[,c(2:3)], metric = 'euclidean')
kNNdistplot(g.dist.cmd, k=2)
abline(h=0.63,col='red')

Para tener una idea de cada quien:

resDB=fpc::dbscan(g.dist.cmd, eps=0.63, MinPts=2,method = 'dist')
datajuntaFA$clustDB=as.factor(resDB$cluster)
aggregate(cbind(MR1, MR2) # dependientes
          ~ clustDB, # nivel
          data = datajuntaFA,    # data
          max)            # operacion
plot_ly(data=datajuntaFA, x = ~MR1, y = ~MR2, text=~country, color = ~clustDB) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Factor1'),
                     yaxis = list(title = 'Factor2')))

Finalmente, veamos relaciones:

library(BBmisc)
## 
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse
## The following object is masked from 'package:base':
## 
##     isFALSE
datajunta$FA1=normalize(datajuntaFA$MR1, 
                       method = "range", 
                       margin=2, # by column
                       range = c(0, 10))

datajunta$FA2=normalize(datajuntaFA$MR2, 
                       method = "range", 
                       margin=2, # by column
                       range = c(0, 10))

You can see them all here:

plot(datajunta[,c("hdi","EPI","FA1","FA2")])