Abro data:
folder="data"
fileName="IDHmundo2015.xlsx"
fileToRead=file.path(folder,fileName)
library(openxlsx)
dataIDH=openxlsx::read.xlsx(fileToRead,
sheet=1,
startRow=3, #ese 3 se refiere a la fila 3 del Excel original [es decir, cuenta las filas sea que estén vacÃas o no; es bueno tener esto porque asà te quedas con el encabezado]
skipEmptyRows=T,skipEmptyCols=T)
Vemos la cola:
tail(dataIDH,10)
## X1
## 193 187
## 194 188
## 195 <NA>
## 196 <NA>
## 197 <NA>
## 198 <NA>
## 199 <NA>
## 200 <NA>
## 201 <NA>
## 202 <NA>
## X2
## 193 Niger
## 194 Central African Republic
## 195 Notes
## 196 a. Data refer to 2015 or the most recent year available.
## 197 b. In calculating the HDI value, expected years of schooling is capped at 18 years.
## 198 c. Updated by HDRO using Barro and Lee (2016) estimates.
## 199 d. Based on data from the national statistical office.
## 200 e. In calculating the HDI value, GNI per capita is capped at $75,000.
## 201 f. Based on Barro and Lee (2016).
## 202 g. Value from UNDESA (2011).
## Human.Development.Index.(HDI) Life.expectancy.at.birth X5
## 193 0.35264066960226614 61.936 <NA>
## 194 0.35244020015467942 51.457999999999998 <NA>
## 195 <NA> <NA> <NA>
## 196 <NA> <NA> <NA>
## 197 <NA> <NA> <NA>
## 198 <NA> <NA> <NA>
## 199 <NA> <NA> <NA>
## 200 <NA> <NA> <NA>
## 201 <NA> <NA> <NA>
## 202 <NA> <NA> <NA>
## Expected.years.of.schooling X7 Mean.years.of.schooling X9
## 193 5.42265 <NA> 1.6579999999999999 f
## 194 7.0989800000000001 <NA> 4.2300000000000004 n
## 195 <NA> <NA> <NA> <NA>
## 196 <NA> <NA> <NA> <NA>
## 197 <NA> <NA> <NA> <NA>
## 198 <NA> <NA> <NA> <NA>
## 199 <NA> <NA> <NA> <NA>
## 200 <NA> <NA> <NA> <NA>
## 201 <NA> <NA> <NA> <NA>
## 202 <NA> <NA> <NA> <NA>
## Gross.national.income.(GNI).per.capita X11
## 193 889.45564579999996 <NA>
## 194 587.47396070000002 <NA>
## 195 <NA> <NA>
## 196 <NA> <NA>
## 197 <NA> <NA>
## 198 <NA> <NA>
## 199 <NA> <NA>
## 200 <NA> <NA>
## 201 <NA> <NA>
## 202 <NA> <NA>
## GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 193 1 187 NO
## 194 4 188 NO
## 195 NA <NA> <NA>
## 196 NA <NA> <NA>
## 197 NA <NA> <NA>
## 198 NA <NA> <NA>
## 199 NA <NA> <NA>
## 200 NA <NA> <NA>
## 201 NA <NA> <NA>
## 202 NA <NA> <NA>
Elimino filas de abajo:
dataIDH=dataIDH[-c(195:202),]
tail(dataIDH)
## X1 X2 Human.Development.Index.(HDI)
## 189 183 Guinea 0.41418630622159636
## 190 184 Burundi 0.40376541480557104
## 191 185 Burkina Faso 0.40174010414463812
## 192 186 Chad 0.39607322446853288
## 193 187 Niger 0.35264066960226614
## 194 188 Central African Republic 0.35244020015467942
## Life.expectancy.at.birth X5 Expected.years.of.schooling X7
## 189 59.215000000000003 <NA> 8.8152699999999999 <NA>
## 190 57.118000000000002 <NA> 10.64743 <NA>
## 191 59.006999999999998 <NA> 7.6928099999999997 <NA>
## 192 51.896999999999998 <NA> 7.3002599999999997 <NA>
## 193 61.936 <NA> 5.42265 <NA>
## 194 51.457999999999998 <NA> 7.0989800000000001 <NA>
## Mean.years.of.schooling X9 Gross.national.income.(GNI).per.capita X11
## 189 2.5684814920000001 q 1058.2198760000001 <NA>
## 190 2.9688892610000002 c 691.1825738 <NA>
## 191 1.441531766 q 1537.499309 <NA>
## 192 2.3233277380000001 n 1990.8714050000001 <NA>
## 193 1.6579999999999999 f 889.45564579999996 <NA>
## 194 4.2300000000000004 n 587.47396070000002 <NA>
## GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 189 4 182 NO
## 190 5 184 NO
## 191 -8 185 NO
## 192 -19 186 NO
## 193 1 187 NO
## 194 4 188 NO
Veo la cabeza:
head(dataIDH)
## X1 X2 Human.Development.Index.(HDI)
## 1 HDI rank Country Value
## 2 <NA> <NA> 2015
## 3 <NA> VERY HIGH HUMAN DEVELOPMENT <NA>
## 4 1 Norway 0.94942283449106446
## 5 2 Australia 0.93867953564660933
## 6 2 Switzerland 0.93913086905938037
## Life.expectancy.at.birth X5 Expected.years.of.schooling X7
## 1 (years) <NA> (years) <NA>
## 2 2015 <NA> 2015 a
## 3 <NA> <NA> <NA> <NA>
## 4 81.710999999999999 <NA> 17.671869999999998 <NA>
## 5 82.537000000000006 <NA> 20.43272 b
## 6 83.132999999999996 <NA> 16.040410000000001 <NA>
## Mean.years.of.schooling X9 Gross.national.income.(GNI).per.capita X11
## 1 (years) <NA> (2011 PPP $) <NA>
## 2 2015 a 2015 <NA>
## 3 <NA> <NA> <NA> <NA>
## 4 12.746420000000001 <NA> 67614.353480000005 <NA>
## 5 13.1751 <NA> 42822.19627 <NA>
## 6 13.37 <NA> 56363.957799999996 <NA>
## GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 1 NA <NA> <NA>
## 2 2015 2014 2014
## 3 NA <NA> <NA>
## 4 5 1 NO
## 5 19 3 NO
## 6 7 2 NO
Elimino las filas innecesarias:
dataIDH=dataIDH[-c(1:3),]
head(dataIDH)
## X1 X2 Human.Development.Index.(HDI) Life.expectancy.at.birth
## 4 1 Norway 0.94942283449106446 81.710999999999999
## 5 2 Australia 0.93867953564660933 82.537000000000006
## 6 2 Switzerland 0.93913086905938037 83.132999999999996
## 7 4 Germany 0.9256689410716622 81.091999999999999
## 8 5 Denmark 0.92464936963656996 80.412000000000006
## 9 5 Singapore 0.9248659328300326 83.209000000000003
## X5 Expected.years.of.schooling X7 Mean.years.of.schooling X9
## 4 <NA> 17.671869999999998 <NA> 12.746420000000001 <NA>
## 5 <NA> 20.43272 b 13.1751 <NA>
## 6 <NA> 16.040410000000001 <NA> 13.37 <NA>
## 7 <NA> 17.095939999999999 <NA> 13.18762553 c
## 8 <NA> 19.188800000000001 b 12.70017 <NA>
## 9 <NA> 15.4 d 11.57241 <NA>
## Gross.national.income.(GNI).per.capita X11
## 4 67614.353480000005 <NA>
## 5 42822.19627 <NA>
## 6 56363.957799999996 <NA>
## 7 44999.647140000001 <NA>
## 8 44518.924019999999 <NA>
## 9 78162.323869999993 e
## GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 4 5 1 NO
## 5 19 3 NO
## 6 7 2 NO
## 7 13 4 NO
## 8 13 6 NO
## 9 -3 4 NO
Elimino columnas innecesarias:
names(dataIDH)
## [1] "X1"
## [2] "X2"
## [3] "Human.Development.Index.(HDI)"
## [4] "Life.expectancy.at.birth"
## [5] "X5"
## [6] "Expected.years.of.schooling"
## [7] "X7"
## [8] "Mean.years.of.schooling"
## [9] "X9"
## [10] "Gross.national.income.(GNI).per.capita"
## [11] "X11"
## [12] "GNI.per.capita.rank.minus.HDI.rank"
## [13] "HDI.rank"
## [14] "America.Latina"
dataIDH=dataIDH[-c(1,5,7,9,11:13)]
head(dataIDH)
## X2 Human.Development.Index.(HDI) Life.expectancy.at.birth
## 4 Norway 0.94942283449106446 81.710999999999999
## 5 Australia 0.93867953564660933 82.537000000000006
## 6 Switzerland 0.93913086905938037 83.132999999999996
## 7 Germany 0.9256689410716622 81.091999999999999
## 8 Denmark 0.92464936963656996 80.412000000000006
## 9 Singapore 0.9248659328300326 83.209000000000003
## Expected.years.of.schooling Mean.years.of.schooling
## 4 17.671869999999998 12.746420000000001
## 5 20.43272 13.1751
## 6 16.040410000000001 13.37
## 7 17.095939999999999 13.18762553
## 8 19.188800000000001 12.70017
## 9 15.4 11.57241
## Gross.national.income.(GNI).per.capita America.Latina
## 4 67614.353480000005 NO
## 5 42822.19627 NO
## 6 56363.957799999996 NO
## 7 44999.647140000001 NO
## 8 44518.924019999999 NO
## 9 78162.323869999993 NO
Cambio nombre de variables:
names(dataIDH)=c("pais","IDH","esperanza","escuela1","escuela2","pbi","alatina")
head(dataIDH)
## pais IDH esperanza escuela1
## 4 Norway 0.94942283449106446 81.710999999999999 17.671869999999998
## 5 Australia 0.93867953564660933 82.537000000000006 20.43272
## 6 Switzerland 0.93913086905938037 83.132999999999996 16.040410000000001
## 7 Germany 0.9256689410716622 81.091999999999999 17.095939999999999
## 8 Denmark 0.92464936963656996 80.412000000000006 19.188800000000001
## 9 Singapore 0.9248659328300326 83.209000000000003 15.4
## escuela2 pbi alatina
## 4 12.746420000000001 67614.353480000005 NO
## 5 13.1751 42822.19627 NO
## 6 13.37 56363.957799999996 NO
## 7 13.18762553 44999.647140000001 NO
## 8 12.70017 44518.924019999999 NO
## 9 11.57241 78162.323869999993 NO
Reseteo Ãndice:
row.names(dataIDH)=NULL
head(dataIDH)
## pais IDH esperanza escuela1
## 1 Norway 0.94942283449106446 81.710999999999999 17.671869999999998
## 2 Australia 0.93867953564660933 82.537000000000006 20.43272
## 3 Switzerland 0.93913086905938037 83.132999999999996 16.040410000000001
## 4 Germany 0.9256689410716622 81.091999999999999 17.095939999999999
## 5 Denmark 0.92464936963656996 80.412000000000006 19.188800000000001
## 6 Singapore 0.9248659328300326 83.209000000000003 15.4
## escuela2 pbi alatina
## 1 12.746420000000001 67614.353480000005 NO
## 2 13.1751 42822.19627 NO
## 3 13.37 56363.957799999996 NO
## 4 13.18762553 44999.647140000001 NO
## 5 12.70017 44518.924019999999 NO
## 6 11.57241 78162.323869999993 NO
Veo estructura y formateo:
str(dataIDH)
## 'data.frame': 191 obs. of 7 variables:
## $ pais : chr "Norway" "Australia" "Switzerland" "Germany" ...
## $ IDH : chr "0.94942283449106446" "0.93867953564660933" "0.93913086905938037" "0.9256689410716622" ...
## $ esperanza: chr "81.710999999999999" "82.537000000000006" "83.132999999999996" "81.091999999999999" ...
## $ escuela1 : chr "17.671869999999998" "20.43272" "16.040410000000001" "17.095939999999999" ...
## $ escuela2 : chr "12.746420000000001" "13.1751" "13.37" "13.18762553" ...
## $ pbi : chr "67614.353480000005" "42822.19627" "56363.957799999996" "44999.647140000001" ...
## $ alatina : chr "NO" "NO" "NO" "NO" ...
Las columnas 2 a 6 deben ser numéricas:
dataIDH[c(2:6)]=lapply(dataIDH[c(2:6)],as.numeric)
La última columna es factor:
dataIDH$alatina=factor(dataIDH$alatina,labels = c("NO","SI"))
Saco estructura:
str(dataIDH)
## 'data.frame': 191 obs. of 7 variables:
## $ pais : chr "Norway" "Australia" "Switzerland" "Germany" ...
## $ IDH : num 0.949 0.939 0.939 0.926 0.925 ...
## $ esperanza: num 81.7 82.5 83.1 81.1 80.4 ...
## $ escuela1 : num 17.7 20.4 16 17.1 19.2 ...
## $ escuela2 : num 12.7 13.2 13.4 13.2 12.7 ...
## $ pbi : num 67614 42822 56364 45000 44519 ...
## $ alatina : Factor w/ 2 levels "NO","SI": 1 1 1 1 1 1 1 1 1 1 ...
Eliminamos filas innecesarias con NAs:
dataIDH=dataIDH[complete.cases(dataIDH),]
Verifico casos perdidos:
summary(dataIDH)
## pais IDH esperanza escuela1
## Length:188 Min. :0.3524 Min. :48.94 Min. : 4.872
## Class :character 1st Qu.:0.5775 1st Qu.:65.68 1st Qu.:10.886
## Mode :character Median :0.7287 Median :73.41 Median :13.140
## Mean :0.6988 Mean :71.35 Mean :12.983
## 3rd Qu.:0.8114 3rd Qu.:76.98 3rd Qu.:14.989
## Max. :0.9494 Max. :84.16 Max. :20.433
## escuela2 pbi alatina
## Min. : 1.442 Min. : 587.5 NO:166
## 1st Qu.: 6.082 1st Qu.: 3511.4 SI: 22
## Median : 8.656 Median : 10416.0
## Mean : 8.372 Mean : 17313.9
## 3rd Qu.:11.086 3rd Qu.: 23517.3
## Max. :13.370 Max. :129915.6
En caso hubiera NAs, en este punto debiéramos imputar
Creamos las versiones ordinales de las variables:
dataIDH$pbiOrd=dataIDH$esc2Ord=dataIDH$esc1Ord=dataIDH$espOrd=dataIDH$idhOrd=NA
niveles=3
etiquetas1=c('bajo','medio','alto')
dataIDH[,c(8:12)]=lapply(dataIDH[,c(2:6)],cut,
breaks=niveles,
labels=etiquetas1,
ordered_result=T)
¡Listo!
[Cuando no sabes el número de clusters que quieres, tienes que usar jerárquico]
data_sub=dataIDH[c(1,3:6)]
head(data_sub)
## pais esperanza escuela1 escuela2 pbi
## 1 Norway 81.711 17.67187 12.74642 67614.35
## 2 Australia 82.537 20.43272 13.17510 42822.20
## 3 Switzerland 83.133 16.04041 13.37000 56363.96
## 4 Germany 81.092 17.09594 13.18763 44999.65
## 5 Denmark 80.412 19.18880 12.70017 44518.92
## 6 Singapore 83.209 15.40000 11.57241 78162.32
Cambio el Ãndice y elimino la variable pais
row.names(data_sub)=data_sub$pais
data_sub=data_sub[,-c(1)]
head(data_sub)
## esperanza escuela1 escuela2 pbi
## Norway 81.711 17.67187 12.74642 67614.35
## Australia 82.537 20.43272 13.17510 42822.20
## Switzerland 83.133 16.04041 13.37000 56363.96
## Germany 81.092 17.09594 13.18763 44999.65
## Denmark 80.412 19.18880 12.70017 44518.92
## Singapore 83.209 15.40000 11.57241 78162.32
data_sub.scaled=scale(data_sub)
head(data_sub.scaled)
## esperanza escuela1 escuela2 pbi
## Norway 1.248465 1.6184372 1.412492 2.637771
## Australia 1.348023 2.5713859 1.550909 1.337664
## Switzerland 1.419860 1.0553143 1.613841 2.047798
## Germany 1.173856 1.4196463 1.554954 1.451850
## Denmark 1.091894 2.1420283 1.397558 1.426641
## Singapore 1.429021 0.8342672 1.033414 3.190910
library(NbClust)
nb=NbClust(data_sub.scaled,method="complete")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 7 proposed 2 as the best number of clusters
## * 7 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
library(factoextra)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
algoritmo="hclust"
cuantosClusters=2
solucionJerarquica1=eclust(data_sub.scaled,
FUNcluster = algoritmo,
k= cuantosClusters,
method="complete",
graph=FALSE)
idhhc=as.data.frame(solucionJerarquica1$cluster)
colnames(idhhc)=c("hc")
dataIDH=merge(dataIDH,idhhc,by.x="pais",by.y=0)
head(dataIDH)
## pais IDH esperanza escuela1 escuela2 pbi
## 1 Afghanistan 0.4793750 60.704 10.05375 3.550000 1870.827
## 2 Albania 0.7641737 77.968 14.17968 9.646810 10252.470
## 3 Algeria 0.7448056 75.027 14.35778 7.785367 13532.961
## 4 Andorra 0.8584517 81.458 13.52402 10.255080 47979.454
## 5 Angola 0.5334641 52.698 11.39000 4.979858 6290.896
## 6 Antigua and Barbuda 0.7855890 76.236 13.94890 9.206913 20907.217
## alatina idhOrd espOrd esc1Ord esc2Ord pbiOrd hc
## 1 NO bajo medio bajo bajo bajo 2
## 2 NO alto alto medio alto bajo 1
## 3 NO medio alto medio medio bajo 1
## 4 NO alto alto medio alto medio 1
## 5 NO bajo bajo medio bajo bajo 2
## 6 NO alto alto medio medio bajo 1
table(dataIDH$hc)
##
## 1 2
## 115 73
fviz_silhouette(solucionJerarquica1)
## cluster size ave.sil.width
## 1 1 115 0.44
## 2 2 73 0.50
dataIDH[dataIDH$pais=="Peru",]$hc
## [1] 1
dataIDH[dataIDH$pais=="Chile",]$hc
## [1] 1
t.test(dataIDH$pbi,dataIDH$hc) #primero la num luego la cat
##
## Welch Two Sample t-test
##
## data: dataIDH$pbi and dataIDH$hc
## t = 12.448, df = 187, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 14568.86 20056.10
## sample estimates:
## mean of x mean of y
## 17313.866115 1.388298
cordata=cor(data_sub)
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
KMO(cordata) #se lee el overall MSA
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cordata)
## Overall MSA = 0.82
## MSA for each item =
## esperanza escuela1 escuela2 pbi
## 0.84 0.77 0.81 0.92
cortest.bartlett(cordata,n=nrow(dataIDH)) #en n ponemos el total de casos (filas) de la data original.
## $chisq
## [1] 522.2189
##
## $p.value
## [1] 1.372589e-109
##
## $df
## [1] 6
Tu p-value es menor a 0.05
eigenf=eigen(cordata)
eigenf$values
## [1] 3.1090096 0.4823381 0.2513709 0.1572813
¡Una dimensión!
resultadoPr=principal(cordata,1,rotate = "varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)
## Principal Components Analysis
## Call: principal(r = cordata, nfactors = 1, rotate = "varimax", scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 h2 u2 com
## esperanza 0.905 0.819 0.181 1
## escuela1 0.924 0.854 0.146 1
## escuela2 0.905 0.819 0.181 1
## pbi 0.786 0.617 0.383 1
##
## PC1
## SS loadings 3.109
## Proportion Var 0.777
##
## Mean item complexity = 1
## Test of the hypothesis that 1 component is sufficient.
##
## The root mean square of the residuals (RMSR) is 0.085
##
## Fit based upon off diagonal values = 0.985
Puntaje en cada dimensión:
regresFactors=factor.scores(data_sub,resultadoPr)$scores
head(regresFactors)
## PC1
## Norway 1.922142
## Australia 1.946096
## Switzerland 1.714175
## Germany 1.583111
## Denmark 1.721778
## Singapore 1.771082
Juntamos dimensiones con data original:
dataIDH=merge(dataIDH,regresFactors,by.x="pais",by.y=0)
head(dataIDH)
## pais IDH esperanza escuela1 escuela2 pbi
## 1 Afghanistan 0.4793750 60.704 10.05375 3.550000 1870.827
## 2 Albania 0.7641737 77.968 14.17968 9.646810 10252.470
## 3 Algeria 0.7448056 75.027 14.35778 7.785367 13532.961
## 4 Andorra 0.8584517 81.458 13.52402 10.255080 47979.454
## 5 Angola 0.5334641 52.698 11.39000 4.979858 6290.896
## 6 Antigua and Barbuda 0.7855890 76.236 13.94890 9.206913 20907.217
## alatina idhOrd espOrd esc1Ord esc2Ord pbiOrd hc PC1
## 1 NO bajo medio bajo bajo bajo 2 -1.3319439
## 2 NO alto alto medio alto bajo 1 0.3810940
## 3 NO medio alto medio medio bajo 1 0.1647170
## 4 NO alto alto medio alto medio 1 0.9934047
## 5 NO bajo bajo medio bajo bajo 2 -1.2828055
## 6 NO alto alto medio medio bajo 1 0.3965070
Comparo Perú con el puntaje promedio:
mean(dataIDH$PC1)
## [1] -1.037224e-16
dataIDH[dataIDH$pais=="Peru",]$PC1
## [1] 0.1433386
Comparo puntaje de Norway y Australia en ambas dimensiones (IDH y PC1)
dataIDH[dataIDH$pais=="Australia",]$IDH > dataIDH[dataIDH$pais=="Norway",]$IDH
## [1] FALSE
dataIDH[dataIDH$pais=="Australia",]$PC1 > dataIDH[dataIDH$pais=="Norway",]$PC1
## [1] TRUE
Calculamos las distancias:
datasub_d=dist(data_sub.scaled)
Algoritmo:
datasub_r=cmdscale(datasub_d,eig=TRUE,k=2)
datasub_r$GOF
## [1] 0.8978369 0.8978369
Vemos el mapa:
titulo="Mapa de Similitudes entre paÃses basado en el IDH"
x=datasub_r$points[,1]
y=datasub_r$points[,2]
plot(x, y, main=titulo)
Pongamos las etiquetas de los nombres de los paÃses para poder interpretar:
plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n")
columnForLabels=dimnames(datasub_r[[1]])[[1]]
text(x, y, labels = columnForLabels, cex=0.5)