REPASO PARCIAL - 09 de mayo

Abro data:

folder="data"
fileName="IDHmundo2015.xlsx"
fileToRead=file.path(folder,fileName)
library(openxlsx)
dataIDH=openxlsx::read.xlsx(fileToRead,
                            sheet=1,
                            startRow=3, #ese 3 se refiere a la fila 3 del Excel original [es decir, cuenta las filas sea que estén vacías o no; es bueno tener esto porque así te quedas con el encabezado]
                            skipEmptyRows=T,skipEmptyCols=T)

Vemos la cola:

tail(dataIDH,10)
##       X1
## 193  187
## 194  188
## 195 <NA>
## 196 <NA>
## 197 <NA>
## 198 <NA>
## 199 <NA>
## 200 <NA>
## 201 <NA>
## 202 <NA>
##                                                                                      X2
## 193                                                                               Niger
## 194                                                            Central African Republic
## 195                                                                               Notes
## 196                            a. Data refer to 2015 or the most recent year available.
## 197 b. In calculating the HDI value, expected years of schooling is capped at 18 years.
## 198                            c. Updated by HDRO using Barro and Lee (2016) estimates.
## 199                              d. Based on data from the national statistical office.
## 200               e. In calculating the HDI value, GNI per capita is capped at $75,000.
## 201                                                   f. Based on Barro and Lee (2016).
## 202                                                        g. Value from UNDESA (2011).
##     Human.Development.Index.(HDI) Life.expectancy.at.birth   X5
## 193           0.35264066960226614                   61.936 <NA>
## 194           0.35244020015467942       51.457999999999998 <NA>
## 195                          <NA>                     <NA> <NA>
## 196                          <NA>                     <NA> <NA>
## 197                          <NA>                     <NA> <NA>
## 198                          <NA>                     <NA> <NA>
## 199                          <NA>                     <NA> <NA>
## 200                          <NA>                     <NA> <NA>
## 201                          <NA>                     <NA> <NA>
## 202                          <NA>                     <NA> <NA>
##     Expected.years.of.schooling   X7 Mean.years.of.schooling   X9
## 193                     5.42265 <NA>      1.6579999999999999    f
## 194          7.0989800000000001 <NA>      4.2300000000000004    n
## 195                        <NA> <NA>                    <NA> <NA>
## 196                        <NA> <NA>                    <NA> <NA>
## 197                        <NA> <NA>                    <NA> <NA>
## 198                        <NA> <NA>                    <NA> <NA>
## 199                        <NA> <NA>                    <NA> <NA>
## 200                        <NA> <NA>                    <NA> <NA>
## 201                        <NA> <NA>                    <NA> <NA>
## 202                        <NA> <NA>                    <NA> <NA>
##     Gross.national.income.(GNI).per.capita  X11
## 193                     889.45564579999996 <NA>
## 194                     587.47396070000002 <NA>
## 195                                   <NA> <NA>
## 196                                   <NA> <NA>
## 197                                   <NA> <NA>
## 198                                   <NA> <NA>
## 199                                   <NA> <NA>
## 200                                   <NA> <NA>
## 201                                   <NA> <NA>
## 202                                   <NA> <NA>
##     GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 193                                  1      187             NO
## 194                                  4      188             NO
## 195                                 NA     <NA>           <NA>
## 196                                 NA     <NA>           <NA>
## 197                                 NA     <NA>           <NA>
## 198                                 NA     <NA>           <NA>
## 199                                 NA     <NA>           <NA>
## 200                                 NA     <NA>           <NA>
## 201                                 NA     <NA>           <NA>
## 202                                 NA     <NA>           <NA>

Elimino filas de abajo:

dataIDH=dataIDH[-c(195:202),]
tail(dataIDH)
##      X1                       X2 Human.Development.Index.(HDI)
## 189 183                   Guinea           0.41418630622159636
## 190 184                  Burundi           0.40376541480557104
## 191 185             Burkina Faso           0.40174010414463812
## 192 186                     Chad           0.39607322446853288
## 193 187                    Niger           0.35264066960226614
## 194 188 Central African Republic           0.35244020015467942
##     Life.expectancy.at.birth   X5 Expected.years.of.schooling   X7
## 189       59.215000000000003 <NA>          8.8152699999999999 <NA>
## 190       57.118000000000002 <NA>                    10.64743 <NA>
## 191       59.006999999999998 <NA>          7.6928099999999997 <NA>
## 192       51.896999999999998 <NA>          7.3002599999999997 <NA>
## 193                   61.936 <NA>                     5.42265 <NA>
## 194       51.457999999999998 <NA>          7.0989800000000001 <NA>
##     Mean.years.of.schooling X9 Gross.national.income.(GNI).per.capita  X11
## 189      2.5684814920000001  q                     1058.2198760000001 <NA>
## 190      2.9688892610000002  c                            691.1825738 <NA>
## 191             1.441531766  q                            1537.499309 <NA>
## 192      2.3233277380000001  n                     1990.8714050000001 <NA>
## 193      1.6579999999999999  f                     889.45564579999996 <NA>
## 194      4.2300000000000004  n                     587.47396070000002 <NA>
##     GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 189                                  4      182             NO
## 190                                  5      184             NO
## 191                                 -8      185             NO
## 192                                -19      186             NO
## 193                                  1      187             NO
## 194                                  4      188             NO

Veo la cabeza:

head(dataIDH)
##         X1                          X2 Human.Development.Index.(HDI)
## 1 HDI rank                     Country                         Value
## 2     <NA>                        <NA>                          2015
## 3     <NA> VERY HIGH HUMAN DEVELOPMENT                          <NA>
## 4        1                      Norway           0.94942283449106446
## 5        2                   Australia           0.93867953564660933
## 6        2                 Switzerland           0.93913086905938037
##   Life.expectancy.at.birth   X5 Expected.years.of.schooling   X7
## 1                  (years) <NA>                     (years) <NA>
## 2                     2015 <NA>                        2015    a
## 3                     <NA> <NA>                        <NA> <NA>
## 4       81.710999999999999 <NA>          17.671869999999998 <NA>
## 5       82.537000000000006 <NA>                    20.43272    b
## 6       83.132999999999996 <NA>          16.040410000000001 <NA>
##   Mean.years.of.schooling   X9 Gross.national.income.(GNI).per.capita  X11
## 1                 (years) <NA>                           (2011 PPP $) <NA>
## 2                    2015    a                                   2015 <NA>
## 3                    <NA> <NA>                                   <NA> <NA>
## 4      12.746420000000001 <NA>                     67614.353480000005 <NA>
## 5                 13.1751 <NA>                            42822.19627 <NA>
## 6                   13.37 <NA>                     56363.957799999996 <NA>
##   GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 1                                 NA     <NA>           <NA>
## 2                               2015     2014           2014
## 3                                 NA     <NA>           <NA>
## 4                                  5        1             NO
## 5                                 19        3             NO
## 6                                  7        2             NO

Elimino las filas innecesarias:

dataIDH=dataIDH[-c(1:3),]
head(dataIDH)
##   X1          X2 Human.Development.Index.(HDI) Life.expectancy.at.birth
## 4  1      Norway           0.94942283449106446       81.710999999999999
## 5  2   Australia           0.93867953564660933       82.537000000000006
## 6  2 Switzerland           0.93913086905938037       83.132999999999996
## 7  4     Germany            0.9256689410716622       81.091999999999999
## 8  5     Denmark           0.92464936963656996       80.412000000000006
## 9  5   Singapore            0.9248659328300326       83.209000000000003
##     X5 Expected.years.of.schooling   X7 Mean.years.of.schooling   X9
## 4 <NA>          17.671869999999998 <NA>      12.746420000000001 <NA>
## 5 <NA>                    20.43272    b                 13.1751 <NA>
## 6 <NA>          16.040410000000001 <NA>                   13.37 <NA>
## 7 <NA>          17.095939999999999 <NA>             13.18762553    c
## 8 <NA>          19.188800000000001    b                12.70017 <NA>
## 9 <NA>                        15.4    d                11.57241 <NA>
##   Gross.national.income.(GNI).per.capita  X11
## 4                     67614.353480000005 <NA>
## 5                            42822.19627 <NA>
## 6                     56363.957799999996 <NA>
## 7                     44999.647140000001 <NA>
## 8                     44518.924019999999 <NA>
## 9                     78162.323869999993    e
##   GNI.per.capita.rank.minus.HDI.rank HDI.rank America.Latina
## 4                                  5        1             NO
## 5                                 19        3             NO
## 6                                  7        2             NO
## 7                                 13        4             NO
## 8                                 13        6             NO
## 9                                 -3        4             NO

Elimino columnas innecesarias:

names(dataIDH)
##  [1] "X1"                                    
##  [2] "X2"                                    
##  [3] "Human.Development.Index.(HDI)"         
##  [4] "Life.expectancy.at.birth"              
##  [5] "X5"                                    
##  [6] "Expected.years.of.schooling"           
##  [7] "X7"                                    
##  [8] "Mean.years.of.schooling"               
##  [9] "X9"                                    
## [10] "Gross.national.income.(GNI).per.capita"
## [11] "X11"                                   
## [12] "GNI.per.capita.rank.minus.HDI.rank"    
## [13] "HDI.rank"                              
## [14] "America.Latina"
dataIDH=dataIDH[-c(1,5,7,9,11:13)]
head(dataIDH)
##            X2 Human.Development.Index.(HDI) Life.expectancy.at.birth
## 4      Norway           0.94942283449106446       81.710999999999999
## 5   Australia           0.93867953564660933       82.537000000000006
## 6 Switzerland           0.93913086905938037       83.132999999999996
## 7     Germany            0.9256689410716622       81.091999999999999
## 8     Denmark           0.92464936963656996       80.412000000000006
## 9   Singapore            0.9248659328300326       83.209000000000003
##   Expected.years.of.schooling Mean.years.of.schooling
## 4          17.671869999999998      12.746420000000001
## 5                    20.43272                 13.1751
## 6          16.040410000000001                   13.37
## 7          17.095939999999999             13.18762553
## 8          19.188800000000001                12.70017
## 9                        15.4                11.57241
##   Gross.national.income.(GNI).per.capita America.Latina
## 4                     67614.353480000005             NO
## 5                            42822.19627             NO
## 6                     56363.957799999996             NO
## 7                     44999.647140000001             NO
## 8                     44518.924019999999             NO
## 9                     78162.323869999993             NO

Cambio nombre de variables:

names(dataIDH)=c("pais","IDH","esperanza","escuela1","escuela2","pbi","alatina")
head(dataIDH)
##          pais                 IDH          esperanza           escuela1
## 4      Norway 0.94942283449106446 81.710999999999999 17.671869999999998
## 5   Australia 0.93867953564660933 82.537000000000006           20.43272
## 6 Switzerland 0.93913086905938037 83.132999999999996 16.040410000000001
## 7     Germany  0.9256689410716622 81.091999999999999 17.095939999999999
## 8     Denmark 0.92464936963656996 80.412000000000006 19.188800000000001
## 9   Singapore  0.9248659328300326 83.209000000000003               15.4
##             escuela2                pbi alatina
## 4 12.746420000000001 67614.353480000005      NO
## 5            13.1751        42822.19627      NO
## 6              13.37 56363.957799999996      NO
## 7        13.18762553 44999.647140000001      NO
## 8           12.70017 44518.924019999999      NO
## 9           11.57241 78162.323869999993      NO

Reseteo índice:

row.names(dataIDH)=NULL
head(dataIDH)
##          pais                 IDH          esperanza           escuela1
## 1      Norway 0.94942283449106446 81.710999999999999 17.671869999999998
## 2   Australia 0.93867953564660933 82.537000000000006           20.43272
## 3 Switzerland 0.93913086905938037 83.132999999999996 16.040410000000001
## 4     Germany  0.9256689410716622 81.091999999999999 17.095939999999999
## 5     Denmark 0.92464936963656996 80.412000000000006 19.188800000000001
## 6   Singapore  0.9248659328300326 83.209000000000003               15.4
##             escuela2                pbi alatina
## 1 12.746420000000001 67614.353480000005      NO
## 2            13.1751        42822.19627      NO
## 3              13.37 56363.957799999996      NO
## 4        13.18762553 44999.647140000001      NO
## 5           12.70017 44518.924019999999      NO
## 6           11.57241 78162.323869999993      NO

Veo estructura y formateo:

str(dataIDH)
## 'data.frame':    191 obs. of  7 variables:
##  $ pais     : chr  "Norway" "Australia" "Switzerland" "Germany" ...
##  $ IDH      : chr  "0.94942283449106446" "0.93867953564660933" "0.93913086905938037" "0.9256689410716622" ...
##  $ esperanza: chr  "81.710999999999999" "82.537000000000006" "83.132999999999996" "81.091999999999999" ...
##  $ escuela1 : chr  "17.671869999999998" "20.43272" "16.040410000000001" "17.095939999999999" ...
##  $ escuela2 : chr  "12.746420000000001" "13.1751" "13.37" "13.18762553" ...
##  $ pbi      : chr  "67614.353480000005" "42822.19627" "56363.957799999996" "44999.647140000001" ...
##  $ alatina  : chr  "NO" "NO" "NO" "NO" ...

Las columnas 2 a 6 deben ser numéricas:

dataIDH[c(2:6)]=lapply(dataIDH[c(2:6)],as.numeric)

La última columna es factor:

dataIDH$alatina=factor(dataIDH$alatina,labels = c("NO","SI"))

Saco estructura:

str(dataIDH)
## 'data.frame':    191 obs. of  7 variables:
##  $ pais     : chr  "Norway" "Australia" "Switzerland" "Germany" ...
##  $ IDH      : num  0.949 0.939 0.939 0.926 0.925 ...
##  $ esperanza: num  81.7 82.5 83.1 81.1 80.4 ...
##  $ escuela1 : num  17.7 20.4 16 17.1 19.2 ...
##  $ escuela2 : num  12.7 13.2 13.4 13.2 12.7 ...
##  $ pbi      : num  67614 42822 56364 45000 44519 ...
##  $ alatina  : Factor w/ 2 levels "NO","SI": 1 1 1 1 1 1 1 1 1 1 ...

Eliminamos filas innecesarias con NAs:

dataIDH=dataIDH[complete.cases(dataIDH),]

Verifico casos perdidos:

summary(dataIDH)
##      pais                IDH           esperanza        escuela1     
##  Length:188         Min.   :0.3524   Min.   :48.94   Min.   : 4.872  
##  Class :character   1st Qu.:0.5775   1st Qu.:65.68   1st Qu.:10.886  
##  Mode  :character   Median :0.7287   Median :73.41   Median :13.140  
##                     Mean   :0.6988   Mean   :71.35   Mean   :12.983  
##                     3rd Qu.:0.8114   3rd Qu.:76.98   3rd Qu.:14.989  
##                     Max.   :0.9494   Max.   :84.16   Max.   :20.433  
##     escuela2           pbi           alatina 
##  Min.   : 1.442   Min.   :   587.5   NO:166  
##  1st Qu.: 6.082   1st Qu.:  3511.4   SI: 22  
##  Median : 8.656   Median : 10416.0           
##  Mean   : 8.372   Mean   : 17313.9           
##  3rd Qu.:11.086   3rd Qu.: 23517.3           
##  Max.   :13.370   Max.   :129915.6

En caso hubiera NAs, en este punto debiéramos imputar

Creamos las versiones ordinales de las variables:

dataIDH$pbiOrd=dataIDH$esc2Ord=dataIDH$esc1Ord=dataIDH$espOrd=dataIDH$idhOrd=NA
niveles=3
etiquetas1=c('bajo','medio','alto')
dataIDH[,c(8:12)]=lapply(dataIDH[,c(2:6)],cut,
                         breaks=niveles,
                         labels=etiquetas1,
                         ordered_result=T)

¡Listo!


EJERCICIOS

Clusters

[Cuando no sabes el número de clusters que quieres, tienes que usar jerárquico]

  1. Subconjunto:
data_sub=dataIDH[c(1,3:6)]
head(data_sub)
##          pais esperanza escuela1 escuela2      pbi
## 1      Norway    81.711 17.67187 12.74642 67614.35
## 2   Australia    82.537 20.43272 13.17510 42822.20
## 3 Switzerland    83.133 16.04041 13.37000 56363.96
## 4     Germany    81.092 17.09594 13.18763 44999.65
## 5     Denmark    80.412 19.18880 12.70017 44518.92
## 6   Singapore    83.209 15.40000 11.57241 78162.32

Cambio el índice y elimino la variable pais

row.names(data_sub)=data_sub$pais
data_sub=data_sub[,-c(1)]
head(data_sub)
##             esperanza escuela1 escuela2      pbi
## Norway         81.711 17.67187 12.74642 67614.35
## Australia      82.537 20.43272 13.17510 42822.20
## Switzerland    83.133 16.04041 13.37000 56363.96
## Germany        81.092 17.09594 13.18763 44999.65
## Denmark        80.412 19.18880 12.70017 44518.92
## Singapore      83.209 15.40000 11.57241 78162.32
  1. Estandarizar:
data_sub.scaled=scale(data_sub)
head(data_sub.scaled)
##             esperanza  escuela1 escuela2      pbi
## Norway       1.248465 1.6184372 1.412492 2.637771
## Australia    1.348023 2.5713859 1.550909 1.337664
## Switzerland  1.419860 1.0553143 1.613841 2.047798
## Germany      1.173856 1.4196463 1.554954 1.451850
## Denmark      1.091894 2.1420283 1.397558 1.426641
## Singapore    1.429021 0.8342672 1.033414 3.190910
  1. Identifico el número óptimo de clusters:
library(NbClust)
nb=NbClust(data_sub.scaled,method="complete")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 7 proposed 2 as the best number of clusters 
## * 7 proposed 3 as the best number of clusters 
## * 4 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
  1. Aplico el algoritmo, sabiendo que tengo que pedir 2 clusters:
library(factoextra)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
algoritmo="hclust"
cuantosClusters=2
solucionJerarquica1=eclust(data_sub.scaled,
                           FUNcluster = algoritmo,
                           k= cuantosClusters,
                           method="complete",
                           graph=FALSE)
  1. Guardamos el resultado de la aglomeración
idhhc=as.data.frame(solucionJerarquica1$cluster)
colnames(idhhc)=c("hc")
dataIDH=merge(dataIDH,idhhc,by.x="pais",by.y=0)
head(dataIDH)
##                  pais       IDH esperanza escuela1  escuela2       pbi
## 1         Afghanistan 0.4793750    60.704 10.05375  3.550000  1870.827
## 2             Albania 0.7641737    77.968 14.17968  9.646810 10252.470
## 3             Algeria 0.7448056    75.027 14.35778  7.785367 13532.961
## 4             Andorra 0.8584517    81.458 13.52402 10.255080 47979.454
## 5              Angola 0.5334641    52.698 11.39000  4.979858  6290.896
## 6 Antigua and Barbuda 0.7855890    76.236 13.94890  9.206913 20907.217
##   alatina idhOrd espOrd esc1Ord esc2Ord pbiOrd hc
## 1      NO   bajo  medio    bajo    bajo   bajo  2
## 2      NO   alto   alto   medio    alto   bajo  1
## 3      NO  medio   alto   medio   medio   bajo  1
## 4      NO   alto   alto   medio    alto  medio  1
## 5      NO   bajo   bajo   medio    bajo   bajo  2
## 6      NO   alto   alto   medio   medio   bajo  1
  1. Veo cuantos países hay en cada cluster:
table(dataIDH$hc)
## 
##   1   2 
## 115  73
  1. Veamos qué tan buena ha sido la aglomeración
fviz_silhouette(solucionJerarquica1)
##   cluster size ave.sil.width
## 1       1  115          0.44
## 2       2   73          0.50

  1. Veamos a qué grupo pertenecen Chile y Perú
dataIDH[dataIDH$pais=="Peru",]$hc
## [1] 1
dataIDH[dataIDH$pais=="Chile",]$hc
## [1] 1
  1. Veamos si hay diferencias estadísticamente significativas en PBI. Como tenemos dos grupos hacemos t-test:
t.test(dataIDH$pbi,dataIDH$hc) #primero la num luego la cat
## 
##  Welch Two Sample t-test
## 
## data:  dataIDH$pbi and dataIDH$hc
## t = 12.448, df = 187, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  14568.86 20056.10
## sample estimates:
##    mean of x    mean of y 
## 17313.866115     1.388298

PCA

  1. Matriz de correlación
cordata=cor(data_sub)
  1. KMO
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
KMO(cordata) #se lee el overall MSA
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cordata)
## Overall MSA =  0.82
## MSA for each item = 
## esperanza  escuela1  escuela2       pbi 
##      0.84      0.77      0.81      0.92
  1. Bartlett
cortest.bartlett(cordata,n=nrow(dataIDH)) #en n ponemos el total de casos (filas) de la data original.
## $chisq
## [1] 522.2189
## 
## $p.value
## [1] 1.372589e-109
## 
## $df
## [1] 6

Tu p-value es menor a 0.05

  1. Número de factores a extraer
eigenf=eigen(cordata)
eigenf$values
## [1] 3.1090096 0.4823381 0.2513709 0.1572813

¡Una dimensión!

  1. Algoritmo
resultadoPr=principal(cordata,1,rotate = "varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)
## Principal Components Analysis
## Call: principal(r = cordata, nfactors = 1, rotate = "varimax", scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1    h2    u2 com
## esperanza 0.905 0.819 0.181   1
## escuela1  0.924 0.854 0.146   1
## escuela2  0.905 0.819 0.181   1
## pbi       0.786 0.617 0.383   1
## 
##                  PC1
## SS loadings    3.109
## Proportion Var 0.777
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.085 
## 
## Fit based upon off diagonal values = 0.985

Puntaje en cada dimensión:

regresFactors=factor.scores(data_sub,resultadoPr)$scores
head(regresFactors)
##                  PC1
## Norway      1.922142
## Australia   1.946096
## Switzerland 1.714175
## Germany     1.583111
## Denmark     1.721778
## Singapore   1.771082

Juntamos dimensiones con data original:

dataIDH=merge(dataIDH,regresFactors,by.x="pais",by.y=0)
head(dataIDH)
##                  pais       IDH esperanza escuela1  escuela2       pbi
## 1         Afghanistan 0.4793750    60.704 10.05375  3.550000  1870.827
## 2             Albania 0.7641737    77.968 14.17968  9.646810 10252.470
## 3             Algeria 0.7448056    75.027 14.35778  7.785367 13532.961
## 4             Andorra 0.8584517    81.458 13.52402 10.255080 47979.454
## 5              Angola 0.5334641    52.698 11.39000  4.979858  6290.896
## 6 Antigua and Barbuda 0.7855890    76.236 13.94890  9.206913 20907.217
##   alatina idhOrd espOrd esc1Ord esc2Ord pbiOrd hc        PC1
## 1      NO   bajo  medio    bajo    bajo   bajo  2 -1.3319439
## 2      NO   alto   alto   medio    alto   bajo  1  0.3810940
## 3      NO  medio   alto   medio   medio   bajo  1  0.1647170
## 4      NO   alto   alto   medio    alto  medio  1  0.9934047
## 5      NO   bajo   bajo   medio    bajo   bajo  2 -1.2828055
## 6      NO   alto   alto   medio   medio   bajo  1  0.3965070

Comparo Perú con el puntaje promedio:

mean(dataIDH$PC1)
## [1] -1.037224e-16
dataIDH[dataIDH$pais=="Peru",]$PC1
## [1] 0.1433386

Comparo puntaje de Norway y Australia en ambas dimensiones (IDH y PC1)

dataIDH[dataIDH$pais=="Australia",]$IDH > dataIDH[dataIDH$pais=="Norway",]$IDH
## [1] FALSE
dataIDH[dataIDH$pais=="Australia",]$PC1 > dataIDH[dataIDH$pais=="Norway",]$PC1
## [1] TRUE

Mapa de similitudes

Calculamos las distancias:

datasub_d=dist(data_sub.scaled)

Algoritmo:

datasub_r=cmdscale(datasub_d,eig=TRUE,k=2)
datasub_r$GOF
## [1] 0.8978369 0.8978369

Vemos el mapa:

titulo="Mapa de Similitudes entre países basado en el IDH"
x=datasub_r$points[,1]
y=datasub_r$points[,2]
plot(x, y, main=titulo)

Pongamos las etiquetas de los nombres de los países para poder interpretar:

plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n")
columnForLabels=dimnames(datasub_r[[1]])[[1]]
text(x, y, labels = columnForLabels, cex=0.5)