Examen Parcial - B

PRIMERA PARTE

Cargo la data:

link="https://en.wikipedia.org/wiki/World_Happiness_Report"
library(XML)
library(RCurl)

## Loading required package: bitops

wikiLinkContents=getURL(link)
wikiTables=readHTMLTable(wikiLinkContents,stringsAsFactors=FALSE)

Veo qué tabla es la que necesito: es la quinta tabla (en las cuatro tablas anteriores no hay nada, al parecer):

data=as.data.frame(wikiTables[5])
head(data)

##   NULL.Overall.Rank NULL.Country NULL.Score NULL.GDP.per.capita
## 1                 1      Finland      7.632               1.305
## 2                 2       Norway      7.594               1.456
## 3                 3      Denmark      7.555               1.351
## 4                 4      Iceland      7.495               1.343
## 5                 5  Switzerland      7.487               1.420
## 6                 6  Netherlands      7.441               1.361
##   NULL.Social.support NULL.Healthy.life.expectancy
## 1               1.592                        0.874
## 2               1.582                        0.861
## 3               1.590                        0.868
## 4               1.644                        0.914
## 5               1.549                        0.927
## 6               1.488                        0.878
##   NULL.Freedom.to.make.life.choices NULL.Generosity
## 1                             0.681           0.192
## 2                             0.686           0.286
## 3                             0.683           0.284
## 4                             0.677           0.353
## 5                             0.660           0.256
## 6                             0.638           0.333
##   NULL.Perceptions.of.corruption
## 1                          0.393
## 2                          0.340
## 3                          0.408
## 4                          0.138
## 5                          0.357
## 6                          0.295

Saco estructura:

str(data)

## 'data.frame':    156 obs. of  9 variables:
##  $ NULL.Overall.Rank                : chr  "1" "2" "3" "4" ...
##  $ NULL.Country                     : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ NULL.Score                       : chr  "7.632" "7.594" "7.555" "7.495" ...
##  $ NULL.GDP.per.capita              : chr  "1.305" "1.456" "1.351" "1.343" ...
##  $ NULL.Social.support              : chr  "1.592" "1.582" "1.590" "1.644" ...
##  $ NULL.Healthy.life.expectancy     : chr  "0.874" "0.861" "0.868" "0.914" ...
##  $ NULL.Freedom.to.make.life.choices: chr  "0.681" "0.686" "0.683" "0.677" ...
##  $ NULL.Generosity                  : chr  "0.192" "0.286" "0.284" "0.353" ...
##  $ NULL.Perceptions.of.corruption   : chr  "0.393" "0.340" "0.408" "0.138" ...

Formateo a numéricas:

data[,c(3:9)]=lapply(data[,c(3:9)],as.numeric)

## Warning in lapply(data[, c(3:9)], as.numeric): NAs introducidos por
## coerción

str(data)

## 'data.frame':    156 obs. of  9 variables:
##  $ NULL.Overall.Rank                : chr  "1" "2" "3" "4" ...
##  $ NULL.Country                     : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ NULL.Score                       : num  7.63 7.59 7.55 7.5 7.49 ...
##  $ NULL.GDP.per.capita              : num  1.3 1.46 1.35 1.34 1.42 ...
##  $ NULL.Social.support              : num  1.59 1.58 1.59 1.64 1.55 ...
##  $ NULL.Healthy.life.expectancy     : num  0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
##  $ NULL.Freedom.to.make.life.choices: num  0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
##  $ NULL.Generosity                  : num  0.192 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
##  $ NULL.Perceptions.of.corruption   : num  0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...

head(data)

##   NULL.Overall.Rank NULL.Country NULL.Score NULL.GDP.per.capita
## 1                 1      Finland      7.632               1.305
## 2                 2       Norway      7.594               1.456
## 3                 3      Denmark      7.555               1.351
## 4                 4      Iceland      7.495               1.343
## 5                 5  Switzerland      7.487               1.420
## 6                 6  Netherlands      7.441               1.361
##   NULL.Social.support NULL.Healthy.life.expectancy
## 1               1.592                        0.874
## 2               1.582                        0.861
## 3               1.590                        0.868
## 4               1.644                        0.914
## 5               1.549                        0.927
## 6               1.488                        0.878
##   NULL.Freedom.to.make.life.choices NULL.Generosity
## 1                             0.681           0.192
## 2                             0.686           0.286
## 3                             0.683           0.284
## 4                             0.677           0.353
## 5                             0.660           0.256
## 6                             0.638           0.333
##   NULL.Perceptions.of.corruption
## 1                          0.393
## 2                          0.340
## 3                          0.408
## 4                          0.138
## 5                          0.357
## 6                          0.295

data=data[,-c(1)]
row.names(data)=data$NULL.Country
head(data)

##             NULL.Country NULL.Score NULL.GDP.per.capita
## Finland          Finland      7.632               1.305
## Norway            Norway      7.594               1.456
## Denmark          Denmark      7.555               1.351
## Iceland          Iceland      7.495               1.343
## Switzerland  Switzerland      7.487               1.420
## Netherlands  Netherlands      7.441               1.361
##             NULL.Social.support NULL.Healthy.life.expectancy
## Finland                   1.592                        0.874
## Norway                    1.582                        0.861
## Denmark                   1.590                        0.868
## Iceland                   1.644                        0.914
## Switzerland               1.549                        0.927
## Netherlands               1.488                        0.878
##             NULL.Freedom.to.make.life.choices NULL.Generosity
## Finland                                 0.681           0.192
## Norway                                  0.686           0.286
## Denmark                                 0.683           0.284
## Iceland                                 0.677           0.353
## Switzerland                             0.660           0.256
## Netherlands                             0.638           0.333
##             NULL.Perceptions.of.corruption
## Finland                              0.393
## Norway                               0.340
## Denmark                              0.408
## Iceland                              0.138
## Switzerland                          0.357
## Netherlands                          0.295

data=data[,-c(1)]
names(data)=c("score","gdp","social","lifeexp","freedom","generosity","corruption")
head(data)

##             score   gdp social lifeexp freedom generosity corruption
## Finland     7.632 1.305  1.592   0.874   0.681      0.192      0.393
## Norway      7.594 1.456  1.582   0.861   0.686      0.286      0.340
## Denmark     7.555 1.351  1.590   0.868   0.683      0.284      0.408
## Iceland     7.495 1.343  1.644   0.914   0.677      0.353      0.138
## Switzerland 7.487 1.420  1.549   0.927   0.660      0.256      0.357
## Netherlands 7.441 1.361  1.488   0.878   0.638      0.333      0.295

Creo subset de mi data:

datasub=data[,c(2:7)]
head(datasub)

##               gdp social lifeexp freedom generosity corruption
## Finland     1.305  1.592   0.874   0.681      0.192      0.393
## Norway      1.456  1.582   0.861   0.686      0.286      0.340
## Denmark     1.351  1.590   0.868   0.683      0.284      0.408
## Iceland     1.343  1.644   0.914   0.677      0.353      0.138
## Switzerland 1.420  1.549   0.927   0.660      0.256      0.357
## Netherlands 1.361  1.488   0.878   0.638      0.333      0.295

Veo si hay perdidos:

summary(datasub)

##       gdp             social         lifeexp          freedom      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.6162   1st Qu.:1.067   1st Qu.:0.4223   1st Qu.:0.3560  
##  Median :0.9495   Median :1.255   Median :0.6440   Median :0.4870  
##  Mean   :0.8914   Mean   :1.213   Mean   :0.5973   Mean   :0.4545  
##  3rd Qu.:1.1978   3rd Qu.:1.463   3rd Qu.:0.7772   3rd Qu.:0.5785  
##  Max.   :2.0960   Max.   :1.644   Max.   :1.0300   Max.   :0.7240  
##                                                                    
##    generosity       corruption   
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.1095   1st Qu.:0.051  
##  Median :0.1740   Median :0.082  
##  Mean   :0.1809   Mean   :0.112  
##  3rd Qu.:0.2390   3rd Qu.:0.137  
##  Max.   :0.5980   Max.   :0.457  
##                   NA's   :1

Hay solo un perdido en la variable corruption. Imputo (no importa que use este código, porque si en las demás variables no hay perdidos, no imputará nada):

library(DescTools)
for(i in 1:ncol(datasub)){  
  MEDIA=Mean(datasub[,i], na.rm = TRUE) 
  datasub[is.na(datasub[,i]), i] = MEDIA  
}

Veo cómo queda:

summary(datasub)

##       gdp             social         lifeexp          freedom      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.6162   1st Qu.:1.067   1st Qu.:0.4223   1st Qu.:0.3560  
##  Median :0.9495   Median :1.255   Median :0.6440   Median :0.4870  
##  Mean   :0.8914   Mean   :1.213   Mean   :0.5973   Mean   :0.4545  
##  3rd Qu.:1.1978   3rd Qu.:1.463   3rd Qu.:0.7772   3rd Qu.:0.5785  
##  Max.   :2.0960   Max.   :1.644   Max.   :1.0300   Max.   :0.7240  
##    generosity       corruption    
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1095   1st Qu.:0.0510  
##  Median :0.1740   Median :0.0820  
##  Mean   :0.1809   Mean   :0.1120  
##  3rd Qu.:0.2390   3rd Qu.:0.1365  
##  Max.   :0.5980   Max.   :0.4570

¡Perfecto! Sigamos.

Ahora saco una matriz de correlación.

cor(datasub)

##                    gdp     social    lifeexp   freedom  generosity
## gdp         1.00000000 0.67173252 0.84427323 0.3217754 -0.01414976
## social      0.67173252 1.00000000 0.66707865 0.4110874  0.01740677
## lifeexp     0.84427323 0.66707865 1.00000000 0.3491445  0.01868503
## freedom     0.32177543 0.41108744 0.34914449 1.0000000  0.29710627
## generosity -0.01414976 0.01740677 0.01868503 0.2971063  1.00000000
## corruption  0.30105386 0.21687701 0.31031301 0.4607881  0.36036993
##            corruption
## gdp         0.3010539
## social      0.2168770
## lifeexp     0.3103130
## freedom     0.4607881
## generosity  0.3603699
## corruption  1.0000000

Con eso podía responder la primera pregunta.

En la siguiente pregunta, piden hacer un mapa de similitudes para ver cómo está Singapur en relación con el resto de países nórdicos.

Para ello primero estandarizo:

datasub_s=scale(datasub)
summary(datasub_s)

##       gdp              social           lifeexp           freedom       
##  Min.   :-2.2746   Min.   :-4.0124   Min.   :-2.4128   Min.   :-2.7983  
##  1st Qu.:-0.7022   1st Qu.:-0.4845   1st Qu.:-0.7072   1st Qu.:-0.6065  
##  Median : 0.1481   Median : 0.1381   Median : 0.1884   Median : 0.2001  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7815   3rd Qu.: 0.8260   3rd Qu.: 0.7267   3rd Qu.: 0.7634  
##  Max.   : 3.0735   Max.   : 1.4246   Max.   : 1.7475   Max.   : 1.6592  
##    generosity         corruption     
##  Min.   :-1.83772   Min.   :-1.1645  
##  1st Qu.:-0.72560   1st Qu.:-0.6342  
##  Median :-0.07051   Median :-0.3119  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.58966   3rd Qu.: 0.2547  
##  Max.   : 4.23580   Max.   : 3.5870

Luego saco las distancias:

datasub_d=dist(datasub_s)

Aplico cmdscale:

datasub_r <- cmdscale(datasub_d,eig=TRUE, k=2) # k sugiere dimensiones. El resultado muestra las coordenadas (puntos) de cada provincia. DEJA EL k en 2.
datasub_r$GOF # GOF indica la bondad de ajuste, mientras mas cerca a 1 mejor.

## [1] 0.721218 0.721218

Buen GOF. Ahora dibujo el mapa:

titulo="Mapa de Similitudes entre países basado en el Índice de Felicidad"
x <- datasub_r$points[,1]
y <- datasub_r$points[,2]
plot(x, y, main=titulo)

Pongo los nombres de los países:

plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n") # 'n' evita que se pongan los puntos.
columnForLabels=dimnames(datasub_r[[1]])[[1]] # etiquetas y colores de los puntos
text(x, y,labels = columnForLabels , cex = 0.5) #con cex indicamos el tamaño

En el mapa ,vemos que Singapur se encuentra con los nórdicos. Con eso se respondía la pregunta.

Si ya tenemos la data estandarizada, puedo usar NbClust para ver cuántos clusters se podrían formar.

Aplico primero el método “complete”:

library(NbClust)
nb <- NbClust(datasub_s, method="complete")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 1 proposed 2 as the best number of clusters 
## * 13 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

¡3 clusters!

Ahora aplico método “kmeans”:

nb2 <- NbClust(datasub_s, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 15 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

¡3 clusters!

COn eso se respondía la pregunta: tanto por “complete” como por “kmeans” son 3 clusters.

PCA:

library(psych)

## Warning: package 'psych' was built under R version 3.4.4

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:DescTools':
## 
##     AUC, ICC, SD

KMO(cor(datasub))

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(datasub))
## Overall MSA =  0.73
## MSA for each item = 
##        gdp     social    lifeexp    freedom generosity corruption 
##       0.70       0.84       0.71       0.76       0.63       0.72

Bartlett:

cortest.bartlett(cor(datasub),n=nrow(datasub))

## $chisq
## [1] 395.2955
## 
## $p.value
## [1] 6.734106e-75
## 
## $df
## [1] 15

eigenf=eigen(cor(datasub))
eigenf$values

## [1] 2.9180016 1.4093064 0.6050568 0.5785018 0.3345613 0.1545721

resultadoPr=principal(cor(datasub),1,rotate="varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)

## Principal Components Analysis
## Call: principal(r = cor(datasub), nfactors = 1, rotate = "varimax", 
##     scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              PC1    h2    u2 com
## gdp        0.859 0.738 0.262   1
## social     0.806 0.650 0.350   1
## lifeexp    0.869 0.755 0.245   1
## freedom    0.643 0.414 0.586   1
## generosity       0.046 0.954   1
## corruption 0.561 0.315 0.685   1
## 
##                  PC1
## SS loadings    2.918
## Proportion Var 0.486
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.168 
## 
## Fit based upon off diagonal values = 0.841

regresFactors=factor.scores(datasub,resultadoPr)$scores

datasub=merge(datasub,regresFactors,by.x = 0,by.y = 0)
head(datasub)

##     Row.names   gdp social lifeexp freedom generosity corruption
## 1 Afghanistan 0.332  0.537   0.255   0.085      0.191      0.036
## 2     Albania 0.916  0.817   0.790   0.419      0.149      0.032
## 3     Algeria 0.979  1.154   0.687   0.077      0.055      0.135
## 4      Angola 0.730  1.125   0.269   0.000      0.079      0.061
## 5   Argentina 1.073  1.468   0.744   0.570      0.062      0.054
## 6     Armenia 0.816  0.990   0.666   0.260      0.077      0.028
##          PC1
## 1 -2.0959561
## 2 -0.3440926
## 3 -0.4409254
## 4 -1.3917248
## 5  0.4974521
## 6 -0.6877787

PARTE 2:

link="https://en.wikipedia.org/wiki/Democracy_Index"
library(XML)
library(RCurl)
wikiLinkContents=getURL(link)
wikiTables=readHTMLTable(wikiLinkContents,stringsAsFactors=FALSE)

data2=as.data.frame(wikiTables[2])
head(data2)

##   Democracy.Index.2017.Rank Democracy.Index.2017.Country
## 1                         1                       Norway
## 2                         2                      Iceland
## 3                         3                       Sweden
## 4                         4                  New Zealand
## 5                         5                      Denmark
## 6                        =6                      Ireland
##   Democracy.Index.2017.Score
## 1                       9.87
## 2                       9.58
## 3                       9.39
## 4                       9.26
## 5                       9.22
## 6                       9.15
##   Democracy.Index.2017.Electoral.process.and.pluralism
## 1                                                10.00
## 2                                                10.00
## 3                                                 9.58
## 4                                                10.00
## 5                                                10.00
## 6                                                 9.58
##   Democracy.Index.2017.Functioning.of.government
## 1                                           9.64
## 2                                           9.29
## 3                                           9.64
## 4                                           9.29
## 5                                           9.29
## 6                                           7.86
##   Democracy.Index.2017.Political.participation
## 1                                        10.00
## 2                                         8.89
## 3                                         8.33
## 4                                         8.89
## 5                                         8.33
## 6                                         8.33
##   Democracy.Index.2017.Political.culture
## 1                                  10.00
## 2                                  10.00
## 3                                  10.00
## 4                                   8.13
## 5                                   9.38
## 6                                  10.00
##   Democracy.Index.2017.Civil.liberties Democracy.Index.2017.Category
## 1                                 9.71                Full democracy
## 2                                 9.71                Full democracy
## 3                                 9.41                Full democracy
## 4                                10.00                Full democracy
## 5                                 9.12                Full democracy
## 6                                10.00                Full democracy

str(data2)

## 'data.frame':    171 obs. of  9 variables:
##  $ Democracy.Index.2017.Rank                           : chr  "1" "2" "3" "4" ...
##  $ Democracy.Index.2017.Country                        : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ Democracy.Index.2017.Score                          : chr  "9.87" "9.58" "9.39" "9.26" ...
##  $ Democracy.Index.2017.Electoral.process.and.pluralism: chr  "10.00" "10.00" "9.58" "10.00" ...
##  $ Democracy.Index.2017.Functioning.of.government      : chr  "9.64" "9.29" "9.64" "9.29" ...
##  $ Democracy.Index.2017.Political.participation        : chr  "10.00" "8.89" "8.33" "8.89" ...
##  $ Democracy.Index.2017.Political.culture              : chr  "10.00" "10.00" "10.00" "8.13" ...
##  $ Democracy.Index.2017.Civil.liberties                : chr  "9.71" "9.71" "9.41" "10.00" ...
##  $ Democracy.Index.2017.Category                       : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...

colnames(data2)=c("rank","pais","score","electo","gobierno","participacion","cultura","libertades","nivelDemocracia")
head(data2)

##   rank        pais score electo gobierno participacion cultura libertades
## 1    1      Norway  9.87  10.00     9.64         10.00   10.00       9.71
## 2    2     Iceland  9.58  10.00     9.29          8.89   10.00       9.71
## 3    3      Sweden  9.39   9.58     9.64          8.33   10.00       9.41
## 4    4 New Zealand  9.26  10.00     9.29          8.89    8.13      10.00
## 5    5     Denmark  9.22  10.00     9.29          8.33    9.38       9.12
## 6   =6     Ireland  9.15   9.58     7.86          8.33   10.00      10.00
##   nivelDemocracia
## 1  Full democracy
## 2  Full democracy
## 3  Full democracy
## 4  Full democracy
## 5  Full democracy
## 6  Full democracy

data2[,c(3:8)]=lapply(data2[,c(3:8)],as.numeric)

## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción

str(data2)

## 'data.frame':    171 obs. of  9 variables:
##  $ rank           : chr  "1" "2" "3" "4" ...
##  $ pais           : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ score          : num  9.87 9.58 9.39 9.26 9.22 9.15 9.15 9.09 9.03 9.03 ...
##  $ electo         : num  10 10 9.58 10 10 9.58 9.58 10 10 9.58 ...
##  $ gobierno       : num  9.64 9.29 9.64 9.29 9.29 7.86 9.64 8.93 8.93 9.29 ...
##  $ participacion  : num  10 8.89 8.33 8.89 8.33 8.33 7.78 7.78 7.78 7.78 ...
##  $ cultura        : num  10 10 10 8.13 9.38 10 8.75 8.75 8.75 9.38 ...
##  $ libertades     : num  9.71 9.71 9.41 10 9.12 10 10 10 9.71 9.12 ...
##  $ nivelDemocracia: chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...

data2=data2[complete.cases(data2),]
head(data2)

##   rank        pais score electo gobierno participacion cultura libertades
## 1    1      Norway  9.87  10.00     9.64         10.00   10.00       9.71
## 2    2     Iceland  9.58  10.00     9.29          8.89   10.00       9.71
## 3    3      Sweden  9.39   9.58     9.64          8.33   10.00       9.41
## 4    4 New Zealand  9.26  10.00     9.29          8.89    8.13      10.00
## 5    5     Denmark  9.22  10.00     9.29          8.33    9.38       9.12
## 6   =6     Ireland  9.15   9.58     7.86          8.33   10.00      10.00
##   nivelDemocracia
## 1  Full democracy
## 2  Full democracy
## 3  Full democracy
## 4  Full democracy
## 5  Full democracy
## 6  Full democracy

row.names(datasub)=datasub$Row.names
datasub=datasub[,-c(1)]
head(datasub)

##               gdp social lifeexp freedom generosity corruption        PC1
## Afghanistan 0.332  0.537   0.255   0.085      0.191      0.036 -2.0959561
## Albania     0.916  0.817   0.790   0.419      0.149      0.032 -0.3440926
## Algeria     0.979  1.154   0.687   0.077      0.055      0.135 -0.4409254
## Angola      0.730  1.125   0.269   0.000      0.079      0.061 -1.3917248
## Argentina   1.073  1.468   0.744   0.570      0.062      0.054  0.4974521
## Armenia     0.816  0.990   0.666   0.260      0.077      0.028 -0.6877787

grandata=merge(data,data2,by.x = 0,by.y = 'pais')
head(grandata)

##     Row.names score.x   gdp social lifeexp freedom generosity corruption
## 1 Afghanistan   3.632 0.332  0.537   0.255   0.085      0.191      0.036
## 2     Albania   4.586 0.916  0.817   0.790   0.419      0.149      0.032
## 3     Algeria   5.295 0.979  1.154   0.687   0.077      0.055      0.135
## 4      Angola   3.795 0.730  1.125   0.269   0.000      0.079      0.061
## 5   Argentina   6.388 1.073  1.468   0.744   0.570      0.062      0.054
## 6     Armenia   4.321 0.816  0.990   0.666   0.260      0.077      0.028
##   rank score.y electo gobierno participacion cultura libertades
## 1  149    2.55   2.50     1.14          2.78    2.50       3.82
## 2   77    5.98   7.00     4.71          5.56    5.00       7.65
## 3  128    3.56   2.58     2.21          3.89    5.00       4.12
## 4  125    3.62   1.75     2.86          5.56    5.00       2.94
## 5   48    6.96   9.17     5.00          6.11    6.88       7.65
## 6  111    4.11   5.25     2.86          5.00    1.88       5.59
##    nivelDemocracia
## 1    Authoritarian
## 2    Hybrid regime
## 3    Authoritarian
## 4    Authoritarian
## 5 Flawed democracy
## 6    Hybrid regime

cor.test(grandata$score.x,grandata$score.y)

## 
##  Pearson's product-moment correlation
## 
## data:  grandata$score.x and grandata$score.y
## t = 9.7281, df = 145, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5193690 0.7173363
## sample estimates:
##      cor 
## 0.628423

grandatasub=grandata[,-c(1,2,9,10,16)]
row.names(grandatasub)=grandata$Row.names
head(grandatasub)

##               gdp social lifeexp freedom generosity corruption electo
## Afghanistan 0.332  0.537   0.255   0.085      0.191      0.036   2.50
## Albania     0.916  0.817   0.790   0.419      0.149      0.032   7.00
## Algeria     0.979  1.154   0.687   0.077      0.055      0.135   2.58
## Angola      0.730  1.125   0.269   0.000      0.079      0.061   1.75
## Argentina   1.073  1.468   0.744   0.570      0.062      0.054   9.17
## Armenia     0.816  0.990   0.666   0.260      0.077      0.028   5.25
##             gobierno participacion cultura libertades
## Afghanistan     1.14          2.78    2.50       3.82
## Albania         4.71          5.56    5.00       7.65
## Algeria         2.21          3.89    5.00       4.12
## Angola          2.86          5.56    5.00       2.94
## Argentina       5.00          6.11    6.88       7.65
## Armenia         2.86          5.00    1.88       5.59

str(grandatasub)

## 'data.frame':    147 obs. of  11 variables:
##  $ gdp          : num  0.332 0.916 0.979 0.73 1.073 ...
##  $ social       : num  0.537 0.817 1.154 1.125 1.468 ...
##  $ lifeexp      : num  0.255 0.79 0.687 0.269 0.744 0.666 0.91 0.891 0.603 0.698 ...
##  $ freedom      : num  0.085 0.419 0.077 0 0.57 0.26 0.647 0.617 0.43 0.594 ...
##  $ generosity   : num  0.191 0.149 0.055 0.079 0.062 0.077 0.361 0.242 0.031 0.243 ...
##  $ corruption   : num  0.036 0.032 0.135 0.061 0.054 0.028 0.302 0.224 0.176 0.123 ...
##  $ electo       : num  2.5 7 2.58 1.75 9.17 5.25 10 9.58 0.5 0.83 ...
##  $ gobierno     : num  1.14 4.71 2.21 2.86 5 2.86 8.93 8.21 2.14 3.21 ...
##  $ participacion: num  2.78 5.56 3.89 5.56 6.11 5 7.78 8.33 3.33 2.78 ...
##  $ cultura      : num  2.5 5 5 5 6.88 1.88 8.75 6.88 3.75 4.38 ...
##  $ libertades   : num  3.82 7.65 4.12 2.94 7.65 5.59 10 9.12 3.53 2.35 ...

summary(grandatasub)

##       gdp             social         lifeexp          freedom      
##  Min.   :0.0240   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.6360   1st Qu.:1.075   1st Qu.:0.4310   1st Qu.:0.3575  
##  Median :0.9650   Median :1.279   Median :0.6690   Median :0.4810  
##  Mean   :0.9063   Mean   :1.223   Mean   :0.6069   Mean   :0.4552  
##  3rd Qu.:1.1985   3rd Qu.:1.467   3rd Qu.:0.7830   3rd Qu.:0.5790  
##  Max.   :2.0960   Max.   :1.644   Max.   :1.0300   Max.   :0.7240  
##                                                                    
##    generosity       corruption         electo          gobierno    
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:0.1090   1st Qu.:0.0510   1st Qu.: 3.500   1st Qu.:3.210  
##  Median :0.1720   Median :0.0820   Median : 7.000   Median :5.360  
##  Mean   :0.1808   Mean   :0.1129   Mean   : 6.161   Mean   :5.061  
##  3rd Qu.:0.2425   3rd Qu.:0.1375   3rd Qu.: 9.170   3rd Qu.:7.055  
##  Max.   :0.5980   Max.   :0.4570   Max.   :10.000   Max.   :9.640  
##                   NA's   :1                                        
##  participacion       cultura         libertades    
##  Min.   : 1.110   Min.   : 1.880   Min.   : 0.000  
##  1st Qu.: 3.890   1st Qu.: 4.380   1st Qu.: 3.820  
##  Median : 5.560   Median : 5.630   Median : 6.180  
##  Mean   : 5.283   Mean   : 5.721   Mean   : 6.019  
##  3rd Qu.: 6.670   3rd Qu.: 6.880   3rd Qu.: 8.240  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000  
##

for(i in 1:ncol(grandatasub)){  
  MEDIA=Mean(grandatasub[,i], na.rm = TRUE) 
  grandatasub[is.na(grandatasub[,i]), i] = MEDIA  
}

summary(grandatasub)

##       gdp             social         lifeexp          freedom      
##  Min.   :0.0240   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.6360   1st Qu.:1.075   1st Qu.:0.4310   1st Qu.:0.3575  
##  Median :0.9650   Median :1.279   Median :0.6690   Median :0.4810  
##  Mean   :0.9063   Mean   :1.223   Mean   :0.6069   Mean   :0.4552  
##  3rd Qu.:1.1985   3rd Qu.:1.467   3rd Qu.:0.7830   3rd Qu.:0.5790  
##  Max.   :2.0960   Max.   :1.644   Max.   :1.0300   Max.   :0.7240  
##    generosity       corruption         electo          gobierno    
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:0.1090   1st Qu.:0.0510   1st Qu.: 3.500   1st Qu.:3.210  
##  Median :0.1720   Median :0.0820   Median : 7.000   Median :5.360  
##  Mean   :0.1808   Mean   :0.1129   Mean   : 6.161   Mean   :5.061  
##  3rd Qu.:0.2425   3rd Qu.:0.1370   3rd Qu.: 9.170   3rd Qu.:7.055  
##  Max.   :0.5980   Max.   :0.4570   Max.   :10.000   Max.   :9.640  
##  participacion       cultura         libertades    
##  Min.   : 1.110   Min.   : 1.880   Min.   : 0.000  
##  1st Qu.: 3.890   1st Qu.: 4.380   1st Qu.: 3.820  
##  Median : 5.560   Median : 5.630   Median : 6.180  
##  Mean   : 5.283   Mean   : 5.721   Mean   : 6.019  
##  3rd Qu.: 6.670   3rd Qu.: 6.880   3rd Qu.: 8.240  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000

cor(grandatasub)

##                        gdp     social    lifeexp   freedom   generosity
## gdp            1.000000000 0.66552882 0.83800846 0.3278154 -0.005575958
## social         0.665528820 1.00000000 0.65829343 0.4244563  0.023068484
## lifeexp        0.838008459 0.65829343 1.00000000 0.3638654  0.028594316
## freedom        0.327815360 0.42445627 0.36386540 1.0000000  0.307574777
## generosity    -0.005575958 0.02306848 0.02859432 0.3075748  1.000000000
## corruption     0.338800708 0.24852155 0.33697880 0.4676990  0.365945968
## electo         0.351863856 0.42250848 0.44891417 0.3228176 -0.007702789
## gobierno       0.605558181 0.58744061 0.65744729 0.5485786  0.169111516
## participacion  0.454287202 0.48755343 0.53777861 0.2584781  0.069957402
## cultura        0.456853938 0.48112521 0.50091925 0.4282252  0.278326334
## libertades     0.486068072 0.52188520 0.56553237 0.3615164 -0.001970897
##               corruption       electo  gobierno participacion   cultura
## gdp            0.3388007  0.351863856 0.6055582     0.4542872 0.4568539
## social         0.2485215  0.422508481 0.5874406     0.4875534 0.4811252
## lifeexp        0.3369788  0.448914171 0.6574473     0.5377786 0.5009192
## freedom        0.4676990  0.322817572 0.5485786     0.2584781 0.4282252
## generosity     0.3659460 -0.007702789 0.1691115     0.0699574 0.2783263
## corruption     1.0000000  0.070511796 0.3843728     0.2459468 0.4809135
## electo         0.0705118  1.000000000 0.7863587     0.6989020 0.5090414
## gobierno       0.3843728  0.786358743 1.0000000     0.6906340 0.6842663
## participacion  0.2459468  0.698901986 0.6906340     1.0000000 0.5856477
## cultura        0.4809135  0.509041443 0.6842663     0.5856477 1.0000000
## libertades     0.1885590  0.900643290 0.8304343     0.7345479 0.6155471
##                 libertades
## gdp            0.486068072
## social         0.521885201
## lifeexp        0.565532370
## freedom        0.361516367
## generosity    -0.001970897
## corruption     0.188558959
## electo         0.900643290
## gobierno       0.830434271
## participacion  0.734547869
## cultura        0.615547145
## libertades     1.000000000

KMO(cor(grandatasub))

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(grandatasub))
## Overall MSA =  0.87
## MSA for each item = 
##           gdp        social       lifeexp       freedom    generosity 
##          0.81          0.93          0.87          0.84          0.70 
##    corruption        electo      gobierno participacion       cultura 
##          0.80          0.80          0.91          0.95          0.92 
##    libertades 
##          0.86

cortest.bartlett(cor(grandatasub),n=nrow(grandatasub))

## $chisq
## [1] 1195.888
## 
## $p.value
## [1] 3.962941e-214
## 
## $df
## [1] 55

eigenf=eigen(cor(grandatasub))
eigenf$values

##  [1] 5.67443754 1.58080581 1.15675205 0.64709382 0.56949856 0.40677485
##  [7] 0.33748823 0.26957468 0.15620530 0.12248755 0.07888163

resultadoPr=principal(cor(grandatasub),2,rotate="varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)

## Principal Components Analysis
## Call: principal(r = cor(grandatasub), nfactors = 2, rotate = "varimax", 
##     scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                  RC1    RC2    h2    u2  com
## gdp            0.698        0.547 0.453 1.24
## social         0.704        0.545 0.455 1.20
## lifeexp        0.759        0.634 0.366 1.20
## freedom               0.642 0.552 0.448 1.61
## generosity            0.752 0.577 0.423 1.04
## corruption            0.801 0.677 0.323 1.11
## electo         0.855        0.738 0.262 1.02
## gobierno       0.872        0.851 0.149 1.23
## participacion  0.810        0.661 0.339 1.02
## cultura        0.645  0.474 0.641 0.359 1.84
## libertades     0.913        0.833 0.167 1.00
## 
##                         RC1   RC2
## SS loadings           5.142 2.113
## Proportion Var        0.467 0.192
## Cumulative Var        0.467 0.660
## Proportion Explained  0.709 0.291
## Cumulative Proportion 0.709 1.000
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.099 
## 
## Fit based upon off diagonal values = 0.959

PARTE 3:

folder="data"
fileName="gdi.xlsx"
fileToRead=file.path(folder,fileName)
library(openxlsx)
gdi=openxlsx::read.xlsx(fileToRead,sheet=1,skipEmptyRows=T,skipEmptyCols=T)
head(gdi)

##         X1 Table.4..Gender.Development.Index                       X3
## 1     <NA>                              <NA> Gender Development Index
## 2     <NA>                              <NA>                    Value
## 3 HDI rank                           Country                     <NA>
## 4     <NA>                              <NA>                     2015
## 5     <NA>       VERY HIGH HUMAN DEVELOPMENT                     <NA>
## 6        1                            Norway      0.99346052864650269
##      X4   X5                            X6                  X7
## 1  <NA> <NA> Human Development Index (HDI)                <NA>
## 2 Group    b                         Value                <NA>
## 3  <NA> <NA>                        Female                Male
## 4  2015 <NA>                          2015                2015
## 5  <NA> <NA>                          <NA>                <NA>
## 6     1 <NA>           0.94442473100258917 0.95064142335808732
##                         X8                 X9                         X10
## 1 Life expectancy at birth               <NA> Expected years of schooling
## 2                  (years)               <NA>                     (years)
## 3                   Female               Male                      Female
## 4                     2015               2015                        2015
## 5                     <NA>               <NA>                        <NA>
## 6       83.691000000000003 79.715999999999994          18.306629999999998
##    X11      X12  X13                      X14  X15                X16  X17
## 1 <NA>     <NA> <NA> Mean years of schooling  <NA>               <NA> <NA>
## 2 <NA>     <NA> <NA>                  (years) <NA>               <NA> <NA>
## 3 <NA>     Male <NA>                   Female <NA>               Male <NA>
## 4    c     2015    c                     2015    c               2015    c
## 5 <NA>     <NA> <NA>                     <NA> <NA>               <NA> <NA>
## 6 <NA> 17.07009 <NA>                 12.79505 <NA> 12.701599999999999 <NA>
##                                           X18                X19  X20
## 1 Estimated gross national income per capita                <NA>    a
## 2                                (2011 PPP $)               <NA> <NA>
## 3                                      Female               Male <NA>
## 4                                        2015               2015 <NA>
## 5                                        <NA>               <NA> <NA>
## 6                          59799.658740179249 75314.004346281479 <NA>

Elimino columnas innecesarias:

gdi=gdi[,c(1:3)]
head(gdi)

##         X1 Table.4..Gender.Development.Index                       X3
## 1     <NA>                              <NA> Gender Development Index
## 2     <NA>                              <NA>                    Value
## 3 HDI rank                           Country                     <NA>
## 4     <NA>                              <NA>                     2015
## 5     <NA>       VERY HIGH HUMAN DEVELOPMENT                     <NA>
## 6        1                            Norway      0.99346052864650269

Elimino filas innecesarias:

gdi=gdi[-c(1:5),]
head(gdi)

##    X1 Table.4..Gender.Development.Index                  X3
## 6   1                            Norway 0.99346052864650269
## 7   2                         Australia 0.97787279010186912
## 8   2                       Switzerland  0.9737969809733199
## 9   4                           Germany  0.9637034038409088
## 10  5                           Denmark 0.97037586022448263
## 11  5                         Singapore 0.98468003632222867

gdi=gdi[,-c(1)]
row.names(gdi)=NULL
head(gdi)

##   Table.4..Gender.Development.Index                  X3
## 1                            Norway 0.99346052864650269
## 2                         Australia 0.97787279010186912
## 3                       Switzerland  0.9737969809733199
## 4                           Germany  0.9637034038409088
## 5                           Denmark 0.97037586022448263
## 6                         Singapore 0.98468003632222867

names(gdi)=c("coutry","gdi2015")
head(gdi)

##        coutry             gdi2015
## 1      Norway 0.99346052864650269
## 2   Australia 0.97787279010186912
## 3 Switzerland  0.9737969809733199
## 4     Germany  0.9637034038409088
## 5     Denmark 0.97037586022448263
## 6   Singapore 0.98468003632222867

gdi[gdi==".."]=NA

Elimino filas innecesarias al final:

gdi=gdi[-c(192:244),]
tail(gdi,10)

##                       coutry             gdi2015
## 182                  Eritrea                <NA>
## 183             Sierra Leone 0.87075650936199367
## 184               Mozambique 0.87935751460598066
## 185              South Sudan                <NA>
## 186                   Guinea 0.78401993273328008
## 187                  Burundi  0.9187330348227295
## 188             Burkina Faso 0.87433676945691585
## 189                     Chad 0.76496475019421595
## 190                    Niger 0.73159473547481912
## 191 Central African Republic 0.77631694317566446

Me quedo con los casos completos:

gdi=gdi[complete.cases(gdi),]
head(gdi)

##        coutry             gdi2015
## 1      Norway 0.99346052864650269
## 2   Australia 0.97787279010186912
## 3 Switzerland  0.9737969809733199
## 4     Germany  0.9637034038409088
## 5     Denmark 0.97037586022448263
## 6   Singapore 0.98468003632222867

str(gdi)

## 'data.frame':    160 obs. of  2 variables:
##  $ coutry : chr  "Norway" "Australia" "Switzerland" "Germany" ...
##  $ gdi2015: chr  "0.99346052864650269" "0.97787279010186912" "0.9737969809733199" "0.9637034038409088" ...

gdi$gdi2015=as.numeric(gdi$gdi2015)
str(gdi)

## 'data.frame':    160 obs. of  2 variables:
##  $ coutry : chr  "Norway" "Australia" "Switzerland" "Germany" ...
##  $ gdi2015: num  0.993 0.978 0.974 0.964 0.97 ...

grandata2=merge(grandata,gdi,by.x = 'Row.names',by.y = 'coutry')
head(grandata2)

##     Row.names score.x   gdp social lifeexp freedom generosity corruption
## 1 Afghanistan   3.632 0.332  0.537   0.255   0.085      0.191      0.036
## 2     Albania   4.586 0.916  0.817   0.790   0.419      0.149      0.032
## 3     Algeria   5.295 0.979  1.154   0.687   0.077      0.055      0.135
## 4   Argentina   6.388 1.073  1.468   0.744   0.570      0.062      0.054
## 5     Armenia   4.321 0.816  0.990   0.666   0.260      0.077      0.028
## 6   Australia   7.272 1.340  1.573   0.910   0.647      0.361      0.302
##   rank score.y electo gobierno participacion cultura libertades
## 1  149    2.55   2.50     1.14          2.78    2.50       3.82
## 2   77    5.98   7.00     4.71          5.56    5.00       7.65
## 3  128    3.56   2.58     2.21          3.89    5.00       4.12
## 4   48    6.96   9.17     5.00          6.11    6.88       7.65
## 5  111    4.11   5.25     2.86          5.00    1.88       5.59
## 6    8    9.09  10.00     8.93          7.78    8.75      10.00
##    nivelDemocracia   gdi2015
## 1    Authoritarian 0.6089179
## 2    Hybrid regime 0.9593885
## 3    Authoritarian 0.8544616
## 4 Flawed democracy 0.9818548
## 5    Hybrid regime 0.9931695
## 6   Full democracy 0.9778728

grandata2sub=grandata2[,c(2,10,17)]
row.names(grandata2sub)=grandata2$Row.names
head(grandata2sub)

##             score.x score.y   gdi2015
## Afghanistan   3.632    2.55 0.6089179
## Albania       4.586    5.98 0.9593885
## Algeria       5.295    3.56 0.8544616
## Argentina     6.388    6.96 0.9818548
## Armenia       4.321    4.11 0.9931695
## Australia     7.272    9.09 0.9778728

str(grandata2sub)

## 'data.frame':    128 obs. of  3 variables:
##  $ score.x: num  3.63 4.59 5.29 6.39 4.32 ...
##  $ score.y: num  2.55 5.98 3.56 6.96 4.11 9.09 8.42 2.65 2.71 5.43 ...
##  $ gdi2015: num  0.609 0.959 0.854 0.982 0.993 ...

summary(grandata2sub)

##     score.x         score.y         gdi2015      
##  Min.   :2.905   Min.   :1.500   Min.   :0.6089  
##  1st Qu.:4.454   1st Qu.:4.072   1st Qu.:0.9032  
##  Median :5.441   Median :6.230   Median :0.9614  
##  Mean   :5.459   Mean   :5.829   Mean   :0.9351  
##  3rd Qu.:6.350   3rd Qu.:7.522   3rd Qu.:0.9836  
##  Max.   :7.632   Max.   :9.870   Max.   :1.0316

grandata2sub_s=scale(grandata2sub)
summary(grandata2sub_s)

##     score.x            score.y           gdi2015       
##  Min.   :-2.23696   Min.   :-2.0380   Min.   :-4.4790  
##  1st Qu.:-0.88051   1st Qu.:-0.8270   1st Qu.:-0.4375  
##  Median :-0.01585   Median : 0.1885   Median : 0.3611  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.78029   3rd Qu.: 0.7970   3rd Qu.: 0.6669  
##  Max.   : 1.90311   Max.   : 1.9020   Max.   : 1.3258

grandata2sub_d=dist(grandata2sub_s)

grandata2sub_r <- cmdscale(grandata2sub_d,eig=TRUE, k=2) # k sugiere dimensiones. El resultado muestra las coordenadas (puntos) de cada provincia. DEJA EL k en 2.
grandata2sub_r$GOF # GOF indica la bondad de ajuste, mientras mas cerca a 1 mejor.

## [1] 0.8766841 0.8766841

titulo="Mapa de Similitudes entre países"
x <- grandata2sub_r$points[,1]
y <- grandata2sub_r$points[,2]
plot(x, y, main=titulo)

plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n") # 'n' evita que se pongan los puntos.
columnForLabels=dimnames(grandata2sub_r[[1]])[[1]] # etiquetas y colores de los puntos
text(x, y,labels = columnForLabels , cex = 0.6) #con cex indicamos el tamaño

¡Singapur no está en los nórdicos!