Cargo la data:
link="https://en.wikipedia.org/wiki/World_Happiness_Report"
library(XML)
library(RCurl)
## Loading required package: bitops
wikiLinkContents=getURL(link)
wikiTables=readHTMLTable(wikiLinkContents,stringsAsFactors=FALSE)
Veo qué tabla es la que necesito: es la quinta tabla (en las cuatro tablas anteriores no hay nada, al parecer):
data=as.data.frame(wikiTables[5])
head(data)
## NULL.Overall.Rank NULL.Country NULL.Score NULL.GDP.per.capita
## 1 1 Finland 7.632 1.305
## 2 2 Norway 7.594 1.456
## 3 3 Denmark 7.555 1.351
## 4 4 Iceland 7.495 1.343
## 5 5 Switzerland 7.487 1.420
## 6 6 Netherlands 7.441 1.361
## NULL.Social.support NULL.Healthy.life.expectancy
## 1 1.592 0.874
## 2 1.582 0.861
## 3 1.590 0.868
## 4 1.644 0.914
## 5 1.549 0.927
## 6 1.488 0.878
## NULL.Freedom.to.make.life.choices NULL.Generosity
## 1 0.681 0.192
## 2 0.686 0.286
## 3 0.683 0.284
## 4 0.677 0.353
## 5 0.660 0.256
## 6 0.638 0.333
## NULL.Perceptions.of.corruption
## 1 0.393
## 2 0.340
## 3 0.408
## 4 0.138
## 5 0.357
## 6 0.295
Saco estructura:
str(data)
## 'data.frame': 156 obs. of 9 variables:
## $ NULL.Overall.Rank : chr "1" "2" "3" "4" ...
## $ NULL.Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ NULL.Score : chr "7.632" "7.594" "7.555" "7.495" ...
## $ NULL.GDP.per.capita : chr "1.305" "1.456" "1.351" "1.343" ...
## $ NULL.Social.support : chr "1.592" "1.582" "1.590" "1.644" ...
## $ NULL.Healthy.life.expectancy : chr "0.874" "0.861" "0.868" "0.914" ...
## $ NULL.Freedom.to.make.life.choices: chr "0.681" "0.686" "0.683" "0.677" ...
## $ NULL.Generosity : chr "0.192" "0.286" "0.284" "0.353" ...
## $ NULL.Perceptions.of.corruption : chr "0.393" "0.340" "0.408" "0.138" ...
Formateo a numéricas:
data[,c(3:9)]=lapply(data[,c(3:9)],as.numeric)
## Warning in lapply(data[, c(3:9)], as.numeric): NAs introducidos por
## coerción
str(data)
## 'data.frame': 156 obs. of 9 variables:
## $ NULL.Overall.Rank : chr "1" "2" "3" "4" ...
## $ NULL.Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ NULL.Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ NULL.GDP.per.capita : num 1.3 1.46 1.35 1.34 1.42 ...
## $ NULL.Social.support : num 1.59 1.58 1.59 1.64 1.55 ...
## $ NULL.Healthy.life.expectancy : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ NULL.Freedom.to.make.life.choices: num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ NULL.Generosity : num 0.192 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ NULL.Perceptions.of.corruption : num 0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
head(data)
## NULL.Overall.Rank NULL.Country NULL.Score NULL.GDP.per.capita
## 1 1 Finland 7.632 1.305
## 2 2 Norway 7.594 1.456
## 3 3 Denmark 7.555 1.351
## 4 4 Iceland 7.495 1.343
## 5 5 Switzerland 7.487 1.420
## 6 6 Netherlands 7.441 1.361
## NULL.Social.support NULL.Healthy.life.expectancy
## 1 1.592 0.874
## 2 1.582 0.861
## 3 1.590 0.868
## 4 1.644 0.914
## 5 1.549 0.927
## 6 1.488 0.878
## NULL.Freedom.to.make.life.choices NULL.Generosity
## 1 0.681 0.192
## 2 0.686 0.286
## 3 0.683 0.284
## 4 0.677 0.353
## 5 0.660 0.256
## 6 0.638 0.333
## NULL.Perceptions.of.corruption
## 1 0.393
## 2 0.340
## 3 0.408
## 4 0.138
## 5 0.357
## 6 0.295
data=data[,-c(1)]
row.names(data)=data$NULL.Country
head(data)
## NULL.Country NULL.Score NULL.GDP.per.capita
## Finland Finland 7.632 1.305
## Norway Norway 7.594 1.456
## Denmark Denmark 7.555 1.351
## Iceland Iceland 7.495 1.343
## Switzerland Switzerland 7.487 1.420
## Netherlands Netherlands 7.441 1.361
## NULL.Social.support NULL.Healthy.life.expectancy
## Finland 1.592 0.874
## Norway 1.582 0.861
## Denmark 1.590 0.868
## Iceland 1.644 0.914
## Switzerland 1.549 0.927
## Netherlands 1.488 0.878
## NULL.Freedom.to.make.life.choices NULL.Generosity
## Finland 0.681 0.192
## Norway 0.686 0.286
## Denmark 0.683 0.284
## Iceland 0.677 0.353
## Switzerland 0.660 0.256
## Netherlands 0.638 0.333
## NULL.Perceptions.of.corruption
## Finland 0.393
## Norway 0.340
## Denmark 0.408
## Iceland 0.138
## Switzerland 0.357
## Netherlands 0.295
data=data[,-c(1)]
names(data)=c("score","gdp","social","lifeexp","freedom","generosity","corruption")
head(data)
## score gdp social lifeexp freedom generosity corruption
## Finland 7.632 1.305 1.592 0.874 0.681 0.192 0.393
## Norway 7.594 1.456 1.582 0.861 0.686 0.286 0.340
## Denmark 7.555 1.351 1.590 0.868 0.683 0.284 0.408
## Iceland 7.495 1.343 1.644 0.914 0.677 0.353 0.138
## Switzerland 7.487 1.420 1.549 0.927 0.660 0.256 0.357
## Netherlands 7.441 1.361 1.488 0.878 0.638 0.333 0.295
Creo subset de mi data:
datasub=data[,c(2:7)]
head(datasub)
## gdp social lifeexp freedom generosity corruption
## Finland 1.305 1.592 0.874 0.681 0.192 0.393
## Norway 1.456 1.582 0.861 0.686 0.286 0.340
## Denmark 1.351 1.590 0.868 0.683 0.284 0.408
## Iceland 1.343 1.644 0.914 0.677 0.353 0.138
## Switzerland 1.420 1.549 0.927 0.660 0.256 0.357
## Netherlands 1.361 1.488 0.878 0.638 0.333 0.295
Veo si hay perdidos:
summary(datasub)
## gdp social lifeexp freedom
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6162 1st Qu.:1.067 1st Qu.:0.4223 1st Qu.:0.3560
## Median :0.9495 Median :1.255 Median :0.6440 Median :0.4870
## Mean :0.8914 Mean :1.213 Mean :0.5973 Mean :0.4545
## 3rd Qu.:1.1978 3rd Qu.:1.463 3rd Qu.:0.7772 3rd Qu.:0.5785
## Max. :2.0960 Max. :1.644 Max. :1.0300 Max. :0.7240
##
## generosity corruption
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.1095 1st Qu.:0.051
## Median :0.1740 Median :0.082
## Mean :0.1809 Mean :0.112
## 3rd Qu.:0.2390 3rd Qu.:0.137
## Max. :0.5980 Max. :0.457
## NA's :1
Hay solo un perdido en la variable corruption. Imputo (no importa que use este código, porque si en las demás variables no hay perdidos, no imputará nada):
library(DescTools)
for(i in 1:ncol(datasub)){
MEDIA=Mean(datasub[,i], na.rm = TRUE)
datasub[is.na(datasub[,i]), i] = MEDIA
}
Veo cómo queda:
summary(datasub)
## gdp social lifeexp freedom
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6162 1st Qu.:1.067 1st Qu.:0.4223 1st Qu.:0.3560
## Median :0.9495 Median :1.255 Median :0.6440 Median :0.4870
## Mean :0.8914 Mean :1.213 Mean :0.5973 Mean :0.4545
## 3rd Qu.:1.1978 3rd Qu.:1.463 3rd Qu.:0.7772 3rd Qu.:0.5785
## Max. :2.0960 Max. :1.644 Max. :1.0300 Max. :0.7240
## generosity corruption
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1095 1st Qu.:0.0510
## Median :0.1740 Median :0.0820
## Mean :0.1809 Mean :0.1120
## 3rd Qu.:0.2390 3rd Qu.:0.1365
## Max. :0.5980 Max. :0.4570
¡Perfecto! Sigamos.
Ahora saco una matriz de correlación.
cor(datasub)
## gdp social lifeexp freedom generosity
## gdp 1.00000000 0.67173252 0.84427323 0.3217754 -0.01414976
## social 0.67173252 1.00000000 0.66707865 0.4110874 0.01740677
## lifeexp 0.84427323 0.66707865 1.00000000 0.3491445 0.01868503
## freedom 0.32177543 0.41108744 0.34914449 1.0000000 0.29710627
## generosity -0.01414976 0.01740677 0.01868503 0.2971063 1.00000000
## corruption 0.30105386 0.21687701 0.31031301 0.4607881 0.36036993
## corruption
## gdp 0.3010539
## social 0.2168770
## lifeexp 0.3103130
## freedom 0.4607881
## generosity 0.3603699
## corruption 1.0000000
Con eso podía responder la primera pregunta.
En la siguiente pregunta, piden hacer un mapa de similitudes para ver cómo está Singapur en relación con el resto de países nórdicos.
Para ello primero estandarizo:
datasub_s=scale(datasub)
summary(datasub_s)
## gdp social lifeexp freedom
## Min. :-2.2746 Min. :-4.0124 Min. :-2.4128 Min. :-2.7983
## 1st Qu.:-0.7022 1st Qu.:-0.4845 1st Qu.:-0.7072 1st Qu.:-0.6065
## Median : 0.1481 Median : 0.1381 Median : 0.1884 Median : 0.2001
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7815 3rd Qu.: 0.8260 3rd Qu.: 0.7267 3rd Qu.: 0.7634
## Max. : 3.0735 Max. : 1.4246 Max. : 1.7475 Max. : 1.6592
## generosity corruption
## Min. :-1.83772 Min. :-1.1645
## 1st Qu.:-0.72560 1st Qu.:-0.6342
## Median :-0.07051 Median :-0.3119
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.58966 3rd Qu.: 0.2547
## Max. : 4.23580 Max. : 3.5870
Luego saco las distancias:
datasub_d=dist(datasub_s)
Aplico cmdscale:
datasub_r <- cmdscale(datasub_d,eig=TRUE, k=2) # k sugiere dimensiones. El resultado muestra las coordenadas (puntos) de cada provincia. DEJA EL k en 2.
datasub_r$GOF # GOF indica la bondad de ajuste, mientras mas cerca a 1 mejor.
## [1] 0.721218 0.721218
Buen GOF. Ahora dibujo el mapa:
titulo="Mapa de Similitudes entre países basado en el Índice de Felicidad"
x <- datasub_r$points[,1]
y <- datasub_r$points[,2]
plot(x, y, main=titulo)
Pongo los nombres de los países:
plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n") # 'n' evita que se pongan los puntos.
columnForLabels=dimnames(datasub_r[[1]])[[1]] # etiquetas y colores de los puntos
text(x, y,labels = columnForLabels , cex = 0.5) #con cex indicamos el tamaño
En el mapa ,vemos que Singapur se encuentra con los nórdicos. Con eso se respondía la pregunta.
Si ya tenemos la data estandarizada, puedo usar NbClust para ver cuántos clusters se podrían formar.
Aplico primero el método “complete”:
library(NbClust)
nb <- NbClust(datasub_s, method="complete")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 1 proposed 2 as the best number of clusters
## * 13 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
¡3 clusters!
Ahora aplico método “kmeans”:
nb2 <- NbClust(datasub_s, method="kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
¡3 clusters!
COn eso se respondía la pregunta: tanto por “complete” como por “kmeans” son 3 clusters.
PCA:
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
##
## Attaching package: 'psych'
## The following objects are masked from 'package:DescTools':
##
## AUC, ICC, SD
KMO(cor(datasub))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(datasub))
## Overall MSA = 0.73
## MSA for each item =
## gdp social lifeexp freedom generosity corruption
## 0.70 0.84 0.71 0.76 0.63 0.72
Bartlett:
cortest.bartlett(cor(datasub),n=nrow(datasub))
## $chisq
## [1] 395.2955
##
## $p.value
## [1] 6.734106e-75
##
## $df
## [1] 15
eigenf=eigen(cor(datasub))
eigenf$values
## [1] 2.9180016 1.4093064 0.6050568 0.5785018 0.3345613 0.1545721
resultadoPr=principal(cor(datasub),1,rotate="varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)
## Principal Components Analysis
## Call: principal(r = cor(datasub), nfactors = 1, rotate = "varimax",
## scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 h2 u2 com
## gdp 0.859 0.738 0.262 1
## social 0.806 0.650 0.350 1
## lifeexp 0.869 0.755 0.245 1
## freedom 0.643 0.414 0.586 1
## generosity 0.046 0.954 1
## corruption 0.561 0.315 0.685 1
##
## PC1
## SS loadings 2.918
## Proportion Var 0.486
##
## Mean item complexity = 1
## Test of the hypothesis that 1 component is sufficient.
##
## The root mean square of the residuals (RMSR) is 0.168
##
## Fit based upon off diagonal values = 0.841
regresFactors=factor.scores(datasub,resultadoPr)$scores
datasub=merge(datasub,regresFactors,by.x = 0,by.y = 0)
head(datasub)
## Row.names gdp social lifeexp freedom generosity corruption
## 1 Afghanistan 0.332 0.537 0.255 0.085 0.191 0.036
## 2 Albania 0.916 0.817 0.790 0.419 0.149 0.032
## 3 Algeria 0.979 1.154 0.687 0.077 0.055 0.135
## 4 Angola 0.730 1.125 0.269 0.000 0.079 0.061
## 5 Argentina 1.073 1.468 0.744 0.570 0.062 0.054
## 6 Armenia 0.816 0.990 0.666 0.260 0.077 0.028
## PC1
## 1 -2.0959561
## 2 -0.3440926
## 3 -0.4409254
## 4 -1.3917248
## 5 0.4974521
## 6 -0.6877787
PARTE 2:
link="https://en.wikipedia.org/wiki/Democracy_Index"
library(XML)
library(RCurl)
wikiLinkContents=getURL(link)
wikiTables=readHTMLTable(wikiLinkContents,stringsAsFactors=FALSE)
data2=as.data.frame(wikiTables[2])
head(data2)
## Democracy.Index.2017.Rank Democracy.Index.2017.Country
## 1 1 Norway
## 2 2 Iceland
## 3 3 Sweden
## 4 4 New Zealand
## 5 5 Denmark
## 6 =6 Ireland
## Democracy.Index.2017.Score
## 1 9.87
## 2 9.58
## 3 9.39
## 4 9.26
## 5 9.22
## 6 9.15
## Democracy.Index.2017.Electoral.process.and.pluralism
## 1 10.00
## 2 10.00
## 3 9.58
## 4 10.00
## 5 10.00
## 6 9.58
## Democracy.Index.2017.Functioning.of.government
## 1 9.64
## 2 9.29
## 3 9.64
## 4 9.29
## 5 9.29
## 6 7.86
## Democracy.Index.2017.Political.participation
## 1 10.00
## 2 8.89
## 3 8.33
## 4 8.89
## 5 8.33
## 6 8.33
## Democracy.Index.2017.Political.culture
## 1 10.00
## 2 10.00
## 3 10.00
## 4 8.13
## 5 9.38
## 6 10.00
## Democracy.Index.2017.Civil.liberties Democracy.Index.2017.Category
## 1 9.71 Full democracy
## 2 9.71 Full democracy
## 3 9.41 Full democracy
## 4 10.00 Full democracy
## 5 9.12 Full democracy
## 6 10.00 Full democracy
str(data2)
## 'data.frame': 171 obs. of 9 variables:
## $ Democracy.Index.2017.Rank : chr "1" "2" "3" "4" ...
## $ Democracy.Index.2017.Country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ Democracy.Index.2017.Score : chr "9.87" "9.58" "9.39" "9.26" ...
## $ Democracy.Index.2017.Electoral.process.and.pluralism: chr "10.00" "10.00" "9.58" "10.00" ...
## $ Democracy.Index.2017.Functioning.of.government : chr "9.64" "9.29" "9.64" "9.29" ...
## $ Democracy.Index.2017.Political.participation : chr "10.00" "8.89" "8.33" "8.89" ...
## $ Democracy.Index.2017.Political.culture : chr "10.00" "10.00" "10.00" "8.13" ...
## $ Democracy.Index.2017.Civil.liberties : chr "9.71" "9.71" "9.41" "10.00" ...
## $ Democracy.Index.2017.Category : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
colnames(data2)=c("rank","pais","score","electo","gobierno","participacion","cultura","libertades","nivelDemocracia")
head(data2)
## rank pais score electo gobierno participacion cultura libertades
## 1 1 Norway 9.87 10.00 9.64 10.00 10.00 9.71
## 2 2 Iceland 9.58 10.00 9.29 8.89 10.00 9.71
## 3 3 Sweden 9.39 9.58 9.64 8.33 10.00 9.41
## 4 4 New Zealand 9.26 10.00 9.29 8.89 8.13 10.00
## 5 5 Denmark 9.22 10.00 9.29 8.33 9.38 9.12
## 6 =6 Ireland 9.15 9.58 7.86 8.33 10.00 10.00
## nivelDemocracia
## 1 Full democracy
## 2 Full democracy
## 3 Full democracy
## 4 Full democracy
## 5 Full democracy
## 6 Full democracy
data2[,c(3:8)]=lapply(data2[,c(3:8)],as.numeric)
## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(data2[, c(3:8)], as.numeric): NAs introducidos por
## coerción
str(data2)
## 'data.frame': 171 obs. of 9 variables:
## $ rank : chr "1" "2" "3" "4" ...
## $ pais : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ score : num 9.87 9.58 9.39 9.26 9.22 9.15 9.15 9.09 9.03 9.03 ...
## $ electo : num 10 10 9.58 10 10 9.58 9.58 10 10 9.58 ...
## $ gobierno : num 9.64 9.29 9.64 9.29 9.29 7.86 9.64 8.93 8.93 9.29 ...
## $ participacion : num 10 8.89 8.33 8.89 8.33 8.33 7.78 7.78 7.78 7.78 ...
## $ cultura : num 10 10 10 8.13 9.38 10 8.75 8.75 8.75 9.38 ...
## $ libertades : num 9.71 9.71 9.41 10 9.12 10 10 10 9.71 9.12 ...
## $ nivelDemocracia: chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
data2=data2[complete.cases(data2),]
head(data2)
## rank pais score electo gobierno participacion cultura libertades
## 1 1 Norway 9.87 10.00 9.64 10.00 10.00 9.71
## 2 2 Iceland 9.58 10.00 9.29 8.89 10.00 9.71
## 3 3 Sweden 9.39 9.58 9.64 8.33 10.00 9.41
## 4 4 New Zealand 9.26 10.00 9.29 8.89 8.13 10.00
## 5 5 Denmark 9.22 10.00 9.29 8.33 9.38 9.12
## 6 =6 Ireland 9.15 9.58 7.86 8.33 10.00 10.00
## nivelDemocracia
## 1 Full democracy
## 2 Full democracy
## 3 Full democracy
## 4 Full democracy
## 5 Full democracy
## 6 Full democracy
row.names(datasub)=datasub$Row.names
datasub=datasub[,-c(1)]
head(datasub)
## gdp social lifeexp freedom generosity corruption PC1
## Afghanistan 0.332 0.537 0.255 0.085 0.191 0.036 -2.0959561
## Albania 0.916 0.817 0.790 0.419 0.149 0.032 -0.3440926
## Algeria 0.979 1.154 0.687 0.077 0.055 0.135 -0.4409254
## Angola 0.730 1.125 0.269 0.000 0.079 0.061 -1.3917248
## Argentina 1.073 1.468 0.744 0.570 0.062 0.054 0.4974521
## Armenia 0.816 0.990 0.666 0.260 0.077 0.028 -0.6877787
grandata=merge(data,data2,by.x = 0,by.y = 'pais')
head(grandata)
## Row.names score.x gdp social lifeexp freedom generosity corruption
## 1 Afghanistan 3.632 0.332 0.537 0.255 0.085 0.191 0.036
## 2 Albania 4.586 0.916 0.817 0.790 0.419 0.149 0.032
## 3 Algeria 5.295 0.979 1.154 0.687 0.077 0.055 0.135
## 4 Angola 3.795 0.730 1.125 0.269 0.000 0.079 0.061
## 5 Argentina 6.388 1.073 1.468 0.744 0.570 0.062 0.054
## 6 Armenia 4.321 0.816 0.990 0.666 0.260 0.077 0.028
## rank score.y electo gobierno participacion cultura libertades
## 1 149 2.55 2.50 1.14 2.78 2.50 3.82
## 2 77 5.98 7.00 4.71 5.56 5.00 7.65
## 3 128 3.56 2.58 2.21 3.89 5.00 4.12
## 4 125 3.62 1.75 2.86 5.56 5.00 2.94
## 5 48 6.96 9.17 5.00 6.11 6.88 7.65
## 6 111 4.11 5.25 2.86 5.00 1.88 5.59
## nivelDemocracia
## 1 Authoritarian
## 2 Hybrid regime
## 3 Authoritarian
## 4 Authoritarian
## 5 Flawed democracy
## 6 Hybrid regime
cor.test(grandata$score.x,grandata$score.y)
##
## Pearson's product-moment correlation
##
## data: grandata$score.x and grandata$score.y
## t = 9.7281, df = 145, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5193690 0.7173363
## sample estimates:
## cor
## 0.628423
grandatasub=grandata[,-c(1,2,9,10,16)]
row.names(grandatasub)=grandata$Row.names
head(grandatasub)
## gdp social lifeexp freedom generosity corruption electo
## Afghanistan 0.332 0.537 0.255 0.085 0.191 0.036 2.50
## Albania 0.916 0.817 0.790 0.419 0.149 0.032 7.00
## Algeria 0.979 1.154 0.687 0.077 0.055 0.135 2.58
## Angola 0.730 1.125 0.269 0.000 0.079 0.061 1.75
## Argentina 1.073 1.468 0.744 0.570 0.062 0.054 9.17
## Armenia 0.816 0.990 0.666 0.260 0.077 0.028 5.25
## gobierno participacion cultura libertades
## Afghanistan 1.14 2.78 2.50 3.82
## Albania 4.71 5.56 5.00 7.65
## Algeria 2.21 3.89 5.00 4.12
## Angola 2.86 5.56 5.00 2.94
## Argentina 5.00 6.11 6.88 7.65
## Armenia 2.86 5.00 1.88 5.59
str(grandatasub)
## 'data.frame': 147 obs. of 11 variables:
## $ gdp : num 0.332 0.916 0.979 0.73 1.073 ...
## $ social : num 0.537 0.817 1.154 1.125 1.468 ...
## $ lifeexp : num 0.255 0.79 0.687 0.269 0.744 0.666 0.91 0.891 0.603 0.698 ...
## $ freedom : num 0.085 0.419 0.077 0 0.57 0.26 0.647 0.617 0.43 0.594 ...
## $ generosity : num 0.191 0.149 0.055 0.079 0.062 0.077 0.361 0.242 0.031 0.243 ...
## $ corruption : num 0.036 0.032 0.135 0.061 0.054 0.028 0.302 0.224 0.176 0.123 ...
## $ electo : num 2.5 7 2.58 1.75 9.17 5.25 10 9.58 0.5 0.83 ...
## $ gobierno : num 1.14 4.71 2.21 2.86 5 2.86 8.93 8.21 2.14 3.21 ...
## $ participacion: num 2.78 5.56 3.89 5.56 6.11 5 7.78 8.33 3.33 2.78 ...
## $ cultura : num 2.5 5 5 5 6.88 1.88 8.75 6.88 3.75 4.38 ...
## $ libertades : num 3.82 7.65 4.12 2.94 7.65 5.59 10 9.12 3.53 2.35 ...
summary(grandatasub)
## gdp social lifeexp freedom
## Min. :0.0240 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6360 1st Qu.:1.075 1st Qu.:0.4310 1st Qu.:0.3575
## Median :0.9650 Median :1.279 Median :0.6690 Median :0.4810
## Mean :0.9063 Mean :1.223 Mean :0.6069 Mean :0.4552
## 3rd Qu.:1.1985 3rd Qu.:1.467 3rd Qu.:0.7830 3rd Qu.:0.5790
## Max. :2.0960 Max. :1.644 Max. :1.0300 Max. :0.7240
##
## generosity corruption electo gobierno
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. :0.000
## 1st Qu.:0.1090 1st Qu.:0.0510 1st Qu.: 3.500 1st Qu.:3.210
## Median :0.1720 Median :0.0820 Median : 7.000 Median :5.360
## Mean :0.1808 Mean :0.1129 Mean : 6.161 Mean :5.061
## 3rd Qu.:0.2425 3rd Qu.:0.1375 3rd Qu.: 9.170 3rd Qu.:7.055
## Max. :0.5980 Max. :0.4570 Max. :10.000 Max. :9.640
## NA's :1
## participacion cultura libertades
## Min. : 1.110 Min. : 1.880 Min. : 0.000
## 1st Qu.: 3.890 1st Qu.: 4.380 1st Qu.: 3.820
## Median : 5.560 Median : 5.630 Median : 6.180
## Mean : 5.283 Mean : 5.721 Mean : 6.019
## 3rd Qu.: 6.670 3rd Qu.: 6.880 3rd Qu.: 8.240
## Max. :10.000 Max. :10.000 Max. :10.000
##
for(i in 1:ncol(grandatasub)){
MEDIA=Mean(grandatasub[,i], na.rm = TRUE)
grandatasub[is.na(grandatasub[,i]), i] = MEDIA
}
summary(grandatasub)
## gdp social lifeexp freedom
## Min. :0.0240 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6360 1st Qu.:1.075 1st Qu.:0.4310 1st Qu.:0.3575
## Median :0.9650 Median :1.279 Median :0.6690 Median :0.4810
## Mean :0.9063 Mean :1.223 Mean :0.6069 Mean :0.4552
## 3rd Qu.:1.1985 3rd Qu.:1.467 3rd Qu.:0.7830 3rd Qu.:0.5790
## Max. :2.0960 Max. :1.644 Max. :1.0300 Max. :0.7240
## generosity corruption electo gobierno
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. :0.000
## 1st Qu.:0.1090 1st Qu.:0.0510 1st Qu.: 3.500 1st Qu.:3.210
## Median :0.1720 Median :0.0820 Median : 7.000 Median :5.360
## Mean :0.1808 Mean :0.1129 Mean : 6.161 Mean :5.061
## 3rd Qu.:0.2425 3rd Qu.:0.1370 3rd Qu.: 9.170 3rd Qu.:7.055
## Max. :0.5980 Max. :0.4570 Max. :10.000 Max. :9.640
## participacion cultura libertades
## Min. : 1.110 Min. : 1.880 Min. : 0.000
## 1st Qu.: 3.890 1st Qu.: 4.380 1st Qu.: 3.820
## Median : 5.560 Median : 5.630 Median : 6.180
## Mean : 5.283 Mean : 5.721 Mean : 6.019
## 3rd Qu.: 6.670 3rd Qu.: 6.880 3rd Qu.: 8.240
## Max. :10.000 Max. :10.000 Max. :10.000
cor(grandatasub)
## gdp social lifeexp freedom generosity
## gdp 1.000000000 0.66552882 0.83800846 0.3278154 -0.005575958
## social 0.665528820 1.00000000 0.65829343 0.4244563 0.023068484
## lifeexp 0.838008459 0.65829343 1.00000000 0.3638654 0.028594316
## freedom 0.327815360 0.42445627 0.36386540 1.0000000 0.307574777
## generosity -0.005575958 0.02306848 0.02859432 0.3075748 1.000000000
## corruption 0.338800708 0.24852155 0.33697880 0.4676990 0.365945968
## electo 0.351863856 0.42250848 0.44891417 0.3228176 -0.007702789
## gobierno 0.605558181 0.58744061 0.65744729 0.5485786 0.169111516
## participacion 0.454287202 0.48755343 0.53777861 0.2584781 0.069957402
## cultura 0.456853938 0.48112521 0.50091925 0.4282252 0.278326334
## libertades 0.486068072 0.52188520 0.56553237 0.3615164 -0.001970897
## corruption electo gobierno participacion cultura
## gdp 0.3388007 0.351863856 0.6055582 0.4542872 0.4568539
## social 0.2485215 0.422508481 0.5874406 0.4875534 0.4811252
## lifeexp 0.3369788 0.448914171 0.6574473 0.5377786 0.5009192
## freedom 0.4676990 0.322817572 0.5485786 0.2584781 0.4282252
## generosity 0.3659460 -0.007702789 0.1691115 0.0699574 0.2783263
## corruption 1.0000000 0.070511796 0.3843728 0.2459468 0.4809135
## electo 0.0705118 1.000000000 0.7863587 0.6989020 0.5090414
## gobierno 0.3843728 0.786358743 1.0000000 0.6906340 0.6842663
## participacion 0.2459468 0.698901986 0.6906340 1.0000000 0.5856477
## cultura 0.4809135 0.509041443 0.6842663 0.5856477 1.0000000
## libertades 0.1885590 0.900643290 0.8304343 0.7345479 0.6155471
## libertades
## gdp 0.486068072
## social 0.521885201
## lifeexp 0.565532370
## freedom 0.361516367
## generosity -0.001970897
## corruption 0.188558959
## electo 0.900643290
## gobierno 0.830434271
## participacion 0.734547869
## cultura 0.615547145
## libertades 1.000000000
KMO(cor(grandatasub))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(grandatasub))
## Overall MSA = 0.87
## MSA for each item =
## gdp social lifeexp freedom generosity
## 0.81 0.93 0.87 0.84 0.70
## corruption electo gobierno participacion cultura
## 0.80 0.80 0.91 0.95 0.92
## libertades
## 0.86
cortest.bartlett(cor(grandatasub),n=nrow(grandatasub))
## $chisq
## [1] 1195.888
##
## $p.value
## [1] 3.962941e-214
##
## $df
## [1] 55
eigenf=eigen(cor(grandatasub))
eigenf$values
## [1] 5.67443754 1.58080581 1.15675205 0.64709382 0.56949856 0.40677485
## [7] 0.33748823 0.26957468 0.15620530 0.12248755 0.07888163
resultadoPr=principal(cor(grandatasub),2,rotate="varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)
## Principal Components Analysis
## Call: principal(r = cor(grandatasub), nfactors = 2, rotate = "varimax",
## scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## gdp 0.698 0.547 0.453 1.24
## social 0.704 0.545 0.455 1.20
## lifeexp 0.759 0.634 0.366 1.20
## freedom 0.642 0.552 0.448 1.61
## generosity 0.752 0.577 0.423 1.04
## corruption 0.801 0.677 0.323 1.11
## electo 0.855 0.738 0.262 1.02
## gobierno 0.872 0.851 0.149 1.23
## participacion 0.810 0.661 0.339 1.02
## cultura 0.645 0.474 0.641 0.359 1.84
## libertades 0.913 0.833 0.167 1.00
##
## RC1 RC2
## SS loadings 5.142 2.113
## Proportion Var 0.467 0.192
## Cumulative Var 0.467 0.660
## Proportion Explained 0.709 0.291
## Cumulative Proportion 0.709 1.000
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.099
##
## Fit based upon off diagonal values = 0.959
PARTE 3:
folder="data"
fileName="gdi.xlsx"
fileToRead=file.path(folder,fileName)
library(openxlsx)
gdi=openxlsx::read.xlsx(fileToRead,sheet=1,skipEmptyRows=T,skipEmptyCols=T)
head(gdi)
## X1 Table.4..Gender.Development.Index X3
## 1 <NA> <NA> Gender Development Index
## 2 <NA> <NA> Value
## 3 HDI rank Country <NA>
## 4 <NA> <NA> 2015
## 5 <NA> VERY HIGH HUMAN DEVELOPMENT <NA>
## 6 1 Norway 0.99346052864650269
## X4 X5 X6 X7
## 1 <NA> <NA> Human Development Index (HDI) <NA>
## 2 Group b Value <NA>
## 3 <NA> <NA> Female Male
## 4 2015 <NA> 2015 2015
## 5 <NA> <NA> <NA> <NA>
## 6 1 <NA> 0.94442473100258917 0.95064142335808732
## X8 X9 X10
## 1 Life expectancy at birth <NA> Expected years of schooling
## 2 (years) <NA> (years)
## 3 Female Male Female
## 4 2015 2015 2015
## 5 <NA> <NA> <NA>
## 6 83.691000000000003 79.715999999999994 18.306629999999998
## X11 X12 X13 X14 X15 X16 X17
## 1 <NA> <NA> <NA> Mean years of schooling <NA> <NA> <NA>
## 2 <NA> <NA> <NA> (years) <NA> <NA> <NA>
## 3 <NA> Male <NA> Female <NA> Male <NA>
## 4 c 2015 c 2015 c 2015 c
## 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 <NA> 17.07009 <NA> 12.79505 <NA> 12.701599999999999 <NA>
## X18 X19 X20
## 1 Estimated gross national income per capita <NA> a
## 2 (2011 PPP $) <NA> <NA>
## 3 Female Male <NA>
## 4 2015 2015 <NA>
## 5 <NA> <NA> <NA>
## 6 59799.658740179249 75314.004346281479 <NA>
Elimino columnas innecesarias:
gdi=gdi[,c(1:3)]
head(gdi)
## X1 Table.4..Gender.Development.Index X3
## 1 <NA> <NA> Gender Development Index
## 2 <NA> <NA> Value
## 3 HDI rank Country <NA>
## 4 <NA> <NA> 2015
## 5 <NA> VERY HIGH HUMAN DEVELOPMENT <NA>
## 6 1 Norway 0.99346052864650269
Elimino filas innecesarias:
gdi=gdi[-c(1:5),]
head(gdi)
## X1 Table.4..Gender.Development.Index X3
## 6 1 Norway 0.99346052864650269
## 7 2 Australia 0.97787279010186912
## 8 2 Switzerland 0.9737969809733199
## 9 4 Germany 0.9637034038409088
## 10 5 Denmark 0.97037586022448263
## 11 5 Singapore 0.98468003632222867
gdi=gdi[,-c(1)]
row.names(gdi)=NULL
head(gdi)
## Table.4..Gender.Development.Index X3
## 1 Norway 0.99346052864650269
## 2 Australia 0.97787279010186912
## 3 Switzerland 0.9737969809733199
## 4 Germany 0.9637034038409088
## 5 Denmark 0.97037586022448263
## 6 Singapore 0.98468003632222867
names(gdi)=c("coutry","gdi2015")
head(gdi)
## coutry gdi2015
## 1 Norway 0.99346052864650269
## 2 Australia 0.97787279010186912
## 3 Switzerland 0.9737969809733199
## 4 Germany 0.9637034038409088
## 5 Denmark 0.97037586022448263
## 6 Singapore 0.98468003632222867
gdi[gdi==".."]=NA
Elimino filas innecesarias al final:
gdi=gdi[-c(192:244),]
tail(gdi,10)
## coutry gdi2015
## 182 Eritrea <NA>
## 183 Sierra Leone 0.87075650936199367
## 184 Mozambique 0.87935751460598066
## 185 South Sudan <NA>
## 186 Guinea 0.78401993273328008
## 187 Burundi 0.9187330348227295
## 188 Burkina Faso 0.87433676945691585
## 189 Chad 0.76496475019421595
## 190 Niger 0.73159473547481912
## 191 Central African Republic 0.77631694317566446
Me quedo con los casos completos:
gdi=gdi[complete.cases(gdi),]
head(gdi)
## coutry gdi2015
## 1 Norway 0.99346052864650269
## 2 Australia 0.97787279010186912
## 3 Switzerland 0.9737969809733199
## 4 Germany 0.9637034038409088
## 5 Denmark 0.97037586022448263
## 6 Singapore 0.98468003632222867
str(gdi)
## 'data.frame': 160 obs. of 2 variables:
## $ coutry : chr "Norway" "Australia" "Switzerland" "Germany" ...
## $ gdi2015: chr "0.99346052864650269" "0.97787279010186912" "0.9737969809733199" "0.9637034038409088" ...
gdi$gdi2015=as.numeric(gdi$gdi2015)
str(gdi)
## 'data.frame': 160 obs. of 2 variables:
## $ coutry : chr "Norway" "Australia" "Switzerland" "Germany" ...
## $ gdi2015: num 0.993 0.978 0.974 0.964 0.97 ...
grandata2=merge(grandata,gdi,by.x = 'Row.names',by.y = 'coutry')
head(grandata2)
## Row.names score.x gdp social lifeexp freedom generosity corruption
## 1 Afghanistan 3.632 0.332 0.537 0.255 0.085 0.191 0.036
## 2 Albania 4.586 0.916 0.817 0.790 0.419 0.149 0.032
## 3 Algeria 5.295 0.979 1.154 0.687 0.077 0.055 0.135
## 4 Argentina 6.388 1.073 1.468 0.744 0.570 0.062 0.054
## 5 Armenia 4.321 0.816 0.990 0.666 0.260 0.077 0.028
## 6 Australia 7.272 1.340 1.573 0.910 0.647 0.361 0.302
## rank score.y electo gobierno participacion cultura libertades
## 1 149 2.55 2.50 1.14 2.78 2.50 3.82
## 2 77 5.98 7.00 4.71 5.56 5.00 7.65
## 3 128 3.56 2.58 2.21 3.89 5.00 4.12
## 4 48 6.96 9.17 5.00 6.11 6.88 7.65
## 5 111 4.11 5.25 2.86 5.00 1.88 5.59
## 6 8 9.09 10.00 8.93 7.78 8.75 10.00
## nivelDemocracia gdi2015
## 1 Authoritarian 0.6089179
## 2 Hybrid regime 0.9593885
## 3 Authoritarian 0.8544616
## 4 Flawed democracy 0.9818548
## 5 Hybrid regime 0.9931695
## 6 Full democracy 0.9778728
grandata2sub=grandata2[,c(2,10,17)]
row.names(grandata2sub)=grandata2$Row.names
head(grandata2sub)
## score.x score.y gdi2015
## Afghanistan 3.632 2.55 0.6089179
## Albania 4.586 5.98 0.9593885
## Algeria 5.295 3.56 0.8544616
## Argentina 6.388 6.96 0.9818548
## Armenia 4.321 4.11 0.9931695
## Australia 7.272 9.09 0.9778728
str(grandata2sub)
## 'data.frame': 128 obs. of 3 variables:
## $ score.x: num 3.63 4.59 5.29 6.39 4.32 ...
## $ score.y: num 2.55 5.98 3.56 6.96 4.11 9.09 8.42 2.65 2.71 5.43 ...
## $ gdi2015: num 0.609 0.959 0.854 0.982 0.993 ...
summary(grandata2sub)
## score.x score.y gdi2015
## Min. :2.905 Min. :1.500 Min. :0.6089
## 1st Qu.:4.454 1st Qu.:4.072 1st Qu.:0.9032
## Median :5.441 Median :6.230 Median :0.9614
## Mean :5.459 Mean :5.829 Mean :0.9351
## 3rd Qu.:6.350 3rd Qu.:7.522 3rd Qu.:0.9836
## Max. :7.632 Max. :9.870 Max. :1.0316
grandata2sub_s=scale(grandata2sub)
summary(grandata2sub_s)
## score.x score.y gdi2015
## Min. :-2.23696 Min. :-2.0380 Min. :-4.4790
## 1st Qu.:-0.88051 1st Qu.:-0.8270 1st Qu.:-0.4375
## Median :-0.01585 Median : 0.1885 Median : 0.3611
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.78029 3rd Qu.: 0.7970 3rd Qu.: 0.6669
## Max. : 1.90311 Max. : 1.9020 Max. : 1.3258
grandata2sub_d=dist(grandata2sub_s)
grandata2sub_r <- cmdscale(grandata2sub_d,eig=TRUE, k=2) # k sugiere dimensiones. El resultado muestra las coordenadas (puntos) de cada provincia. DEJA EL k en 2.
grandata2sub_r$GOF # GOF indica la bondad de ajuste, mientras mas cerca a 1 mejor.
## [1] 0.8766841 0.8766841
titulo="Mapa de Similitudes entre países"
x <- grandata2sub_r$points[,1]
y <- grandata2sub_r$points[,2]
plot(x, y, main=titulo)
plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n") # 'n' evita que se pongan los puntos.
columnForLabels=dimnames(grandata2sub_r[[1]])[[1]] # etiquetas y colores de los puntos
text(x, y,labels = columnForLabels , cex = 0.6) #con cex indicamos el tamaño
¡Singapur no está en los nórdicos!