Cargo la data:
# instalen paquetes antes de activarlos!
library(XML)
library(RCurl)
## Loading required package: bitops
# URL
link = "https://en.wikipedia.org/wiki/World_Happiness_Report"
# Data
wikiLinkContents=getURL(link)
wikiTables=readHTMLTable(wikiLinkContents,stringsAsFactors=FALSE)
Selecciono mi tabla de interés:
data=as.data.frame(wikiTables[5])
head(data)
## NULL.Overall.Rank NULL.Country NULL.Score NULL.GDP.per.capita
## 1 1 Finland 7.632 1.305
## 2 2 Norway 7.594 1.456
## 3 3 Denmark 7.555 1.351
## 4 4 Iceland 7.495 1.343
## 5 5 Switzerland 7.487 1.420
## 6 6 Netherlands 7.441 1.361
## NULL.Social.support NULL.Healthy.life.expectancy
## 1 1.592 0.874
## 2 1.582 0.861
## 3 1.590 0.868
## 4 1.644 0.914
## 5 1.549 0.927
## 6 1.488 0.878
## NULL.Freedom.to.make.life.choices NULL.Generosity
## 1 0.681 0.192
## 2 0.686 0.286
## 3 0.683 0.284
## 4 0.677 0.353
## 5 0.660 0.256
## 6 0.638 0.333
## NULL.Perceptions.of.corruption
## 1 0.393
## 2 0.340
## 3 0.408
## 4 0.138
## 5 0.357
## 6 0.295
Veo estructura:
str(data)
## 'data.frame': 156 obs. of 9 variables:
## $ NULL.Overall.Rank : chr "1" "2" "3" "4" ...
## $ NULL.Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ NULL.Score : chr "7.632" "7.594" "7.555" "7.495" ...
## $ NULL.GDP.per.capita : chr "1.305" "1.456" "1.351" "1.343" ...
## $ NULL.Social.support : chr "1.592" "1.582" "1.590" "1.644" ...
## $ NULL.Healthy.life.expectancy : chr "0.874" "0.861" "0.868" "0.914" ...
## $ NULL.Freedom.to.make.life.choices: chr "0.681" "0.686" "0.683" "0.677" ...
## $ NULL.Generosity : chr "0.192" "0.286" "0.284" "0.353" ...
## $ NULL.Perceptions.of.corruption : chr "0.393" "0.340" "0.408" "0.138" ...
Formateo:
data[,c(3:9)]=lapply(data[,c(3:9)],as.numeric)
## Warning in lapply(data[, c(3:9)], as.numeric): NAs introducidos por
## coerción
str(data)
## 'data.frame': 156 obs. of 9 variables:
## $ NULL.Overall.Rank : chr "1" "2" "3" "4" ...
## $ NULL.Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ NULL.Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ NULL.GDP.per.capita : num 1.3 1.46 1.35 1.34 1.42 ...
## $ NULL.Social.support : num 1.59 1.58 1.59 1.64 1.55 ...
## $ NULL.Healthy.life.expectancy : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ NULL.Freedom.to.make.life.choices: num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ NULL.Generosity : num 0.192 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ NULL.Perceptions.of.corruption : num 0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
head(data)
## NULL.Overall.Rank NULL.Country NULL.Score NULL.GDP.per.capita
## 1 1 Finland 7.632 1.305
## 2 2 Norway 7.594 1.456
## 3 3 Denmark 7.555 1.351
## 4 4 Iceland 7.495 1.343
## 5 5 Switzerland 7.487 1.420
## 6 6 Netherlands 7.441 1.361
## NULL.Social.support NULL.Healthy.life.expectancy
## 1 1.592 0.874
## 2 1.582 0.861
## 3 1.590 0.868
## 4 1.644 0.914
## 5 1.549 0.927
## 6 1.488 0.878
## NULL.Freedom.to.make.life.choices NULL.Generosity
## 1 0.681 0.192
## 2 0.686 0.286
## 3 0.683 0.284
## 4 0.677 0.353
## 5 0.660 0.256
## 6 0.638 0.333
## NULL.Perceptions.of.corruption
## 1 0.393
## 2 0.340
## 3 0.408
## 4 0.138
## 5 0.357
## 6 0.295
Limpieza:
data=data[,-c(1)]
head(data)
## NULL.Country NULL.Score NULL.GDP.per.capita NULL.Social.support
## 1 Finland 7.632 1.305 1.592
## 2 Norway 7.594 1.456 1.582
## 3 Denmark 7.555 1.351 1.590
## 4 Iceland 7.495 1.343 1.644
## 5 Switzerland 7.487 1.420 1.549
## 6 Netherlands 7.441 1.361 1.488
## NULL.Healthy.life.expectancy NULL.Freedom.to.make.life.choices
## 1 0.874 0.681
## 2 0.861 0.686
## 3 0.868 0.683
## 4 0.914 0.677
## 5 0.927 0.660
## 6 0.878 0.638
## NULL.Generosity NULL.Perceptions.of.corruption
## 1 0.192 0.393
## 2 0.286 0.340
## 3 0.284 0.408
## 4 0.353 0.138
## 5 0.256 0.357
## 6 0.333 0.295
Pongo los países como índice:
row.names(data)=data$NULL.Country
head(data)
## NULL.Country NULL.Score NULL.GDP.per.capita
## Finland Finland 7.632 1.305
## Norway Norway 7.594 1.456
## Denmark Denmark 7.555 1.351
## Iceland Iceland 7.495 1.343
## Switzerland Switzerland 7.487 1.420
## Netherlands Netherlands 7.441 1.361
## NULL.Social.support NULL.Healthy.life.expectancy
## Finland 1.592 0.874
## Norway 1.582 0.861
## Denmark 1.590 0.868
## Iceland 1.644 0.914
## Switzerland 1.549 0.927
## Netherlands 1.488 0.878
## NULL.Freedom.to.make.life.choices NULL.Generosity
## Finland 0.681 0.192
## Norway 0.686 0.286
## Denmark 0.683 0.284
## Iceland 0.677 0.353
## Switzerland 0.660 0.256
## Netherlands 0.638 0.333
## NULL.Perceptions.of.corruption
## Finland 0.393
## Norway 0.340
## Denmark 0.408
## Iceland 0.138
## Switzerland 0.357
## Netherlands 0.295
data=data[,-c(1)]
head(data)
## NULL.Score NULL.GDP.per.capita NULL.Social.support
## Finland 7.632 1.305 1.592
## Norway 7.594 1.456 1.582
## Denmark 7.555 1.351 1.590
## Iceland 7.495 1.343 1.644
## Switzerland 7.487 1.420 1.549
## Netherlands 7.441 1.361 1.488
## NULL.Healthy.life.expectancy NULL.Freedom.to.make.life.choices
## Finland 0.874 0.681
## Norway 0.861 0.686
## Denmark 0.868 0.683
## Iceland 0.914 0.677
## Switzerland 0.927 0.660
## Netherlands 0.878 0.638
## NULL.Generosity NULL.Perceptions.of.corruption
## Finland 0.192 0.393
## Norway 0.286 0.340
## Denmark 0.284 0.408
## Iceland 0.353 0.138
## Switzerland 0.256 0.357
## Netherlands 0.333 0.295
names(data)=c("score","gdp","social","lifeexp","freedom","generosity","corruption")
head(data)
## score gdp social lifeexp freedom generosity corruption
## Finland 7.632 1.305 1.592 0.874 0.681 0.192 0.393
## Norway 7.594 1.456 1.582 0.861 0.686 0.286 0.340
## Denmark 7.555 1.351 1.590 0.868 0.683 0.284 0.408
## Iceland 7.495 1.343 1.644 0.914 0.677 0.353 0.138
## Switzerland 7.487 1.420 1.549 0.927 0.660 0.256 0.357
## Netherlands 7.441 1.361 1.488 0.878 0.638 0.333 0.295
¡Ya tengo la data limpia!
Creo subset sin considerar score
datasub=data[,c(2:7)]
head(datasub)
## gdp social lifeexp freedom generosity corruption
## Finland 1.305 1.592 0.874 0.681 0.192 0.393
## Norway 1.456 1.582 0.861 0.686 0.286 0.340
## Denmark 1.351 1.590 0.868 0.683 0.284 0.408
## Iceland 1.343 1.644 0.914 0.677 0.353 0.138
## Switzerland 1.420 1.549 0.927 0.660 0.256 0.357
## Netherlands 1.361 1.488 0.878 0.638 0.333 0.295
Veo si hay perdidos:
summary(datasub)
## gdp social lifeexp freedom
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6162 1st Qu.:1.067 1st Qu.:0.4223 1st Qu.:0.3560
## Median :0.9495 Median :1.255 Median :0.6440 Median :0.4870
## Mean :0.8914 Mean :1.213 Mean :0.5973 Mean :0.4545
## 3rd Qu.:1.1978 3rd Qu.:1.463 3rd Qu.:0.7772 3rd Qu.:0.5785
## Max. :2.0960 Max. :1.644 Max. :1.0300 Max. :0.7240
##
## generosity corruption
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.1095 1st Qu.:0.051
## Median :0.1740 Median :0.082
## Mean :0.1809 Mean :0.112
## 3rd Qu.:0.2390 3rd Qu.:0.137
## Max. :0.5980 Max. :0.457
## NA's :1
Imputo:
library(DescTools)
for(i in 1:ncol(datasub)){
MEDIA=Mean(datasub[,i], na.rm = TRUE)
datasub[is.na(datasub[,i]), i] = MEDIA
}
summary(datasub)
## gdp social lifeexp freedom
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6162 1st Qu.:1.067 1st Qu.:0.4223 1st Qu.:0.3560
## Median :0.9495 Median :1.255 Median :0.6440 Median :0.4870
## Mean :0.8914 Mean :1.213 Mean :0.5973 Mean :0.4545
## 3rd Qu.:1.1978 3rd Qu.:1.463 3rd Qu.:0.7772 3rd Qu.:0.5785
## Max. :2.0960 Max. :1.644 Max. :1.0300 Max. :0.7240
## generosity corruption
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1095 1st Qu.:0.0510
## Median :0.1740 Median :0.0820
## Mean :0.1809 Mean :0.1120
## 3rd Qu.:0.2390 3rd Qu.:0.1365
## Max. :0.5980 Max. :0.4570
¿Qué variable tiene correlación mas baja con las demás?
cor(datasub)
## gdp social lifeexp freedom generosity
## gdp 1.00000000 0.67173252 0.84427323 0.3217754 -0.01414976
## social 0.67173252 1.00000000 0.66707865 0.4110874 0.01740677
## lifeexp 0.84427323 0.66707865 1.00000000 0.3491445 0.01868503
## freedom 0.32177543 0.41108744 0.34914449 1.0000000 0.29710627
## generosity -0.01414976 0.01740677 0.01868503 0.2971063 1.00000000
## corruption 0.30105386 0.21687701 0.31031301 0.4607881 0.36036993
## corruption
## gdp 0.3010539
## social 0.2168770
## lifeexp 0.3103130
## freedom 0.4607881
## generosity 0.3603699
## corruption 1.0000000
Respuesta: generosity
Mapa de similitudes:
PASO 1: Estandarizo con scale:
datasub_s=scale(datasub)
summary(datasub_s)
## gdp social lifeexp freedom
## Min. :-2.2746 Min. :-4.0124 Min. :-2.4128 Min. :-2.7983
## 1st Qu.:-0.7022 1st Qu.:-0.4845 1st Qu.:-0.7072 1st Qu.:-0.6065
## Median : 0.1481 Median : 0.1381 Median : 0.1884 Median : 0.2001
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7815 3rd Qu.: 0.8260 3rd Qu.: 0.7267 3rd Qu.: 0.7634
## Max. : 3.0735 Max. : 1.4246 Max. : 1.7475 Max. : 1.6592
## generosity corruption
## Min. :-1.83772 Min. :-1.1645
## 1st Qu.:-0.72560 1st Qu.:-0.6342
## Median :-0.07051 Median :-0.3119
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.58966 3rd Qu.: 0.2547
## Max. : 4.23580 Max. : 3.5870
PASO 2: Saco las distancias con dist
datasub_d=dist(datasub_s)
PASO 3: Hago escalamiento multidimensional con cmdscale
datasub_r <- cmdscale(datasub_d,eig=TRUE, k=2) # k sugiere dimensiones. El resultado muestra las coordenadas (puntos) de cada provincia. DEJA EL k en 2.
datasub_r$GOF # GOF indica la bondad de ajuste, mientras mas cerca a 1 mejor.
## [1] 0.721218 0.721218
PASO 4: Dibujo el mapa
titulo="Mapa de Similitudes entre países basado en el Índice de Felicidad"
x <- datasub_r$points[,1]
y <- datasub_r$points[,2]
plot(x, y, main=titulo)
Pongo nombre de los países:
plot(x, y, xlab="Dimension 1", ylab="Dimension 2", main=titulo, type="n") # 'n' evita que se pongan los puntos.
columnForLabels=dimnames(datasub_r[[1]])[[1]] # etiquetas y colores de los puntos
text(x, y,labels = columnForLabels , cex = 0.5) #con cex indicamos el tamaño
Clusters:
PASO 1: Estandarizo. ¡Ya esta! ¡Sigue así, campeón!
PASO 2: Veo cuántos clusters debo sacar, según el algoritmo.
Método complete:
library(NbClust)
nb <- NbClust(datasub_s, method="complete")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 1 proposed 2 as the best number of clusters
## * 13 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Según el método complete, debes sacar tres clusters.
¿Y no habrá otro método? ¿Será K-means otro método? No lo sé.
Voy a ver:
library(NbClust)
nb <- NbClust(datasub_s, method="kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 15 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
Hay otro método en kmeans que me dice cuántos cluster sacar.
Según kmeans también debo sacar 3 clusters.
PCA:
PASO 1: sacar matriz de correlación
matriz=cor(datasub)
matriz
## gdp social lifeexp freedom generosity
## gdp 1.00000000 0.67173252 0.84427323 0.3217754 -0.01414976
## social 0.67173252 1.00000000 0.66707865 0.4110874 0.01740677
## lifeexp 0.84427323 0.66707865 1.00000000 0.3491445 0.01868503
## freedom 0.32177543 0.41108744 0.34914449 1.0000000 0.29710627
## generosity -0.01414976 0.01740677 0.01868503 0.2971063 1.00000000
## corruption 0.30105386 0.21687701 0.31031301 0.4607881 0.36036993
## corruption
## gdp 0.3010539
## social 0.2168770
## lifeexp 0.3103130
## freedom 0.4607881
## generosity 0.3603699
## corruption 1.0000000
PASO 2: KMO
Nota: KMO no es un una prueba de significancia
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
##
## Attaching package: 'psych'
## The following objects are masked from 'package:DescTools':
##
## AUC, ICC, SD
KMO(matriz)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = matriz)
## Overall MSA = 0.73
## MSA for each item =
## gdp social lifeexp freedom generosity corruption
## 0.70 0.84 0.71 0.76 0.63 0.72
El overall es de 0.73, entonces es adecuado hacer PCA.
PASO 3: Bartlett [esta SÍ ES UNA PRUEBA DE SIGNIFICANCIA]
cortest.bartlett(matriz,n=nrow(datasub))
## $chisq
## [1] 395.2955
##
## $p.value
## [1] 6.734106e-75
##
## $df
## [1] 15
La matriz de correlación NO ES UNA MATRIZ DE IDENTIDAD. Entonces, puedo hacer PCA.
PASO 4: Criterio de la varianza (eigenvalues)
eigenf=eigen(matriz)
eigenf$values
## [1] 2.9180016 1.4093064 0.6050568 0.5785018 0.3345613 0.1545721
Esta técnica me dice que podría reducir mis 6 variables originales en dos dimensiones.
PASO 5: Reduzco las 6 variables en un sola dimensión [“nuevo score”]
resultadoPr=principal(matriz,1,rotate="varimax",scores=T)
print(resultadoPr,digits=3,cut=0.40)
## Principal Components Analysis
## Call: principal(r = matriz, nfactors = 1, rotate = "varimax", scores = T)
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 h2 u2 com
## gdp 0.859 0.738 0.262 1
## social 0.806 0.650 0.350 1
## lifeexp 0.869 0.755 0.245 1
## freedom 0.643 0.414 0.586 1
## generosity 0.046 0.954 1
## corruption 0.561 0.315 0.685 1
##
## PC1
## SS loadings 2.918
## Proportion Var 0.486
##
## Mean item complexity = 1
## Test of the hypothesis that 1 component is sufficient.
##
## The root mean square of the residuals (RMSR) is 0.168
##
## Fit based upon off diagonal values = 0.841
PASO 6: Agregar a la data original
regresFactors=factor.scores(datasub,resultadoPr)$scores
datasub=merge(datasub,regresFactors,by.x = 0,by.y = 0)
head(datasub)
## Row.names gdp social lifeexp freedom generosity corruption
## 1 Afghanistan 0.332 0.537 0.255 0.085 0.191 0.036
## 2 Albania 0.916 0.817 0.790 0.419 0.149 0.032
## 3 Algeria 0.979 1.154 0.687 0.077 0.055 0.135
## 4 Angola 0.730 1.125 0.269 0.000 0.079 0.061
## 5 Argentina 1.073 1.468 0.744 0.570 0.062 0.054
## 6 Armenia 0.816 0.990 0.666 0.260 0.077 0.028
## PC1
## 1 -2.0959561
## 2 -0.3440926
## 3 -0.4409254
## 4 -1.3917248
## 5 0.4974521
## 6 -0.6877787