3. Scraping

library(htmltab)
linkPage = "https://en.wikipedia.org/wiki/World_Happiness_Report"
linkPath = '//*[@id="mw-content-text"]/div/table'
feliz = htmltab(doc = linkPage,
                which = linkPath)
str(feliz)

## 'data.frame':    156 obs. of  9 variables:
##  $ Overall rank                : chr  "1" "2" "3" "4" ...
##  $ Country or region           : chr  " Finland" " Denmark" " Norway" " Iceland" ...
##  $ Score                       : chr  "7.769" "7.600" "7.554" "7.494" ...
##  $ GDP per capita              : chr  "1.340" "1.383" "1.488" "1.380" ...
##  $ Social support              : chr  "1.587" "1.573" "1.582" "1.624" ...
##  $ Healthy life expectancy     : chr  "0.986" "0.996" "1.028" "1.026" ...
##  $ Freedom to make life choices: chr  "0.596" "0.592" "0.603" "0.591" ...
##  $ Generosity                  : chr  "0.153" "0.252" "0.271" "0.354" ...
##  $ Perceptions of corruption   : chr  "0.393" "0.410" "0.341" "0.118" ...

2. Limpieza

library(stringr)
library(magrittr)
feliz[,c(1)]=NULL #Eliminamos Rank
feliz[,]=lapply(feliz[,], trimws,whitespace = "[\\h\\v]")
row.names(feliz)= feliz$`Country or region`
feliz[,-c(1)]=lapply(feliz[,-c(1)],as.numeric)
str(feliz)

## 'data.frame':    156 obs. of  8 variables:
##  $ Country or region           : chr  "Finland" "Denmark" "Norway" "Iceland" ...
##  $ Score                       : num  7.77 7.6 7.55 7.49 7.49 ...
##  $ GDP per capita              : num  1.34 1.38 1.49 1.38 1.4 ...
##  $ Social support              : num  1.59 1.57 1.58 1.62 1.52 ...
##  $ Healthy life expectancy     : num  0.986 0.996 1.028 1.026 0.999 ...
##  $ Freedom to make life choices: num  0.596 0.592 0.603 0.591 0.557 0.572 0.574 0.585 0.584 0.532 ...
##  $ Generosity                  : num  0.153 0.252 0.271 0.354 0.322 0.263 0.267 0.33 0.285 0.244 ...
##  $ Perceptions of corruption   : num  0.393 0.41 0.341 0.118 0.298 0.343 0.373 0.38 0.308 0.226 ...

feliz[!complete.cases(feliz),]

## [1] Country or region            Score                       
## [3] GDP per capita               Social support              
## [5] Healthy life expectancy      Freedom to make life choices
## [7] Generosity                   Perceptions of corruption   
## <0 rows> (or 0-length row.names)

Ninguna fila tiene valores perdidos

3. Análisis de conglomerados por partición

Primero calculamos distancias

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)
g.dist = daisy(feliz[,c(3:8)], metric="gower")

3 CLUSTERS

A. Proponer cantidad de clusters:

pam.resultado1=pam(g.dist,3,cluster.only = F)
feliz$clusterPT1=pam.resultado1$cluster

B. Exploramos resultados

library(plyr) # para funcion "each"..

# nota el uso de as.matrix con cbind:
agg1 = aggregate(as.matrix(cbind(feliz[,c(3:8)]))~ clusterPT1, data=feliz,
          FUN=plyr::each(MD = median, Media = mean))
# convertir en data frame, y 
tablaResumen1 =t(as.data.frame(agg1))
tablaResumen1

##                                         [,1]       [,2]      [,3]
## clusterPT1                         1.0000000 2.00000000 3.0000000
## GDP per capita.MD                  1.3815000 1.04300000 0.3800000
## GDP per capita.Media               1.3937083 1.03589412 0.4096383
## Social support.MD                  1.5125000 1.35100000 0.9160000
## Social support.Media               1.4896250 1.31363529 0.8777660
## Healthy life expectancy.MD         0.9990000 0.81800000 0.4400000
## Healthy life expectancy.Media      0.9855000 0.81600000 0.4286383
## Freedom to make life choices.MD    0.5570000 0.41000000 0.3520000
## Freedom to make life choices.Media 0.5473750 0.38450588 0.3294681
## Generosity.MD                      0.2690000 0.12600000 0.2090000
## Generosity.Media                   0.2740417 0.14434118 0.2171702
## Perceptions of corruption.MD       0.2885000 0.05700000 0.0890000
## Perceptions of corruption.Media    0.2740417 0.06875294 0.1013617

C. Graficamos

fviz_cluster(object = list(data=g.dist, cluster = feliz$clusterPT1),
             geom = c("text"), 
             ellipse.type = "convex")

5 CLUSTERS

B. Proponer cantidad de clusters:

pam.resultado2=pam(g.dist,5,cluster.only = F)
feliz$clusterPT2=pam.resultado2$cluster

C. Exploramos resultados

library(plyr) # para funcion "each"..

# nota el uso de as.matrix con cbind:
agg2 = aggregate(as.matrix(cbind(feliz[,c(3:8)]))~ clusterPT2, data=feliz,
          FUN=plyr::each(MD = median, Media = mean))
# convertir en data frame, y 
tablaResumen2 =t(as.data.frame(agg2))
tablaResumen2

##                                       [,1]       [,2]      [,3]       [,4]
## clusterPT2                         1.00000 2.00000000 3.0000000 4.00000000
## GDP per capita.MD                  1.38150 1.22100000 1.0430000 0.79700000
## GDP per capita.Media               1.38070 1.20222857 1.0302571 0.77126923
## Social support.MD                  1.52400 1.43100000 1.3280000 1.23400000
## Social support.Media               1.50545 1.40125714 1.2663714 1.20323077
## Healthy life expectancy.MD         1.01050 0.88100000 0.8080000 0.72050000
## Healthy life expectancy.Media      1.00250 0.89665714 0.7903143 0.70642308
## Freedom to make life choices.MD    0.56800 0.47300000 0.2640000 0.49450000
## Freedom to make life choices.Media 0.55595 0.47145714 0.2623143 0.48280769
## Generosity.MD                      0.27100 0.12700000 0.1030000 0.19200000
## Generosity.Media                   0.28310 0.15362857 0.1238286 0.20838462
## Perceptions of corruption.MD       0.30300 0.07300000 0.0550000 0.08250000
## Perceptions of corruption.Media    0.29810 0.08117143 0.0590000 0.09276923
##                                        [,5]
## clusterPT2                         5.000000
## GDP per capita.MD                  0.350000
## GDP per capita.Media               0.373725
## Social support.MD                  0.885500
## Social support.Media               0.847625
## Healthy life expectancy.MD         0.418000
## Healthy life expectancy.Media      0.392425
## Freedom to make life choices.MD    0.329500
## Freedom to make life choices.Media 0.298775
## Generosity.MD                      0.204500
## Generosity.Media                   0.206550
## Perceptions of corruption.MD       0.087500
## Perceptions of corruption.Media    0.097625

D. Graficamos

fviz_cluster(object = list(data=g.dist, cluster = feliz$clusterPT2),
             geom = c("text"), 
             ellipse.type = "convex")

D. Comparamos

Usemos una tabla de contingencia para verificar la asignación:

table(feliz$clusterPT1,feliz$clusterPT2,dnn = c('clusterPT1','clusterPT2'))

##           clusterPT2
## clusterPT1  1  2  3  4  5
##          1 20  4  0  0  0
##          2  0 31 35 19  0
##          3  0  0  0  7 40

feliz[feliz$clusterPT1==1 & feliz$clusterPT2==1,'Country or region']

##  [1] "Finland"              "Denmark"              "Norway"              
##  [4] "Iceland"              "Netherlands"          "Switzerland"         
##  [7] "Sweden"               "New Zealand"          "Canada"              
## [10] "Austria"              "Australia"            "Luxembourg"          
## [13] "United Kingdom"       "Ireland"              "Germany"             
## [16] "United Arab Emirates" "Malta"                "Singapore"           
## [19] "Uzbekistan"           "Hong Kong"

EJERCICIO

3. Scraping

2. Limpieza

3. Análisis de conglomerados por partición

Primero calculamos distancias

3 CLUSTERS

A. Proponer cantidad de clusters:

B. Exploramos resultados

C. Graficamos

5 CLUSTERS

B. Proponer cantidad de clusters:

C. Exploramos resultados

D. Graficamos

D. Comparamos