library(readr)
library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 3.6.3

library(factoextra)

## Warning: package 'factoextra' was built under R version 3.6.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.6.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Efectue un ACP usando la funcion PCA(….) del paquete FactoMineR y de una interpretacion siguiendo los siguientes pasos:

1) Cargue la tabla de datos y ejecute un str(…), summary(…) y un dim(…),

verifque la correcta lectura de los datos.

datos <- read.csv("Datos_tarea/Vehiculos.csv",
header = T,
sep = ",",
dec = ".",
)

str(datos)

## 'data.frame':    100 obs. of  11 variables:
##  $ car.type  : Factor w/ 100 levels "alfa-romero giulia",..: 1 2 4 3 5 6 9 7 8 14 ...
##  $ fueltype  : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
##  $ doornumber: Factor w/ 2 levels "four","two": 2 2 2 1 2 1 1 1 2 2 ...
##  $ carlength : num  169 169 177 193 177 ...
##  $ carheight : num  48.8 48.8 53.1 55.7 54.3 54.3 55.7 55.7 53.7 52 ...
##  $ enginesize: int  130 130 136 136 108 164 164 209 209 90 ...
##  $ boreratio : num  3.47 3.47 3.19 3.19 3.5 3.31 3.31 3.62 3.62 3.03 ...
##  $ horsepower: int  111 111 110 110 101 121 121 182 182 70 ...
##  $ citympg   : int  21 21 19 19 23 21 20 16 16 38 ...
##  $ highwaympg: int  27 27 25 25 29 28 25 22 22 43 ...
##  $ price     : num  13495 16500 15250 18920 16430 ...

summary(datos)

##                 car.type    fueltype  doornumber   carlength    
##  alfa-romero giulia : 1   diesel: 6   four:51    Min.   :144.6  
##  alfa-romero stelvio: 1   gas   :94   two :49    1st Qu.:159.1  
##  audi 5000          : 1                          Median :171.2  
##  audi fox           : 1                          Mean   :172.3  
##  bmw 320i           : 1                          3rd Qu.:180.6  
##  bmw x3             : 1                          Max.   :208.1  
##  (Other)            :94                                         
##    carheight       enginesize      boreratio       horsepower   
##  Min.   :47.80   Min.   : 70.0   Min.   :2.910   Min.   : 52.0  
##  1st Qu.:51.60   1st Qu.: 92.0   1st Qu.:3.030   1st Qu.: 69.0  
##  Median :53.90   Median :108.5   Median :3.290   Median : 92.0  
##  Mean   :53.66   Mean   :126.7   Mean   :3.312   Mean   :102.1  
##  3rd Qu.:55.62   3rd Qu.:141.0   3rd Qu.:3.598   3rd Qu.:114.0  
##  Max.   :59.80   Max.   :326.0   Max.   :3.940   Max.   :262.0  
##                                                                 
##     citympg        highwaympg        price      
##  Min.   :13.00   Min.   :16.00   Min.   : 5118  
##  1st Qu.:19.00   1st Qu.:25.00   1st Qu.: 7295  
##  Median :25.50   Median :31.00   Median : 9418  
##  Mean   :25.63   Mean   :31.01   Mean   :12865  
##  3rd Qu.:31.00   3rd Qu.:37.00   3rd Qu.:15544  
##  Max.   :49.00   Max.   :54.00   Max.   :45400  
##

dim(datos)

## [1] 100  11

#2 Elimine de la tabla de datos las variables categoricas

datos$fueltype <- NULL
datos$car.type <- NULL
datos$doornumber <- NULL
str(datos)

## 'data.frame':    100 obs. of  8 variables:
##  $ carlength : num  169 169 177 193 177 ...
##  $ carheight : num  48.8 48.8 53.1 55.7 54.3 54.3 55.7 55.7 53.7 52 ...
##  $ enginesize: int  130 130 136 136 108 164 164 209 209 90 ...
##  $ boreratio : num  3.47 3.47 3.19 3.19 3.5 3.31 3.31 3.62 3.62 3.03 ...
##  $ horsepower: int  111 111 110 110 101 121 121 182 182 70 ...
##  $ citympg   : int  21 21 19 19 23 21 20 16 16 38 ...
##  $ highwaympg: int  27 27 25 25 29 28 25 22 22 43 ...
##  $ price     : num  13495 16500 15250 18920 16430 ...

#3 Grafique el plano principal con plot(…) y con fviz pca ind(…)

library(FactoMineR)
modelo <- PCA(datos, scale.unit = TRUE, ncp = 5, graph = FALSE)

plot(modelo)

library(factoextra)
fviz_pca_ind(X = modelo,
pointsize = 5,
pointshape = 21,
fill = "#E7B800",
repel = TRUE)

4 Grafique el circulo de correlacion con plot(…) y con fviz pca var(…).

library(FactoMineR)
plot(x = modelo,
axes = c(1, 2),
choix = "var",
col.var = "blue",
new.plot = TRUE)

fviz_pca_var(X = modelo,
col.var = "steelblue")

#5 Repita los ejercicios anteriores pero esta vez elimine individuos y variables mal representados (coseno cuadrado menor al 5 %). Cuales individuos y variables se eliminaron?

cos2.ind <- (modelo$ind$cos2[,1] + modelo$ind$cos2[,2])*100
round(cos2.ind, 2)

##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 72.56 76.74 43.80 63.73 46.89 78.65 81.96 94.83 88.80 91.44 90.01 94.80 97.82 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## 71.55 97.79 97.67 97.22 73.56 85.10 79.78 93.68 78.74 66.30 87.50 89.25 68.72 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
## 23.31 88.41 88.41 95.08 93.28 79.30 95.77 95.23 26.45 47.44 28.48 90.16 90.93 
##    40    41    42    43    44    45    46    47    48    49    50    51    52 
## 92.37 90.57 94.64 97.75 97.62 34.22 98.75 98.71 93.11 98.44 98.05 74.23 81.15 
##    53    54    55    56    57    58    59    60    61    62    63    64    65 
## 84.37 82.72 96.13 91.72 79.65 89.11 62.91 92.20 94.80 71.55 78.29 64.46 84.50 
##    66    67    68    69    70    71    72    73    74    75    76    77    78 
## 65.86 88.87 42.36 14.98  9.45  1.10 15.54 19.25  8.88  0.93 94.94 95.05 94.92 
##    79    80    81    82    83    84    85    86    87    88    89    90    91 
## 81.30 80.42 89.14 35.75 36.00 33.15 38.80 84.07 74.81 78.71 90.28 78.38 67.91 
##    92    93    94    95    96    97    98    99   100 
## 21.81 46.75 57.48 76.13 84.20 78.27 85.16 85.58 86.48

eliminados <- list(cos2.ind <= 5)
eliminados

## [[1]]
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    40    41    42    43    44    45    46    47    48    49    50    51    52 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    53    54    55    56    57    58    59    60    61    62    63    64    65 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    66    67    68    69    70    71    72    73    74    75    76    77    78 
## FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE 
##    79    80    81    82    83    84    85    86    87    88    89    90    91 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    92    93    94    95    96    97    98    99   100 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Se eliminan los casos 71 y 75

plot(x = modelo,
axes = c(1, 2),
choix = "ind",
col.ind = "green",
new.plot = TRUE,
select = "cos2 0.05")

cos2.var <- (modelo$var$cos2[,1] + modelo$var$cos2[,2])*100
round(cos2.var,2)

##  carlength  carheight enginesize  boreratio horsepower    citympg highwaympg 
##      89.32      94.54      82.04      63.07      91.77      83.24      85.86 
##      price 
##      81.37

Como se puede ver, no se elimina ninguna variable

6) En el circulo de correlacion interprete la correlacion entre las variables.

plot(modelo,
axes = c(1, 2),
choix = "var",
col.var = "brown",
new.plot = TRUE,
select = "cos2 0.1")

Podemos ver que hay una alta correlacion entre carlength y boreratio, price, enginesize y horsepower, así como entre citympg y highwaympg.

7) Interprete la formacion de los clusteres basado en la sobreposicion del circulo y el plano

plot(modelo,
axes = c(1, 2),
choix = "ind",
col.var = "red",
new.plot = TRUE,
select = "cos2 0.1")

b)Ejecute un Clustering Jerarquico con la agregacion del Salto Maximo y Ward. Guarde la tabla de datos en el archivo Vehiculos2.csv con el cluster al que pertenece cada individuo para el caso de la agregacion de Ward, esto con 4 clusteres

matriz <- dist(datos)
modelo.SM <- hclust(matriz, method = "complete")
plot(modelo.SM)

matriz <- dist(datos)
modelo.ward <- hclust(matriz, method = "ward.D")
plot(modelo.ward, hang = -1)
rect.hclust(modelo.ward, k = 4, border = "blue")

datos$grupos <- cutree(modelo.ward, k = 4)
datos

##     carlength carheight enginesize boreratio horsepower citympg highwaympg
## 1       168.8      48.8        130      3.47        111      21         27
## 2       168.8      48.8        130      3.47        111      21         27
## 3       177.3      53.1        136      3.19        110      19         25
## 4       192.7      55.7        136      3.19        110      19         25
## 5       176.8      54.3        108      3.50        101      23         29
## 6       176.8      54.3        164      3.31        121      21         28
## 7       189.0      55.7        164      3.31        121      20         25
## 8       189.0      55.7        209      3.62        182      16         22
## 9       193.8      53.7        209      3.62        182      16         22
## 10      155.9      52.0         90      3.03         70      38         43
## 11      158.8      52.0         90      3.03         70      38         43
## 12      157.3      50.8         90      2.97         68      37         41
## 13      157.3      50.8         90      2.97         68      31         38
## 14      157.3      50.8         98      3.03        102      24         30
## 15      157.3      50.6         90      2.97         68      31         38
## 16      157.3      50.6         90      2.97         68      31         38
## 17      157.3      50.6         90      2.97         68      31         38
## 18      157.3      50.6         98      3.03        102      24         30
## 19      174.6      59.8        122      3.34         88      24         30
## 20      144.6      50.8         92      2.91         58      49         54
## 21      144.6      50.8         92      2.91         76      31         38
## 22      163.4      54.5         92      2.91         76      30         34
## 23      157.1      58.3         92      2.92         76      30         34
## 24      167.5      53.3        110      3.15         86      27         33
## 25      167.5      53.3        110      3.15         86      27         33
## 26      175.4      54.1        110      3.15         86      27         33
## 27      170.7      53.5        111      3.31         78      24         29
## 28      155.9      52.0         90      3.03         70      38         43
## 29      155.9      52.0         90      3.03         70      38         43
## 30      199.6      52.8        258      3.63        176      15         19
## 31      191.7      47.8        326      3.54        262      13         17
## 32      159.1      54.1         91      3.03         68      30         31
## 33      159.1      54.1         91      3.03         68      31         38
## 34      159.1      54.1         91      3.03         68      31         38
## 35      169.0      49.6         70      3.33        101      17         23
## 36      169.0      49.6         80      3.33        135      16         23
## 37      177.8      53.7        122      3.39         84      26         32
## 38      202.6      56.5        234      3.46        155      16         18
## 39      180.3      50.8        234      3.46        155      16         18
## 40      208.1      56.7        308      3.80        184      14         16
## 41      199.2      55.4        304      3.80        184      14         16
## 42      157.3      50.8         92      2.97         68      37         41
## 43      157.3      50.8         92      2.97         68      31         38
## 44      157.3      50.8         92      2.97         68      31         38
## 45      172.4      51.6        122      3.35         88      25         32
## 46      165.3      54.5         97      3.15         69      31         37
## 47      165.3      54.5         97      3.15         69      31         37
## 48      170.2      53.5         97      3.15         69      31         37
## 49      165.3      54.5         97      3.15         69      31         37
## 50      162.4      53.3         97      3.15         69      31         37
## 51      173.4      54.7        120      3.33         97      27         34
## 52      181.7      55.1        181      3.43        152      17         22
## 53      184.6      56.1        181      3.43        152      17         22
## 54      184.6      55.1        181      3.43        152      19         25
## 55      170.7      49.7        181      3.43        160      19         25
## 56      170.7      49.7        181      3.43        200      17         23
## 57      186.7      56.7        120      3.46         97      19         24
## 58      198.9      58.7        120      3.46         97      19         24
## 59      186.7      56.7        152      3.70         95      28         33
## 60      198.9      58.7        152      3.70         95      25         25
## 61      157.3      50.8         90      2.97         68      37         41
## 62      157.3      50.8         98      3.03        102      24         30
## 63      173.2      50.2        156      3.59        145      19         24
## 64      168.9      50.2        151      3.94        143      19         27
## 65      168.9      51.6        194      3.74        207      17         25
## 66      181.5      55.2        132      3.46         90      23         31
## 67      186.6      56.1        121      3.54        110      21         28
## 68      156.9      53.7         97      3.62         69      31         36
## 69      157.9      53.7        108      3.62         73      26         31
## 70      172.0      52.5        108      3.62         82      28         33
## 71      172.0      52.5        108      3.62         94      26         32
## 72      172.0      54.3        108      3.62         82      24         25
## 73      172.0      54.3        108      3.62        111      24         29
## 74      173.5      53.0        108      3.62         82      28         32
## 75      173.5      53.0        108      3.62         94      25         31
## 76      158.7      54.5         92      3.05         62      35         39
## 77      158.7      54.5         92      3.05         62      31         38
## 78      158.7      54.5         92      3.05         62      31         38
## 79      169.7      59.1         92      3.05         62      27         32
## 80      169.7      59.1         92      3.05         62      27         32
## 81      168.7      52.6         98      3.19         70      29         34
## 82      168.7      52.6         98      3.24        112      26         29
## 83      168.7      52.6         98      3.24        112      26         29
## 84      176.2      52.0        146      3.62        116      24         30
## 85      176.2      52.0        146      3.62        116      24         30
## 86      175.6      54.9        110      3.27         73      30         33
## 87      187.8      54.1        161      3.27        156      19         24
## 88      171.7      55.7         97      3.01         52      37         46
## 89      171.7      55.7        109      3.19         85      27         34
## 90      171.7      55.7         97      3.01         52      37         46
## 91      171.7      55.7        109      3.19        100      26         32
## 92      159.3      55.6        109      3.19         90      24         29
## 93      180.2      55.1        136      3.19        110      19         24
## 94      180.2      55.1         97      3.01         68      33         38
## 95      188.8      56.2        141      3.78        114      23         28
## 96      188.8      57.5        141      3.78        114      23         28
## 97      188.8      56.2        141      3.78        114      24         28
## 98      188.8      57.5        141      3.78        114      24         28
## 99      188.8      56.2        130      3.62        162      17         22
## 100     188.8      57.5        130      3.62        162      17         22
##       price grupos
## 1   13495.0      1
## 2   16500.0      1
## 3   15250.0      1
## 4   18920.0      1
## 5   16430.0      1
## 6   21105.0      1
## 7   24565.0      1
## 8   30760.0      2
## 9   41315.0      2
## 10   6295.0      3
## 11   6575.0      3
## 12   5572.0      3
## 13   6377.0      3
## 14   7957.0      3
## 15   6229.0      3
## 16   6692.0      3
## 17   7609.0      3
## 18   8558.0      3
## 19   8921.0      4
## 20   6479.0      3
## 21   6855.0      3
## 22   7295.0      3
## 23   7295.0      3
## 24   7895.0      3
## 25   9095.0      4
## 26   8845.0      4
## 27   6785.0      3
## 28   8916.5      4
## 29   8916.5      4
## 30  32250.0      2
## 31  36000.0      2
## 32   5195.0      3
## 33   6095.0      3
## 34   6795.0      3
## 35  10945.0      4
## 36  15645.0      1
## 37  10595.0      4
## 38  34184.0      2
## 39  35056.0      2
## 40  40960.0      2
## 41  45400.0      2
## 42   5389.0      3
## 43   6189.0      3
## 44   6669.0      3
## 45   6989.0      3
## 46   5499.0      3
## 47   6849.0      3
## 48   7349.0      3
## 49   7299.0      3
## 50   8249.0      3
## 51   9549.0      4
## 52  13499.0      1
## 53  14399.0      1
## 54  13499.0      1
## 55  17199.0      1
## 56  19699.0      1
## 57  11900.0      4
## 58  12440.0      1
## 59  16900.0      1
## 60  17075.0      1
## 61   5572.0      3
## 62   7957.0      3
## 63  12764.0      1
## 64  22018.0      1
## 65  34028.0      2
## 66   9295.0      4
## 67  15510.0      1
## 68   5118.0      3
## 69   7053.0      3
## 70   7775.0      3
## 71   9960.0      4
## 72   9233.0      4
## 73  11259.0      4
## 74   7463.0      3
## 75  10198.0      4
## 76   5348.0      3
## 77   6338.0      3
## 78   6488.0      3
## 79   7898.0      3
## 80   8778.0      4
## 81   8058.0      3
## 82   9298.0      4
## 83   9538.0      4
## 84   9989.0      4
## 85  11199.0      4
## 86  10698.0      4
## 87  15750.0      1
## 88   7775.0      3
## 89   7975.0      3
## 90   7995.0      3
## 91   9995.0      4
## 92  11595.0      4
## 93  13295.0      1
## 94  13845.0      1
## 95  12940.0      1
## 96  13415.0      1
## 97  15985.0      1
## 98  16515.0      1
## 99  18420.0      1
## 100 18950.0      1

#c) Interprete los resultados del ejercicio anterior para el caso de agregacion de Ward usando graficos tipo radar, use 4 clusteres.

write.csv(datos,'vehiculos2.csv')

library(cluster)
centroide <- function(num.cluster, datos, clusters) {
ind <- (clusters == num.cluster)
return(colMeans(datos[ind,]))
}


centro.cluster1 <- centroide(1, datos, datos$grupos)
centro.cluster2 <- centroide(2, datos, datos$grupos)
centro.cluster3 <- centroide(3, datos, datos$grupos)
centro.cluster4 <- centroide(4, datos, datos$grupos)

centros <- rbind(centro.cluster1,
centro.cluster2,
centro.cluster3, centro.cluster4)
centros

##                 carlength carheight enginesize boreratio horsepower  citympg
## centro.cluster1  182.3464  54.35714  143.67857  3.488571  126.25000 20.67857
## centro.cluster2  192.5778  53.44444  252.88889  3.630000  187.44444 15.22222
## centro.cluster3  161.4220  53.00976   96.31707  3.103902   72.14634 31.09756
## centro.cluster4  171.5136  54.07727  110.27273  3.344091   92.31818 26.00000
##                 highwaympg     price grupos
## centro.cluster1   26.03571 16286.679      1
## centro.cluster2   19.22222 36661.444      2
## centro.cluster3   36.87805  6874.317      3
## centro.cluster4   31.22727  9941.727      4

centros <- as.data.frame(centros)
maximos <- apply(centros, 2, max)
minimos <- apply(centros, 2, min)
centros <- rbind(minimos, centros)
centros <- rbind(maximos, centros)
centros

##                 carlength carheight enginesize boreratio horsepower  citympg
## 1                192.5778  54.35714  252.88889  3.630000  187.44444 31.09756
## 11               161.4220  53.00976   96.31707  3.103902   72.14634 15.22222
## centro.cluster1  182.3464  54.35714  143.67857  3.488571  126.25000 20.67857
## centro.cluster2  192.5778  53.44444  252.88889  3.630000  187.44444 15.22222
## centro.cluster3  161.4220  53.00976   96.31707  3.103902   72.14634 31.09756
## centro.cluster4  171.5136  54.07727  110.27273  3.344091   92.31818 26.00000
##                 highwaympg     price grupos
## 1                 36.87805 36661.444      4
## 11                19.22222  6874.317      1
## centro.cluster1   26.03571 16286.679      1
## centro.cluster2   19.22222 36661.444      2
## centro.cluster3   36.87805  6874.317      3
## centro.cluster4   31.22727  9941.727      4

library(fmsb) 
colores <- c("blue","brown","green", "red")
radarchart(as.data.frame(centros),maxmin=TRUE,
axistype=4, axislabcol="slategray4",
centerzero=FALSE, seg=8,
cglcol="gray67", pcol=colores,
plty=1, plwd=5,
title="Comparación de clústeres")
legenda <-legend(1.5,1,
legend=c("Cluster 1",
"Cluster 2",
"Cluster 3","Cluster 4" ),
seg.len=-1.4,
title="Clústeres",
pch=21, bty="n" ,
lwd=3, y.intersp=1,
horiz=FALSE, col=colores)

d) Construya y grafique un clustering jerarquico sobre las componentes principales del ACP

res <- PCA(datos , scale.unit=TRUE, ncp=5, graph = FALSE)
res.hcpc <- HCPC(res, nb.clust = -1, consol = TRUE, min = 3, max = 3, graph = FALSE)
plot.HCPC(res.hcpc, choice="bar")

#2 a) Cargue la tabla de datos y ejecute un str(…), summary(…) y un dim(…), verifique la correcta lectura de los datos.

datos_beijing <- read_csv("Datos_tarea/DatosBeijing.csv")

## Parsed with column specification:
## cols(
##   ID = col_double(),
##   Anno = col_double(),
##   Mes = col_double(),
##   Dia = col_double(),
##   Hora = col_double(),
##   ConcetracionParticula_pm2.5 = col_double(),
##   PuntoRocio = col_double(),
##   Temperatura = col_double(),
##   Presion = col_double(),
##   DireccionViento = col_character(),
##   VelocidadViento = col_double(),
##   HorasNieve = col_double(),
##   HorasLluvia = col_double()
## )

str(datos_beijing)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 43824 obs. of  13 variables:
##  $ ID                         : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Anno                       : num  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Mes                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Dia                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Hora                       : num  0 1 2 3 4 5 6 7 8 9 ...
##  $ ConcetracionParticula_pm2.5: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PuntoRocio                 : num  -21 -21 -21 -21 -20 -19 -19 -19 -19 -20 ...
##  $ Temperatura                : num  -11 -12 -11 -14 -12 -10 -9 -9 -9 -8 ...
##  $ Presion                    : num  1021 1020 1019 1019 1018 ...
##  $ DireccionViento            : chr  "NW" "NW" "NW" "NW" ...
##  $ VelocidadViento            : num  1.79 4.92 6.71 9.84 12.97 ...
##  $ HorasNieve                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HorasLluvia                : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   Anno = col_double(),
##   ..   Mes = col_double(),
##   ..   Dia = col_double(),
##   ..   Hora = col_double(),
##   ..   ConcetracionParticula_pm2.5 = col_double(),
##   ..   PuntoRocio = col_double(),
##   ..   Temperatura = col_double(),
##   ..   Presion = col_double(),
##   ..   DireccionViento = col_character(),
##   ..   VelocidadViento = col_double(),
##   ..   HorasNieve = col_double(),
##   ..   HorasLluvia = col_double()
##   .. )

summary(datos_beijing)

##        ID             Anno           Mes              Dia       
##  Min.   :    1   Min.   :2010   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:10957   1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00  
##  Median :21913   Median :2012   Median : 7.000   Median :16.00  
##  Mean   :21913   Mean   :2012   Mean   : 6.524   Mean   :15.73  
##  3rd Qu.:32868   3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :43824   Max.   :2014   Max.   :12.000   Max.   :31.00  
##                                                                 
##       Hora       ConcetracionParticula_pm2.5   PuntoRocio       Temperatura    
##  Min.   : 0.00   Min.   :  0.00              Min.   :-40.000   Min.   :-19.00  
##  1st Qu.: 5.75   1st Qu.: 29.00              1st Qu.:-10.000   1st Qu.:  2.00  
##  Median :11.50   Median : 72.00              Median :  2.000   Median : 14.00  
##  Mean   :11.50   Mean   : 98.61              Mean   :  1.817   Mean   : 12.45  
##  3rd Qu.:17.25   3rd Qu.:137.00              3rd Qu.: 15.000   3rd Qu.: 23.00  
##  Max.   :23.00   Max.   :994.00              Max.   : 28.000   Max.   : 42.00  
##                  NA's   :2067                                                  
##     Presion     DireccionViento    VelocidadViento    HorasNieve      
##  Min.   : 991   Length:43824       Min.   :  0.45   Min.   : 0.00000  
##  1st Qu.:1008   Class :character   1st Qu.:  1.79   1st Qu.: 0.00000  
##  Median :1016   Mode  :character   Median :  5.37   Median : 0.00000  
##  Mean   :1016                      Mean   : 23.89   Mean   : 0.05273  
##  3rd Qu.:1025                      3rd Qu.: 21.91   3rd Qu.: 0.00000  
##  Max.   :1046                      Max.   :585.60   Max.   :27.00000  
##                                                                       
##   HorasLluvia     
##  Min.   : 0.0000  
##  1st Qu.: 0.0000  
##  Median : 0.0000  
##  Mean   : 0.1949  
##  3rd Qu.: 0.0000  
##  Max.   :36.0000  
##

dim(datos_beijing)

## [1] 43824    13

b) Elimine las filas con NA usando el comando na.omit(…). Cuantas filas eliminaron?

data <- na.omit(datos_beijing)

41757-43824=2067 filas eliminadas

#c) Eimine de la tabla de datos la variable DireccionViento. Por que se debe eliminar? Que otra aternativa se tiene en lugar de eliminarla?

data$DireccionViento <- NULL

Se debe eliminar porque es categórica

#d) Que pasa si ejecutamos un clustering jerarquico con hclust(…). Por que sucede esto?

No se puede ejecutar (error) por el gran tamaño del archivo

#e) Ejecute un k-medias con k = 3, iter.max=1000 y nstart=50.

grupos <- kmeans(datos, centers= 3, iter.max = 1000, nstart = 50)
grupos$cluster

##   [1] 2 2 2 2 2 2 2 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 2 1
##  [38] 3 3 3 3 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 2 2 1 1 2 2 3 1 2 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2

#f) De una interpretacion de los resultados usando un grafco tipo radar

centros<-grupos$centers
rownames(centros)<-c("Cluster 1","Cluster 2","Cluster 3")
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros

##           carlength carheight enginesize boreratio horsepower  citympg
## 1          192.5778  54.35714   252.8889  3.630000  187.44444 29.31746
## 11         164.9460  53.38254   101.1905  3.187778   79.19048 15.22222
## Cluster 1  164.9460  53.38254   101.1905  3.187778   79.19048 29.31746
## Cluster 2  182.3464  54.35714   143.6786  3.488571  126.25000 20.67857
## Cluster 3  192.5778  53.44444   252.8889  3.630000  187.44444 15.22222
##           highwaympg     price   grupos
## 1           34.90476 36661.444 3.349206
## 11          19.22222  7945.476 1.000000
## Cluster 1   34.90476  7945.476 3.349206
## Cluster 2   26.03571 16286.679 1.000000
## Cluster 3   19.22222 36661.444 2.000000

color <- c("blue", "red", "brown")
radarchart(as.data.frame(centros),
maxmin=TRUE,axistype=4,
axislabcol="slategray4",
centerzero=FALSE,seg=8,
cglcol="gray67",
pcol=color,plty=1,plwd=5,
title="Comp de Clusteres")
legenda <-legend(1.5,1,
legend=c("Cluster 1",
"Cluster 2",
"Cluster 3"),
seg.len=-1.4,title="Clusteres",
pch=21,bty="n" ,lwd=3,
y.intersp=1,
horiz=FALSE,col=color)

g) Construya el Codo de Jambu usando iter.max=100 y nstart=5, cuantos conglomerados (clusteres) sugiere el codo? Utilice tambien el metodo silhouette de la funcion fviz nbclust, cuantos conglomerados (clusteres) sugiere este metodo?

InercIa<-rep(0,9)
for(k in 1:9) {
grupos<-kmeans(data, centers=k,iter.max = 100, nstart=5)
InercIa[k]<-grupos$tot.withinss
}

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)

plot(InercIa,col="blue",type="b")

#3 a) Ejecute las siguientes instrucciones para calcular y graficarla matriz de correlaciones yluego interprete todas las correlaciones:

library(corrplot)

## corrplot 0.84 loaded

datos <- read.table("Datos_tarea/ExpectativaVida.csv",sep = ",",dec='.',header=TRUE)
datos2 <-datos[,-c(1)] # Se quita la variable categorica Status.
correlaciones <- cor(datos2)
corrplot(correlaciones)

Hay correlaciones importantes entre schooling y income composition of resources, entre estas dos y life expectancy

#b Realice un ACP usando el paquete FactoMineR tomando en cuenta solo las variables numericas, grafiqe el plano principal y el circulo de correlaciones de manera que se le eliminen los individuos y variables con menos de 5% de representacion, como se muestra en la figra siguiente. Basado en la sobreposicicon del crculo de correlaciones y el plano principal interprete el Cluster 1 y Cluster 2 segun se indican en el siguiente grafico

datos$Status<- NULL
str(datos)

## 'data.frame':    131 obs. of  7 variables:
##  $ Life.expectancy                : num  59.9 77.5 75.4 51.7 76.2 74.6 82.7 81.4 72.5 71.4 ...
##  $ Adult.Mortality                : int  271 8 11 348 118 12 6 66 119 132 ...
##  $ Alcohol                        : num  0.01 4.51 0.01 8.33 7.93 ...
##  $ BMI                            : num  18.6 57.2 58.4 22.7 62.2 54.1 66.1 57.1 51.5 17.7 ...
##  $ HIV.AIDS                       : num  0.1 0.1 0.1 2 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ Income.composition.of.resources: num  0.476 0.761 0.741 0.527 0.825 0.739 0.936 0.892 0.752 0.57 ...
##  $ Schooling                      : num  10 14.2 14.4 11.4 17.3 12.7 20.4 15.9 12.2 10 ...

modelo <- PCA(datos, scale.unit = TRUE, ncp = 4, graph = FALSE)
cos2.var <- (modelo$var$cos2[,1] + modelo$var$cos2[,2])*100
round(cos2.var,2)

##                 Life.expectancy                 Adult.Mortality 
##                           89.62                           82.46 
##                         Alcohol                             BMI 
##                           68.89                           50.45 
##                        HIV.AIDS Income.composition.of.resources 
##                           79.20                           92.58 
##                       Schooling 
##                           83.99

plot(modelo,
axes = c(1, 2),
choix = "var",
col.var = "red",
new.plot = TRUE,
select = "cos2 0.05")

plot(modelo,
axes = c(1, 2),
choix = "ind",
col.var = "red",
new.plot = TRUE,
select = "cos2 0.05")

c) Interprete en el plano principal y circulo de correlaciones dimensiones 1-3 los individuos(paises) que quedaron mal representados en el plano principal y circulo de correlaciones dimensiones 1-2.

plot(modelo,
axes = c(1, 3),
choix = "ind",
col.var = "brown",
new.plot = TRUE,
select = "cos2 0.05")

plot(modelo,
axes = c(1, 3),
choix = "var",
col.var = "blue",
new.plot = TRUE,
select = "cos2 0.05")

matriz <- (dist(datos))
res <- cmdscale(d = matriz, eig = TRUE, k = 2)
head(res$points, 20)

##                              [,1]       [,2]
## Afghanistan             112.46656  13.740334
## Albania                -153.36690  -4.468283
## Algeria                -150.30445  -5.359244
## Angola                  189.16856   3.538990
## Argentina               -44.30762 -19.045071
## Armenia                -148.94455  -1.271414
## Australia              -156.50183 -14.389393
## Austria                 -95.99096 -10.435648
## Azerbaijan              -42.10665  -7.339436
## Bangladesh              -26.45939  25.082985
## Belarus                  36.53840 -25.267898
## Belgium                 -86.99548 -18.501016
## Belize                   19.26632  33.232935
## Benin                    93.06269   8.864273
## Bhutan                   56.75077  12.194140
## Bosnia and Herzegovina  -72.60365  -9.473217
## Botswana                107.70429  -5.203226
## Brazil                  -17.78351 -14.150696
## Bulgaria                -24.52427 -23.505631
## Burkina Faso            109.54004  14.002722

plot(modelo,
axes = c(1, 2),
choix = "ind",
col.var = "blue",
new.plot = TRUE,
select = "cos2 0.05")

d) Realice un ACP usando todas las variables mediante un Escalamiento Multidimensional. Compare el plano principal respecto al plano principal del ACP Es el mismo plano? R/No lo es

x <- res$points[,1]
y <- res$points[,2]
plot(x, y, xlab = "Comp 1",
ylab = "Comp 2",
main = "MDS",
pch = 19)
text(x, y,
labels = row.names(datos),
cex = 0.85 , pos = 1)

#4 Para la tabla de datos eurodist que viene con el paquete datasets el cual contiene las distancias entre algunas de las ciudades mas importantes de Europa ejecute un Escalamiento Multidimensional y luego compare el resultado con el mapa de Europa.

str(eurodist)

##  'dist' num [1:210] 3313 2963 3175 3339 2762 ...
##  - attr(*, "Size")= num 21
##  - attr(*, "Labels")= chr [1:21] "Athens" "Barcelona" "Brussels" "Calais" ...

print(eurodist)

##                 Athens Barcelona Brussels Calais Cherbourg Cologne Copenhagen
## Barcelona         3313                                                       
## Brussels          2963      1318                                             
## Calais            3175      1326      204                                    
## Cherbourg         3339      1294      583    460                             
## Cologne           2762      1498      206    409       785                   
## Copenhagen        3276      2218      966   1136      1545     760           
## Geneva            2610       803      677    747       853    1662       1418
## Gibraltar         4485      1172     2256   2224      2047    2436       3196
## Hamburg           2977      2018      597    714      1115     460        460
## Hook of Holland   3030      1490      172    330       731     269        269
## Lisbon            4532      1305     2084   2052      1827    2290       2971
## Lyons             2753       645      690    739       789     714       1458
## Madrid            3949       636     1558   1550      1347    1764       2498
## Marseilles        2865       521     1011   1059      1101    1035       1778
## Milan             2282      1014      925   1077      1209     911       1537
## Munich            2179      1365      747    977      1160     583       1104
## Paris             3000      1033      285    280       340     465       1176
## Rome               817      1460     1511   1662      1794    1497       2050
## Stockholm         3927      2868     1616   1786      2196    1403        650
## Vienna            1991      1802     1175   1381      1588     937       1455
##                 Geneva Gibraltar Hamburg Hook of Holland Lisbon Lyons Madrid
## Barcelona                                                                   
## Brussels                                                                    
## Calais                                                                      
## Cherbourg                                                                   
## Cologne                                                                     
## Copenhagen                                                                  
## Geneva                                                                      
## Gibraltar         1975                                                      
## Hamburg           1118      2897                                            
## Hook of Holland    895      2428     550                                    
## Lisbon            1936       676    2671            2280                    
## Lyons              158      1817    1159             863   1178             
## Madrid            1439       698    2198            1730    668  1281       
## Marseilles         425      1693    1479            1183   1762   320   1157
## Milan              328      2185    1238            1098   2250   328   1724
## Munich             591      2565     805             851   2507   724   2010
## Paris              513      1971     877             457   1799   471   1273
## Rome               995      2631    1751            1683   2700  1048   2097
## Stockholm         2068      3886     949            1500   3231  2108   3188
## Vienna            1019      2974    1155            1205   2937  1157   2409
##                 Marseilles Milan Munich Paris Rome Stockholm
## Barcelona                                                   
## Brussels                                                    
## Calais                                                      
## Cherbourg                                                   
## Cologne                                                     
## Copenhagen                                                  
## Geneva                                                      
## Gibraltar                                                   
## Hamburg                                                     
## Hook of Holland                                             
## Lisbon                                                      
## Lyons                                                       
## Madrid                                                      
## Marseilles                                                  
## Milan                  618                                  
## Munich                1109   331                            
## Paris                  792   856    821                     
## Rome                  1011   586    946  1476               
## Stockholm             2428  2187   1754  1827 2707          
## Vienna                1363   898    428  1249 1209      2105

library(datasets)
res <- cmdscale(eurodist,eig=TRUE, k=2)
# k es el número de componentes a usar
# Plotear la solución
x <- res$points[,1]
y <- res$points[,2]

euro.matriz <- as.matrix(eurodist)


x <- x
y <- -y
plot(x,y,
xlab="Comp 1",
ylab="Comp 2",
main="MDS",
pch = 19)
text(x, y,
labels = row.names(euro.matriz),
cex=0.75,
pos = 1)

knitr::include_graphics('Mapa_europa.png')

Tarea1_luisp_lopez