library(readr)
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.6.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
verifque la correcta lectura de los datos.
datos <- read.csv("Datos_tarea/Vehiculos.csv",
header = T,
sep = ",",
dec = ".",
)
str(datos)
## 'data.frame': 100 obs. of 11 variables:
## $ car.type : Factor w/ 100 levels "alfa-romero giulia",..: 1 2 4 3 5 6 9 7 8 14 ...
## $ fueltype : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
## $ doornumber: Factor w/ 2 levels "four","two": 2 2 2 1 2 1 1 1 2 2 ...
## $ carlength : num 169 169 177 193 177 ...
## $ carheight : num 48.8 48.8 53.1 55.7 54.3 54.3 55.7 55.7 53.7 52 ...
## $ enginesize: int 130 130 136 136 108 164 164 209 209 90 ...
## $ boreratio : num 3.47 3.47 3.19 3.19 3.5 3.31 3.31 3.62 3.62 3.03 ...
## $ horsepower: int 111 111 110 110 101 121 121 182 182 70 ...
## $ citympg : int 21 21 19 19 23 21 20 16 16 38 ...
## $ highwaympg: int 27 27 25 25 29 28 25 22 22 43 ...
## $ price : num 13495 16500 15250 18920 16430 ...
summary(datos)
## car.type fueltype doornumber carlength
## alfa-romero giulia : 1 diesel: 6 four:51 Min. :144.6
## alfa-romero stelvio: 1 gas :94 two :49 1st Qu.:159.1
## audi 5000 : 1 Median :171.2
## audi fox : 1 Mean :172.3
## bmw 320i : 1 3rd Qu.:180.6
## bmw x3 : 1 Max. :208.1
## (Other) :94
## carheight enginesize boreratio horsepower
## Min. :47.80 Min. : 70.0 Min. :2.910 Min. : 52.0
## 1st Qu.:51.60 1st Qu.: 92.0 1st Qu.:3.030 1st Qu.: 69.0
## Median :53.90 Median :108.5 Median :3.290 Median : 92.0
## Mean :53.66 Mean :126.7 Mean :3.312 Mean :102.1
## 3rd Qu.:55.62 3rd Qu.:141.0 3rd Qu.:3.598 3rd Qu.:114.0
## Max. :59.80 Max. :326.0 Max. :3.940 Max. :262.0
##
## citympg highwaympg price
## Min. :13.00 Min. :16.00 Min. : 5118
## 1st Qu.:19.00 1st Qu.:25.00 1st Qu.: 7295
## Median :25.50 Median :31.00 Median : 9418
## Mean :25.63 Mean :31.01 Mean :12865
## 3rd Qu.:31.00 3rd Qu.:37.00 3rd Qu.:15544
## Max. :49.00 Max. :54.00 Max. :45400
##
dim(datos)
## [1] 100 11
#2 Elimine de la tabla de datos las variables categoricas
datos$fueltype <- NULL
datos$car.type <- NULL
datos$doornumber <- NULL
str(datos)
## 'data.frame': 100 obs. of 8 variables:
## $ carlength : num 169 169 177 193 177 ...
## $ carheight : num 48.8 48.8 53.1 55.7 54.3 54.3 55.7 55.7 53.7 52 ...
## $ enginesize: int 130 130 136 136 108 164 164 209 209 90 ...
## $ boreratio : num 3.47 3.47 3.19 3.19 3.5 3.31 3.31 3.62 3.62 3.03 ...
## $ horsepower: int 111 111 110 110 101 121 121 182 182 70 ...
## $ citympg : int 21 21 19 19 23 21 20 16 16 38 ...
## $ highwaympg: int 27 27 25 25 29 28 25 22 22 43 ...
## $ price : num 13495 16500 15250 18920 16430 ...
#3 Grafique el plano principal con plot(…) y con fviz pca ind(…)
library(FactoMineR)
modelo <- PCA(datos, scale.unit = TRUE, ncp = 5, graph = FALSE)
plot(modelo)
library(factoextra)
fviz_pca_ind(X = modelo,
pointsize = 5,
pointshape = 21,
fill = "#E7B800",
repel = TRUE)
library(FactoMineR)
plot(x = modelo,
axes = c(1, 2),
choix = "var",
col.var = "blue",
new.plot = TRUE)
fviz_pca_var(X = modelo,
col.var = "steelblue")
#5 Repita los ejercicios anteriores pero esta vez elimine individuos y variables mal representados (coseno cuadrado menor al 5 %). Cuales individuos y variables se eliminaron?
cos2.ind <- (modelo$ind$cos2[,1] + modelo$ind$cos2[,2])*100
round(cos2.ind, 2)
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 72.56 76.74 43.80 63.73 46.89 78.65 81.96 94.83 88.80 91.44 90.01 94.80 97.82
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 71.55 97.79 97.67 97.22 73.56 85.10 79.78 93.68 78.74 66.30 87.50 89.25 68.72
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## 23.31 88.41 88.41 95.08 93.28 79.30 95.77 95.23 26.45 47.44 28.48 90.16 90.93
## 40 41 42 43 44 45 46 47 48 49 50 51 52
## 92.37 90.57 94.64 97.75 97.62 34.22 98.75 98.71 93.11 98.44 98.05 74.23 81.15
## 53 54 55 56 57 58 59 60 61 62 63 64 65
## 84.37 82.72 96.13 91.72 79.65 89.11 62.91 92.20 94.80 71.55 78.29 64.46 84.50
## 66 67 68 69 70 71 72 73 74 75 76 77 78
## 65.86 88.87 42.36 14.98 9.45 1.10 15.54 19.25 8.88 0.93 94.94 95.05 94.92
## 79 80 81 82 83 84 85 86 87 88 89 90 91
## 81.30 80.42 89.14 35.75 36.00 33.15 38.80 84.07 74.81 78.71 90.28 78.38 67.91
## 92 93 94 95 96 97 98 99 100
## 21.81 46.75 57.48 76.13 84.20 78.27 85.16 85.58 86.48
eliminados <- list(cos2.ind <= 5)
eliminados
## [[1]]
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 40 41 42 43 44 45 46 47 48 49 50 51 52
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 53 54 55 56 57 58 59 60 61 62 63 64 65
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 66 67 68 69 70 71 72 73 74 75 76 77 78
## FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 79 80 81 82 83 84 85 86 87 88 89 90 91
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 92 93 94 95 96 97 98 99 100
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Se eliminan los casos 71 y 75
plot(x = modelo,
axes = c(1, 2),
choix = "ind",
col.ind = "green",
new.plot = TRUE,
select = "cos2 0.05")
cos2.var <- (modelo$var$cos2[,1] + modelo$var$cos2[,2])*100
round(cos2.var,2)
## carlength carheight enginesize boreratio horsepower citympg highwaympg
## 89.32 94.54 82.04 63.07 91.77 83.24 85.86
## price
## 81.37
Como se puede ver, no se elimina ninguna variable
plot(modelo,
axes = c(1, 2),
choix = "var",
col.var = "brown",
new.plot = TRUE,
select = "cos2 0.1")
Podemos ver que hay una alta correlacion entre carlength y boreratio, price, enginesize y horsepower, asà como entre citympg y highwaympg.
plot(modelo,
axes = c(1, 2),
choix = "ind",
col.var = "red",
new.plot = TRUE,
select = "cos2 0.1")
matriz <- dist(datos)
modelo.SM <- hclust(matriz, method = "complete")
plot(modelo.SM)
matriz <- dist(datos)
modelo.ward <- hclust(matriz, method = "ward.D")
plot(modelo.ward, hang = -1)
rect.hclust(modelo.ward, k = 4, border = "blue")
datos$grupos <- cutree(modelo.ward, k = 4)
datos
## carlength carheight enginesize boreratio horsepower citympg highwaympg
## 1 168.8 48.8 130 3.47 111 21 27
## 2 168.8 48.8 130 3.47 111 21 27
## 3 177.3 53.1 136 3.19 110 19 25
## 4 192.7 55.7 136 3.19 110 19 25
## 5 176.8 54.3 108 3.50 101 23 29
## 6 176.8 54.3 164 3.31 121 21 28
## 7 189.0 55.7 164 3.31 121 20 25
## 8 189.0 55.7 209 3.62 182 16 22
## 9 193.8 53.7 209 3.62 182 16 22
## 10 155.9 52.0 90 3.03 70 38 43
## 11 158.8 52.0 90 3.03 70 38 43
## 12 157.3 50.8 90 2.97 68 37 41
## 13 157.3 50.8 90 2.97 68 31 38
## 14 157.3 50.8 98 3.03 102 24 30
## 15 157.3 50.6 90 2.97 68 31 38
## 16 157.3 50.6 90 2.97 68 31 38
## 17 157.3 50.6 90 2.97 68 31 38
## 18 157.3 50.6 98 3.03 102 24 30
## 19 174.6 59.8 122 3.34 88 24 30
## 20 144.6 50.8 92 2.91 58 49 54
## 21 144.6 50.8 92 2.91 76 31 38
## 22 163.4 54.5 92 2.91 76 30 34
## 23 157.1 58.3 92 2.92 76 30 34
## 24 167.5 53.3 110 3.15 86 27 33
## 25 167.5 53.3 110 3.15 86 27 33
## 26 175.4 54.1 110 3.15 86 27 33
## 27 170.7 53.5 111 3.31 78 24 29
## 28 155.9 52.0 90 3.03 70 38 43
## 29 155.9 52.0 90 3.03 70 38 43
## 30 199.6 52.8 258 3.63 176 15 19
## 31 191.7 47.8 326 3.54 262 13 17
## 32 159.1 54.1 91 3.03 68 30 31
## 33 159.1 54.1 91 3.03 68 31 38
## 34 159.1 54.1 91 3.03 68 31 38
## 35 169.0 49.6 70 3.33 101 17 23
## 36 169.0 49.6 80 3.33 135 16 23
## 37 177.8 53.7 122 3.39 84 26 32
## 38 202.6 56.5 234 3.46 155 16 18
## 39 180.3 50.8 234 3.46 155 16 18
## 40 208.1 56.7 308 3.80 184 14 16
## 41 199.2 55.4 304 3.80 184 14 16
## 42 157.3 50.8 92 2.97 68 37 41
## 43 157.3 50.8 92 2.97 68 31 38
## 44 157.3 50.8 92 2.97 68 31 38
## 45 172.4 51.6 122 3.35 88 25 32
## 46 165.3 54.5 97 3.15 69 31 37
## 47 165.3 54.5 97 3.15 69 31 37
## 48 170.2 53.5 97 3.15 69 31 37
## 49 165.3 54.5 97 3.15 69 31 37
## 50 162.4 53.3 97 3.15 69 31 37
## 51 173.4 54.7 120 3.33 97 27 34
## 52 181.7 55.1 181 3.43 152 17 22
## 53 184.6 56.1 181 3.43 152 17 22
## 54 184.6 55.1 181 3.43 152 19 25
## 55 170.7 49.7 181 3.43 160 19 25
## 56 170.7 49.7 181 3.43 200 17 23
## 57 186.7 56.7 120 3.46 97 19 24
## 58 198.9 58.7 120 3.46 97 19 24
## 59 186.7 56.7 152 3.70 95 28 33
## 60 198.9 58.7 152 3.70 95 25 25
## 61 157.3 50.8 90 2.97 68 37 41
## 62 157.3 50.8 98 3.03 102 24 30
## 63 173.2 50.2 156 3.59 145 19 24
## 64 168.9 50.2 151 3.94 143 19 27
## 65 168.9 51.6 194 3.74 207 17 25
## 66 181.5 55.2 132 3.46 90 23 31
## 67 186.6 56.1 121 3.54 110 21 28
## 68 156.9 53.7 97 3.62 69 31 36
## 69 157.9 53.7 108 3.62 73 26 31
## 70 172.0 52.5 108 3.62 82 28 33
## 71 172.0 52.5 108 3.62 94 26 32
## 72 172.0 54.3 108 3.62 82 24 25
## 73 172.0 54.3 108 3.62 111 24 29
## 74 173.5 53.0 108 3.62 82 28 32
## 75 173.5 53.0 108 3.62 94 25 31
## 76 158.7 54.5 92 3.05 62 35 39
## 77 158.7 54.5 92 3.05 62 31 38
## 78 158.7 54.5 92 3.05 62 31 38
## 79 169.7 59.1 92 3.05 62 27 32
## 80 169.7 59.1 92 3.05 62 27 32
## 81 168.7 52.6 98 3.19 70 29 34
## 82 168.7 52.6 98 3.24 112 26 29
## 83 168.7 52.6 98 3.24 112 26 29
## 84 176.2 52.0 146 3.62 116 24 30
## 85 176.2 52.0 146 3.62 116 24 30
## 86 175.6 54.9 110 3.27 73 30 33
## 87 187.8 54.1 161 3.27 156 19 24
## 88 171.7 55.7 97 3.01 52 37 46
## 89 171.7 55.7 109 3.19 85 27 34
## 90 171.7 55.7 97 3.01 52 37 46
## 91 171.7 55.7 109 3.19 100 26 32
## 92 159.3 55.6 109 3.19 90 24 29
## 93 180.2 55.1 136 3.19 110 19 24
## 94 180.2 55.1 97 3.01 68 33 38
## 95 188.8 56.2 141 3.78 114 23 28
## 96 188.8 57.5 141 3.78 114 23 28
## 97 188.8 56.2 141 3.78 114 24 28
## 98 188.8 57.5 141 3.78 114 24 28
## 99 188.8 56.2 130 3.62 162 17 22
## 100 188.8 57.5 130 3.62 162 17 22
## price grupos
## 1 13495.0 1
## 2 16500.0 1
## 3 15250.0 1
## 4 18920.0 1
## 5 16430.0 1
## 6 21105.0 1
## 7 24565.0 1
## 8 30760.0 2
## 9 41315.0 2
## 10 6295.0 3
## 11 6575.0 3
## 12 5572.0 3
## 13 6377.0 3
## 14 7957.0 3
## 15 6229.0 3
## 16 6692.0 3
## 17 7609.0 3
## 18 8558.0 3
## 19 8921.0 4
## 20 6479.0 3
## 21 6855.0 3
## 22 7295.0 3
## 23 7295.0 3
## 24 7895.0 3
## 25 9095.0 4
## 26 8845.0 4
## 27 6785.0 3
## 28 8916.5 4
## 29 8916.5 4
## 30 32250.0 2
## 31 36000.0 2
## 32 5195.0 3
## 33 6095.0 3
## 34 6795.0 3
## 35 10945.0 4
## 36 15645.0 1
## 37 10595.0 4
## 38 34184.0 2
## 39 35056.0 2
## 40 40960.0 2
## 41 45400.0 2
## 42 5389.0 3
## 43 6189.0 3
## 44 6669.0 3
## 45 6989.0 3
## 46 5499.0 3
## 47 6849.0 3
## 48 7349.0 3
## 49 7299.0 3
## 50 8249.0 3
## 51 9549.0 4
## 52 13499.0 1
## 53 14399.0 1
## 54 13499.0 1
## 55 17199.0 1
## 56 19699.0 1
## 57 11900.0 4
## 58 12440.0 1
## 59 16900.0 1
## 60 17075.0 1
## 61 5572.0 3
## 62 7957.0 3
## 63 12764.0 1
## 64 22018.0 1
## 65 34028.0 2
## 66 9295.0 4
## 67 15510.0 1
## 68 5118.0 3
## 69 7053.0 3
## 70 7775.0 3
## 71 9960.0 4
## 72 9233.0 4
## 73 11259.0 4
## 74 7463.0 3
## 75 10198.0 4
## 76 5348.0 3
## 77 6338.0 3
## 78 6488.0 3
## 79 7898.0 3
## 80 8778.0 4
## 81 8058.0 3
## 82 9298.0 4
## 83 9538.0 4
## 84 9989.0 4
## 85 11199.0 4
## 86 10698.0 4
## 87 15750.0 1
## 88 7775.0 3
## 89 7975.0 3
## 90 7995.0 3
## 91 9995.0 4
## 92 11595.0 4
## 93 13295.0 1
## 94 13845.0 1
## 95 12940.0 1
## 96 13415.0 1
## 97 15985.0 1
## 98 16515.0 1
## 99 18420.0 1
## 100 18950.0 1
#c) Interprete los resultados del ejercicio anterior para el caso de agregacion de Ward usando graficos tipo radar, use 4 clusteres.
write.csv(datos,'vehiculos2.csv')
library(cluster)
centroide <- function(num.cluster, datos, clusters) {
ind <- (clusters == num.cluster)
return(colMeans(datos[ind,]))
}
centro.cluster1 <- centroide(1, datos, datos$grupos)
centro.cluster2 <- centroide(2, datos, datos$grupos)
centro.cluster3 <- centroide(3, datos, datos$grupos)
centro.cluster4 <- centroide(4, datos, datos$grupos)
centros <- rbind(centro.cluster1,
centro.cluster2,
centro.cluster3, centro.cluster4)
centros
## carlength carheight enginesize boreratio horsepower citympg
## centro.cluster1 182.3464 54.35714 143.67857 3.488571 126.25000 20.67857
## centro.cluster2 192.5778 53.44444 252.88889 3.630000 187.44444 15.22222
## centro.cluster3 161.4220 53.00976 96.31707 3.103902 72.14634 31.09756
## centro.cluster4 171.5136 54.07727 110.27273 3.344091 92.31818 26.00000
## highwaympg price grupos
## centro.cluster1 26.03571 16286.679 1
## centro.cluster2 19.22222 36661.444 2
## centro.cluster3 36.87805 6874.317 3
## centro.cluster4 31.22727 9941.727 4
centros <- as.data.frame(centros)
maximos <- apply(centros, 2, max)
minimos <- apply(centros, 2, min)
centros <- rbind(minimos, centros)
centros <- rbind(maximos, centros)
centros
## carlength carheight enginesize boreratio horsepower citympg
## 1 192.5778 54.35714 252.88889 3.630000 187.44444 31.09756
## 11 161.4220 53.00976 96.31707 3.103902 72.14634 15.22222
## centro.cluster1 182.3464 54.35714 143.67857 3.488571 126.25000 20.67857
## centro.cluster2 192.5778 53.44444 252.88889 3.630000 187.44444 15.22222
## centro.cluster3 161.4220 53.00976 96.31707 3.103902 72.14634 31.09756
## centro.cluster4 171.5136 54.07727 110.27273 3.344091 92.31818 26.00000
## highwaympg price grupos
## 1 36.87805 36661.444 4
## 11 19.22222 6874.317 1
## centro.cluster1 26.03571 16286.679 1
## centro.cluster2 19.22222 36661.444 2
## centro.cluster3 36.87805 6874.317 3
## centro.cluster4 31.22727 9941.727 4
library(fmsb)
colores <- c("blue","brown","green", "red")
radarchart(as.data.frame(centros),maxmin=TRUE,
axistype=4, axislabcol="slategray4",
centerzero=FALSE, seg=8,
cglcol="gray67", pcol=colores,
plty=1, plwd=5,
title="Comparación de clústeres")
legenda <-legend(1.5,1,
legend=c("Cluster 1",
"Cluster 2",
"Cluster 3","Cluster 4" ),
seg.len=-1.4,
title="Clústeres",
pch=21, bty="n" ,
lwd=3, y.intersp=1,
horiz=FALSE, col=colores)
res <- PCA(datos , scale.unit=TRUE, ncp=5, graph = FALSE)
res.hcpc <- HCPC(res, nb.clust = -1, consol = TRUE, min = 3, max = 3, graph = FALSE)
plot.HCPC(res.hcpc, choice="bar")
#2 a) Cargue la tabla de datos y ejecute un str(…), summary(…) y un dim(…), verifique la correcta lectura de los datos.
datos_beijing <- read_csv("Datos_tarea/DatosBeijing.csv")
## Parsed with column specification:
## cols(
## ID = col_double(),
## Anno = col_double(),
## Mes = col_double(),
## Dia = col_double(),
## Hora = col_double(),
## ConcetracionParticula_pm2.5 = col_double(),
## PuntoRocio = col_double(),
## Temperatura = col_double(),
## Presion = col_double(),
## DireccionViento = col_character(),
## VelocidadViento = col_double(),
## HorasNieve = col_double(),
## HorasLluvia = col_double()
## )
str(datos_beijing)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 43824 obs. of 13 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Anno : num 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ Mes : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Dia : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Hora : num 0 1 2 3 4 5 6 7 8 9 ...
## $ ConcetracionParticula_pm2.5: num NA NA NA NA NA NA NA NA NA NA ...
## $ PuntoRocio : num -21 -21 -21 -21 -20 -19 -19 -19 -19 -20 ...
## $ Temperatura : num -11 -12 -11 -14 -12 -10 -9 -9 -9 -8 ...
## $ Presion : num 1021 1020 1019 1019 1018 ...
## $ DireccionViento : chr "NW" "NW" "NW" "NW" ...
## $ VelocidadViento : num 1.79 4.92 6.71 9.84 12.97 ...
## $ HorasNieve : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HorasLluvia : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. Anno = col_double(),
## .. Mes = col_double(),
## .. Dia = col_double(),
## .. Hora = col_double(),
## .. ConcetracionParticula_pm2.5 = col_double(),
## .. PuntoRocio = col_double(),
## .. Temperatura = col_double(),
## .. Presion = col_double(),
## .. DireccionViento = col_character(),
## .. VelocidadViento = col_double(),
## .. HorasNieve = col_double(),
## .. HorasLluvia = col_double()
## .. )
summary(datos_beijing)
## ID Anno Mes Dia
## Min. : 1 Min. :2010 Min. : 1.000 Min. : 1.00
## 1st Qu.:10957 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00
## Median :21913 Median :2012 Median : 7.000 Median :16.00
## Mean :21913 Mean :2012 Mean : 6.524 Mean :15.73
## 3rd Qu.:32868 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :43824 Max. :2014 Max. :12.000 Max. :31.00
##
## Hora ConcetracionParticula_pm2.5 PuntoRocio Temperatura
## Min. : 0.00 Min. : 0.00 Min. :-40.000 Min. :-19.00
## 1st Qu.: 5.75 1st Qu.: 29.00 1st Qu.:-10.000 1st Qu.: 2.00
## Median :11.50 Median : 72.00 Median : 2.000 Median : 14.00
## Mean :11.50 Mean : 98.61 Mean : 1.817 Mean : 12.45
## 3rd Qu.:17.25 3rd Qu.:137.00 3rd Qu.: 15.000 3rd Qu.: 23.00
## Max. :23.00 Max. :994.00 Max. : 28.000 Max. : 42.00
## NA's :2067
## Presion DireccionViento VelocidadViento HorasNieve
## Min. : 991 Length:43824 Min. : 0.45 Min. : 0.00000
## 1st Qu.:1008 Class :character 1st Qu.: 1.79 1st Qu.: 0.00000
## Median :1016 Mode :character Median : 5.37 Median : 0.00000
## Mean :1016 Mean : 23.89 Mean : 0.05273
## 3rd Qu.:1025 3rd Qu.: 21.91 3rd Qu.: 0.00000
## Max. :1046 Max. :585.60 Max. :27.00000
##
## HorasLluvia
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.1949
## 3rd Qu.: 0.0000
## Max. :36.0000
##
dim(datos_beijing)
## [1] 43824 13
data <- na.omit(datos_beijing)
41757-43824=2067 filas eliminadas
#c) Eimine de la tabla de datos la variable DireccionViento. Por que se debe eliminar? Que otra aternativa se tiene en lugar de eliminarla?
data$DireccionViento <- NULL
Se debe eliminar porque es categórica
#d) Que pasa si ejecutamos un clustering jerarquico con hclust(…). Por que sucede esto?
No se puede ejecutar (error) por el gran tamaño del archivo
#e) Ejecute un k-medias con k = 3, iter.max=1000 y nstart=50.
grupos <- kmeans(datos, centers= 3, iter.max = 1000, nstart = 50)
grupos$cluster
## [1] 2 2 2 2 2 2 2 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 2 1
## [38] 3 3 3 3 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 2 2 1 1 2 2 3 1 2 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2
#f) De una interpretacion de los resultados usando un grafco tipo radar
centros<-grupos$centers
rownames(centros)<-c("Cluster 1","Cluster 2","Cluster 3")
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros
## carlength carheight enginesize boreratio horsepower citympg
## 1 192.5778 54.35714 252.8889 3.630000 187.44444 29.31746
## 11 164.9460 53.38254 101.1905 3.187778 79.19048 15.22222
## Cluster 1 164.9460 53.38254 101.1905 3.187778 79.19048 29.31746
## Cluster 2 182.3464 54.35714 143.6786 3.488571 126.25000 20.67857
## Cluster 3 192.5778 53.44444 252.8889 3.630000 187.44444 15.22222
## highwaympg price grupos
## 1 34.90476 36661.444 3.349206
## 11 19.22222 7945.476 1.000000
## Cluster 1 34.90476 7945.476 3.349206
## Cluster 2 26.03571 16286.679 1.000000
## Cluster 3 19.22222 36661.444 2.000000
color <- c("blue", "red", "brown")
radarchart(as.data.frame(centros),
maxmin=TRUE,axistype=4,
axislabcol="slategray4",
centerzero=FALSE,seg=8,
cglcol="gray67",
pcol=color,plty=1,plwd=5,
title="Comp de Clusteres")
legenda <-legend(1.5,1,
legend=c("Cluster 1",
"Cluster 2",
"Cluster 3"),
seg.len=-1.4,title="Clusteres",
pch=21,bty="n" ,lwd=3,
y.intersp=1,
horiz=FALSE,col=color)
InercIa<-rep(0,9)
for(k in 1:9) {
grupos<-kmeans(data, centers=k,iter.max = 100, nstart=5)
InercIa[k]<-grupos$tot.withinss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2087850)
plot(InercIa,col="blue",type="b")
#3 a) Ejecute las siguientes instrucciones para calcular y graficarla matriz de correlaciones yluego interprete todas las correlaciones:
library(corrplot)
## corrplot 0.84 loaded
datos <- read.table("Datos_tarea/ExpectativaVida.csv",sep = ",",dec='.',header=TRUE)
datos2 <-datos[,-c(1)] # Se quita la variable categorica Status.
correlaciones <- cor(datos2)
corrplot(correlaciones)
Hay correlaciones importantes entre schooling y income composition of resources, entre estas dos y life expectancy
#b Realice un ACP usando el paquete FactoMineR tomando en cuenta solo las variables numericas, grafiqe el plano principal y el circulo de correlaciones de manera que se le eliminen los individuos y variables con menos de 5% de representacion, como se muestra en la figra siguiente. Basado en la sobreposicicon del crculo de correlaciones y el plano principal interprete el Cluster 1 y Cluster 2 segun se indican en el siguiente grafico
datos$Status<- NULL
str(datos)
## 'data.frame': 131 obs. of 7 variables:
## $ Life.expectancy : num 59.9 77.5 75.4 51.7 76.2 74.6 82.7 81.4 72.5 71.4 ...
## $ Adult.Mortality : int 271 8 11 348 118 12 6 66 119 132 ...
## $ Alcohol : num 0.01 4.51 0.01 8.33 7.93 ...
## $ BMI : num 18.6 57.2 58.4 22.7 62.2 54.1 66.1 57.1 51.5 17.7 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 2 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ Income.composition.of.resources: num 0.476 0.761 0.741 0.527 0.825 0.739 0.936 0.892 0.752 0.57 ...
## $ Schooling : num 10 14.2 14.4 11.4 17.3 12.7 20.4 15.9 12.2 10 ...
modelo <- PCA(datos, scale.unit = TRUE, ncp = 4, graph = FALSE)
cos2.var <- (modelo$var$cos2[,1] + modelo$var$cos2[,2])*100
round(cos2.var,2)
## Life.expectancy Adult.Mortality
## 89.62 82.46
## Alcohol BMI
## 68.89 50.45
## HIV.AIDS Income.composition.of.resources
## 79.20 92.58
## Schooling
## 83.99
plot(modelo,
axes = c(1, 2),
choix = "var",
col.var = "red",
new.plot = TRUE,
select = "cos2 0.05")
plot(modelo,
axes = c(1, 2),
choix = "ind",
col.var = "red",
new.plot = TRUE,
select = "cos2 0.05")
plot(modelo,
axes = c(1, 3),
choix = "ind",
col.var = "brown",
new.plot = TRUE,
select = "cos2 0.05")
plot(modelo,
axes = c(1, 3),
choix = "var",
col.var = "blue",
new.plot = TRUE,
select = "cos2 0.05")
matriz <- (dist(datos))
res <- cmdscale(d = matriz, eig = TRUE, k = 2)
head(res$points, 20)
## [,1] [,2]
## Afghanistan 112.46656 13.740334
## Albania -153.36690 -4.468283
## Algeria -150.30445 -5.359244
## Angola 189.16856 3.538990
## Argentina -44.30762 -19.045071
## Armenia -148.94455 -1.271414
## Australia -156.50183 -14.389393
## Austria -95.99096 -10.435648
## Azerbaijan -42.10665 -7.339436
## Bangladesh -26.45939 25.082985
## Belarus 36.53840 -25.267898
## Belgium -86.99548 -18.501016
## Belize 19.26632 33.232935
## Benin 93.06269 8.864273
## Bhutan 56.75077 12.194140
## Bosnia and Herzegovina -72.60365 -9.473217
## Botswana 107.70429 -5.203226
## Brazil -17.78351 -14.150696
## Bulgaria -24.52427 -23.505631
## Burkina Faso 109.54004 14.002722
plot(modelo,
axes = c(1, 2),
choix = "ind",
col.var = "blue",
new.plot = TRUE,
select = "cos2 0.05")
x <- res$points[,1]
y <- res$points[,2]
plot(x, y, xlab = "Comp 1",
ylab = "Comp 2",
main = "MDS",
pch = 19)
text(x, y,
labels = row.names(datos),
cex = 0.85 , pos = 1)
#4 Para la tabla de datos eurodist que viene con el paquete datasets el cual contiene las distancias entre algunas de las ciudades mas importantes de Europa ejecute un Escalamiento Multidimensional y luego compare el resultado con el mapa de Europa.
str(eurodist)
## 'dist' num [1:210] 3313 2963 3175 3339 2762 ...
## - attr(*, "Size")= num 21
## - attr(*, "Labels")= chr [1:21] "Athens" "Barcelona" "Brussels" "Calais" ...
print(eurodist)
## Athens Barcelona Brussels Calais Cherbourg Cologne Copenhagen
## Barcelona 3313
## Brussels 2963 1318
## Calais 3175 1326 204
## Cherbourg 3339 1294 583 460
## Cologne 2762 1498 206 409 785
## Copenhagen 3276 2218 966 1136 1545 760
## Geneva 2610 803 677 747 853 1662 1418
## Gibraltar 4485 1172 2256 2224 2047 2436 3196
## Hamburg 2977 2018 597 714 1115 460 460
## Hook of Holland 3030 1490 172 330 731 269 269
## Lisbon 4532 1305 2084 2052 1827 2290 2971
## Lyons 2753 645 690 739 789 714 1458
## Madrid 3949 636 1558 1550 1347 1764 2498
## Marseilles 2865 521 1011 1059 1101 1035 1778
## Milan 2282 1014 925 1077 1209 911 1537
## Munich 2179 1365 747 977 1160 583 1104
## Paris 3000 1033 285 280 340 465 1176
## Rome 817 1460 1511 1662 1794 1497 2050
## Stockholm 3927 2868 1616 1786 2196 1403 650
## Vienna 1991 1802 1175 1381 1588 937 1455
## Geneva Gibraltar Hamburg Hook of Holland Lisbon Lyons Madrid
## Barcelona
## Brussels
## Calais
## Cherbourg
## Cologne
## Copenhagen
## Geneva
## Gibraltar 1975
## Hamburg 1118 2897
## Hook of Holland 895 2428 550
## Lisbon 1936 676 2671 2280
## Lyons 158 1817 1159 863 1178
## Madrid 1439 698 2198 1730 668 1281
## Marseilles 425 1693 1479 1183 1762 320 1157
## Milan 328 2185 1238 1098 2250 328 1724
## Munich 591 2565 805 851 2507 724 2010
## Paris 513 1971 877 457 1799 471 1273
## Rome 995 2631 1751 1683 2700 1048 2097
## Stockholm 2068 3886 949 1500 3231 2108 3188
## Vienna 1019 2974 1155 1205 2937 1157 2409
## Marseilles Milan Munich Paris Rome Stockholm
## Barcelona
## Brussels
## Calais
## Cherbourg
## Cologne
## Copenhagen
## Geneva
## Gibraltar
## Hamburg
## Hook of Holland
## Lisbon
## Lyons
## Madrid
## Marseilles
## Milan 618
## Munich 1109 331
## Paris 792 856 821
## Rome 1011 586 946 1476
## Stockholm 2428 2187 1754 1827 2707
## Vienna 1363 898 428 1249 1209 2105
library(datasets)
res <- cmdscale(eurodist,eig=TRUE, k=2)
# k es el número de componentes a usar
# Plotear la solución
x <- res$points[,1]
y <- res$points[,2]
euro.matriz <- as.matrix(eurodist)
x <- x
y <- -y
plot(x,y,
xlab="Comp 1",
ylab="Comp 2",
main="MDS",
pch = 19)
text(x, y,
labels = row.names(euro.matriz),
cex=0.75,
pos = 1)
knitr::include_graphics('Mapa_europa.png')