Parte 1: Clustering
#Cambiar a factores las siguientes columnas:
#mfr: A = 0; G = 1; K = 2; N = 3; P = 4; Q = 5; R = 6
#type: C = 0; H = 1
db$mfr <- ifelse(db$mfr=="A", 0,
ifelse(db$mfr=="G", 1,
ifelse(db$mfr=="K", 2,
ifelse(db$mfr=="N", 3,
ifelse(db$mfr=="P", 4,
ifelse(db$mfr=="Q", 5,
ifelse(db$mfr=="R", 6,
NA)))))))
db$type <- ifelse(db$type=="C", 0,
ifelse(db$type=="H", 1, NA))
#Escalamos los datos
cereales = scale(db, center = TRUE, scale = TRUE)
summary(cereales)
## mfr type calories protein
## Min. :-1.5771 Min. :-0.200 Min. :-2.9195 Min. :-1.4116
## 1st Qu.:-0.9988 1st Qu.:-0.200 1st Qu.:-0.3533 1st Qu.:-0.4982
## Median :-0.4206 Median :-0.200 Median : 0.1600 Median : 0.4152
## Mean : 0.0000 Mean : 0.000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7360 3rd Qu.:-0.200 3rd Qu.: 0.1600 3rd Qu.: 0.4152
## Max. : 1.8925 Max. : 4.934 Max. : 2.7262 Max. : 3.1554
## fat sodium fiber carbo
## Min. :-1.0065 Min. :-1.9047 Min. :-0.90290 Min. :-3.6451
## 1st Qu.:-1.0065 1st Qu.:-0.3540 1st Qu.:-0.48333 1st Qu.:-0.6070
## Median :-0.0129 Median : 0.2424 Median :-0.06375 Median :-0.1396
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.9807 3rd Qu.: 0.6003 3rd Qu.: 0.35582 3rd Qu.: 0.5615
## Max. : 3.9614 Max. : 1.9124 Max. : 4.97115 Max. : 1.9637
## sugars potass vitamins shelf
## Min. :-1.78229 Min. :-1.36179 Min. :-1.2643 Min. :-1.4508
## 1st Qu.:-0.88238 1st Qu.:-0.78665 1st Qu.:-0.1453 1st Qu.:-1.4508
## Median : 0.01753 Median :-0.08526 Median :-0.1453 Median :-0.2496
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.91744 3rd Qu.: 0.33558 3rd Qu.:-0.1453 3rd Qu.: 0.9516
## Max. : 1.81735 Max. : 3.28142 Max. : 3.2115 Max. : 0.9516
## weight cups rating
## Min. :-3.5195 Min. :-2.4538 Min. :-1.7529
## 1st Qu.:-0.1968 1st Qu.:-0.6490 1st Qu.:-0.6757
## Median :-0.1968 Median :-0.3053 Median :-0.1613
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.1968 3rd Qu.: 0.7690 3rd Qu.: 0.5811
## Max. : 3.1260 Max. : 2.9176 Max. : 3.6334
cereales = as.data.frame(cereales)
Creamos los clusters en función a “calories” y “sugars”
kmcluster = kmeans(cereales,centers=4,nstart = 50)
#Utilizamos 4 ya que es lo "estandar", màs adelante veremos cuál es el número más optimo
kmcluster
## K-means clustering with 4 clusters of sizes 23, 17, 3, 34
##
## Cluster means:
## mfr type calories protein fat sodium
## 1 0.43427639 0.4696476 -0.7103035 0.5343311 -0.4880887 -0.65479200
## 2 -0.01236948 -0.2000351 1.0958759 0.5226506 1.0391108 -0.02770273
## 3 -0.22780463 -0.2000351 -2.2351425 1.3286071 -0.3440932 0.20268253
## 4 -0.26749005 -0.2000351 0.1297799 -0.7400146 -0.1590166 0.43891514
## fiber carbo sugars potass vitamins shelf
## 1 0.09495521 0.11440966 -0.8921616 -0.02182989 -0.4858650 -0.2495930
## 2 0.47922584 -0.08462542 0.6262939 0.88431118 0.1837835 0.8809166
## 3 3.71242164 -1.85342766 -0.7323949 3.00086468 -0.1453172 0.9515734
## 4 -0.63141394 0.12845568 0.3549972 -0.69217049 0.2496037 -0.3555783
## weight cups rating
## 1 -0.5348330 -0.0493036 0.9693597
## 2 1.1518771 -0.5125311 -0.3814882
## 3 -0.1967771 -1.8665328 2.2195570
## 4 -0.1967771 0.4543120 -0.6608425
##
## Clustering vector:
## 100% Bran 100% Natural Bran
## 3 2
## All-Bran All-Bran with Extra Fiber
## 3 3
## Almond Delight Apple Cinnamon Cheerios
## 4 4
## Apple Jacks Basic 4
## 4 2
## Bran Chex Bran Flakes
## 1 1
## Cap'n'Crunch Cheerios
## 4 1
## Cinnamon Toast Crunch Clusters
## 4 2
## Cocoa Puffs Corn Chex
## 4 4
## Corn Flakes Corn Pops
## 4 4
## Count Chocula Cracklin' Oat Bran
## 4 2
## Cream of Wheat (Quick) Crispix
## 1 4
## Crispy Wheat & Raisins Double Chex
## 4 1
## Froot Loops Frosted Flakes
## 4 4
## Frosted Mini-Wheats Fruit & Fibre Dates; Walnuts; and Oats
## 1 2
## Fruitful Bran Fruity Pebbles
## 2 4
## Golden Crisp Golden Grahams
## 4 4
## Grape Nuts Flakes Grape-Nuts
## 1 1
## Great Grains Pecan Honey Graham Ohs
## 2 4
## Honey Nut Cheerios Honey-comb
## 4 4
## Just Right Crunchy Nuggets Just Right Fruit & Nut
## 4 2
## Kix Life
## 4 1
## Lucky Charms Maypo
## 4 1
## Muesli Raisins; Dates; & Almonds Muesli Raisins; Peaches; & Pecans
## 2 2
## Mueslix Crispy Blend Multi-Grain Cheerios
## 2 4
## Nut&Honey Crunch Nutri-Grain Almond-Raisin
## 4 2
## Nutri-grain Wheat Oatmeal Raisin Crisp
## 1 2
## Post Nat. Raisin Bran Product 19
## 2 4
## Puffed Rice Puffed Wheat
## 1 1
## Quaker Oat Squares Quaker Oatmeal
## 1 1
## Raisin Bran Raisin Nut Bran
## 2 2
## Raisin Squares Rice Chex
## 1 4
## Rice Krispies Shredded Wheat
## 4 1
## Shredded Wheat 'n'Bran Shredded Wheat spoon size
## 1 1
## Smacks Special K
## 4 1
## Strawberry Fruit Wheats Total Corn Flakes
## 1 4
## Total Raisin Bran Total Whole Grain
## 2 4
## Triples Trix
## 4 4
## Wheat Chex Wheaties
## 1 1
## Wheaties Honey Gold
## 4
##
## Within cluster sum of squares by cluster:
## [1] 301.11311 149.60089 10.67216 246.26831
## (between_SS / total_SS = 37.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Gráficamos en función a “calories” y “sugars”
cereales = cereales %>% mutate(cluster = kmcluster$cluster)
(g1=ggplot(cereales, aes(x = calories , y = sugars)) +
geom_point(aes(color=as.factor(cluster)), size=10)+
geom_text(aes(label = cluster), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=4")
)

rownames(cereales)=name
fviz_cluster(kmcluster, cereales, show.clust.cent = T,
ellipse.type = "euclid", star.plot = T, repel = T) +
labs(title = "Resultados clustering K-means") +
theme_bw()

#Ahora creamos un cluster con k = 2
kmcluster2 = kmeans(cereales, centers=2, nstart = 50)
cereales = cereales %>% mutate(cluster2 = kmcluster2$cluster)
(g2=ggplot(cereales, aes(x = calories, y = sugars)) +
geom_point(aes(color=as.factor(cluster2)), size=10)+
geom_text(aes(label = cluster2), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=2")
)

rownames(cereales)=name
fviz_cluster(kmcluster2, cereales, show.clust.cent = T,
ellipse.type = "euclid", star.plot = T, repel = T) +
labs(title = "Resultados clustering K-means") +
theme_bw()

Comparamos cluster1 y cluster2
plot_grid(g1,g2)

Buscamos el número optimo de clusters
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
cluster_means = kmeans(data,centers = n_clusters,
iter.max = iter.max,
nstart = nstart)
return(cluster_means$tot.withinss)
}
# Se aplica esta funci?n con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
.f = total_within,
data = cereales)
total_withinss
## [1] 1291.0649 987.1462 817.1465 707.6545 625.8225 544.2581 491.1711
## [8] 442.2914 402.7844 369.9506 343.5181 314.0512 287.0797 271.6476
## [15] 249.8472
#graficamos la varianza total
data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:15) +
labs(title = "Suma total de cuadrados intra-cluster") +
theme_bw()

Dendograma
#Matriz de distancias
d <- dist(cereales)
# Clúster jerarquico
hc <- hclust(d, method = "ward.D")
#Plot
plot(as.dendrogram(hc), main = "ward.D")

Parte 2: Árbol
#Entrenamos y seleccionamos una muestra del 70%
train=sample(seq(length(db$rating)),length(db$rating)*0.7,replace=FALSE)
Creamos el árbol
cereal.tree = tree(db$rating~.,db,subset=train)
summary(cereal.tree)
##
## Regression tree:
## tree(formula = db$rating ~ ., data = db, subset = train)
## Variables actually used in tree construction:
## [1] "calories" "sugars" "sodium" "protein"
## Number of terminal nodes: 6
## Residual mean deviance: 31.45 = 1478 / 47
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12.1900 -2.8590 -0.1862 0.0000 3.1910 20.7600
Graficamos el arbol
plot(cereal.tree);text(cereal.tree,pretty=0)

#Valores del arbol
cereal.tree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 53 11710.00 41.99
## 2) calories < 95 10 1475.00 64.56
## 4) sugars < 1 5 680.70 72.95 *
## 5) sugars > 1 5 90.72 56.17 *
## 3) calories > 95 43 3955.00 36.74
## 6) sugars < 7.5 18 979.50 44.83
## 12) sodium < 137.5 5 215.90 54.61 *
## 13) sodium > 137.5 13 101.10 41.07 *
## 7) sugars > 7.5 25 951.00 30.92
## 14) protein < 1.5 8 147.20 24.01 *
## 15) protein > 1.5 17 242.30 34.17 *