Cargar las librerías

library(tree)
library(dplyr)
library(tidyverse)
library(factoextra)
library(cowplot)
library(ggpubr)
library(cluster)
library(purrr)
library(ggrepel)
library(rpart)
library(rpart.plot)
library(caret)

Cargar la base de datos

Esta base de datos es sobre distintos tipos de cereales y sus valores nutricionales, así como un rating otorgado por usuarios.

db <- read.csv("C:\\Users\\alfon\\OneDrive\\Desktop\\cereal.csv")
name = row.names(db) <- db$name
db <- db[, -which(names(db) == "name")]
summary(db)
##      mfr                type              calories        protein     
##  Length:77          Length:77          Min.   : 50.0   Min.   :1.000  
##  Class :character   Class :character   1st Qu.:100.0   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Median :110.0   Median :3.000  
##                                        Mean   :106.9   Mean   :2.545  
##                                        3rd Qu.:110.0   3rd Qu.:3.000  
##                                        Max.   :160.0   Max.   :6.000  
##       fat            sodium          fiber            carbo     
##  Min.   :0.000   Min.   :  0.0   Min.   : 0.000   Min.   :-1.0  
##  1st Qu.:0.000   1st Qu.:130.0   1st Qu.: 1.000   1st Qu.:12.0  
##  Median :1.000   Median :180.0   Median : 2.000   Median :14.0  
##  Mean   :1.013   Mean   :159.7   Mean   : 2.152   Mean   :14.6  
##  3rd Qu.:2.000   3rd Qu.:210.0   3rd Qu.: 3.000   3rd Qu.:17.0  
##  Max.   :5.000   Max.   :320.0   Max.   :14.000   Max.   :23.0  
##      sugars           potass          vitamins          shelf      
##  Min.   :-1.000   Min.   : -1.00   Min.   :  0.00   Min.   :1.000  
##  1st Qu.: 3.000   1st Qu.: 40.00   1st Qu.: 25.00   1st Qu.:1.000  
##  Median : 7.000   Median : 90.00   Median : 25.00   Median :2.000  
##  Mean   : 6.922   Mean   : 96.08   Mean   : 28.25   Mean   :2.208  
##  3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00   3rd Qu.:3.000  
##  Max.   :15.000   Max.   :330.00   Max.   :100.00   Max.   :3.000  
##      weight          cups           rating     
##  Min.   :0.50   Min.   :0.250   Min.   :18.04  
##  1st Qu.:1.00   1st Qu.:0.670   1st Qu.:33.17  
##  Median :1.00   Median :0.750   Median :40.40  
##  Mean   :1.03   Mean   :0.821   Mean   :42.67  
##  3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:50.83  
##  Max.   :1.50   Max.   :1.500   Max.   :93.70

Parte 1: Clustering

#Cambiar a factores las siguientes columnas:
  #mfr: A = 0; G = 1; K = 2; N = 3; P = 4; Q = 5; R = 6
  #type: C = 0; H = 1

db$mfr <- ifelse(db$mfr=="A", 0,
                 ifelse(db$mfr=="G", 1,
                 ifelse(db$mfr=="K", 2,
                 ifelse(db$mfr=="N", 3,
                 ifelse(db$mfr=="P", 4,
                 ifelse(db$mfr=="Q", 5,
                 ifelse(db$mfr=="R", 6,
                        NA)))))))

db$type <- ifelse(db$type=="C", 0,
                  ifelse(db$type=="H", 1, NA))

#Escalamos los datos
cereales = scale(db, center = TRUE, scale = TRUE)
summary(cereales)
##       mfr               type           calories          protein       
##  Min.   :-1.5771   Min.   :-0.200   Min.   :-2.9195   Min.   :-1.4116  
##  1st Qu.:-0.9988   1st Qu.:-0.200   1st Qu.:-0.3533   1st Qu.:-0.4982  
##  Median :-0.4206   Median :-0.200   Median : 0.1600   Median : 0.4152  
##  Mean   : 0.0000   Mean   : 0.000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7360   3rd Qu.:-0.200   3rd Qu.: 0.1600   3rd Qu.: 0.4152  
##  Max.   : 1.8925   Max.   : 4.934   Max.   : 2.7262   Max.   : 3.1554  
##       fat              sodium            fiber              carbo        
##  Min.   :-1.0065   Min.   :-1.9047   Min.   :-0.90290   Min.   :-3.6451  
##  1st Qu.:-1.0065   1st Qu.:-0.3540   1st Qu.:-0.48333   1st Qu.:-0.6070  
##  Median :-0.0129   Median : 0.2424   Median :-0.06375   Median :-0.1396  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.9807   3rd Qu.: 0.6003   3rd Qu.: 0.35582   3rd Qu.: 0.5615  
##  Max.   : 3.9614   Max.   : 1.9124   Max.   : 4.97115   Max.   : 1.9637  
##      sugars             potass            vitamins           shelf        
##  Min.   :-1.78229   Min.   :-1.36179   Min.   :-1.2643   Min.   :-1.4508  
##  1st Qu.:-0.88238   1st Qu.:-0.78665   1st Qu.:-0.1453   1st Qu.:-1.4508  
##  Median : 0.01753   Median :-0.08526   Median :-0.1453   Median :-0.2496  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.91744   3rd Qu.: 0.33558   3rd Qu.:-0.1453   3rd Qu.: 0.9516  
##  Max.   : 1.81735   Max.   : 3.28142   Max.   : 3.2115   Max.   : 0.9516  
##      weight             cups             rating       
##  Min.   :-3.5195   Min.   :-2.4538   Min.   :-1.7529  
##  1st Qu.:-0.1968   1st Qu.:-0.6490   1st Qu.:-0.6757  
##  Median :-0.1968   Median :-0.3053   Median :-0.1613  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.1968   3rd Qu.: 0.7690   3rd Qu.: 0.5811  
##  Max.   : 3.1260   Max.   : 2.9176   Max.   : 3.6334
cereales = as.data.frame(cereales)

Creamos los clusters en función a “calories” y “sugars”

kmcluster = kmeans(cereales,centers=4,nstart = 50)
#Utilizamos 4 ya que es lo "estandar", màs adelante veremos cuál es el número más optimo
kmcluster
## K-means clustering with 4 clusters of sizes 23, 17, 3, 34
## 
## Cluster means:
##           mfr       type   calories    protein        fat      sodium
## 1  0.43427639  0.4696476 -0.7103035  0.5343311 -0.4880887 -0.65479200
## 2 -0.01236948 -0.2000351  1.0958759  0.5226506  1.0391108 -0.02770273
## 3 -0.22780463 -0.2000351 -2.2351425  1.3286071 -0.3440932  0.20268253
## 4 -0.26749005 -0.2000351  0.1297799 -0.7400146 -0.1590166  0.43891514
##         fiber       carbo     sugars      potass   vitamins      shelf
## 1  0.09495521  0.11440966 -0.8921616 -0.02182989 -0.4858650 -0.2495930
## 2  0.47922584 -0.08462542  0.6262939  0.88431118  0.1837835  0.8809166
## 3  3.71242164 -1.85342766 -0.7323949  3.00086468 -0.1453172  0.9515734
## 4 -0.63141394  0.12845568  0.3549972 -0.69217049  0.2496037 -0.3555783
##       weight       cups     rating
## 1 -0.5348330 -0.0493036  0.9693597
## 2  1.1518771 -0.5125311 -0.3814882
## 3 -0.1967771 -1.8665328  2.2195570
## 4 -0.1967771  0.4543120 -0.6608425
## 
## Clustering vector:
##                              100% Bran                      100% Natural Bran 
##                                      3                                      2 
##                               All-Bran              All-Bran with Extra Fiber 
##                                      3                                      3 
##                         Almond Delight                Apple Cinnamon Cheerios 
##                                      4                                      4 
##                            Apple Jacks                                Basic 4 
##                                      4                                      2 
##                              Bran Chex                            Bran Flakes 
##                                      1                                      1 
##                           Cap'n'Crunch                               Cheerios 
##                                      4                                      1 
##                  Cinnamon Toast Crunch                               Clusters 
##                                      4                                      2 
##                            Cocoa Puffs                              Corn Chex 
##                                      4                                      4 
##                            Corn Flakes                              Corn Pops 
##                                      4                                      4 
##                          Count Chocula                     Cracklin' Oat Bran 
##                                      4                                      2 
##                 Cream of Wheat (Quick)                                Crispix 
##                                      1                                      4 
##                 Crispy Wheat & Raisins                            Double Chex 
##                                      4                                      1 
##                            Froot Loops                         Frosted Flakes 
##                                      4                                      4 
##                    Frosted Mini-Wheats Fruit & Fibre Dates; Walnuts; and Oats 
##                                      1                                      2 
##                          Fruitful Bran                         Fruity Pebbles 
##                                      2                                      4 
##                           Golden Crisp                         Golden Grahams 
##                                      4                                      4 
##                      Grape Nuts Flakes                             Grape-Nuts 
##                                      1                                      1 
##                     Great Grains Pecan                       Honey Graham Ohs 
##                                      2                                      4 
##                     Honey Nut Cheerios                             Honey-comb 
##                                      4                                      4 
##            Just Right Crunchy  Nuggets                 Just Right Fruit & Nut 
##                                      4                                      2 
##                                    Kix                                   Life 
##                                      4                                      1 
##                           Lucky Charms                                  Maypo 
##                                      4                                      1 
##       Muesli Raisins; Dates; & Almonds      Muesli Raisins; Peaches; & Pecans 
##                                      2                                      2 
##                   Mueslix Crispy Blend                   Multi-Grain Cheerios 
##                                      2                                      4 
##                       Nut&Honey Crunch              Nutri-Grain Almond-Raisin 
##                                      4                                      2 
##                      Nutri-grain Wheat                   Oatmeal Raisin Crisp 
##                                      1                                      2 
##                  Post Nat. Raisin Bran                             Product 19 
##                                      2                                      4 
##                            Puffed Rice                           Puffed Wheat 
##                                      1                                      1 
##                     Quaker Oat Squares                         Quaker Oatmeal 
##                                      1                                      1 
##                            Raisin Bran                        Raisin Nut Bran 
##                                      2                                      2 
##                         Raisin Squares                              Rice Chex 
##                                      1                                      4 
##                          Rice Krispies                         Shredded Wheat 
##                                      4                                      1 
##                 Shredded Wheat 'n'Bran              Shredded Wheat spoon size 
##                                      1                                      1 
##                                 Smacks                              Special K 
##                                      4                                      1 
##                Strawberry Fruit Wheats                      Total Corn Flakes 
##                                      1                                      4 
##                      Total Raisin Bran                      Total Whole Grain 
##                                      2                                      4 
##                                Triples                                   Trix 
##                                      4                                      4 
##                             Wheat Chex                               Wheaties 
##                                      1                                      1 
##                    Wheaties Honey Gold 
##                                      4 
## 
## Within cluster sum of squares by cluster:
## [1] 301.11311 149.60089  10.67216 246.26831
##  (between_SS / total_SS =  37.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Gráficamos en función a “calories” y “sugars”

cereales = cereales %>% mutate(cluster = kmcluster$cluster)

(g1=ggplot(cereales, aes(x = calories , y = sugars)) +
    geom_point(aes(color=as.factor(cluster)), size=10)+
    geom_text(aes(label = cluster), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=4") 
)

rownames(cereales)=name

fviz_cluster(kmcluster, cereales, show.clust.cent = T,
             ellipse.type = "euclid", star.plot = T, repel = T) +
  labs(title = "Resultados clustering K-means") +
  theme_bw()

#Ahora creamos un cluster con k = 2
kmcluster2 = kmeans(cereales, centers=2, nstart = 50)
cereales = cereales %>% mutate(cluster2 = kmcluster2$cluster)

(g2=ggplot(cereales, aes(x = calories, y = sugars)) +
    geom_point(aes(color=as.factor(cluster2)), size=10)+
    geom_text(aes(label = cluster2), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=2") 
)

rownames(cereales)=name

fviz_cluster(kmcluster2, cereales, show.clust.cent = T,
             ellipse.type = "euclid", star.plot = T, repel = T) +
  labs(title = "Resultados clustering K-means") +
  theme_bw()

Comparamos cluster1 y cluster2

plot_grid(g1,g2)

Buscamos el número optimo de clusters

total_within = function(n_clusters, data, iter.max=1000, nstart=50){
  
  cluster_means = kmeans(data,centers = n_clusters,
                         iter.max = iter.max,
                         nstart = nstart)
  return(cluster_means$tot.withinss)
}


# Se aplica esta funci?n con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
                          .f = total_within,
                          data = cereales)
total_withinss
##  [1] 1291.0649  987.1462  817.1465  707.6545  625.8225  544.2581  491.1711
##  [8]  442.2914  402.7844  369.9506  343.5181  314.0512  287.0797  271.6476
## [15]  249.8472
#graficamos la varianza total

data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
  ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:15) +
  labs(title = "Suma total de cuadrados intra-cluster") +
  theme_bw()

Metodo “factoextra”

matriz_dist=get_dist(cereales, method = "euclidean")

fviz_nbclust(cereales, FUNcluster = kmeans, 
             method = "wss", k.max = 15, 
             diss = matriz_dist, nstart = 50)

Nos damos cuenta que se empieza a quebrar en “2”, por lo que “2” es el número optimo de clusters

Dendograma

#Matriz de distancias
d <- dist(cereales)

# Clúster jerarquico
hc <- hclust(d, method = "ward.D")

#Plot
plot(as.dendrogram(hc), main = "ward.D")

Parte 2: Árbol

#Entrenamos y seleccionamos una muestra del 70%
train=sample(seq(length(db$rating)),length(db$rating)*0.7,replace=FALSE)

Creamos el árbol

cereal.tree = tree(db$rating~.,db,subset=train)
summary(cereal.tree)
## 
## Regression tree:
## tree(formula = db$rating ~ ., data = db, subset = train)
## Variables actually used in tree construction:
## [1] "calories" "sugars"   "sodium"   "protein" 
## Number of terminal nodes:  6 
## Residual mean deviance:  31.45 = 1478 / 47 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -12.1900  -2.8590  -0.1862   0.0000   3.1910  20.7600

Graficamos el arbol

plot(cereal.tree);text(cereal.tree,pretty=0)

#Valores del arbol
cereal.tree
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 53 11710.00 41.99  
##    2) calories < 95 10  1475.00 64.56  
##      4) sugars < 1 5   680.70 72.95 *
##      5) sugars > 1 5    90.72 56.17 *
##    3) calories > 95 43  3955.00 36.74  
##      6) sugars < 7.5 18   979.50 44.83  
##       12) sodium < 137.5 5   215.90 54.61 *
##       13) sodium > 137.5 13   101.10 41.07 *
##      7) sugars > 7.5 25   951.00 30.92  
##       14) protein < 1.5 8   147.20 24.01 *
##       15) protein > 1.5 17   242.30 34.17 *