Evidencia 2: Clustering y arbol de decisiones

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cowplot)

## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp

library(ggpubr)

## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:cowplot':
## 
##     get_legend

library(cluster)
library(purrr)
library(dplyr)

data3<-read.csv('C:/Users/rusoc/OneDrive/Escritorio/TEC/Mineria de datos/Wine.csv')

Clustering

#Quitamos columnas que no nos sirven para el analisis
data3<- data3[,-12]

#Estandarizamos los datos
dataestand = scale(data3, center = TRUE, scale = TRUE)
dataestand = as.data.frame(dataestand)

#Buscamos el numero optimo de clusters
optimo <- fviz_nbclust(dataestand, kmeans, method = "wss")
optimo

# Encontramos que el numero optimo es 3, hacemos el kmeans clustering
kmdata<-kmeans(dataestand, centers = 3)
kmdata

## K-means clustering with 3 clusters of sizes 66, 48, 64
## 
## Cluster means:
##         Class    Alcohol Malic.acid        Ash Alcalinity.of.ash   Magnesium
## 1 -1.07368267  0.7769463 -0.3473270  0.2954645        -0.5737743  0.47744029
## 2  1.34311926  0.1935024  0.9396205  0.2572190         0.6035138 -0.08401245
## 3  0.09989582 -0.9463527 -0.3465344 -0.4976119         0.1390694 -0.42935096
##   Total.phenols  Flavanoids Nonflavanoid.phenols Proanthocyanins
## 1     0.9205062  0.98837353          -0.59318319      0.58610914
## 2    -0.9801784 -1.24214568           0.75513763     -0.73937973
## 3    -0.2141382 -0.08765095           0.04536694     -0.04989025
##   Color.intensity
## 1       0.1432358
## 2       1.0148634
## 3      -0.9088595
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 3 3 1 3 3 3 3 1 3 1
##  [75] 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 370.4979 265.3089 451.0259
##  (between_SS / total_SS =  44.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#Visualizamos el Kmeas clustering
clusterdata <- kmdata$cluster

(dataplot=ggplot(dataestand, aes(x = Color.intensity, y = Alcohol)) +
    geom_point(aes(color=as.factor(clusterdata)), size=5)+
    geom_text(aes(label = clusterdata), size = 3) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=3"))

Arbol de decisiones

# Selección muestra entrenamiento de un 70%
library(tree)

## Warning: package 'tree' was built under R version 4.3.2

library(dplyr)
train <- sample(seq(length(data3$Alcohol)), length(data3$Alcohol) * 0.7, replace = FALSE)

# Creación del árbol de clasificación
data3.tree <- tree(data3$Alcohol ~ Color.intensity, data3, subset = train)

# Visualización del árbol
plot(data3.tree)
text(data3.tree, pretty = 0)

# Ver los valores del árbol
data3.tree

## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 124 84.090 13.00  
##    2) Color.intensity < 4.15 52 19.340 12.37  
##      4) Color.intensity < 3.285 34  7.633 12.13 *
##      5) Color.intensity > 3.285 18  6.083 12.82 *
##    3) Color.intensity > 4.15 72 29.450 13.45  
##      6) Color.intensity < 5.045 18  4.117 13.06 *
##      7) Color.intensity > 5.045 54 21.560 13.58  
##       14) Color.intensity < 7 31 11.440 13.76  
##         28) Color.intensity < 5.265 7  1.239 14.12 *
##         29) Color.intensity > 5.265 24  9.067 13.66 *
##       15) Color.intensity > 7 23  7.751 13.34 *

#PREDICCIONES
test <- setdiff(seq(length(data3$Alcohol)), train)
tree.pred <- predict(data3.tree, data3[test, ])
summary(tree.pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.13   12.30   13.05   12.99   13.34   13.66

result_table <- with(data3[test, ], table(tree.pred, Alcohol))
print(result_table)

##                   Alcohol
## tree.pred          11.62 11.66 11.81 11.84 11.87 12 12.04 12.08 12.16 12.2
##   12.1308823529412     1     0     1     1     1  1     1     2     1    0
##   12.8222222222222     0     1     0     0     0  0     0     1     0    0
##   13.055               0     0     0     0     0  0     0     0     0    0
##   13.3408695652174     0     0     0     0     0  0     0     0     0    0
##   13.6604166666667     0     0     0     0     0  0     0     0     0    1
##                   Alcohol
## tree.pred          12.22 12.34 12.36 12.37 12.47 12.51 12.64 12.7 12.77 12.81
##   12.1308823529412     1     1     0     0     1     1     0    1     0     0
##   12.8222222222222     0     0     0     0     0     0     0    0     0     0
##   13.055               0     0     0     1     0     0     0    0     0     0
##   13.3408695652174     0     0     1     0     0     0     0    0     1     0
##   13.6604166666667     0     0     0     0     0     1     1    0     0     1
##                   Alcohol
## tree.pred          12.87 12.88 13.05 13.16 13.17 13.2 13.23 13.27 13.28 13.3
##   12.1308823529412     0     0     0     0     0    0     0     0     0    0
##   12.8222222222222     0     0     0     1     0    0     0     0     0    1
##   13.055               0     0     1     0     0    1     0     0     1    0
##   13.3408695652174     1     0     0     0     1    0     1     1     0    0
##   13.6604166666667     0     1     0     1     0    0     0     0     0    0
##                   Alcohol
## tree.pred          13.39 13.4 13.49 13.56 13.58 13.62 13.67 13.69 13.73 13.82
##   12.1308823529412     0    0     0     0     0     0     0     0     0     0
##   12.8222222222222     0    0     1     0     0     0     1     0     0     0
##   13.055               1    0     0     0     0     1     0     0     0     0
##   13.3408695652174     0    1     0     0     1     0     0     0     0     1
##   13.6604166666667     0    0     0     1     0     0     0     1     2     0
##                   Alcohol
## tree.pred          13.83 13.84 13.87 14.12 14.13 14.19 14.22 14.38
##   12.1308823529412     0     0     0     0     0     0     0     0
##   12.8222222222222     0     0     0     0     0     0     0     0
##   13.055               0     0     1     1     0     0     0     1
##   13.3408695652174     0     1     0     0     1     1     0     1
##   13.6604166666667     1     0     0     0     0     0     1     0

Evidencia 2: Clustering y arbol de decisiones

Adrian Israel Castillo Lara y Constantino Millet

2023-11-27