library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(cluster)
library(purrr)
library(dplyr)
data3<-read.csv('C:/Users/rusoc/OneDrive/Escritorio/TEC/Mineria de datos/Wine.csv')
Clustering
#Quitamos columnas que no nos sirven para el analisis
data3<- data3[,-12]
#Estandarizamos los datos
dataestand = scale(data3, center = TRUE, scale = TRUE)
dataestand = as.data.frame(dataestand)
#Buscamos el numero optimo de clusters
optimo <- fviz_nbclust(dataestand, kmeans, method = "wss")
optimo
# Encontramos que el numero optimo es 3, hacemos el kmeans clustering
kmdata<-kmeans(dataestand, centers = 3)
kmdata
## K-means clustering with 3 clusters of sizes 66, 48, 64
##
## Cluster means:
## Class Alcohol Malic.acid Ash Alcalinity.of.ash Magnesium
## 1 -1.07368267 0.7769463 -0.3473270 0.2954645 -0.5737743 0.47744029
## 2 1.34311926 0.1935024 0.9396205 0.2572190 0.6035138 -0.08401245
## 3 0.09989582 -0.9463527 -0.3465344 -0.4976119 0.1390694 -0.42935096
## Total.phenols Flavanoids Nonflavanoid.phenols Proanthocyanins
## 1 0.9205062 0.98837353 -0.59318319 0.58610914
## 2 -0.9801784 -1.24214568 0.75513763 -0.73937973
## 3 -0.2141382 -0.08765095 0.04536694 -0.04989025
## Color.intensity
## 1 0.1432358
## 2 1.0148634
## 3 -0.9088595
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 3 3 1 3 3 3 3 1 3 1
## [75] 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 370.4979 265.3089 451.0259
## (between_SS / total_SS = 44.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Visualizamos el Kmeas clustering
clusterdata <- kmdata$cluster
(dataplot=ggplot(dataestand, aes(x = Color.intensity, y = Alcohol)) +
geom_point(aes(color=as.factor(clusterdata)), size=5)+
geom_text(aes(label = clusterdata), size = 3) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=3"))
Arbol de decisiones
# Selección muestra entrenamiento de un 70%
library(tree)
## Warning: package 'tree' was built under R version 4.3.2
library(dplyr)
train <- sample(seq(length(data3$Alcohol)), length(data3$Alcohol) * 0.7, replace = FALSE)
# Creación del árbol de clasificación
data3.tree <- tree(data3$Alcohol ~ Color.intensity, data3, subset = train)
# Visualización del árbol
plot(data3.tree)
text(data3.tree, pretty = 0)
# Ver los valores del árbol
data3.tree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 124 84.090 13.00
## 2) Color.intensity < 4.15 52 19.340 12.37
## 4) Color.intensity < 3.285 34 7.633 12.13 *
## 5) Color.intensity > 3.285 18 6.083 12.82 *
## 3) Color.intensity > 4.15 72 29.450 13.45
## 6) Color.intensity < 5.045 18 4.117 13.06 *
## 7) Color.intensity > 5.045 54 21.560 13.58
## 14) Color.intensity < 7 31 11.440 13.76
## 28) Color.intensity < 5.265 7 1.239 14.12 *
## 29) Color.intensity > 5.265 24 9.067 13.66 *
## 15) Color.intensity > 7 23 7.751 13.34 *
#PREDICCIONES
test <- setdiff(seq(length(data3$Alcohol)), train)
tree.pred <- predict(data3.tree, data3[test, ])
summary(tree.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.13 12.30 13.05 12.99 13.34 13.66
result_table <- with(data3[test, ], table(tree.pred, Alcohol))
print(result_table)
## Alcohol
## tree.pred 11.62 11.66 11.81 11.84 11.87 12 12.04 12.08 12.16 12.2
## 12.1308823529412 1 0 1 1 1 1 1 2 1 0
## 12.8222222222222 0 1 0 0 0 0 0 1 0 0
## 13.055 0 0 0 0 0 0 0 0 0 0
## 13.3408695652174 0 0 0 0 0 0 0 0 0 0
## 13.6604166666667 0 0 0 0 0 0 0 0 0 1
## Alcohol
## tree.pred 12.22 12.34 12.36 12.37 12.47 12.51 12.64 12.7 12.77 12.81
## 12.1308823529412 1 1 0 0 1 1 0 1 0 0
## 12.8222222222222 0 0 0 0 0 0 0 0 0 0
## 13.055 0 0 0 1 0 0 0 0 0 0
## 13.3408695652174 0 0 1 0 0 0 0 0 1 0
## 13.6604166666667 0 0 0 0 0 1 1 0 0 1
## Alcohol
## tree.pred 12.87 12.88 13.05 13.16 13.17 13.2 13.23 13.27 13.28 13.3
## 12.1308823529412 0 0 0 0 0 0 0 0 0 0
## 12.8222222222222 0 0 0 1 0 0 0 0 0 1
## 13.055 0 0 1 0 0 1 0 0 1 0
## 13.3408695652174 1 0 0 0 1 0 1 1 0 0
## 13.6604166666667 0 1 0 1 0 0 0 0 0 0
## Alcohol
## tree.pred 13.39 13.4 13.49 13.56 13.58 13.62 13.67 13.69 13.73 13.82
## 12.1308823529412 0 0 0 0 0 0 0 0 0 0
## 12.8222222222222 0 0 1 0 0 0 1 0 0 0
## 13.055 1 0 0 0 0 1 0 0 0 0
## 13.3408695652174 0 1 0 0 1 0 0 0 0 1
## 13.6604166666667 0 0 0 1 0 0 0 1 2 0
## Alcohol
## tree.pred 13.83 13.84 13.87 14.12 14.13 14.19 14.22 14.38
## 12.1308823529412 0 0 0 0 0 0 0 0
## 12.8222222222222 0 0 0 0 0 0 0 0
## 13.055 0 0 1 1 0 0 0 1
## 13.3408695652174 0 1 0 0 1 1 0 1
## 13.6604166666667 1 0 0 0 0 0 1 0