# Descargar el dataset de vino
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", "winequality-red.csv")
# Leer el dataset
wine <- read.csv("winequality-red.csv", sep=";")
# Mostrar las primeras filas y la estructura del dataset
print(head(wine))
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
print(str(wine))
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
## NULL
# Cargar las librerías necesarias
library(class)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
# Mostrar las primeras filas para verificar la carga
print(head(wine))
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
# Convertir la calidad en una variable binaria
wine$quality_label <- ifelse(wine$quality >= 6, 'bueno', 'malo')
wine$quality_label <- as.factor(wine$quality_label)
# Mostrar la distribución de calidad
print(table(wine$quality_label))
##
## bueno malo
## 855 744
# Preparar los predictores
predictors <- wine[, !(names(wine) %in% c('quality', 'quality_label'))]
# Normalizar los datos
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
predictors_norm <- as.data.frame(lapply(predictors, normalize))
# Dividir en conjunto de entrenamiento y prueba
set.seed(123)
trainIndex <- createDataPartition(wine$quality_label, p = .7, list = FALSE, times = 1)
trainData <- predictors_norm[trainIndex, ]
testData <- predictors_norm[-trainIndex, ]
trainLabels <- wine$quality_label[trainIndex]
testLabels <- wine$quality_label[-trainIndex]
# Aplicar KNN con k = 5
k <- 5
predicted_labels <- knn(train = trainData, test = testData, cl = trainLabels, k = k)
# Evaluar el modelo
conf_matrix <- confusionMatrix(predicted_labels, testLabels)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bueno malo
## bueno 190 77
## malo 66 146
##
## Accuracy : 0.7015
## 95% CI : (0.6583, 0.7421)
## No Information Rate : 0.5344
## P-Value [Acc > NIR] : 6.301e-14
##
## Kappa : 0.3982
##
## Mcnemar's Test P-Value : 0.403
##
## Sensitivity : 0.7422
## Specificity : 0.6547
## Pos Pred Value : 0.7116
## Neg Pred Value : 0.6887
## Prevalence : 0.5344
## Detection Rate : 0.3967
## Detection Prevalence : 0.5574
## Balanced Accuracy : 0.6984
##
## 'Positive' Class : bueno
##
# Visualizar la distribución de calidad
p <- ggplot(wine, aes(x = quality)) +
geom_histogram(binwidth = 0.5, fill = 'skyblue', color = 'black') +
theme_minimal() +
ggtitle('Distribución de la Calidad del Vino')
print(p)
