Problema 2: Calidad del vino

# Descargar el dataset de vino
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", "winequality-red.csv")

# Leer el dataset
wine <- read.csv("winequality-red.csv", sep=";")

# Mostrar las primeras filas y la estructura del dataset
print(head(wine))

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

print(str(wine))

## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
## NULL

# Cargar las librerías necesarias
library(class)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(ggplot2)

# Mostrar las primeras filas para verificar la carga  
print(head(wine))

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

# Convertir la calidad en una variable binaria
wine$quality_label <- ifelse(wine$quality >= 6, 'bueno', 'malo')
wine$quality_label <- as.factor(wine$quality_label)

# Mostrar la distribución de calidad
print(table(wine$quality_label))

## 
## bueno  malo 
##   855   744

# Preparar los predictores
predictors <- wine[, !(names(wine) %in% c('quality', 'quality_label'))]

# Normalizar los datos
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
predictors_norm <- as.data.frame(lapply(predictors, normalize))

# Dividir en conjunto de entrenamiento y prueba
set.seed(123)
trainIndex <- createDataPartition(wine$quality_label, p = .7, list = FALSE, times = 1)
trainData <- predictors_norm[trainIndex, ]
testData  <- predictors_norm[-trainIndex, ]
trainLabels <- wine$quality_label[trainIndex]
testLabels  <- wine$quality_label[-trainIndex]

# Aplicar KNN con k = 5
k <- 5
predicted_labels <- knn(train = trainData, test = testData, cl = trainLabels, k = k)

# Evaluar el modelo
conf_matrix <- confusionMatrix(predicted_labels, testLabels)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bueno malo
##      bueno   190   77
##      malo     66  146
##                                           
##                Accuracy : 0.7015          
##                  95% CI : (0.6583, 0.7421)
##     No Information Rate : 0.5344          
##     P-Value [Acc > NIR] : 6.301e-14       
##                                           
##                   Kappa : 0.3982          
##                                           
##  Mcnemar's Test P-Value : 0.403           
##                                           
##             Sensitivity : 0.7422          
##             Specificity : 0.6547          
##          Pos Pred Value : 0.7116          
##          Neg Pred Value : 0.6887          
##              Prevalence : 0.5344          
##          Detection Rate : 0.3967          
##    Detection Prevalence : 0.5574          
##       Balanced Accuracy : 0.6984          
##                                           
##        'Positive' Class : bueno           
##

# Visualizar la distribución de calidad
p <- ggplot(wine, aes(x = quality)) + 
    geom_histogram(binwidth = 0.5, fill = 'skyblue', color = 'black') +
    theme_minimal() +
    ggtitle('Distribución de la Calidad del Vino')
print(p)

Problema 2: Calidad del vino

Gabriela I. Padilla Maymó

2025-03-21