The UCI Wine Quality dataset is a structured dataset containing physicochemical attributes of red wine samples. The goal is to predict wine quality scores.
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine <- read.csv(url, sep = ";")
head(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
str(wine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
dim(wine)
## [1] 1599 12
hist(
wine$quality,
main = "Distribution of Wine Quality",
xlab = "Quality Score",
col = "lightblue",
border = "black"
)
plot(
wine$alcohol,
wine$quality,
xlab = "Alcohol",
ylab = "Quality",
main = "Alcohol vs Wine Quality"
)
cor_matrix <- cor(wine)
heatmap(cor_matrix)
boxplot(
alcohol ~ quality,
data = wine,
main = "Alcohol Content by Wine Quality",
xlab = "Quality",
ylab = "Alcohol"
)
plot(density(wine$alcohol), main = "Density of Alcohol Content")
expected_cols <- c(
"fixed.acidity","volatile.acidity","citric.acid",
"residual.sugar","chlorides","free.sulfur.dioxide",
"total.sulfur.dioxide","density","pH","sulphates",
"alcohol","quality"
)
all(expected_cols %in% colnames(wine))
## [1] TRUE
colSums(is.na(wine))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
This is a supervised regression problem where wine quality is predicted based on physicochemical features.
set.seed(123)
index <- sample(1:nrow(wine), 0.8 * nrow(wine))
train <- wine[index, ]
test <- wine[-index, ]
Decision Tree regression is chosen due to its interpretability and suitability for reproducible pipelines.
library(rpart)
model <- rpart(quality ~ ., data = train, method = "anova")
pred <- predict(model, test)
rmse <- sqrt(mean((pred - test$quality)^2))
rmse
## [1] 0.6290354
new_data <- test[1, -12]
predict(model, new_data)
## 3
## 5.520124