## Homework 5: Random Forests
# load data
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
wine_data <- read.csv(data_url, sep = ";")
# inspect the data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…
# check for missing data
colSums(is.na(wine_data))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
# convert quality variable to a factor
wine_data$quality <- as.factor(wine_data$quality)
# inspect preprocessed data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…
# set seed
set.seed(123)
# build the random forest model
rf_model <- randomForest(quality ~ ., data = wine_data, importance = TRUE)
# print the model
print(rf_model)
##
## Call:
## randomForest(formula = quality ~ ., data = wine_data, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 28.38%
## Confusion matrix:
## 3 4 5 6 7 8 9 class.error
## 3 0 0 8 12 0 0 0 1.0000000
## 4 0 39 74 48 2 0 0 0.7607362
## 5 0 6 1041 400 10 0 0 0.2855182
## 6 0 3 254 1830 108 3 0 0.1674249
## 7 0 0 13 343 519 5 0 0.4102273
## 8 0 0 1 50 45 79 0 0.5485714
## 9 0 0 0 3 2 0 0 1.0000000
# predict the quality on the training data
predictions <- predict(rf_model, wine_data)
# confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = wine_data$quality)
confusion_matrix
## Actual
## Predicted 3 4 5 6 7 8 9
## 3 20 0 0 0 0 0 0
## 4 0 163 0 0 0 0 0
## 5 0 0 1457 0 0 0 0
## 6 0 0 0 2198 0 0 0
## 7 0 0 0 0 880 0 0
## 8 0 0 0 0 0 175 0
## 9 0 0 0 0 0 0 5
# calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 1
# feature importance
importance <- importance(rf_model)
importance
## 3 4 5 6 7 8
## fixed.acidity -1.5500691 19.47876 51.95281 51.58469 46.68156 40.92930
## volatile.acidity -1.4099276 36.11990 77.65881 72.28454 74.75365 56.13492
## citric.acid 1.0825740 29.07191 47.37555 58.71825 53.80246 39.24552
## residual.sugar 0.1231600 20.25384 46.86846 55.40574 50.36578 38.28303
## chlorides -0.7754203 25.43895 53.65610 44.35629 56.89210 41.55486
## free.sulfur.dioxide 3.1763284 43.07423 55.80675 63.86693 56.25406 45.39101
## total.sulfur.dioxide 1.3767097 25.38358 48.03239 44.78567 55.32534 39.37846
## density -1.7347369 20.79561 41.50219 52.80461 53.94913 42.69886
## pH -0.5710255 17.15741 56.25886 56.65801 63.66741 41.37330
## sulphates -1.1241078 22.06626 52.48398 52.39505 54.36232 42.78307
## alcohol -3.9525721 29.97031 88.39088 57.65117 70.45969 70.37398
## 9 MeanDecreaseAccuracy MeanDecreaseGini
## fixed.acidity 1.4170505 79.37281 248.7857
## volatile.acidity 1.0010015 98.01926 328.3941
## citric.acid -0.5775428 73.91267 267.0190
## residual.sugar 0.2773714 68.72509 293.2068
## chlorides 2.0080483 65.02855 283.2314
## free.sulfur.dioxide -1.2144654 92.44620 312.7140
## total.sulfur.dioxide -1.0010015 64.93598 302.6403
## density 0.6745065 67.70083 346.8309
## pH 1.3020693 83.95930 283.9198
## sulphates -1.0010015 78.04405 265.7748
## alcohol 1.3894250 101.45836 374.2046