## Homework 5: Random Forests

Loading the Data

# load data
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
wine_data <- read.csv(data_url, sep = ";")

# inspect the data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity        <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity     <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid          <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar       <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides            <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide  <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density              <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH                   <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates            <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol              <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality              <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…

Data Preprocessing (2 points)

# check for missing data
colSums(is.na(wine_data))
##        fixed.acidity     volatile.acidity          citric.acid 
##                    0                    0                    0 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##                    0                    0                    0 
## total.sulfur.dioxide              density                   pH 
##                    0                    0                    0 
##            sulphates              alcohol              quality 
##                    0                    0                    0
# convert quality variable to a factor
wine_data$quality <- as.factor(wine_data$quality)

# inspect preprocessed data
glimpse(wine_data)
## Rows: 4,898
## Columns: 12
## $ fixed.acidity        <dbl> 7.0, 6.3, 8.1, 7.2, 7.2, 8.1, 6.2, 7.0, 6.3, 8.1,…
## $ volatile.acidity     <dbl> 0.27, 0.30, 0.28, 0.23, 0.23, 0.28, 0.32, 0.27, 0…
## $ citric.acid          <dbl> 0.36, 0.34, 0.40, 0.32, 0.32, 0.40, 0.16, 0.36, 0…
## $ residual.sugar       <dbl> 20.70, 1.60, 6.90, 8.50, 8.50, 6.90, 7.00, 20.70,…
## $ chlorides            <dbl> 0.045, 0.049, 0.050, 0.058, 0.058, 0.050, 0.045, …
## $ free.sulfur.dioxide  <dbl> 45, 14, 30, 47, 47, 30, 30, 45, 14, 28, 11, 17, 1…
## $ total.sulfur.dioxide <dbl> 170, 132, 97, 186, 186, 97, 136, 170, 132, 129, 6…
## $ density              <dbl> 1.0010, 0.9940, 0.9951, 0.9956, 0.9956, 0.9951, 0…
## $ pH                   <dbl> 3.00, 3.30, 3.26, 3.19, 3.19, 3.26, 3.18, 3.00, 3…
## $ sulphates            <dbl> 0.45, 0.49, 0.44, 0.40, 0.40, 0.44, 0.47, 0.45, 0…
## $ alcohol              <dbl> 8.8, 9.5, 10.1, 9.9, 9.9, 10.1, 9.6, 8.8, 9.5, 11…
## $ quality              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6…

Building the Random Forest Model (2 points)

# set seed
set.seed(123)

# build the random forest model
rf_model <- randomForest(quality ~ ., data = wine_data, importance = TRUE)

# print the model
print(rf_model)
## 
## Call:
##  randomForest(formula = quality ~ ., data = wine_data, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 28.38%
## Confusion matrix:
##   3  4    5    6   7  8 9 class.error
## 3 0  0    8   12   0  0 0   1.0000000
## 4 0 39   74   48   2  0 0   0.7607362
## 5 0  6 1041  400  10  0 0   0.2855182
## 6 0  3  254 1830 108  3 0   0.1674249
## 7 0  0   13  343 519  5 0   0.4102273
## 8 0  0    1   50  45 79 0   0.5485714
## 9 0  0    0    3   2  0 0   1.0000000

Model Evaluation (3 points)

# predict the quality on the training data
predictions <- predict(rf_model, wine_data)

# confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = wine_data$quality)
confusion_matrix
##          Actual
## Predicted    3    4    5    6    7    8    9
##         3   20    0    0    0    0    0    0
##         4    0  163    0    0    0    0    0
##         5    0    0 1457    0    0    0    0
##         6    0    0    0 2198    0    0    0
##         7    0    0    0    0  880    0    0
##         8    0    0    0    0    0  175    0
##         9    0    0    0    0    0    0    5
# calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
accuracy
## [1] 1
# feature importance
importance <- importance(rf_model)
importance
##                               3        4        5        6        7        8
## fixed.acidity        -1.5500691 19.47876 51.95281 51.58469 46.68156 40.92930
## volatile.acidity     -1.4099276 36.11990 77.65881 72.28454 74.75365 56.13492
## citric.acid           1.0825740 29.07191 47.37555 58.71825 53.80246 39.24552
## residual.sugar        0.1231600 20.25384 46.86846 55.40574 50.36578 38.28303
## chlorides            -0.7754203 25.43895 53.65610 44.35629 56.89210 41.55486
## free.sulfur.dioxide   3.1763284 43.07423 55.80675 63.86693 56.25406 45.39101
## total.sulfur.dioxide  1.3767097 25.38358 48.03239 44.78567 55.32534 39.37846
## density              -1.7347369 20.79561 41.50219 52.80461 53.94913 42.69886
## pH                   -0.5710255 17.15741 56.25886 56.65801 63.66741 41.37330
## sulphates            -1.1241078 22.06626 52.48398 52.39505 54.36232 42.78307
## alcohol              -3.9525721 29.97031 88.39088 57.65117 70.45969 70.37398
##                               9 MeanDecreaseAccuracy MeanDecreaseGini
## fixed.acidity         1.4170505             79.37281         248.7857
## volatile.acidity      1.0010015             98.01926         328.3941
## citric.acid          -0.5775428             73.91267         267.0190
## residual.sugar        0.2773714             68.72509         293.2068
## chlorides             2.0080483             65.02855         283.2314
## free.sulfur.dioxide  -1.2144654             92.44620         312.7140
## total.sulfur.dioxide -1.0010015             64.93598         302.6403
## density               0.6745065             67.70083         346.8309
## pH                    1.3020693             83.95930         283.9198
## sulphates            -1.0010015             78.04405         265.7748
## alcohol               1.3894250            101.45836         374.2046