library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(class)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3

Red Wine

redwine = read.csv("winequality-red.csv", sep=';', header=T)
names(redwine)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"

Add high-quailty feature

# I don't know if i want this...
#redwine$high = ifelse(redwine$quality > 6, 1, 0)

Train test

set.seed(1)
index = sample(1:nrow(redwine), 0.8 * nrow(redwine))
train = redwine[index, ]
test = redwine[-index, ]

Analysis

Idk we could put stuff here but we already did it.

for (name in setdiff(names(redwine), c("quality"))) {
  #par(mfrow = c(1,2))
  formula = as.formula(paste(name, "~ quality"))
  boxplot(formula, data = redwine)
}

It looks like most have differences, but a lot only have small differences. Some like ‘alcohol’ have great differences.

Regression

Test with all data

red.model = lm(quality ~ ., data = train)
summary(red.model)
## 
## Call:
## lm(formula = quality ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.52213 -0.35627 -0.04738  0.44215  2.00517 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.342e+01  2.323e+01   1.008 0.313441    
## fixed.acidity         2.725e-02  2.862e-02   0.952 0.341210    
## volatile.acidity     -1.032e+00  1.353e-01  -7.625 4.77e-14 ***
## citric.acid          -1.857e-01  1.626e-01  -1.142 0.253523    
## residual.sugar        2.223e-02  1.623e-02   1.370 0.170964    
## chlorides            -1.456e+00  4.849e-01  -3.002 0.002731 ** 
## free.sulfur.dioxide   4.483e-03  2.452e-03   1.828 0.067725 .  
## total.sulfur.dioxide -3.086e-03  8.191e-04  -3.767 0.000173 ***
## density              -1.976e+01  2.372e+01  -0.833 0.404986    
## pH                   -3.457e-01  2.165e-01  -1.597 0.110515    
## sulphates             9.192e-01  1.257e-01   7.311 4.69e-13 ***
## alcohol               2.834e-01  2.909e-02   9.741  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6474 on 1267 degrees of freedom
## Multiple R-squared:  0.3536, Adjusted R-squared:  0.348 
## F-statistic: 63.02 on 11 and 1267 DF,  p-value: < 2.2e-16
plot(red.model)

pred = predict(red.model, newdata = test, type='response')
pred.round = round(pred)
actual = test$quality
RMSE = sqrt(mean((pred - actual)^2))
RMSE
## [1] 0.6530574
red.lm.table = table(Actual = actual, Predicted = pred.round)
red.lm.table
##       Predicted
## Actual  4  5  6  7
##      3  1  2  1  0
##      4  0  6  5  0
##      5  0 97 28  0
##      6  0 47 84  4
##      7  0  2 28 12
##      8  0  0  2  1
exact = sum(pred.round == actual) / nrow(test)
within.1 = sum(abs(pred.round - actual) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.603125
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9625

Test with unique polynomial and shit

red.model.2 = lm(quality ~ poly(fixed.acidity,2) + volatile.acidity
               + residual.sugar + chlorides +
                 total.sulfur.dioxide + density +
                 poly(sulphates,2) + poly(alcohol,3), data = train)
summary(red.model.2)
## 
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + volatile.acidity + 
##     residual.sugar + chlorides + total.sulfur.dioxide + density + 
##     poly(sulphates, 2) + poly(alcohol, 3), data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40440 -0.37319 -0.05082  0.44524  2.01917 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.273e+01  1.961e+01   3.709 0.000217 ***
## poly(fixed.acidity, 2)1  3.593e+00  1.045e+00   3.437 0.000606 ***
## poly(fixed.acidity, 2)2 -1.524e+00  6.884e-01  -2.214 0.027005 *  
## volatile.acidity        -8.275e-01  1.149e-01  -7.201 1.02e-12 ***
## residual.sugar           3.795e-02  1.504e-02   2.522 0.011786 *  
## chlorides               -1.531e+00  4.581e-01  -3.342 0.000855 ***
## total.sulfur.dioxide    -1.770e-03  5.858e-04  -3.021 0.002571 ** 
## density                 -6.676e+01  1.971e+01  -3.388 0.000726 ***
## poly(sulphates, 2)1      6.297e+00  7.511e-01   8.383  < 2e-16 ***
## poly(sulphates, 2)2     -3.969e+00  6.827e-01  -5.813 7.74e-09 ***
## poly(alcohol, 3)1        8.908e+00  9.742e-01   9.145  < 2e-16 ***
## poly(alcohol, 3)2       -4.895e-01  6.658e-01  -0.735 0.462380    
## poly(alcohol, 3)3       -1.518e+00  6.594e-01  -2.301 0.021529 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6376 on 1266 degrees of freedom
## Multiple R-squared:  0.3735, Adjusted R-squared:  0.3676 
## F-statistic:  62.9 on 12 and 1266 DF,  p-value: < 2.2e-16
plot(red.model.2)

pred2 = predict(red.model.2, newdata = test, type='response')
pred2.round = round(pred2)
actual = test$quality
RMSE2 = sqrt(mean((pred2 - actual)^2))
RMSE2
## [1] 0.6384957
red.lm.table.2 = table(Actual = actual, Predicted = pred2.round)
red.lm.table.2
##       Predicted
## Actual  4  5  6  7
##      3  1  2  1  0
##      4  2  5  4  0
##      5  0 98 27  0
##      6  0 47 81  7
##      7  0  3 22 17
##      8  0  0  2  1
lm.2.results = data.frame(actual = test$quality, predicted = pred2.round)
for (i in 3:8){
  sample = subset(lm.2.results, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 0.8181818 
## Error when rating =  5 : 0.216 
## Error when rating =  6 : 0.4 
## Error when rating =  7 : 0.5952381 
## Error when rating =  8 : 1
exact = sum(pred2.round == actual) / nrow(test)
within.1 = sum(abs(pred2.round - actual) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.61875
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9625

Trees

Regression Tree

red.tree = tree(quality ~ ., data = train)
summary(red.tree)
## 
## Regression tree:
## tree(formula = quality ~ ., data = train)
## Variables actually used in tree construction:
## [1] "alcohol"              "sulphates"            "volatile.acidity"    
## [4] "total.sulfur.dioxide"
## Number of terminal nodes:  11 
## Residual mean deviance:  0.3927 = 497.9 / 1268 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.1900 -0.2933 -0.1903  0.0000  0.4043  1.9390
plot(red.tree)
text(red.tree, pretty = 0, cex = 0.7)

red.cv.tree = cv.tree(red.tree)
plot(red.cv.tree$size, red.cv.tree$dev, type='b')

red.tree.pred = predict(red.tree, newdata = test)
red.tree.pred.r = round(red.tree.pred)
tree.RMSE = sqrt(mean((red.tree.pred - test$quality)^2))
tree.RMSE
## [1] 0.6929522
red.tree.table = table(Actual = test$quality, Predicted = red.tree.pred.r)
red.tree.table
##       Predicted
## Actual  4  5  6  7
##      3  2  1  1  0
##      4  0  6  5  0
##      5  0 96 29  0
##      6  0 57 68 10
##      7  0  2 25 15
##      8  0  0  2  1
exact = sum(red.tree.pred.r == test$quality) / nrow(test)
within.1 = sum(abs(red.tree.pred.r - test$quality) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.559375
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.965625
tree.results = data.frame(actual = test$quality, predicted = red.tree.pred.r)
for (i in 3:8){
  sample = subset(tree.results, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.232 
## Error when rating =  6 : 0.4962963 
## Error when rating =  7 : 0.6428571 
## Error when rating =  8 : 1

Random Forrest

Default Trees

Will later need to do CV and try out different tree values.

rf.red = randomForest(quality ~ ., data = train, importance=TRUE)
varImpPlot(rf.red)

rf.red.pred = predict(rf.red, newdata = test)
rf.red.pred.r = round(rf.red.pred)
rf.red.rmse = sqrt(mean((rf.red.pred - test$quality)^2))
rf.red.rmse
## [1] 0.5865801
red.rf.table = table(Actual = test$quality, Predicted = rf.red.pred.r)
red.rf.table
##       Predicted
## Actual   5   6   7
##      3   4   0   0
##      4   8   3   0
##      5 105  20   0
##      6  34  96   5
##      7   2  21  19
##      8   0   1   2
exact = sum(rf.red.pred.r == test$quality) / nrow(test)
within.1 = sum(abs(rf.red.pred.r - test$quality) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.6875
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.96875
rf.red.results = data.frame(actual = test$quality, predicted = rf.red.pred.r)
for (i in 3:8){
  sample = subset(rf.red.results, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.16 
## Error when rating =  6 : 0.2888889 
## Error when rating =  7 : 0.547619 
## Error when rating =  8 : 1

Different values

# predefining variables
p = ncol(redwine) - 1
m = c(p, floor(p / 2), floor(sqrt(p)))
names(m) = c("p", "p/2", "sqrt(p)")
ntrees = seq(1, 150, by = 5)

# to store the errors
results <- data.frame()

for (j in 1:3) {
  for (i in seq_along(ntrees)) {
    model = randomForest(quality ~ ., data = train, mtry = m[j], 
                          ntree = ntrees[i], keep.forest = TRUE)
    
    preds = predict(model, newdata = test)
    mse = mean((preds - test$quality)^2)
    
    results = rbind(results, data.frame(
      ntree = ntrees[i],
      MSE = mse,
      mtry = m[j]
    ))
  }
}
plot(x = results[results$mtry == 11, ]$ntree, 
     y = results[results$mtry == 11, ]$MSE, 
     col='blue', type = 'l',
     xlab = 'number of trees', ylab = 'MSE',
     ylim = c(0.2,0.8))
lines(x = results[results$mtry == 5, ]$ntree, 
     y = results[results$mtry == 5, ]$MSE, 
     col='red', type = 'l')
lines(x = results[results$mtry == 3, ]$ntree, 
     y = results[results$mtry == 3, ]$MSE, 
     col='darkgreen', type = 'l')
legend("topright", legend = names(m),
       col = c("blue", "red", "darkgreen"), 
       lwd = 2, title = "mtry")

Looks like it doesn’t really matter what value of mtry we use, and once we get to like 50 trees nothing really improves the model.

rf.red.best = randomForest(quality ~ ., data = train, mtry = 4, 
                          ntree = 50, keep.forest = TRUE)
rf.red.pred.best = predict(rf.red.best, newdata = test)
rf.red.pred.best.r = round(rf.red.pred.best)
rf.red.rmse = sqrt(mean((rf.red.pred.best - test$quality)^2))
rf.red.rmse
## [1] 0.5896732
red.rf.table.2 = table(Actual = test$quality, Predicted = rf.red.pred.best.r)
red.rf.table.2
##       Predicted
## Actual   5   6   7
##      3   4   0   0
##      4   8   3   0
##      5 104  21   0
##      6  32  96   7
##      7   1  19  22
##      8   0   1   2
exact = sum(rf.red.pred.best.r == test$quality) / nrow(test)
within.1 = sum(abs(rf.red.pred.best.r - test$quality) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.69375
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.971875
rf.red.best.res = data.frame(actual = test$quality, predicted = rf.red.pred.best.r)
for (i in 3:8){
  sample = subset(rf.red.best.res, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.168 
## Error when rating =  6 : 0.2888889 
## Error when rating =  7 : 0.4761905 
## Error when rating =  8 : 1

KNN Clustering

names(redwine)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"
# ecluded variables 
excluded.var = c("quality")#, "citric.acid", "free.sulfur.dioxide", "pH")  

# set up variables 
x.train = train[, -which(names(train) %in% excluded.var)] 
y.train = train$quality 
x.test = test[, -which(names(test) %in% excluded.var)] 
y.test = test$quality
kval = c(1,2,3,5,10,15,20,25) 
for (val in kval){   
  set.seed(1)   
  knn.pred = knn(x.train, x.test, y.train, k = val)   
  cat("For k = ", val,":")   
  print(table(Actual = y.test, Predicted = knn.pred))   
  cat("Correct %:", mean(knn.pred == y.test), "\n")   
  cat("Correct % +/- 1:", mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
      + mean(knn.pred == y.test), "\n") 
}
## For k =  1 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  4  7  0  0
##      5  0  5 97 19  4  0
##      6  0  4 52 67 11  1
##      7  0  1  3 15 22  1
##      8  0  0  0  1  1  1
## Correct %: 0.584375 
## Correct % +/- 1: 0.703125 
## For k =  2 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  0  1  0
##      4  0  0  5  6  0  0
##      5  0  4 86 32  3  0
##      6  0  4 56 58 17  0
##      7  0  2  8 13 17  2
##      8  0  0  1  1  0  1
## Correct %: 0.50625 
## Correct % +/- 1: 0.68125 
## For k =  3 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  5  5  1  0
##      5  0  1 83 37  4  0
##      6  0  2 62 52 19  0
##      7  0  1  5 16 18  2
##      8  0  0  0  1  1  1
## Correct %: 0.48125 
## Correct % +/- 1: 0.684375 
## For k =  5 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  8  3  0  0
##      5  0  0 76 44  5  0
##      6  0  0 65 52 18  0
##      7  0  0  4 25 12  1
##      8  0  0  0  1  2  0
## Correct %: 0.4375 
## Correct % +/- 1: 0.6625 
## For k =  10 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  2  1  1  0
##      4  0  0  6  5  0  0
##      5  0  1 77 44  3  0
##      6  0  0 65 59 11  0
##      7  0  1  7 28  6  0
##      8  0  0  0  3  0  0
## Correct %: 0.44375 
## Correct % +/- 1: 0.6375 
## For k =  15 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 85 39  1  0
##      6  0  0 65 66  4  0
##      7  0  0 11 25  6  0
##      8  0  0  0  2  1  0
## Correct %: 0.490625 
## Correct % +/- 1: 0.65 
## For k =  20 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  2  2  0  0
##      4  0  0  7  4  0  0
##      5  0  0 88 36  1  0
##      6  0  0 63 69  3  0
##      7  0  0  9 29  4  0
##      8  0  0  1  1  1  0
## Correct %: 0.503125 
## Correct % +/- 1: 0.653125 
## For k =  25 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  1  1  2  0
##      4  0  0  8  3  0  0
##      5  0  0 90 35  0  0
##      6  0  0 63 69  3  0
##      7  0  0  8 32  2  0
##      8  0  0  1  1  1  0
## Correct %: 0.503125 
## Correct % +/- 1: 0.65
knn.pred = knn(x.train, x.test, y.train, k = 1)
red.knn.table = print(table(Actual = y.test, Predicted = knn.pred))
##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  4  7  0  0
##      5  0  5 97 19  4  0
##      6  0  4 52 67 11  1
##      7  0  1  3 15 22  1
##      8  0  0  0  1  1  1
red.knn.table
##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  4  7  0  0
##      5  0  5 97 19  4  0
##      6  0  4 52 67 11  1
##      7  0  1  3 15 22  1
##      8  0  0  0  1  1  1

KNN Standardized

# makes a standaridized version
redwine.standard = redwine
redwine.standard[ , -which(names(redwine) == "quality")] = scale(redwine[ , -which(names(redwine) == "quality")])

train.knn = redwine.standard[index, ]
test.knn = redwine.standard[-index, ]
x.train = train.knn[, -which(names(train.knn) %in% excluded.var)] 
y.train = train.knn$quality
x.test = test.knn[, -which(names(test.knn) %in% excluded.var)] 
y.test = test.knn$quality
kval = c(1,2,3,5,10,15,20,25,30,40,50) 
for (val in kval){   
  set.seed(1)   
  knn.pred = knn(x.train, x.test, y.train, k = val)   
  cat("For k = ", val,":")   
  print(table(Actual = y.test, Predicted = knn.pred))   
  correct = mean(knn.pred == y.test)
  pm1 = mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
  
  cat("Correct %:", correct, "\n")   
  cat("Correct % +/- 1:", correct + pm1, "\n") 
}
## For k =  1 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  1  6  4  0  0
##      5  0  2 94 26  1  2
##      6  0  2 35 85 11  2
##      7  0  0  6  8 26  2
##      8  0  0  0  1  1  1
## Correct %: 0.646875 
## Correct % +/- 1: 0.803125 
## For k =  2 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  1  1  2  0  0
##      4  0  1  5  5  0  0
##      5  0  0 94 27  3  1
##      6  0  2 39 79 14  1
##      7  0  0  7  8 25  2
##      8  0  0  0  1  1  1
## Correct %: 0.625 
## Correct % +/- 1: 0.7875 
## For k =  3 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  1 92 29  2  1
##      6  0  0 42 74 16  3
##      7  0  0  5 12 24  1
##      8  0  0  0  1  2  0
## Correct %: 0.59375 
## Correct % +/- 1: 0.765625 
## For k =  5 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  6  4  1  0
##      5  0  0 90 32  3  0
##      6  0  0 45 72 18  0
##      7  0  0  4 18 20  0
##      8  0  0  0  1  2  0
## Correct %: 0.56875 
## Correct % +/- 1: 0.75 
## For k =  10 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  6  5  0  0
##      5  0  0 94 28  3  0
##      6  0  0 48 73 14  0
##      7  0  0  5 17 20  0
##      8  0  0  0  1  2  0
## Correct %: 0.584375 
## Correct % +/- 1: 0.74375 
## For k =  15 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  6  5  0  0
##      5  0  0 96 27  2  0
##      6  0  0 50 72 13  0
##      7  0  0  3 22 17  0
##      8  0  0  0  1  2  0
## Correct %: 0.578125 
## Correct % +/- 1: 0.725 
## For k =  20 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 97 26  2  0
##      6  0  0 49 74 12  0
##      7  0  0  2 24 16  0
##      8  0  0  0  2  1  0
## Correct %: 0.584375 
## Correct % +/- 1: 0.728125 
## For k =  25 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 96 27  2  0
##      6  0  0 52 74  9  0
##      7  0  0  3 20 19  0
##      8  0  0  0  2  1  0
## Correct %: 0.590625 
## Correct % +/- 1: 0.728125 
## For k =  30 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  8  3  0  0
##      5  0  0 94 29  2  0
##      6  0  0 52 74  9  0
##      7  0  0  2 23 17  0
##      8  0  0  0  2  1  0
## Correct %: 0.578125 
## Correct % +/- 1: 0.725 
## For k =  40 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 98 25  2  0
##      6  0  0 49 81  5  0
##      7  0  0  3 23 16  0
##      8  0  0  0  2  1  0
## Correct %: 0.609375 
## Correct % +/- 1: 0.728125 
## For k =  50 :      Predicted
## Actual   3   4   5   6   7   8
##      3   0   0   3   1   0   0
##      4   0   0   7   4   0   0
##      5   0   0 100  25   0   0
##      6   0   0  52  78   5   0
##      7   0   0   4  26  12   0
##      8   0   0   0   2   1   0
## Correct %: 0.59375 
## Correct % +/- 1: 0.7125

Naive Bayes

nb.red = naiveBayes(quality ~ . - citric.acid - free.sulfur.dioxide -
                      pH, data = train) 
nb.red.class = predict(nb.red, test) 
red.nb.table = table(Actual = test$quality, Predicted = nb.red.class) 
red.nb.table
##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  2  0  0  0
##      4  0  1  5  4  0  1
##      5  0  2 91 30  1  1
##      6  0  2 48 73  9  3
##      7  0  0  2 20 18  2
##      8  0  0  0  1  2  0
print(mean(test$quality == nb.red.class))
## [1] 0.571875

Results

I think here we can make a fun plot that shows the error for each rating value by each model. Here’s some example code.

model.error = data.frame(
  response = 3:8,
  lm = c(100, 80, 20, 40, 60, 100),
  svm = c(100, 80, 40, 50, 60, 80),
  rf = c(100, 100, 10, 30, 50, 100)
)
bar_matrix = t(as.matrix(model.error[ , -1]))  # transpose for barplot
colnames(bar_matrix) = model.error$response
barplot(
  bar_matrix,
  beside = TRUE,
  col = c("pink", "lightblue", "lightgreen"),
  legend = rownames(bar_matrix)
)

White Wine

whitewine = read.csv("winequality-white.csv", sep=';', header=T)
names(whitewine)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"

Train Test

whitewine[whitewine$quality == "9",]
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 775            9.1             0.27        0.45           10.6     0.035
## 821            6.6             0.36        0.29            1.6     0.021
## 828            7.4             0.24        0.36            2.0     0.031
## 877            6.9             0.36        0.34            4.2     0.018
## 1606           7.1             0.26        0.49            2.2     0.032
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 775                   28                  124 0.99700 3.20      0.46    10.4
## 821                   24                   85 0.98965 3.41      0.61    12.4
## 828                   27                  139 0.99055 3.28      0.48    12.5
## 877                   57                  119 0.98980 3.28      0.36    12.7
## 1606                  31                  113 0.99030 3.37      0.42    12.9
##      quality
## 775        9
## 821        9
## 828        9
## 877        9
## 1606       9
set.seed(1)
index.w = sample(1:nrow(whitewine), 0.8 * nrow(whitewine))
train.w = whitewine[index.w, ]
test.w = whitewine[-index.w, ]

Analysis

for (name in setdiff(names(whitewine), c("quality"))) {
  #par(mfrow = c(1,2))
  formula = as.formula(paste(name, "~ quality"))
  boxplot(formula, data = whitewine)
}

There seems to be much less variability between values than we saw in red wine

Test with all data

white.model = lm(quality ~ ., data = train.w) 
summary(white.model)
## 
## Call:
## lm(formula = quality ~ ., data = train.w)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8815 -0.4852 -0.0325  0.4635  2.9742 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.578e+02  2.024e+01   7.796 8.16e-15 ***
## fixed.acidity         6.680e-02  2.290e-02   2.918  0.00355 ** 
## volatile.acidity     -1.812e+00  1.271e-01 -14.254  < 2e-16 ***
## citric.acid           3.447e-03  1.072e-01   0.032  0.97436    
## residual.sugar        8.519e-02  8.203e-03  10.386  < 2e-16 ***
## chlorides            -2.723e-01  6.017e-01  -0.452  0.65095    
## free.sulfur.dioxide   4.067e-03  9.319e-04   4.364 1.31e-05 ***
## total.sulfur.dioxide -4.189e-04  4.230e-04  -0.990  0.32208    
## density              -1.579e+02  2.053e+01  -7.691 1.83e-14 ***
## pH                    7.043e-01  1.170e-01   6.020 1.91e-09 ***
## sulphates             6.986e-01  1.127e-01   6.197 6.33e-10 ***
## alcohol               1.843e-01  2.622e-02   7.031 2.40e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.755 on 3906 degrees of freedom
## Multiple R-squared:  0.283,  Adjusted R-squared:  0.281 
## F-statistic: 140.1 on 11 and 3906 DF,  p-value: < 2.2e-16
#plot(white.model)
pred.w = predict(white.model, newdata = test.w, type='response') 
pred.round.w = round(pred.w) 
actual.w = test.w$quality 
RMSE.w = sqrt(mean((pred.w - actual.w)^2)) 
RMSE.w
## [1] 0.7374623
white.lm.table = table(Actual = actual.w, Predicted = pred.round.w)
white.lm.table
##       Predicted
## Actual   4   5   6   7
##      3   0   0   2   0
##      4   0  14   6   1
##      5   1 130 168   2
##      6   0  66 326  42
##      7   0   9 130  47
##      8   0   1  20  13
##      9   0   0   1   1
exact = sum(pred.round.w == actual.w) / nrow(test.w) 
within.1 = sum(abs(pred.round.w - actual.w) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n') 
## Percent of test data correclty labeled: 0.5132653
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9561224
lm.results.w = data.frame(actual = test.w$quality, predicted = pred.round.w)

for (i in 3:8){   
  sample = subset(lm.results.w, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.5681063 
## Error when rating =  6 : 0.2488479 
## Error when rating =  7 : 0.7473118 
## Error when rating =  8 : 1

Test with unique polynomial and shit

white.model.2 = lm(quality ~ poly(fixed.acidity,2) + poly(volatile.acidity,3) + 
            residual.sugar + free.sulfur.dioxide + poly(density,2) + 
            pH + sulphates + poly(alcohol,3), data = train.w)
summary(white.model.2)
## 
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + poly(volatile.acidity, 
##     3) + residual.sugar + free.sulfur.dioxide + poly(density, 
##     2) + pH + sulphates + poly(alcohol, 3), data = train.w)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7865 -0.4886 -0.0201  0.4791  2.4358 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.407e+00  4.426e-01   3.178 0.001492 ** 
## poly(fixed.acidity, 2)1     7.410e+00  1.333e+00   5.558 2.92e-08 ***
## poly(fixed.acidity, 2)2    -3.622e+00  7.664e-01  -4.726 2.37e-06 ***
## poly(volatile.acidity, 3)1 -1.187e+01  7.876e-01 -15.076  < 2e-16 ***
## poly(volatile.acidity, 3)2  1.979e+00  7.793e-01   2.540 0.011136 *  
## poly(volatile.acidity, 3)3 -2.554e+00  7.684e-01  -3.324 0.000897 ***
## residual.sugar              1.058e-01  9.056e-03  11.681  < 2e-16 ***
## free.sulfur.dioxide         3.517e-03  7.442e-04   4.726 2.38e-06 ***
## poly(density, 2)1          -4.332e+01  4.438e+00  -9.760  < 2e-16 ***
## poly(density, 2)2           5.109e+00  9.811e-01   5.207 2.02e-07 ***
## pH                          1.030e+00  1.232e-01   8.364  < 2e-16 ***
## sulphates                   7.766e-01  1.125e-01   6.904 5.88e-12 ***
## poly(alcohol, 3)1           6.064e+00  2.487e+00   2.438 0.014796 *  
## poly(alcohol, 3)2           2.954e+00  8.091e-01   3.651 0.000265 ***
## poly(alcohol, 3)3          -1.658e+00  7.702e-01  -2.153 0.031387 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7468 on 3903 degrees of freedom
## Multiple R-squared:  0.299,  Adjusted R-squared:  0.2965 
## F-statistic: 118.9 on 14 and 3903 DF,  p-value: < 2.2e-16
#plot(white.model.2)
pred2.w = predict(white.model.2, newdata = test.w, type='response') 
pred2.round.w = round(pred2.w) 
actual.w = test.w$quality 
RMSE2.w = sqrt(mean((pred2.w - actual.w)^2)) 
RMSE2.w
## [1] 0.7286815
white.lm.table.2 = table(Actual = actual.w, Predicted = pred2.round.w)
white.lm.table.2
##       Predicted
## Actual   5   6   7
##      3   0   2   0
##      4  15   5   1
##      5 146 154   1
##      6  67 318  49
##      7   6 123  57
##      8   1  19  14
##      9   0   1   1
lm.2.results.w = data.frame(actual = test.w$quality, predicted = pred2.round.w)

for (i in 4:8){   
  sample = subset(lm.2.results.w, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.5149502 
## Error when rating =  6 : 0.2672811 
## Error when rating =  7 : 0.6935484 
## Error when rating =  8 : 1
exact = sum(pred2.round.w == actual.w) / nrow(test.w) 
within.1 = sum(abs(pred2.round.w - actual.w) == 1) / nrow(test.w) 

cat("Percent of test data correclty labeled:", exact, '\n') 
## Percent of test data correclty labeled: 0.5316327
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9622449

That wasn’t too good of results…

Trees

Regression Tree

white.tree = tree(quality ~ .,data = train.w) 
summary(white.tree)
## 
## Regression tree:
## tree(formula = quality ~ ., data = train.w)
## Variables actually used in tree construction:
## [1] "alcohol"             "volatile.acidity"    "free.sulfur.dioxide"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.5906 = 2311 / 3913 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.5910 -0.3539 -0.1841  0.0000  0.6461  2.6460
plot(white.tree) 
text(white.tree, pretty = 0, cex = 0.7)

white.cv.tree = cv.tree(white.tree)
plot(white.cv.tree$size, white.cv.tree$dev, type='b')

white.tree.pred = predict(white.tree, newdata = test.w) 
white.tree.pred.r = round(white.tree.pred) 
tree.RMSE = sqrt(mean((white.tree.pred - test.w$quality)^2)) 
tree.RMSE
## [1] 0.7402891
white.tree.table = table(Actual = test.w$quality, Predicted = white.tree.pred.r)
white.tree.table
##       Predicted
## Actual   5   6   7
##      3   0   2   0
##      4  14   5   2
##      5 190 107   4
##      6 112 244  78
##      7  14  96  76
##      8   0  13  21
##      9   1   0   1
exact = sum(white.tree.pred.r == test.w$quality) / nrow(test.w) 
within.1 = sum(abs(white.tree.pred.r - test.w$quality) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n') 
## Percent of test data correclty labeled: 0.5204082
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9571429
tree.results.w = data.frame(actual = test.w$quality, 
                            predicted = white.tree.pred.r) 

for (i in 3:8){   
  sample = subset(tree.results.w, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.3687708 
## Error when rating =  6 : 0.437788 
## Error when rating =  7 : 0.5913978 
## Error when rating =  8 : 1

Random Forrest

Default Trees

Will later need to do CV and try out different tree values.

rf.white = randomForest(quality ~ .- citric.acid - chlorides -
                        total.sulfur.dioxide, data = train.w, importance=TRUE) 
varImpPlot(rf.white)

rf.white.pred = predict(rf.white, newdata = test.w) 
rf.white.pred.r = round(rf.white.pred) 
rf.white.rmse = sqrt(mean((rf.white.pred - test.w$quality)^2)) 
rf.white.rmse
## [1] 0.5888631
white.rf.table = table(Actual = test.w$quality, Predicted = rf.white.pred.r)
white.rf.table
##       Predicted
## Actual   4   5   6   7   8
##      3   0   0   2   0   0
##      4   1  15   4   1   0
##      5   0 201  99   1   0
##      6   0  43 362  29   0
##      7   0   1  83 102   0
##      8   0   0   7  24   3
##      9   0   1   0   1   0
exact = sum(rf.white.pred.r == test.w$quality) / nrow(test.w) 
within.1 = sum(abs(rf.white.pred.r - test.w$quality) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n') 
## Percent of test data correclty labeled: 0.6826531
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9816327
rf.white.results = data.frame(actual = test.w$quality, 
                              predicted = rf.white.pred.r) 

for (i in 3:8){   
  sample = subset(rf.white.results, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 0.952381 
## Error when rating =  5 : 0.3322259 
## Error when rating =  6 : 0.1658986 
## Error when rating =  7 : 0.4516129 
## Error when rating =  8 : 0.9117647

Different values

# predefining variables 
p = ncol(whitewine) - 1 
m = c(p, floor(p / 2), floor(sqrt(p))) 
names(m) = c("p", "p/2", "sqrt(p)") 
ntrees = seq(1, 150, by = 5)  

# to store the errors 
results <- data.frame()  
for (j in 1:3) {   
  for (i in seq_along(ntrees)) {     
    model = randomForest(quality ~ ., data = train.w, mtry = m[j],
                         ntree = ntrees[i], keep.forest = TRUE)          
    preds = predict(model, newdata = test.w)     
    mse = mean((preds - test.w$quality)^2)          
    results = rbind(results, data.frame(ntree = ntrees[i],MSE = mse,mtry = m[j])) 
  } 
}
plot(x = results[results$mtry == 11, ]$ntree,       
     y = results[results$mtry == 11, ]$MSE,       
     col='blue', type = 'l',      
     xlab = 'number of trees', ylab = 'MSE',      
     #ylim = c(0.2,0.8)
     ) 
lines(x = results[results$mtry == 5, ]$ntree,       
      y = results[results$mtry == 5, ]$MSE,       
      col='red', type = 'l') 
lines(x = results[results$mtry == 3, ]$ntree,       
      y = results[results$mtry == 3, ]$MSE,       
      col='darkgreen', type = 'l') 
legend("topright", legend = names(m),        
       col = c("blue", "red", "darkgreen"),         
       lwd = 2, title = "mtry")

It looks like using mtry of sqrt(p) is best, and it looks like a high number of trees is good.

rf.white.best = randomForest(quality ~ ., data = train.w, mtry = 3,
                             ntree = 150, keep.forest = TRUE) 
rf.white.pred.best = predict(rf.white.best, newdata = test.w) 
rf.white.pred.best.r = round(rf.white.pred.best) 
rf.white.rmse = sqrt(mean((rf.white.pred.best - test.w$quality)^2)) 
rf.white.rmse
## [1] 0.5885784
white.rf.table.2 = table(Actual = test.w$quality, Predicted = rf.white.pred.best.r)
white.rf.table.2
##       Predicted
## Actual   4   5   6   7   8
##      3   0   0   2   0   0
##      4   1  14   6   0   0
##      5   0 207  93   1   0
##      6   0  40 369  25   0
##      7   0   2  77 107   0
##      8   0   0  13  17   4
##      9   0   0   1   1   0
exact = sum(rf.white.pred.best.r == test.w$quality) / nrow(test.w) 
within.1 = sum(abs(rf.white.pred.best.r - test.w$quality) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n') 
## Percent of test data correclty labeled: 0.7020408
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9734694
rf.white.best.res = data.frame(actual = test.w$quality, 
                               predicted = rf.white.pred.best.r) 

for (i in 3:8){   
  sample = subset(rf.white.best.res, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}
## Error when rating =  3 : 1 
## Error when rating =  4 : 0.952381 
## Error when rating =  5 : 0.3122924 
## Error when rating =  6 : 0.1497696 
## Error when rating =  7 : 0.4247312 
## Error when rating =  8 : 0.8823529

KNN Clustering

# ecluded variables
excluded.var = c("quality", "citric.acid", "chlorides", "total.sulfur.dioxide")

# set up variables
x.train.w = train.w[, -which(names(train.w) %in% excluded.var)]
y.train.w = train.w$quality
x.test.w = test.w[, -which(names(test.w) %in% excluded.var)]
y.test.w = test.w$quality
kval = c(1,2,3,5,10,15,20,25)
for (val in kval){
  set.seed(1)
  knn.pred = knn(x.train.w, x.test.w, y.train.w, k = val)
  cat("For k = ", val,":")
  print(table(Actual = y.test.w, Predicted = knn.pred))
  cat("Correct %:", mean(knn.pred == y.test.w), "\n")
  cat("Correct % +/- 1:", mean(abs(as.integer(knn.pred) - as.integer(y.test.w)) == 1)
      + mean(knn.pred == y.test.w), "\n")
}
## For k =  1 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   2   0   0   0   0
##      4   0   7   9   4   1   0   0
##      5   0  14 175  91  18   3   0
##      6   0   5  75 292  53   9   0
##      7   0   3  15  56 100  12   0
##      8   0   0   4  12   7  11   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.5969388 
## Correct % +/- 1: 0.7693878 
## For k =  2 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   1   1   0   0   0   0
##      4   0   0  13   4   4   0   0
##      5   0  12 145 113  26   5   0
##      6   0   6 109 247  61  11   0
##      7   0   2  21  76  80   7   0
##      8   0   0   9   7   8  10   0
##      9   0   1   0   1   0   0   0
## Correct %: 0.4918367 
## Correct % +/- 1: 0.7 
## For k =  3 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   1   0   1   0   0   0   0
##      4   0   4   8   8   1   0   0
##      5   0  11 155 113  20   2   0
##      6   0   6 105 246  70   7   0
##      7   0   1  18  79  75  13   0
##      8   0   0   5  12  10   7   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.4979592 
## Correct % +/- 1: 0.7091837 
## For k =  5 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   2   0   0   0   0
##      4   0   2   9   9   1   0   0
##      5   0   8 157 113  19   4   0
##      6   0   4 108 247  66   9   0
##      7   0   0  14 100  65   7   0
##      8   0   0   4  15   9   6   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.4867347 
## Correct % +/- 1: 0.6908163 
## For k =  10 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   3  11   6   1   0   0
##      5   0   4 137 143  12   5   0
##      6   0   3  98 278  51   4   0
##      7   0   0  17  96  67   6   0
##      8   0   0   4  17  10   3   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.4979592 
## Correct % +/- 1: 0.7214286 
## For k =  15 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   1   1   0   0   0
##      4   0   1  13   7   0   0   0
##      5   0   5 146 136   9   5   0
##      6   0   2  97 298  35   2   0
##      7   0   0  16 112  55   3   0
##      8   0   0   3  19  10   2   0
##      9   0   0   0   1   1   0   0
## Correct %: 0.5122449 
## Correct % +/- 1: 0.7091837 
## For k =  20 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   2  11   8   0   0   0
##      5   0   2 141 149   9   0   0
##      6   0   3  89 314  27   1   0
##      7   0   0  14 121  48   3   0
##      8   0   0   4  22   6   2   0
##      9   0   0   0   1   1   0   0
## Correct %: 0.5173469 
## Correct % +/- 1: 0.7132653 
## For k =  25 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   3  11   7   0   0   0
##      5   0   4 139 153   5   0   0
##      6   0   1 101 306  25   1   0
##      7   0   0  16 129  41   0   0
##      8   0   0   7  20   6   1   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.5 
## Correct % +/- 1: 0.694898
knn.pred = knn(x.train.w, x.test.w, y.train.w, k = val)
white.knn.table = table(Actual = y.test.w, Predicted = knn.pred)
white.knn.table
##       Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   2  12   7   0   0   0
##      5   0   4 136 156   5   0   0
##      6   0   1  98 310  25   0   0
##      7   0   0  18 127  39   2   0
##      8   0   0   8  20   5   1   0
##      9   0   0   0   2   0   0   0

KNN Standardized

# makes a standaridized version 
whitewine.standard = whitewine 
whitewine.standard[ , -which(names(whitewine) == "quality")] = scale(whitewine[ , -which(names(whitewine) == "quality")])  

train.knn = whitewine.standard[index, ] 
test.knn = whitewine.standard[-index, ]
x.train = train.knn[, -which(names(train.knn) %in% excluded.var)]  
y.train = train.knn$quality 
x.test = test.knn[, -which(names(test.knn) %in% excluded.var)]  
y.test = test.knn$quality
kval = c(1,2,3,5,10,15,20,25,30,40,50)  
for (val in kval){      
  set.seed(1)      
  knn.pred = knn(x.train, x.test, y.train, k = val)      
  cat("For k = ", val,":")      
  print(table(Actual = y.test, Predicted = knn.pred))      
  correct = mean(knn.pred == y.test)   
  pm1 = mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)      
  
  cat("Correct %:", correct, "\n")      
  cat("Correct % +/- 1:", correct + pm1, "\n")  
}
## For k =  1 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   2   6   3   0   0   0
##      4   1  15  32  48  10   2   0
##      5   1  35 452 453 112  19   0
##      6   4  51 353 760 398  87   1
##      7   1   6  60 237 282  59   1
##      8   0   2  10  41  58  14   0
##      9   0   0   0   1   1   0   1
## Correct %: 0.4211108 
## Correct % +/- 1: 0.6910749 
## For k =  2 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   3   5   3   0   0   0
##      4   0  14  32  49   8   5   0
##      5   2  47 427 456 124  16   0
##      6   4  28 394 734 410  82   2
##      7   1   5  64 248 279  49   0
##      8   0   3   9  44  51  18   0
##      9   0   0   0   2   1   0   0
## Correct %: 0.4067422 
## Correct % +/- 1: 0.6772589 
## For k =  3 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   7   4   0   0   0
##      4   0  11  40  44   8   5   0
##      5   0  28 416 485 130  13   0
##      6   3  27 369 765 423  66   1
##      7   2   2  57 240 287  56   2
##      8   0   1   5  42  63  14   0
##      9   0   1   0   0   1   0   1
## Correct %: 0.4128212 
## Correct % +/- 1: 0.6974302 
## For k =  5 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   7   4   0   0   0
##      4   0   7  46  46   6   3   0
##      5   0  15 403 511 132  11   0
##      6   0   7 322 843 440  42   0
##      7   0   3  41 270 294  38   0
##      8   0   0   4  46  61  14   0
##      9   0   0   0   2   1   0   0
## Correct %: 0.4313346 
## Correct % +/- 1: 0.7231279 
## For k =  10 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   6   5   0   0   0
##      4   0   2  50  46   9   1   0
##      5   0   3 427 527 112   3   0
##      6   0   1 294 936 416   7   0
##      7   0   0  24 308 297  17   0
##      8   0   0   6  53  61   5   0
##      9   0   0   0   1   2   0   0
## Correct %: 0.4606245 
## Correct % +/- 1: 0.7444045 
## For k =  15 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   8   3   0   0   0
##      4   0   1  51  45  11   0   0
##      5   0   3 444 544  79   2   0
##      6   0   0 307 970 376   1   0
##      7   0   0  31 323 290   2   0
##      8   0   0   1  60  64   0   0
##      9   0   0   0   2   1   0   0
## Correct %: 0.4711246 
## Correct % +/- 1: 0.7444045 
## For k =  20 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   6   5   0   0   0
##      4   0   0  53  43  12   0   0
##      5   0   3 440 562  67   0   0
##      6   0   0 297 984 372   1   0
##      7   0   0  30 326 289   1   0
##      8   0   0   4  59  62   0   0
##      9   0   0   0   1   2   0   0
## Correct %: 0.4733352 
## Correct % +/- 1: 0.7510362 
## For k =  25 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    5    6    0    0    0
##      4    0    0   52   44   12    0    0
##      5    0    2  471  543   56    0    0
##      6    0    2  275 1005  372    0    0
##      7    0    0   26  325  295    0    0
##      8    0    0    2   59   64    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4893617 
## Correct % +/- 1: 0.7615363 
## For k =  30 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    6    5    0    0    0
##      4    0    0   51   44   13    0    0
##      5    0    0  470  551   51    0    0
##      6    0    0  289 1017  347    1    0
##      7    0    0   30  327  289    0    0
##      8    0    0    3   62   60    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4907433 
## Correct % +/- 1: 0.7579442 
## For k =  40 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    6    5    0    0    0
##      4    0    0   54   42   12    0    0
##      5    0    0  456  579   37    0    0
##      6    0    0  289 1049  316    0    0
##      7    0    0   29  344  273    0    0
##      8    0    0    2   60   63    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4912959 
## Correct % +/- 1: 0.7582205 
## For k =  50 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    5    5    1    0    0
##      4    0    0   56   41   11    0    0
##      5    0    0  456  578   38    0    0
##      6    0    0  286 1063  305    0    0
##      7    0    0   30  358  258    0    0
##      8    0    0    2   64   59    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4910196 
## Correct % +/- 1: 0.7549047

Naive Bayes

nb.white = naiveBayes(quality ~ . - citric.acid - chlorides -
                        total.sulfur.dioxide, data = train.w)
nb.white.class = predict(nb.white, test.w)
white.nb.table = table(Actual = test.w$quality, Predicted = nb.white.class)
white.nb.table
##       Predicted
## Actual   3   4   5   6   7   8   9
##      3   1   0   0   1   0   0   0
##      4   0   3   7   9   2   0   0
##      5   1  13 175 105   6   1   0
##      6   0   3 148 191  89   0   3
##      7   0   0  26  64  88   5   3
##      8   0   1   4   9  20   0   0
##      9   0   0   1   0   1   0   0
print(mean(test.w$quality == nb.white.class))
## [1] 0.4673469

Results

Agg Results

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
test.table = as.data.frame(red.lm.table)
test.table$Correct = ifelse(as.character(test.table$Actual) == 
                              as.character(test.table$Predicted),
                          "Correct", "Incorrect")
names(test.table)
## [1] "Actual"    "Predicted" "Freq"      "Correct"
t1 = ggplot(test.table, aes(x = factor(Actual), y = Freq, 
                       fill = factor(Correct, levels = c("Correct", "Incorrect")))) +
  geom_bar(stat = "identity", position = "stack") +  # 'dodge' for side-by-side bars
  labs(x = "Actual Label", y = "Count", fill = "Prediction Status") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink")) +
  theme(legend.position = "none")

t2 = ggplot(test.table, aes(x = factor(Predicted), y = Freq, 
                       fill = factor(Correct, levels = c("Correct", "Incorrect")))) +
  geom_bar(stat = "identity", position = "stack") +  # 'dodge' for side-by-side bars
  labs(x = "Actual Label", y = "Count", fill = "Prediction Status") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

grid.arrange(t1, t2, ncol = 2,
             top = text("LM Precision and Recall"))

Attempt to put next to eachother

# values
# red.lm.table.2, red.tree.table, red.rf.table.2, red.knn.table
lm.table = as.data.frame(red.lm.table)
lm.table$model = "Regression"
tree.table = as.data.frame(red.tree.table)
tree.table$model = "Tree"
rf.table = as.data.frame(red.rf.table.2)
rf.table$model = "Random Forest"
knn.table = as.data.frame(red.knn.table)
knn.table$model = "KNN"
list.df = list(lm.table, tree.table, rf.table, knn.table)
red.df = do.call(rbind, list.df)
red.df$Correct = ifelse(as.character(red.df$Actual) == 
                              as.character(red.df$Predicted),
                          "Correct", "Incorrect")
ggplot(red.df, aes(x = factor(Actual), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "How Well Each Model Predicts Rating by Rating") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

ggplot(red.df, aes(x = factor(Predicted, level = c(3:8)), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Predicted Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Recall for Red Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

White

# values
# red.lm.table.2, red.tree.table, red.rf.table.2, red.knn.table
lm.table = as.data.frame(white.lm.table)
lm.table$model = "Regression"
tree.table = as.data.frame(white.tree.table)
tree.table$model = "Tree"
rf.table = as.data.frame(white.rf.table.2)
rf.table$model = "Random Forest"
knn.table = as.data.frame(white.knn.table)
knn.table$model = "KNN"
list.df = list(lm.table, tree.table, rf.table, knn.table)
white.df = do.call(rbind, list.df)
white.df$Correct = ifelse(as.character(white.df$Actual) == 
                              as.character(white.df$Predicted),
                          "Correct", "Incorrect")
ggplot(white.df, aes(x = factor(Actual), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Recall for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

white.df
##     Actual Predicted Freq         model   Correct
## 1        3         4    0    Regression Incorrect
## 2        4         4    0    Regression   Correct
## 3        5         4    1    Regression Incorrect
## 4        6         4    0    Regression Incorrect
## 5        7         4    0    Regression Incorrect
## 6        8         4    0    Regression Incorrect
## 7        9         4    0    Regression Incorrect
## 8        3         5    0    Regression Incorrect
## 9        4         5   14    Regression Incorrect
## 10       5         5  130    Regression   Correct
## 11       6         5   66    Regression Incorrect
## 12       7         5    9    Regression Incorrect
## 13       8         5    1    Regression Incorrect
## 14       9         5    0    Regression Incorrect
## 15       3         6    2    Regression Incorrect
## 16       4         6    6    Regression Incorrect
## 17       5         6  168    Regression Incorrect
## 18       6         6  326    Regression   Correct
## 19       7         6  130    Regression Incorrect
## 20       8         6   20    Regression Incorrect
## 21       9         6    1    Regression Incorrect
## 22       3         7    0    Regression Incorrect
## 23       4         7    1    Regression Incorrect
## 24       5         7    2    Regression Incorrect
## 25       6         7   42    Regression Incorrect
## 26       7         7   47    Regression   Correct
## 27       8         7   13    Regression Incorrect
## 28       9         7    1    Regression Incorrect
## 29       3         5    0          Tree Incorrect
## 30       4         5   14          Tree Incorrect
## 31       5         5  190          Tree   Correct
## 32       6         5  112          Tree Incorrect
## 33       7         5   14          Tree Incorrect
## 34       8         5    0          Tree Incorrect
## 35       9         5    1          Tree Incorrect
## 36       3         6    2          Tree Incorrect
## 37       4         6    5          Tree Incorrect
## 38       5         6  107          Tree Incorrect
## 39       6         6  244          Tree   Correct
## 40       7         6   96          Tree Incorrect
## 41       8         6   13          Tree Incorrect
## 42       9         6    0          Tree Incorrect
## 43       3         7    0          Tree Incorrect
## 44       4         7    2          Tree Incorrect
## 45       5         7    4          Tree Incorrect
## 46       6         7   78          Tree Incorrect
## 47       7         7   76          Tree   Correct
## 48       8         7   21          Tree Incorrect
## 49       9         7    1          Tree Incorrect
## 50       3         4    0 Random Forest Incorrect
## 51       4         4    1 Random Forest   Correct
## 52       5         4    0 Random Forest Incorrect
## 53       6         4    0 Random Forest Incorrect
## 54       7         4    0 Random Forest Incorrect
## 55       8         4    0 Random Forest Incorrect
## 56       9         4    0 Random Forest Incorrect
## 57       3         5    0 Random Forest Incorrect
## 58       4         5   14 Random Forest Incorrect
## 59       5         5  207 Random Forest   Correct
## 60       6         5   40 Random Forest Incorrect
## 61       7         5    2 Random Forest Incorrect
## 62       8         5    0 Random Forest Incorrect
## 63       9         5    0 Random Forest Incorrect
## 64       3         6    2 Random Forest Incorrect
## 65       4         6    6 Random Forest Incorrect
## 66       5         6   93 Random Forest Incorrect
## 67       6         6  369 Random Forest   Correct
## 68       7         6   77 Random Forest Incorrect
## 69       8         6   13 Random Forest Incorrect
## 70       9         6    1 Random Forest Incorrect
## 71       3         7    0 Random Forest Incorrect
## 72       4         7    0 Random Forest Incorrect
## 73       5         7    1 Random Forest Incorrect
## 74       6         7   25 Random Forest Incorrect
## 75       7         7  107 Random Forest   Correct
## 76       8         7   17 Random Forest Incorrect
## 77       9         7    1 Random Forest Incorrect
## 78       3         8    0 Random Forest Incorrect
## 79       4         8    0 Random Forest Incorrect
## 80       5         8    0 Random Forest Incorrect
## 81       6         8    0 Random Forest Incorrect
## 82       7         8    0 Random Forest Incorrect
## 83       8         8    4 Random Forest   Correct
## 84       9         8    0 Random Forest Incorrect
## 85       3         3    0           KNN   Correct
## 86       4         3    0           KNN Incorrect
## 87       5         3    0           KNN Incorrect
## 88       6         3    0           KNN Incorrect
## 89       7         3    0           KNN Incorrect
## 90       8         3    0           KNN Incorrect
## 91       9         3    0           KNN Incorrect
## 92       3         4    0           KNN Incorrect
## 93       4         4    2           KNN   Correct
## 94       5         4    4           KNN Incorrect
## 95       6         4    1           KNN Incorrect
## 96       7         4    0           KNN Incorrect
## 97       8         4    0           KNN Incorrect
## 98       9         4    0           KNN Incorrect
## 99       3         5    0           KNN Incorrect
## 100      4         5   12           KNN Incorrect
## 101      5         5  136           KNN   Correct
## 102      6         5   98           KNN Incorrect
## 103      7         5   18           KNN Incorrect
## 104      8         5    8           KNN Incorrect
## 105      9         5    0           KNN Incorrect
## 106      3         6    2           KNN Incorrect
## 107      4         6    7           KNN Incorrect
## 108      5         6  156           KNN Incorrect
## 109      6         6  310           KNN   Correct
## 110      7         6  127           KNN Incorrect
## 111      8         6   20           KNN Incorrect
## 112      9         6    2           KNN Incorrect
## 113      3         7    0           KNN Incorrect
## 114      4         7    0           KNN Incorrect
## 115      5         7    5           KNN Incorrect
## 116      6         7   25           KNN Incorrect
## 117      7         7   39           KNN   Correct
## 118      8         7    5           KNN Incorrect
## 119      9         7    0           KNN Incorrect
## 120      3         8    0           KNN Incorrect
## 121      4         8    0           KNN Incorrect
## 122      5         8    0           KNN Incorrect
## 123      6         8    0           KNN Incorrect
## 124      7         8    2           KNN Incorrect
## 125      8         8    1           KNN   Correct
## 126      9         8    0           KNN Incorrect
## 127      3         9    0           KNN Incorrect
## 128      4         9    0           KNN Incorrect
## 129      5         9    0           KNN Incorrect
## 130      6         9    0           KNN Incorrect
## 131      7         9    0           KNN Incorrect
## 132      8         9    0           KNN Incorrect
## 133      9         9    0           KNN   Correct
ggplot(white.df, aes(x = factor(Actual), y = Freq, fill=model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Correct, ncol = 4) +
  labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Recall for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("pink", "lightgreen", "skyblue", "purple"))

ggplot(white.df, aes(x = factor(Predicted, level = c(4:8)), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Predicted White Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Precision for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

ggplot(white.df, aes(x = factor(Predicted, level = c(4:8)), y = Freq, fill=model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Correct, ncol = 2) +
  labs(x = "Predicted White Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Precision for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("pink", "lightgreen", "skyblue", "purple"))

Note sure why 9 is being listed as NA

nrow(whitewine)
## [1] 4898
nrow(redwine)
## [1] 1599