library(tree)

## Warning: package 'tree' was built under R version 4.4.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(class)
library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

Red Wine

redwine = read.csv("winequality-red.csv", sep=';', header=T)

names(redwine)

##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"

Add high-quailty feature

# I don't know if i want this...
#redwine$high = ifelse(redwine$quality > 6, 1, 0)

Train test

set.seed(1)
index = sample(1:nrow(redwine), 0.8 * nrow(redwine))
train = redwine[index, ]
test = redwine[-index, ]

Analysis

Idk we could put stuff here but we already did it.

for (name in setdiff(names(redwine), c("quality"))) {
  #par(mfrow = c(1,2))
  formula = as.formula(paste(name, "~ quality"))
  boxplot(formula, data = redwine)
}

It looks like most have differences, but a lot only have small differences. Some like ‘alcohol’ have great differences.

Regression

Test with all data

red.model = lm(quality ~ ., data = train)
summary(red.model)

## 
## Call:
## lm(formula = quality ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.52213 -0.35627 -0.04738  0.44215  2.00517 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.342e+01  2.323e+01   1.008 0.313441    
## fixed.acidity         2.725e-02  2.862e-02   0.952 0.341210    
## volatile.acidity     -1.032e+00  1.353e-01  -7.625 4.77e-14 ***
## citric.acid          -1.857e-01  1.626e-01  -1.142 0.253523    
## residual.sugar        2.223e-02  1.623e-02   1.370 0.170964    
## chlorides            -1.456e+00  4.849e-01  -3.002 0.002731 ** 
## free.sulfur.dioxide   4.483e-03  2.452e-03   1.828 0.067725 .  
## total.sulfur.dioxide -3.086e-03  8.191e-04  -3.767 0.000173 ***
## density              -1.976e+01  2.372e+01  -0.833 0.404986    
## pH                   -3.457e-01  2.165e-01  -1.597 0.110515    
## sulphates             9.192e-01  1.257e-01   7.311 4.69e-13 ***
## alcohol               2.834e-01  2.909e-02   9.741  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6474 on 1267 degrees of freedom
## Multiple R-squared:  0.3536, Adjusted R-squared:  0.348 
## F-statistic: 63.02 on 11 and 1267 DF,  p-value: < 2.2e-16

plot(red.model)

pred = predict(red.model, newdata = test, type='response')
pred.round = round(pred)
actual = test$quality
RMSE = sqrt(mean((pred - actual)^2))
RMSE

## [1] 0.6530574

red.lm.table = table(Actual = actual, Predicted = pred.round)
red.lm.table

##       Predicted
## Actual  4  5  6  7
##      3  1  2  1  0
##      4  0  6  5  0
##      5  0 97 28  0
##      6  0 47 84  4
##      7  0  2 28 12
##      8  0  0  2  1

exact = sum(pred.round == actual) / nrow(test)
within.1 = sum(abs(pred.round - actual) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.603125

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9625

Test with unique polynomial and shit

red.model.2 = lm(quality ~ poly(fixed.acidity,2) + volatile.acidity
               + residual.sugar + chlorides +
                 total.sulfur.dioxide + density +
                 poly(sulphates,2) + poly(alcohol,3), data = train)
summary(red.model.2)

## 
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + volatile.acidity + 
##     residual.sugar + chlorides + total.sulfur.dioxide + density + 
##     poly(sulphates, 2) + poly(alcohol, 3), data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40440 -0.37319 -0.05082  0.44524  2.01917 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.273e+01  1.961e+01   3.709 0.000217 ***
## poly(fixed.acidity, 2)1  3.593e+00  1.045e+00   3.437 0.000606 ***
## poly(fixed.acidity, 2)2 -1.524e+00  6.884e-01  -2.214 0.027005 *  
## volatile.acidity        -8.275e-01  1.149e-01  -7.201 1.02e-12 ***
## residual.sugar           3.795e-02  1.504e-02   2.522 0.011786 *  
## chlorides               -1.531e+00  4.581e-01  -3.342 0.000855 ***
## total.sulfur.dioxide    -1.770e-03  5.858e-04  -3.021 0.002571 ** 
## density                 -6.676e+01  1.971e+01  -3.388 0.000726 ***
## poly(sulphates, 2)1      6.297e+00  7.511e-01   8.383  < 2e-16 ***
## poly(sulphates, 2)2     -3.969e+00  6.827e-01  -5.813 7.74e-09 ***
## poly(alcohol, 3)1        8.908e+00  9.742e-01   9.145  < 2e-16 ***
## poly(alcohol, 3)2       -4.895e-01  6.658e-01  -0.735 0.462380    
## poly(alcohol, 3)3       -1.518e+00  6.594e-01  -2.301 0.021529 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6376 on 1266 degrees of freedom
## Multiple R-squared:  0.3735, Adjusted R-squared:  0.3676 
## F-statistic:  62.9 on 12 and 1266 DF,  p-value: < 2.2e-16

plot(red.model.2)

pred2 = predict(red.model.2, newdata = test, type='response')
pred2.round = round(pred2)
actual = test$quality
RMSE2 = sqrt(mean((pred2 - actual)^2))
RMSE2

## [1] 0.6384957

red.lm.table.2 = table(Actual = actual, Predicted = pred2.round)
red.lm.table.2

##       Predicted
## Actual  4  5  6  7
##      3  1  2  1  0
##      4  2  5  4  0
##      5  0 98 27  0
##      6  0 47 81  7
##      7  0  3 22 17
##      8  0  0  2  1

lm.2.results = data.frame(actual = test$quality, predicted = pred2.round)
for (i in 3:8){
  sample = subset(lm.2.results, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 0.8181818 
## Error when rating =  5 : 0.216 
## Error when rating =  6 : 0.4 
## Error when rating =  7 : 0.5952381 
## Error when rating =  8 : 1

exact = sum(pred2.round == actual) / nrow(test)
within.1 = sum(abs(pred2.round - actual) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.61875

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9625

Trees

Regression Tree

red.tree = tree(quality ~ ., data = train)
summary(red.tree)

## 
## Regression tree:
## tree(formula = quality ~ ., data = train)
## Variables actually used in tree construction:
## [1] "alcohol"              "sulphates"            "volatile.acidity"    
## [4] "total.sulfur.dioxide"
## Number of terminal nodes:  11 
## Residual mean deviance:  0.3927 = 497.9 / 1268 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.1900 -0.2933 -0.1903  0.0000  0.4043  1.9390

plot(red.tree)
text(red.tree, pretty = 0, cex = 0.7)

red.cv.tree = cv.tree(red.tree)
plot(red.cv.tree$size, red.cv.tree$dev, type='b')

red.tree.pred = predict(red.tree, newdata = test)
red.tree.pred.r = round(red.tree.pred)
tree.RMSE = sqrt(mean((red.tree.pred - test$quality)^2))
tree.RMSE

## [1] 0.6929522

red.tree.table = table(Actual = test$quality, Predicted = red.tree.pred.r)
red.tree.table

##       Predicted
## Actual  4  5  6  7
##      3  2  1  1  0
##      4  0  6  5  0
##      5  0 96 29  0
##      6  0 57 68 10
##      7  0  2 25 15
##      8  0  0  2  1

exact = sum(red.tree.pred.r == test$quality) / nrow(test)
within.1 = sum(abs(red.tree.pred.r - test$quality) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.559375

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.965625

tree.results = data.frame(actual = test$quality, predicted = red.tree.pred.r)
for (i in 3:8){
  sample = subset(tree.results, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.232 
## Error when rating =  6 : 0.4962963 
## Error when rating =  7 : 0.6428571 
## Error when rating =  8 : 1

Random Forrest

Default Trees

Will later need to do CV and try out different tree values.

rf.red = randomForest(quality ~ ., data = train, importance=TRUE)
varImpPlot(rf.red)

rf.red.pred = predict(rf.red, newdata = test)
rf.red.pred.r = round(rf.red.pred)
rf.red.rmse = sqrt(mean((rf.red.pred - test$quality)^2))
rf.red.rmse

## [1] 0.5865801

red.rf.table = table(Actual = test$quality, Predicted = rf.red.pred.r)
red.rf.table

##       Predicted
## Actual   5   6   7
##      3   4   0   0
##      4   8   3   0
##      5 105  20   0
##      6  34  96   5
##      7   2  21  19
##      8   0   1   2

exact = sum(rf.red.pred.r == test$quality) / nrow(test)
within.1 = sum(abs(rf.red.pred.r - test$quality) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.6875

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.96875

rf.red.results = data.frame(actual = test$quality, predicted = rf.red.pred.r)
for (i in 3:8){
  sample = subset(rf.red.results, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.16 
## Error when rating =  6 : 0.2888889 
## Error when rating =  7 : 0.547619 
## Error when rating =  8 : 1

Different values

# predefining variables
p = ncol(redwine) - 1
m = c(p, floor(p / 2), floor(sqrt(p)))
names(m) = c("p", "p/2", "sqrt(p)")
ntrees = seq(1, 150, by = 5)

# to store the errors
results <- data.frame()

for (j in 1:3) {
  for (i in seq_along(ntrees)) {
    model = randomForest(quality ~ ., data = train, mtry = m[j], 
                          ntree = ntrees[i], keep.forest = TRUE)
    
    preds = predict(model, newdata = test)
    mse = mean((preds - test$quality)^2)
    
    results = rbind(results, data.frame(
      ntree = ntrees[i],
      MSE = mse,
      mtry = m[j]
    ))
  }
}

plot(x = results[results$mtry == 11, ]$ntree, 
     y = results[results$mtry == 11, ]$MSE, 
     col='blue', type = 'l',
     xlab = 'number of trees', ylab = 'MSE',
     ylim = c(0.2,0.8))
lines(x = results[results$mtry == 5, ]$ntree, 
     y = results[results$mtry == 5, ]$MSE, 
     col='red', type = 'l')
lines(x = results[results$mtry == 3, ]$ntree, 
     y = results[results$mtry == 3, ]$MSE, 
     col='darkgreen', type = 'l')
legend("topright", legend = names(m),
       col = c("blue", "red", "darkgreen"), 
       lwd = 2, title = "mtry")

Looks like it doesn’t really matter what value of mtry we use, and once we get to like 50 trees nothing really improves the model.

rf.red.best = randomForest(quality ~ ., data = train, mtry = 4, 
                          ntree = 50, keep.forest = TRUE)
rf.red.pred.best = predict(rf.red.best, newdata = test)
rf.red.pred.best.r = round(rf.red.pred.best)
rf.red.rmse = sqrt(mean((rf.red.pred.best - test$quality)^2))
rf.red.rmse

## [1] 0.5896732

red.rf.table.2 = table(Actual = test$quality, Predicted = rf.red.pred.best.r)
red.rf.table.2

##       Predicted
## Actual   5   6   7
##      3   4   0   0
##      4   8   3   0
##      5 104  21   0
##      6  32  96   7
##      7   1  19  22
##      8   0   1   2

exact = sum(rf.red.pred.best.r == test$quality) / nrow(test)
within.1 = sum(abs(rf.red.pred.best.r - test$quality) == 1) / nrow(test)

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.69375

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.971875

rf.red.best.res = data.frame(actual = test$quality, predicted = rf.red.pred.best.r)
for (i in 3:8){
  sample = subset(rf.red.best.res, actual == i)
  n = nrow(sample)
  correct = (n - sum(sample$actual == sample$predicted)) / n
  cat("Error when rating = ", i, ":", correct, "\n")
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.168 
## Error when rating =  6 : 0.2888889 
## Error when rating =  7 : 0.4761905 
## Error when rating =  8 : 1

KNN Clustering

names(redwine)

##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"

# ecluded variables 
excluded.var = c("quality")#, "citric.acid", "free.sulfur.dioxide", "pH")  

# set up variables 
x.train = train[, -which(names(train) %in% excluded.var)] 
y.train = train$quality 
x.test = test[, -which(names(test) %in% excluded.var)] 
y.test = test$quality

kval = c(1,2,3,5,10,15,20,25) 
for (val in kval){   
  set.seed(1)   
  knn.pred = knn(x.train, x.test, y.train, k = val)   
  cat("For k = ", val,":")   
  print(table(Actual = y.test, Predicted = knn.pred))   
  cat("Correct %:", mean(knn.pred == y.test), "\n")   
  cat("Correct % +/- 1:", mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
      + mean(knn.pred == y.test), "\n") 
}

## For k =  1 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  4  7  0  0
##      5  0  5 97 19  4  0
##      6  0  4 52 67 11  1
##      7  0  1  3 15 22  1
##      8  0  0  0  1  1  1
## Correct %: 0.584375 
## Correct % +/- 1: 0.703125 
## For k =  2 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  0  1  0
##      4  0  0  5  6  0  0
##      5  0  4 86 32  3  0
##      6  0  4 56 58 17  0
##      7  0  2  8 13 17  2
##      8  0  0  1  1  0  1
## Correct %: 0.50625 
## Correct % +/- 1: 0.68125 
## For k =  3 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  5  5  1  0
##      5  0  1 83 37  4  0
##      6  0  2 62 52 19  0
##      7  0  1  5 16 18  2
##      8  0  0  0  1  1  1
## Correct %: 0.48125 
## Correct % +/- 1: 0.684375 
## For k =  5 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  8  3  0  0
##      5  0  0 76 44  5  0
##      6  0  0 65 52 18  0
##      7  0  0  4 25 12  1
##      8  0  0  0  1  2  0
## Correct %: 0.4375 
## Correct % +/- 1: 0.6625 
## For k =  10 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  2  1  1  0
##      4  0  0  6  5  0  0
##      5  0  1 77 44  3  0
##      6  0  0 65 59 11  0
##      7  0  1  7 28  6  0
##      8  0  0  0  3  0  0
## Correct %: 0.44375 
## Correct % +/- 1: 0.6375 
## For k =  15 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 85 39  1  0
##      6  0  0 65 66  4  0
##      7  0  0 11 25  6  0
##      8  0  0  0  2  1  0
## Correct %: 0.490625 
## Correct % +/- 1: 0.65 
## For k =  20 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  2  2  0  0
##      4  0  0  7  4  0  0
##      5  0  0 88 36  1  0
##      6  0  0 63 69  3  0
##      7  0  0  9 29  4  0
##      8  0  0  1  1  1  0
## Correct %: 0.503125 
## Correct % +/- 1: 0.653125 
## For k =  25 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  1  1  2  0
##      4  0  0  8  3  0  0
##      5  0  0 90 35  0  0
##      6  0  0 63 69  3  0
##      7  0  0  8 32  2  0
##      8  0  0  1  1  1  0
## Correct %: 0.503125 
## Correct % +/- 1: 0.65

knn.pred = knn(x.train, x.test, y.train, k = 1)
red.knn.table = print(table(Actual = y.test, Predicted = knn.pred))

##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  4  7  0  0
##      5  0  5 97 19  4  0
##      6  0  4 52 67 11  1
##      7  0  1  3 15 22  1
##      8  0  0  0  1  1  1

red.knn.table

##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  4  7  0  0
##      5  0  5 97 19  4  0
##      6  0  4 52 67 11  1
##      7  0  1  3 15 22  1
##      8  0  0  0  1  1  1

KNN Standardized

# makes a standaridized version
redwine.standard = redwine
redwine.standard[ , -which(names(redwine) == "quality")] = scale(redwine[ , -which(names(redwine) == "quality")])

train.knn = redwine.standard[index, ]
test.knn = redwine.standard[-index, ]

x.train = train.knn[, -which(names(train.knn) %in% excluded.var)] 
y.train = train.knn$quality
x.test = test.knn[, -which(names(test.knn) %in% excluded.var)] 
y.test = test.knn$quality

kval = c(1,2,3,5,10,15,20,25,30,40,50) 
for (val in kval){   
  set.seed(1)   
  knn.pred = knn(x.train, x.test, y.train, k = val)   
  cat("For k = ", val,":")   
  print(table(Actual = y.test, Predicted = knn.pred))   
  correct = mean(knn.pred == y.test)
  pm1 = mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
  
  cat("Correct %:", correct, "\n")   
  cat("Correct % +/- 1:", correct + pm1, "\n") 
}

## For k =  1 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  1  6  4  0  0
##      5  0  2 94 26  1  2
##      6  0  2 35 85 11  2
##      7  0  0  6  8 26  2
##      8  0  0  0  1  1  1
## Correct %: 0.646875 
## Correct % +/- 1: 0.803125 
## For k =  2 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  1  1  2  0  0
##      4  0  1  5  5  0  0
##      5  0  0 94 27  3  1
##      6  0  2 39 79 14  1
##      7  0  0  7  8 25  2
##      8  0  0  0  1  1  1
## Correct %: 0.625 
## Correct % +/- 1: 0.7875 
## For k =  3 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  1 92 29  2  1
##      6  0  0 42 74 16  3
##      7  0  0  5 12 24  1
##      8  0  0  0  1  2  0
## Correct %: 0.59375 
## Correct % +/- 1: 0.765625 
## For k =  5 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  6  4  1  0
##      5  0  0 90 32  3  0
##      6  0  0 45 72 18  0
##      7  0  0  4 18 20  0
##      8  0  0  0  1  2  0
## Correct %: 0.56875 
## Correct % +/- 1: 0.75 
## For k =  10 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  1  1  0  0
##      4  0  0  6  5  0  0
##      5  0  0 94 28  3  0
##      6  0  0 48 73 14  0
##      7  0  0  5 17 20  0
##      8  0  0  0  1  2  0
## Correct %: 0.584375 
## Correct % +/- 1: 0.74375 
## For k =  15 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  6  5  0  0
##      5  0  0 96 27  2  0
##      6  0  0 50 72 13  0
##      7  0  0  3 22 17  0
##      8  0  0  0  1  2  0
## Correct %: 0.578125 
## Correct % +/- 1: 0.725 
## For k =  20 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 97 26  2  0
##      6  0  0 49 74 12  0
##      7  0  0  2 24 16  0
##      8  0  0  0  2  1  0
## Correct %: 0.584375 
## Correct % +/- 1: 0.728125 
## For k =  25 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 96 27  2  0
##      6  0  0 52 74  9  0
##      7  0  0  3 20 19  0
##      8  0  0  0  2  1  0
## Correct %: 0.590625 
## Correct % +/- 1: 0.728125 
## For k =  30 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  8  3  0  0
##      5  0  0 94 29  2  0
##      6  0  0 52 74  9  0
##      7  0  0  2 23 17  0
##      8  0  0  0  2  1  0
## Correct %: 0.578125 
## Correct % +/- 1: 0.725 
## For k =  40 :      Predicted
## Actual  3  4  5  6  7  8
##      3  0  0  3  1  0  0
##      4  0  0  7  4  0  0
##      5  0  0 98 25  2  0
##      6  0  0 49 81  5  0
##      7  0  0  3 23 16  0
##      8  0  0  0  2  1  0
## Correct %: 0.609375 
## Correct % +/- 1: 0.728125 
## For k =  50 :      Predicted
## Actual   3   4   5   6   7   8
##      3   0   0   3   1   0   0
##      4   0   0   7   4   0   0
##      5   0   0 100  25   0   0
##      6   0   0  52  78   5   0
##      7   0   0   4  26  12   0
##      8   0   0   0   2   1   0
## Correct %: 0.59375 
## Correct % +/- 1: 0.7125

Naive Bayes

nb.red = naiveBayes(quality ~ . - citric.acid - free.sulfur.dioxide -
                      pH, data = train) 
nb.red.class = predict(nb.red, test) 
red.nb.table = table(Actual = test$quality, Predicted = nb.red.class) 
red.nb.table

##       Predicted
## Actual  3  4  5  6  7  8
##      3  0  2  2  0  0  0
##      4  0  1  5  4  0  1
##      5  0  2 91 30  1  1
##      6  0  2 48 73  9  3
##      7  0  0  2 20 18  2
##      8  0  0  0  1  2  0

print(mean(test$quality == nb.red.class))

## [1] 0.571875

Results

I think here we can make a fun plot that shows the error for each rating value by each model. Here’s some example code.

model.error = data.frame(
  response = 3:8,
  lm = c(100, 80, 20, 40, 60, 100),
  svm = c(100, 80, 40, 50, 60, 80),
  rf = c(100, 100, 10, 30, 50, 100)
)
bar_matrix = t(as.matrix(model.error[ , -1]))  # transpose for barplot
colnames(bar_matrix) = model.error$response

barplot(
  bar_matrix,
  beside = TRUE,
  col = c("pink", "lightblue", "lightgreen"),
  legend = rownames(bar_matrix)
)

White Wine

whitewine = read.csv("winequality-white.csv", sep=';', header=T)

names(whitewine)

##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"

Train Test

whitewine[whitewine$quality == "9",]

##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 775            9.1             0.27        0.45           10.6     0.035
## 821            6.6             0.36        0.29            1.6     0.021
## 828            7.4             0.24        0.36            2.0     0.031
## 877            6.9             0.36        0.34            4.2     0.018
## 1606           7.1             0.26        0.49            2.2     0.032
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 775                   28                  124 0.99700 3.20      0.46    10.4
## 821                   24                   85 0.98965 3.41      0.61    12.4
## 828                   27                  139 0.99055 3.28      0.48    12.5
## 877                   57                  119 0.98980 3.28      0.36    12.7
## 1606                  31                  113 0.99030 3.37      0.42    12.9
##      quality
## 775        9
## 821        9
## 828        9
## 877        9
## 1606       9

set.seed(1)
index.w = sample(1:nrow(whitewine), 0.8 * nrow(whitewine))
train.w = whitewine[index.w, ]
test.w = whitewine[-index.w, ]

Analysis

for (name in setdiff(names(whitewine), c("quality"))) {
  #par(mfrow = c(1,2))
  formula = as.formula(paste(name, "~ quality"))
  boxplot(formula, data = whitewine)
}

There seems to be much less variability between values than we saw in red wine

Test with all data

white.model = lm(quality ~ ., data = train.w) 
summary(white.model)

## 
## Call:
## lm(formula = quality ~ ., data = train.w)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8815 -0.4852 -0.0325  0.4635  2.9742 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.578e+02  2.024e+01   7.796 8.16e-15 ***
## fixed.acidity         6.680e-02  2.290e-02   2.918  0.00355 ** 
## volatile.acidity     -1.812e+00  1.271e-01 -14.254  < 2e-16 ***
## citric.acid           3.447e-03  1.072e-01   0.032  0.97436    
## residual.sugar        8.519e-02  8.203e-03  10.386  < 2e-16 ***
## chlorides            -2.723e-01  6.017e-01  -0.452  0.65095    
## free.sulfur.dioxide   4.067e-03  9.319e-04   4.364 1.31e-05 ***
## total.sulfur.dioxide -4.189e-04  4.230e-04  -0.990  0.32208    
## density              -1.579e+02  2.053e+01  -7.691 1.83e-14 ***
## pH                    7.043e-01  1.170e-01   6.020 1.91e-09 ***
## sulphates             6.986e-01  1.127e-01   6.197 6.33e-10 ***
## alcohol               1.843e-01  2.622e-02   7.031 2.40e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.755 on 3906 degrees of freedom
## Multiple R-squared:  0.283,  Adjusted R-squared:  0.281 
## F-statistic: 140.1 on 11 and 3906 DF,  p-value: < 2.2e-16

#plot(white.model)

pred.w = predict(white.model, newdata = test.w, type='response') 
pred.round.w = round(pred.w) 
actual.w = test.w$quality 
RMSE.w = sqrt(mean((pred.w - actual.w)^2)) 
RMSE.w

## [1] 0.7374623

white.lm.table = table(Actual = actual.w, Predicted = pred.round.w)
white.lm.table

##       Predicted
## Actual   4   5   6   7
##      3   0   0   2   0
##      4   0  14   6   1
##      5   1 130 168   2
##      6   0  66 326  42
##      7   0   9 130  47
##      8   0   1  20  13
##      9   0   0   1   1

exact = sum(pred.round.w == actual.w) / nrow(test.w) 
within.1 = sum(abs(pred.round.w - actual.w) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.5132653

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9561224

lm.results.w = data.frame(actual = test.w$quality, predicted = pred.round.w)

for (i in 3:8){   
  sample = subset(lm.results.w, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.5681063 
## Error when rating =  6 : 0.2488479 
## Error when rating =  7 : 0.7473118 
## Error when rating =  8 : 1

Test with unique polynomial and shit

white.model.2 = lm(quality ~ poly(fixed.acidity,2) + poly(volatile.acidity,3) + 
            residual.sugar + free.sulfur.dioxide + poly(density,2) + 
            pH + sulphates + poly(alcohol,3), data = train.w)
summary(white.model.2)

## 
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + poly(volatile.acidity, 
##     3) + residual.sugar + free.sulfur.dioxide + poly(density, 
##     2) + pH + sulphates + poly(alcohol, 3), data = train.w)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7865 -0.4886 -0.0201  0.4791  2.4358 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.407e+00  4.426e-01   3.178 0.001492 ** 
## poly(fixed.acidity, 2)1     7.410e+00  1.333e+00   5.558 2.92e-08 ***
## poly(fixed.acidity, 2)2    -3.622e+00  7.664e-01  -4.726 2.37e-06 ***
## poly(volatile.acidity, 3)1 -1.187e+01  7.876e-01 -15.076  < 2e-16 ***
## poly(volatile.acidity, 3)2  1.979e+00  7.793e-01   2.540 0.011136 *  
## poly(volatile.acidity, 3)3 -2.554e+00  7.684e-01  -3.324 0.000897 ***
## residual.sugar              1.058e-01  9.056e-03  11.681  < 2e-16 ***
## free.sulfur.dioxide         3.517e-03  7.442e-04   4.726 2.38e-06 ***
## poly(density, 2)1          -4.332e+01  4.438e+00  -9.760  < 2e-16 ***
## poly(density, 2)2           5.109e+00  9.811e-01   5.207 2.02e-07 ***
## pH                          1.030e+00  1.232e-01   8.364  < 2e-16 ***
## sulphates                   7.766e-01  1.125e-01   6.904 5.88e-12 ***
## poly(alcohol, 3)1           6.064e+00  2.487e+00   2.438 0.014796 *  
## poly(alcohol, 3)2           2.954e+00  8.091e-01   3.651 0.000265 ***
## poly(alcohol, 3)3          -1.658e+00  7.702e-01  -2.153 0.031387 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7468 on 3903 degrees of freedom
## Multiple R-squared:  0.299,  Adjusted R-squared:  0.2965 
## F-statistic: 118.9 on 14 and 3903 DF,  p-value: < 2.2e-16

#plot(white.model.2)

pred2.w = predict(white.model.2, newdata = test.w, type='response') 
pred2.round.w = round(pred2.w) 
actual.w = test.w$quality 
RMSE2.w = sqrt(mean((pred2.w - actual.w)^2)) 
RMSE2.w

## [1] 0.7286815

white.lm.table.2 = table(Actual = actual.w, Predicted = pred2.round.w)
white.lm.table.2

##       Predicted
## Actual   5   6   7
##      3   0   2   0
##      4  15   5   1
##      5 146 154   1
##      6  67 318  49
##      7   6 123  57
##      8   1  19  14
##      9   0   1   1

lm.2.results.w = data.frame(actual = test.w$quality, predicted = pred2.round.w)

for (i in 4:8){   
  sample = subset(lm.2.results.w, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}

## Error when rating =  4 : 1 
## Error when rating =  5 : 0.5149502 
## Error when rating =  6 : 0.2672811 
## Error when rating =  7 : 0.6935484 
## Error when rating =  8 : 1

exact = sum(pred2.round.w == actual.w) / nrow(test.w) 
within.1 = sum(abs(pred2.round.w - actual.w) == 1) / nrow(test.w) 

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.5316327

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9622449

That wasn’t too good of results…

Trees

Regression Tree

white.tree = tree(quality ~ .,data = train.w) 
summary(white.tree)

## 
## Regression tree:
## tree(formula = quality ~ ., data = train.w)
## Variables actually used in tree construction:
## [1] "alcohol"             "volatile.acidity"    "free.sulfur.dioxide"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.5906 = 2311 / 3913 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.5910 -0.3539 -0.1841  0.0000  0.6461  2.6460

plot(white.tree) 
text(white.tree, pretty = 0, cex = 0.7)

white.cv.tree = cv.tree(white.tree)
plot(white.cv.tree$size, white.cv.tree$dev, type='b')

white.tree.pred = predict(white.tree, newdata = test.w) 
white.tree.pred.r = round(white.tree.pred) 
tree.RMSE = sqrt(mean((white.tree.pred - test.w$quality)^2)) 
tree.RMSE

## [1] 0.7402891

white.tree.table = table(Actual = test.w$quality, Predicted = white.tree.pred.r)
white.tree.table

##       Predicted
## Actual   5   6   7
##      3   0   2   0
##      4  14   5   2
##      5 190 107   4
##      6 112 244  78
##      7  14  96  76
##      8   0  13  21
##      9   1   0   1

exact = sum(white.tree.pred.r == test.w$quality) / nrow(test.w) 
within.1 = sum(abs(white.tree.pred.r - test.w$quality) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.5204082

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9571429

tree.results.w = data.frame(actual = test.w$quality, 
                            predicted = white.tree.pred.r) 

for (i in 3:8){   
  sample = subset(tree.results.w, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 1 
## Error when rating =  5 : 0.3687708 
## Error when rating =  6 : 0.437788 
## Error when rating =  7 : 0.5913978 
## Error when rating =  8 : 1

Random Forrest

Default Trees

Will later need to do CV and try out different tree values.

rf.white = randomForest(quality ~ .- citric.acid - chlorides -
                        total.sulfur.dioxide, data = train.w, importance=TRUE) 
varImpPlot(rf.white)

rf.white.pred = predict(rf.white, newdata = test.w) 
rf.white.pred.r = round(rf.white.pred) 
rf.white.rmse = sqrt(mean((rf.white.pred - test.w$quality)^2)) 
rf.white.rmse

## [1] 0.5888631

white.rf.table = table(Actual = test.w$quality, Predicted = rf.white.pred.r)
white.rf.table

##       Predicted
## Actual   4   5   6   7   8
##      3   0   0   2   0   0
##      4   1  15   4   1   0
##      5   0 201  99   1   0
##      6   0  43 362  29   0
##      7   0   1  83 102   0
##      8   0   0   7  24   3
##      9   0   1   0   1   0

exact = sum(rf.white.pred.r == test.w$quality) / nrow(test.w) 
within.1 = sum(abs(rf.white.pred.r - test.w$quality) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.6826531

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9816327

rf.white.results = data.frame(actual = test.w$quality, 
                              predicted = rf.white.pred.r) 

for (i in 3:8){   
  sample = subset(rf.white.results, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 0.952381 
## Error when rating =  5 : 0.3322259 
## Error when rating =  6 : 0.1658986 
## Error when rating =  7 : 0.4516129 
## Error when rating =  8 : 0.9117647

Different values

# predefining variables 
p = ncol(whitewine) - 1 
m = c(p, floor(p / 2), floor(sqrt(p))) 
names(m) = c("p", "p/2", "sqrt(p)") 
ntrees = seq(1, 150, by = 5)  

# to store the errors 
results <- data.frame()  
for (j in 1:3) {   
  for (i in seq_along(ntrees)) {     
    model = randomForest(quality ~ ., data = train.w, mtry = m[j],
                         ntree = ntrees[i], keep.forest = TRUE)          
    preds = predict(model, newdata = test.w)     
    mse = mean((preds - test.w$quality)^2)          
    results = rbind(results, data.frame(ntree = ntrees[i],MSE = mse,mtry = m[j])) 
  } 
}

plot(x = results[results$mtry == 11, ]$ntree,       
     y = results[results$mtry == 11, ]$MSE,       
     col='blue', type = 'l',      
     xlab = 'number of trees', ylab = 'MSE',      
     #ylim = c(0.2,0.8)
     ) 
lines(x = results[results$mtry == 5, ]$ntree,       
      y = results[results$mtry == 5, ]$MSE,       
      col='red', type = 'l') 
lines(x = results[results$mtry == 3, ]$ntree,       
      y = results[results$mtry == 3, ]$MSE,       
      col='darkgreen', type = 'l') 
legend("topright", legend = names(m),        
       col = c("blue", "red", "darkgreen"),         
       lwd = 2, title = "mtry")

It looks like using mtry of sqrt(p) is best, and it looks like a high number of trees is good.

rf.white.best = randomForest(quality ~ ., data = train.w, mtry = 3,
                             ntree = 150, keep.forest = TRUE) 
rf.white.pred.best = predict(rf.white.best, newdata = test.w) 
rf.white.pred.best.r = round(rf.white.pred.best) 
rf.white.rmse = sqrt(mean((rf.white.pred.best - test.w$quality)^2)) 
rf.white.rmse

## [1] 0.5885784

white.rf.table.2 = table(Actual = test.w$quality, Predicted = rf.white.pred.best.r)
white.rf.table.2

##       Predicted
## Actual   4   5   6   7   8
##      3   0   0   2   0   0
##      4   1  14   6   0   0
##      5   0 207  93   1   0
##      6   0  40 369  25   0
##      7   0   2  77 107   0
##      8   0   0  13  17   4
##      9   0   0   1   1   0

exact = sum(rf.white.pred.best.r == test.w$quality) / nrow(test.w) 
within.1 = sum(abs(rf.white.pred.best.r - test.w$quality) == 1) / nrow(test.w)  

cat("Percent of test data correclty labeled:", exact, '\n')

## Percent of test data correclty labeled: 0.7020408

cat("Percent of test data +/- 1 point:", exact + within.1, '\n')

## Percent of test data +/- 1 point: 0.9734694

rf.white.best.res = data.frame(actual = test.w$quality, 
                               predicted = rf.white.pred.best.r) 

for (i in 3:8){   
  sample = subset(rf.white.best.res, actual == i)   
  n = nrow(sample)   
  correct = (n - sum(sample$actual == sample$predicted)) / n   
  cat("Error when rating = ", i, ":", correct, "\n") 
}

## Error when rating =  3 : 1 
## Error when rating =  4 : 0.952381 
## Error when rating =  5 : 0.3122924 
## Error when rating =  6 : 0.1497696 
## Error when rating =  7 : 0.4247312 
## Error when rating =  8 : 0.8823529

KNN Clustering

# ecluded variables
excluded.var = c("quality", "citric.acid", "chlorides", "total.sulfur.dioxide")

# set up variables
x.train.w = train.w[, -which(names(train.w) %in% excluded.var)]
y.train.w = train.w$quality
x.test.w = test.w[, -which(names(test.w) %in% excluded.var)]
y.test.w = test.w$quality

kval = c(1,2,3,5,10,15,20,25)
for (val in kval){
  set.seed(1)
  knn.pred = knn(x.train.w, x.test.w, y.train.w, k = val)
  cat("For k = ", val,":")
  print(table(Actual = y.test.w, Predicted = knn.pred))
  cat("Correct %:", mean(knn.pred == y.test.w), "\n")
  cat("Correct % +/- 1:", mean(abs(as.integer(knn.pred) - as.integer(y.test.w)) == 1)
      + mean(knn.pred == y.test.w), "\n")
}

## For k =  1 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   2   0   0   0   0
##      4   0   7   9   4   1   0   0
##      5   0  14 175  91  18   3   0
##      6   0   5  75 292  53   9   0
##      7   0   3  15  56 100  12   0
##      8   0   0   4  12   7  11   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.5969388 
## Correct % +/- 1: 0.7693878 
## For k =  2 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   1   1   0   0   0   0
##      4   0   0  13   4   4   0   0
##      5   0  12 145 113  26   5   0
##      6   0   6 109 247  61  11   0
##      7   0   2  21  76  80   7   0
##      8   0   0   9   7   8  10   0
##      9   0   1   0   1   0   0   0
## Correct %: 0.4918367 
## Correct % +/- 1: 0.7 
## For k =  3 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   1   0   1   0   0   0   0
##      4   0   4   8   8   1   0   0
##      5   0  11 155 113  20   2   0
##      6   0   6 105 246  70   7   0
##      7   0   1  18  79  75  13   0
##      8   0   0   5  12  10   7   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.4979592 
## Correct % +/- 1: 0.7091837 
## For k =  5 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   2   0   0   0   0
##      4   0   2   9   9   1   0   0
##      5   0   8 157 113  19   4   0
##      6   0   4 108 247  66   9   0
##      7   0   0  14 100  65   7   0
##      8   0   0   4  15   9   6   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.4867347 
## Correct % +/- 1: 0.6908163 
## For k =  10 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   3  11   6   1   0   0
##      5   0   4 137 143  12   5   0
##      6   0   3  98 278  51   4   0
##      7   0   0  17  96  67   6   0
##      8   0   0   4  17  10   3   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.4979592 
## Correct % +/- 1: 0.7214286 
## For k =  15 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   1   1   0   0   0
##      4   0   1  13   7   0   0   0
##      5   0   5 146 136   9   5   0
##      6   0   2  97 298  35   2   0
##      7   0   0  16 112  55   3   0
##      8   0   0   3  19  10   2   0
##      9   0   0   0   1   1   0   0
## Correct %: 0.5122449 
## Correct % +/- 1: 0.7091837 
## For k =  20 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   2  11   8   0   0   0
##      5   0   2 141 149   9   0   0
##      6   0   3  89 314  27   1   0
##      7   0   0  14 121  48   3   0
##      8   0   0   4  22   6   2   0
##      9   0   0   0   1   1   0   0
## Correct %: 0.5173469 
## Correct % +/- 1: 0.7132653 
## For k =  25 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   3  11   7   0   0   0
##      5   0   4 139 153   5   0   0
##      6   0   1 101 306  25   1   0
##      7   0   0  16 129  41   0   0
##      8   0   0   7  20   6   1   0
##      9   0   0   0   2   0   0   0
## Correct %: 0.5 
## Correct % +/- 1: 0.694898

knn.pred = knn(x.train.w, x.test.w, y.train.w, k = val)
white.knn.table = table(Actual = y.test.w, Predicted = knn.pred)
white.knn.table

##       Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   0   2   0   0   0
##      4   0   2  12   7   0   0   0
##      5   0   4 136 156   5   0   0
##      6   0   1  98 310  25   0   0
##      7   0   0  18 127  39   2   0
##      8   0   0   8  20   5   1   0
##      9   0   0   0   2   0   0   0

KNN Standardized

# makes a standaridized version 
whitewine.standard = whitewine 
whitewine.standard[ , -which(names(whitewine) == "quality")] = scale(whitewine[ , -which(names(whitewine) == "quality")])  

train.knn = whitewine.standard[index, ] 
test.knn = whitewine.standard[-index, ]

x.train = train.knn[, -which(names(train.knn) %in% excluded.var)]  
y.train = train.knn$quality 
x.test = test.knn[, -which(names(test.knn) %in% excluded.var)]  
y.test = test.knn$quality

kval = c(1,2,3,5,10,15,20,25,30,40,50)  
for (val in kval){      
  set.seed(1)      
  knn.pred = knn(x.train, x.test, y.train, k = val)      
  cat("For k = ", val,":")      
  print(table(Actual = y.test, Predicted = knn.pred))      
  correct = mean(knn.pred == y.test)   
  pm1 = mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)      
  
  cat("Correct %:", correct, "\n")      
  cat("Correct % +/- 1:", correct + pm1, "\n")  
}

## For k =  1 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   2   6   3   0   0   0
##      4   1  15  32  48  10   2   0
##      5   1  35 452 453 112  19   0
##      6   4  51 353 760 398  87   1
##      7   1   6  60 237 282  59   1
##      8   0   2  10  41  58  14   0
##      9   0   0   0   1   1   0   1
## Correct %: 0.4211108 
## Correct % +/- 1: 0.6910749 
## For k =  2 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   3   5   3   0   0   0
##      4   0  14  32  49   8   5   0
##      5   2  47 427 456 124  16   0
##      6   4  28 394 734 410  82   2
##      7   1   5  64 248 279  49   0
##      8   0   3   9  44  51  18   0
##      9   0   0   0   2   1   0   0
## Correct %: 0.4067422 
## Correct % +/- 1: 0.6772589 
## For k =  3 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   7   4   0   0   0
##      4   0  11  40  44   8   5   0
##      5   0  28 416 485 130  13   0
##      6   3  27 369 765 423  66   1
##      7   2   2  57 240 287  56   2
##      8   0   1   5  42  63  14   0
##      9   0   1   0   0   1   0   1
## Correct %: 0.4128212 
## Correct % +/- 1: 0.6974302 
## For k =  5 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   7   4   0   0   0
##      4   0   7  46  46   6   3   0
##      5   0  15 403 511 132  11   0
##      6   0   7 322 843 440  42   0
##      7   0   3  41 270 294  38   0
##      8   0   0   4  46  61  14   0
##      9   0   0   0   2   1   0   0
## Correct %: 0.4313346 
## Correct % +/- 1: 0.7231279 
## For k =  10 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   6   5   0   0   0
##      4   0   2  50  46   9   1   0
##      5   0   3 427 527 112   3   0
##      6   0   1 294 936 416   7   0
##      7   0   0  24 308 297  17   0
##      8   0   0   6  53  61   5   0
##      9   0   0   0   1   2   0   0
## Correct %: 0.4606245 
## Correct % +/- 1: 0.7444045 
## For k =  15 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   8   3   0   0   0
##      4   0   1  51  45  11   0   0
##      5   0   3 444 544  79   2   0
##      6   0   0 307 970 376   1   0
##      7   0   0  31 323 290   2   0
##      8   0   0   1  60  64   0   0
##      9   0   0   0   2   1   0   0
## Correct %: 0.4711246 
## Correct % +/- 1: 0.7444045 
## For k =  20 :      Predicted
## Actual   3   4   5   6   7   8   9
##      3   0   0   6   5   0   0   0
##      4   0   0  53  43  12   0   0
##      5   0   3 440 562  67   0   0
##      6   0   0 297 984 372   1   0
##      7   0   0  30 326 289   1   0
##      8   0   0   4  59  62   0   0
##      9   0   0   0   1   2   0   0
## Correct %: 0.4733352 
## Correct % +/- 1: 0.7510362 
## For k =  25 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    5    6    0    0    0
##      4    0    0   52   44   12    0    0
##      5    0    2  471  543   56    0    0
##      6    0    2  275 1005  372    0    0
##      7    0    0   26  325  295    0    0
##      8    0    0    2   59   64    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4893617 
## Correct % +/- 1: 0.7615363 
## For k =  30 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    6    5    0    0    0
##      4    0    0   51   44   13    0    0
##      5    0    0  470  551   51    0    0
##      6    0    0  289 1017  347    1    0
##      7    0    0   30  327  289    0    0
##      8    0    0    3   62   60    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4907433 
## Correct % +/- 1: 0.7579442 
## For k =  40 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    6    5    0    0    0
##      4    0    0   54   42   12    0    0
##      5    0    0  456  579   37    0    0
##      6    0    0  289 1049  316    0    0
##      7    0    0   29  344  273    0    0
##      8    0    0    2   60   63    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4912959 
## Correct % +/- 1: 0.7582205 
## For k =  50 :      Predicted
## Actual    3    4    5    6    7    8    9
##      3    0    0    5    5    1    0    0
##      4    0    0   56   41   11    0    0
##      5    0    0  456  578   38    0    0
##      6    0    0  286 1063  305    0    0
##      7    0    0   30  358  258    0    0
##      8    0    0    2   64   59    0    0
##      9    0    0    0    1    2    0    0
## Correct %: 0.4910196 
## Correct % +/- 1: 0.7549047

Naive Bayes

nb.white = naiveBayes(quality ~ . - citric.acid - chlorides -
                        total.sulfur.dioxide, data = train.w)
nb.white.class = predict(nb.white, test.w)
white.nb.table = table(Actual = test.w$quality, Predicted = nb.white.class)
white.nb.table

##       Predicted
## Actual   3   4   5   6   7   8   9
##      3   1   0   0   1   0   0   0
##      4   0   3   7   9   2   0   0
##      5   1  13 175 105   6   1   0
##      6   0   3 148 191  89   0   3
##      7   0   0  26  64  88   5   3
##      8   0   1   4   9  20   0   0
##      9   0   0   1   0   1   0   0

print(mean(test.w$quality == nb.white.class))

## [1] 0.4673469

Results

Agg Results

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.4.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:randomForest':
## 
##     combine

test.table = as.data.frame(red.lm.table)
test.table$Correct = ifelse(as.character(test.table$Actual) == 
                              as.character(test.table$Predicted),
                          "Correct", "Incorrect")
names(test.table)

## [1] "Actual"    "Predicted" "Freq"      "Correct"

t1 = ggplot(test.table, aes(x = factor(Actual), y = Freq, 
                       fill = factor(Correct, levels = c("Correct", "Incorrect")))) +
  geom_bar(stat = "identity", position = "stack") +  # 'dodge' for side-by-side bars
  labs(x = "Actual Label", y = "Count", fill = "Prediction Status") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink")) +
  theme(legend.position = "none")

t2 = ggplot(test.table, aes(x = factor(Predicted), y = Freq, 
                       fill = factor(Correct, levels = c("Correct", "Incorrect")))) +
  geom_bar(stat = "identity", position = "stack") +  # 'dodge' for side-by-side bars
  labs(x = "Actual Label", y = "Count", fill = "Prediction Status") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

grid.arrange(t1, t2, ncol = 2,
             top = text("LM Precision and Recall"))

Attempt to put next to eachother

# values
# red.lm.table.2, red.tree.table, red.rf.table.2, red.knn.table
lm.table = as.data.frame(red.lm.table)
lm.table$model = "Regression"
tree.table = as.data.frame(red.tree.table)
tree.table$model = "Tree"
rf.table = as.data.frame(red.rf.table.2)
rf.table$model = "Random Forest"
knn.table = as.data.frame(red.knn.table)
knn.table$model = "KNN"

list.df = list(lm.table, tree.table, rf.table, knn.table)
red.df = do.call(rbind, list.df)
red.df$Correct = ifelse(as.character(red.df$Actual) == 
                              as.character(red.df$Predicted),
                          "Correct", "Incorrect")

ggplot(red.df, aes(x = factor(Actual), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "How Well Each Model Predicts Rating by Rating") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

ggplot(red.df, aes(x = factor(Predicted, level = c(3:8)), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Predicted Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Recall for Red Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

White

# values
# red.lm.table.2, red.tree.table, red.rf.table.2, red.knn.table
lm.table = as.data.frame(white.lm.table)
lm.table$model = "Regression"
tree.table = as.data.frame(white.tree.table)
tree.table$model = "Tree"
rf.table = as.data.frame(white.rf.table.2)
rf.table$model = "Random Forest"
knn.table = as.data.frame(white.knn.table)
knn.table$model = "KNN"

list.df = list(lm.table, tree.table, rf.table, knn.table)
white.df = do.call(rbind, list.df)
white.df$Correct = ifelse(as.character(white.df$Actual) == 
                              as.character(white.df$Predicted),
                          "Correct", "Incorrect")

ggplot(white.df, aes(x = factor(Actual), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Recall for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

white.df

##     Actual Predicted Freq         model   Correct
## 1        3         4    0    Regression Incorrect
## 2        4         4    0    Regression   Correct
## 3        5         4    1    Regression Incorrect
## 4        6         4    0    Regression Incorrect
## 5        7         4    0    Regression Incorrect
## 6        8         4    0    Regression Incorrect
## 7        9         4    0    Regression Incorrect
## 8        3         5    0    Regression Incorrect
## 9        4         5   14    Regression Incorrect
## 10       5         5  130    Regression   Correct
## 11       6         5   66    Regression Incorrect
## 12       7         5    9    Regression Incorrect
## 13       8         5    1    Regression Incorrect
## 14       9         5    0    Regression Incorrect
## 15       3         6    2    Regression Incorrect
## 16       4         6    6    Regression Incorrect
## 17       5         6  168    Regression Incorrect
## 18       6         6  326    Regression   Correct
## 19       7         6  130    Regression Incorrect
## 20       8         6   20    Regression Incorrect
## 21       9         6    1    Regression Incorrect
## 22       3         7    0    Regression Incorrect
## 23       4         7    1    Regression Incorrect
## 24       5         7    2    Regression Incorrect
## 25       6         7   42    Regression Incorrect
## 26       7         7   47    Regression   Correct
## 27       8         7   13    Regression Incorrect
## 28       9         7    1    Regression Incorrect
## 29       3         5    0          Tree Incorrect
## 30       4         5   14          Tree Incorrect
## 31       5         5  190          Tree   Correct
## 32       6         5  112          Tree Incorrect
## 33       7         5   14          Tree Incorrect
## 34       8         5    0          Tree Incorrect
## 35       9         5    1          Tree Incorrect
## 36       3         6    2          Tree Incorrect
## 37       4         6    5          Tree Incorrect
## 38       5         6  107          Tree Incorrect
## 39       6         6  244          Tree   Correct
## 40       7         6   96          Tree Incorrect
## 41       8         6   13          Tree Incorrect
## 42       9         6    0          Tree Incorrect
## 43       3         7    0          Tree Incorrect
## 44       4         7    2          Tree Incorrect
## 45       5         7    4          Tree Incorrect
## 46       6         7   78          Tree Incorrect
## 47       7         7   76          Tree   Correct
## 48       8         7   21          Tree Incorrect
## 49       9         7    1          Tree Incorrect
## 50       3         4    0 Random Forest Incorrect
## 51       4         4    1 Random Forest   Correct
## 52       5         4    0 Random Forest Incorrect
## 53       6         4    0 Random Forest Incorrect
## 54       7         4    0 Random Forest Incorrect
## 55       8         4    0 Random Forest Incorrect
## 56       9         4    0 Random Forest Incorrect
## 57       3         5    0 Random Forest Incorrect
## 58       4         5   14 Random Forest Incorrect
## 59       5         5  207 Random Forest   Correct
## 60       6         5   40 Random Forest Incorrect
## 61       7         5    2 Random Forest Incorrect
## 62       8         5    0 Random Forest Incorrect
## 63       9         5    0 Random Forest Incorrect
## 64       3         6    2 Random Forest Incorrect
## 65       4         6    6 Random Forest Incorrect
## 66       5         6   93 Random Forest Incorrect
## 67       6         6  369 Random Forest   Correct
## 68       7         6   77 Random Forest Incorrect
## 69       8         6   13 Random Forest Incorrect
## 70       9         6    1 Random Forest Incorrect
## 71       3         7    0 Random Forest Incorrect
## 72       4         7    0 Random Forest Incorrect
## 73       5         7    1 Random Forest Incorrect
## 74       6         7   25 Random Forest Incorrect
## 75       7         7  107 Random Forest   Correct
## 76       8         7   17 Random Forest Incorrect
## 77       9         7    1 Random Forest Incorrect
## 78       3         8    0 Random Forest Incorrect
## 79       4         8    0 Random Forest Incorrect
## 80       5         8    0 Random Forest Incorrect
## 81       6         8    0 Random Forest Incorrect
## 82       7         8    0 Random Forest Incorrect
## 83       8         8    4 Random Forest   Correct
## 84       9         8    0 Random Forest Incorrect
## 85       3         3    0           KNN   Correct
## 86       4         3    0           KNN Incorrect
## 87       5         3    0           KNN Incorrect
## 88       6         3    0           KNN Incorrect
## 89       7         3    0           KNN Incorrect
## 90       8         3    0           KNN Incorrect
## 91       9         3    0           KNN Incorrect
## 92       3         4    0           KNN Incorrect
## 93       4         4    2           KNN   Correct
## 94       5         4    4           KNN Incorrect
## 95       6         4    1           KNN Incorrect
## 96       7         4    0           KNN Incorrect
## 97       8         4    0           KNN Incorrect
## 98       9         4    0           KNN Incorrect
## 99       3         5    0           KNN Incorrect
## 100      4         5   12           KNN Incorrect
## 101      5         5  136           KNN   Correct
## 102      6         5   98           KNN Incorrect
## 103      7         5   18           KNN Incorrect
## 104      8         5    8           KNN Incorrect
## 105      9         5    0           KNN Incorrect
## 106      3         6    2           KNN Incorrect
## 107      4         6    7           KNN Incorrect
## 108      5         6  156           KNN Incorrect
## 109      6         6  310           KNN   Correct
## 110      7         6  127           KNN Incorrect
## 111      8         6   20           KNN Incorrect
## 112      9         6    2           KNN Incorrect
## 113      3         7    0           KNN Incorrect
## 114      4         7    0           KNN Incorrect
## 115      5         7    5           KNN Incorrect
## 116      6         7   25           KNN Incorrect
## 117      7         7   39           KNN   Correct
## 118      8         7    5           KNN Incorrect
## 119      9         7    0           KNN Incorrect
## 120      3         8    0           KNN Incorrect
## 121      4         8    0           KNN Incorrect
## 122      5         8    0           KNN Incorrect
## 123      6         8    0           KNN Incorrect
## 124      7         8    2           KNN Incorrect
## 125      8         8    1           KNN   Correct
## 126      9         8    0           KNN Incorrect
## 127      3         9    0           KNN Incorrect
## 128      4         9    0           KNN Incorrect
## 129      5         9    0           KNN Incorrect
## 130      6         9    0           KNN Incorrect
## 131      7         9    0           KNN Incorrect
## 132      8         9    0           KNN Incorrect
## 133      9         9    0           KNN   Correct

ggplot(white.df, aes(x = factor(Actual), y = Freq, fill=model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Correct, ncol = 4) +
  labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Recall for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("pink", "lightgreen", "skyblue", "purple"))

ggplot(white.df, aes(x = factor(Predicted, level = c(4:8)), y = Freq, fill=Correct)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ model, ncol = 4) +
  labs(x = "Predicted White Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Precision for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("lightblue", "pink"))

ggplot(white.df, aes(x = factor(Predicted, level = c(4:8)), y = Freq, fill=model)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Correct, ncol = 2) +
  labs(x = "Predicted White Wine Rating", y = "Count", fill = "Prediction Status",
       title = "Per-Class Precision for White Wine") +
  theme_minimal() +
  scale_fill_manual(values = c("pink", "lightgreen", "skyblue", "purple"))

Note sure why 9 is being listed as NA

nrow(whitewine)

## [1] 4898

nrow(redwine)

## [1] 1599

Wine

Thomas Reedy

2025-04-25

Red Wine

Add high-quailty feature

Train test

Analysis

Regression

Test with all data

Test with unique polynomial and shit

Trees

Regression Tree

Random Forrest

Default Trees

Different values

KNN Clustering

KNN Standardized

Naive Bayes

Results

White Wine

Train Test

Analysis

Test with all data

Test with unique polynomial and shit

Trees

Regression Tree

Random Forrest

Default Trees

Different values

KNN Clustering

KNN Standardized

Naive Bayes

Results

Agg Results

Attempt to put next to eachother

White