library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(class)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
redwine = read.csv("winequality-red.csv", sep=';', header=T)
names(redwine)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
# I don't know if i want this...
#redwine$high = ifelse(redwine$quality > 6, 1, 0)
set.seed(1)
index = sample(1:nrow(redwine), 0.8 * nrow(redwine))
train = redwine[index, ]
test = redwine[-index, ]
Idk we could put stuff here but we already did it.
for (name in setdiff(names(redwine), c("quality"))) {
#par(mfrow = c(1,2))
formula = as.formula(paste(name, "~ quality"))
boxplot(formula, data = redwine)
}
It looks like most have differences, but a lot only have small differences. Some like ‘alcohol’ have great differences.
red.model = lm(quality ~ ., data = train)
summary(red.model)
##
## Call:
## lm(formula = quality ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.52213 -0.35627 -0.04738 0.44215 2.00517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.342e+01 2.323e+01 1.008 0.313441
## fixed.acidity 2.725e-02 2.862e-02 0.952 0.341210
## volatile.acidity -1.032e+00 1.353e-01 -7.625 4.77e-14 ***
## citric.acid -1.857e-01 1.626e-01 -1.142 0.253523
## residual.sugar 2.223e-02 1.623e-02 1.370 0.170964
## chlorides -1.456e+00 4.849e-01 -3.002 0.002731 **
## free.sulfur.dioxide 4.483e-03 2.452e-03 1.828 0.067725 .
## total.sulfur.dioxide -3.086e-03 8.191e-04 -3.767 0.000173 ***
## density -1.976e+01 2.372e+01 -0.833 0.404986
## pH -3.457e-01 2.165e-01 -1.597 0.110515
## sulphates 9.192e-01 1.257e-01 7.311 4.69e-13 ***
## alcohol 2.834e-01 2.909e-02 9.741 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6474 on 1267 degrees of freedom
## Multiple R-squared: 0.3536, Adjusted R-squared: 0.348
## F-statistic: 63.02 on 11 and 1267 DF, p-value: < 2.2e-16
plot(red.model)
pred = predict(red.model, newdata = test, type='response')
pred.round = round(pred)
actual = test$quality
RMSE = sqrt(mean((pred - actual)^2))
RMSE
## [1] 0.6530574
red.lm.table = table(Actual = actual, Predicted = pred.round)
red.lm.table
## Predicted
## Actual 4 5 6 7
## 3 1 2 1 0
## 4 0 6 5 0
## 5 0 97 28 0
## 6 0 47 84 4
## 7 0 2 28 12
## 8 0 0 2 1
exact = sum(pred.round == actual) / nrow(test)
within.1 = sum(abs(pred.round - actual) == 1) / nrow(test)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.603125
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9625
red.model.2 = lm(quality ~ poly(fixed.acidity,2) + volatile.acidity
+ residual.sugar + chlorides +
total.sulfur.dioxide + density +
poly(sulphates,2) + poly(alcohol,3), data = train)
summary(red.model.2)
##
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + volatile.acidity +
## residual.sugar + chlorides + total.sulfur.dioxide + density +
## poly(sulphates, 2) + poly(alcohol, 3), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40440 -0.37319 -0.05082 0.44524 2.01917
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.273e+01 1.961e+01 3.709 0.000217 ***
## poly(fixed.acidity, 2)1 3.593e+00 1.045e+00 3.437 0.000606 ***
## poly(fixed.acidity, 2)2 -1.524e+00 6.884e-01 -2.214 0.027005 *
## volatile.acidity -8.275e-01 1.149e-01 -7.201 1.02e-12 ***
## residual.sugar 3.795e-02 1.504e-02 2.522 0.011786 *
## chlorides -1.531e+00 4.581e-01 -3.342 0.000855 ***
## total.sulfur.dioxide -1.770e-03 5.858e-04 -3.021 0.002571 **
## density -6.676e+01 1.971e+01 -3.388 0.000726 ***
## poly(sulphates, 2)1 6.297e+00 7.511e-01 8.383 < 2e-16 ***
## poly(sulphates, 2)2 -3.969e+00 6.827e-01 -5.813 7.74e-09 ***
## poly(alcohol, 3)1 8.908e+00 9.742e-01 9.145 < 2e-16 ***
## poly(alcohol, 3)2 -4.895e-01 6.658e-01 -0.735 0.462380
## poly(alcohol, 3)3 -1.518e+00 6.594e-01 -2.301 0.021529 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6376 on 1266 degrees of freedom
## Multiple R-squared: 0.3735, Adjusted R-squared: 0.3676
## F-statistic: 62.9 on 12 and 1266 DF, p-value: < 2.2e-16
plot(red.model.2)
pred2 = predict(red.model.2, newdata = test, type='response')
pred2.round = round(pred2)
actual = test$quality
RMSE2 = sqrt(mean((pred2 - actual)^2))
RMSE2
## [1] 0.6384957
red.lm.table.2 = table(Actual = actual, Predicted = pred2.round)
red.lm.table.2
## Predicted
## Actual 4 5 6 7
## 3 1 2 1 0
## 4 2 5 4 0
## 5 0 98 27 0
## 6 0 47 81 7
## 7 0 3 22 17
## 8 0 0 2 1
lm.2.results = data.frame(actual = test$quality, predicted = pred2.round)
for (i in 3:8){
sample = subset(lm.2.results, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 0.8181818
## Error when rating = 5 : 0.216
## Error when rating = 6 : 0.4
## Error when rating = 7 : 0.5952381
## Error when rating = 8 : 1
exact = sum(pred2.round == actual) / nrow(test)
within.1 = sum(abs(pred2.round - actual) == 1) / nrow(test)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.61875
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9625
red.tree = tree(quality ~ ., data = train)
summary(red.tree)
##
## Regression tree:
## tree(formula = quality ~ ., data = train)
## Variables actually used in tree construction:
## [1] "alcohol" "sulphates" "volatile.acidity"
## [4] "total.sulfur.dioxide"
## Number of terminal nodes: 11
## Residual mean deviance: 0.3927 = 497.9 / 1268
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.1900 -0.2933 -0.1903 0.0000 0.4043 1.9390
plot(red.tree)
text(red.tree, pretty = 0, cex = 0.7)
red.cv.tree = cv.tree(red.tree)
plot(red.cv.tree$size, red.cv.tree$dev, type='b')
red.tree.pred = predict(red.tree, newdata = test)
red.tree.pred.r = round(red.tree.pred)
tree.RMSE = sqrt(mean((red.tree.pred - test$quality)^2))
tree.RMSE
## [1] 0.6929522
red.tree.table = table(Actual = test$quality, Predicted = red.tree.pred.r)
red.tree.table
## Predicted
## Actual 4 5 6 7
## 3 2 1 1 0
## 4 0 6 5 0
## 5 0 96 29 0
## 6 0 57 68 10
## 7 0 2 25 15
## 8 0 0 2 1
exact = sum(red.tree.pred.r == test$quality) / nrow(test)
within.1 = sum(abs(red.tree.pred.r - test$quality) == 1) / nrow(test)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.559375
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.965625
tree.results = data.frame(actual = test$quality, predicted = red.tree.pred.r)
for (i in 3:8){
sample = subset(tree.results, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 1
## Error when rating = 5 : 0.232
## Error when rating = 6 : 0.4962963
## Error when rating = 7 : 0.6428571
## Error when rating = 8 : 1
Will later need to do CV and try out different tree values.
rf.red = randomForest(quality ~ ., data = train, importance=TRUE)
varImpPlot(rf.red)
rf.red.pred = predict(rf.red, newdata = test)
rf.red.pred.r = round(rf.red.pred)
rf.red.rmse = sqrt(mean((rf.red.pred - test$quality)^2))
rf.red.rmse
## [1] 0.5865801
red.rf.table = table(Actual = test$quality, Predicted = rf.red.pred.r)
red.rf.table
## Predicted
## Actual 5 6 7
## 3 4 0 0
## 4 8 3 0
## 5 105 20 0
## 6 34 96 5
## 7 2 21 19
## 8 0 1 2
exact = sum(rf.red.pred.r == test$quality) / nrow(test)
within.1 = sum(abs(rf.red.pred.r - test$quality) == 1) / nrow(test)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.6875
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.96875
rf.red.results = data.frame(actual = test$quality, predicted = rf.red.pred.r)
for (i in 3:8){
sample = subset(rf.red.results, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 1
## Error when rating = 5 : 0.16
## Error when rating = 6 : 0.2888889
## Error when rating = 7 : 0.547619
## Error when rating = 8 : 1
# predefining variables
p = ncol(redwine) - 1
m = c(p, floor(p / 2), floor(sqrt(p)))
names(m) = c("p", "p/2", "sqrt(p)")
ntrees = seq(1, 150, by = 5)
# to store the errors
results <- data.frame()
for (j in 1:3) {
for (i in seq_along(ntrees)) {
model = randomForest(quality ~ ., data = train, mtry = m[j],
ntree = ntrees[i], keep.forest = TRUE)
preds = predict(model, newdata = test)
mse = mean((preds - test$quality)^2)
results = rbind(results, data.frame(
ntree = ntrees[i],
MSE = mse,
mtry = m[j]
))
}
}
plot(x = results[results$mtry == 11, ]$ntree,
y = results[results$mtry == 11, ]$MSE,
col='blue', type = 'l',
xlab = 'number of trees', ylab = 'MSE',
ylim = c(0.2,0.8))
lines(x = results[results$mtry == 5, ]$ntree,
y = results[results$mtry == 5, ]$MSE,
col='red', type = 'l')
lines(x = results[results$mtry == 3, ]$ntree,
y = results[results$mtry == 3, ]$MSE,
col='darkgreen', type = 'l')
legend("topright", legend = names(m),
col = c("blue", "red", "darkgreen"),
lwd = 2, title = "mtry")
Looks like it doesn’t really matter what value of mtry we use, and once we get to like 50 trees nothing really improves the model.
rf.red.best = randomForest(quality ~ ., data = train, mtry = 4,
ntree = 50, keep.forest = TRUE)
rf.red.pred.best = predict(rf.red.best, newdata = test)
rf.red.pred.best.r = round(rf.red.pred.best)
rf.red.rmse = sqrt(mean((rf.red.pred.best - test$quality)^2))
rf.red.rmse
## [1] 0.5896732
red.rf.table.2 = table(Actual = test$quality, Predicted = rf.red.pred.best.r)
red.rf.table.2
## Predicted
## Actual 5 6 7
## 3 4 0 0
## 4 8 3 0
## 5 104 21 0
## 6 32 96 7
## 7 1 19 22
## 8 0 1 2
exact = sum(rf.red.pred.best.r == test$quality) / nrow(test)
within.1 = sum(abs(rf.red.pred.best.r - test$quality) == 1) / nrow(test)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.69375
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.971875
rf.red.best.res = data.frame(actual = test$quality, predicted = rf.red.pred.best.r)
for (i in 3:8){
sample = subset(rf.red.best.res, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 1
## Error when rating = 5 : 0.168
## Error when rating = 6 : 0.2888889
## Error when rating = 7 : 0.4761905
## Error when rating = 8 : 1
names(redwine)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
# ecluded variables
excluded.var = c("quality")#, "citric.acid", "free.sulfur.dioxide", "pH")
# set up variables
x.train = train[, -which(names(train) %in% excluded.var)]
y.train = train$quality
x.test = test[, -which(names(test) %in% excluded.var)]
y.test = test$quality
kval = c(1,2,3,5,10,15,20,25)
for (val in kval){
set.seed(1)
knn.pred = knn(x.train, x.test, y.train, k = val)
cat("For k = ", val,":")
print(table(Actual = y.test, Predicted = knn.pred))
cat("Correct %:", mean(knn.pred == y.test), "\n")
cat("Correct % +/- 1:", mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
+ mean(knn.pred == y.test), "\n")
}
## For k = 1 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 2 1 1 0 0
## 4 0 0 4 7 0 0
## 5 0 5 97 19 4 0
## 6 0 4 52 67 11 1
## 7 0 1 3 15 22 1
## 8 0 0 0 1 1 1
## Correct %: 0.584375
## Correct % +/- 1: 0.703125
## For k = 2 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 0 1 0
## 4 0 0 5 6 0 0
## 5 0 4 86 32 3 0
## 6 0 4 56 58 17 0
## 7 0 2 8 13 17 2
## 8 0 0 1 1 0 1
## Correct %: 0.50625
## Correct % +/- 1: 0.68125
## For k = 3 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 5 5 1 0
## 5 0 1 83 37 4 0
## 6 0 2 62 52 19 0
## 7 0 1 5 16 18 2
## 8 0 0 0 1 1 1
## Correct %: 0.48125
## Correct % +/- 1: 0.684375
## For k = 5 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 8 3 0 0
## 5 0 0 76 44 5 0
## 6 0 0 65 52 18 0
## 7 0 0 4 25 12 1
## 8 0 0 0 1 2 0
## Correct %: 0.4375
## Correct % +/- 1: 0.6625
## For k = 10 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 2 1 1 0
## 4 0 0 6 5 0 0
## 5 0 1 77 44 3 0
## 6 0 0 65 59 11 0
## 7 0 1 7 28 6 0
## 8 0 0 0 3 0 0
## Correct %: 0.44375
## Correct % +/- 1: 0.6375
## For k = 15 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 7 4 0 0
## 5 0 0 85 39 1 0
## 6 0 0 65 66 4 0
## 7 0 0 11 25 6 0
## 8 0 0 0 2 1 0
## Correct %: 0.490625
## Correct % +/- 1: 0.65
## For k = 20 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 2 2 0 0
## 4 0 0 7 4 0 0
## 5 0 0 88 36 1 0
## 6 0 0 63 69 3 0
## 7 0 0 9 29 4 0
## 8 0 0 1 1 1 0
## Correct %: 0.503125
## Correct % +/- 1: 0.653125
## For k = 25 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 1 1 2 0
## 4 0 0 8 3 0 0
## 5 0 0 90 35 0 0
## 6 0 0 63 69 3 0
## 7 0 0 8 32 2 0
## 8 0 0 1 1 1 0
## Correct %: 0.503125
## Correct % +/- 1: 0.65
knn.pred = knn(x.train, x.test, y.train, k = 1)
red.knn.table = print(table(Actual = y.test, Predicted = knn.pred))
## Predicted
## Actual 3 4 5 6 7 8
## 3 0 2 1 1 0 0
## 4 0 0 4 7 0 0
## 5 0 5 97 19 4 0
## 6 0 4 52 67 11 1
## 7 0 1 3 15 22 1
## 8 0 0 0 1 1 1
red.knn.table
## Predicted
## Actual 3 4 5 6 7 8
## 3 0 2 1 1 0 0
## 4 0 0 4 7 0 0
## 5 0 5 97 19 4 0
## 6 0 4 52 67 11 1
## 7 0 1 3 15 22 1
## 8 0 0 0 1 1 1
# makes a standaridized version
redwine.standard = redwine
redwine.standard[ , -which(names(redwine) == "quality")] = scale(redwine[ , -which(names(redwine) == "quality")])
train.knn = redwine.standard[index, ]
test.knn = redwine.standard[-index, ]
x.train = train.knn[, -which(names(train.knn) %in% excluded.var)]
y.train = train.knn$quality
x.test = test.knn[, -which(names(test.knn) %in% excluded.var)]
y.test = test.knn$quality
kval = c(1,2,3,5,10,15,20,25,30,40,50)
for (val in kval){
set.seed(1)
knn.pred = knn(x.train, x.test, y.train, k = val)
cat("For k = ", val,":")
print(table(Actual = y.test, Predicted = knn.pred))
correct = mean(knn.pred == y.test)
pm1 = mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
cat("Correct %:", correct, "\n")
cat("Correct % +/- 1:", correct + pm1, "\n")
}
## For k = 1 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 2 1 1 0 0
## 4 0 1 6 4 0 0
## 5 0 2 94 26 1 2
## 6 0 2 35 85 11 2
## 7 0 0 6 8 26 2
## 8 0 0 0 1 1 1
## Correct %: 0.646875
## Correct % +/- 1: 0.803125
## For k = 2 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 1 1 2 0 0
## 4 0 1 5 5 0 0
## 5 0 0 94 27 3 1
## 6 0 2 39 79 14 1
## 7 0 0 7 8 25 2
## 8 0 0 0 1 1 1
## Correct %: 0.625
## Correct % +/- 1: 0.7875
## For k = 3 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 7 4 0 0
## 5 0 1 92 29 2 1
## 6 0 0 42 74 16 3
## 7 0 0 5 12 24 1
## 8 0 0 0 1 2 0
## Correct %: 0.59375
## Correct % +/- 1: 0.765625
## For k = 5 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 6 4 1 0
## 5 0 0 90 32 3 0
## 6 0 0 45 72 18 0
## 7 0 0 4 18 20 0
## 8 0 0 0 1 2 0
## Correct %: 0.56875
## Correct % +/- 1: 0.75
## For k = 10 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 2 1 1 0 0
## 4 0 0 6 5 0 0
## 5 0 0 94 28 3 0
## 6 0 0 48 73 14 0
## 7 0 0 5 17 20 0
## 8 0 0 0 1 2 0
## Correct %: 0.584375
## Correct % +/- 1: 0.74375
## For k = 15 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 6 5 0 0
## 5 0 0 96 27 2 0
## 6 0 0 50 72 13 0
## 7 0 0 3 22 17 0
## 8 0 0 0 1 2 0
## Correct %: 0.578125
## Correct % +/- 1: 0.725
## For k = 20 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 7 4 0 0
## 5 0 0 97 26 2 0
## 6 0 0 49 74 12 0
## 7 0 0 2 24 16 0
## 8 0 0 0 2 1 0
## Correct %: 0.584375
## Correct % +/- 1: 0.728125
## For k = 25 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 7 4 0 0
## 5 0 0 96 27 2 0
## 6 0 0 52 74 9 0
## 7 0 0 3 20 19 0
## 8 0 0 0 2 1 0
## Correct %: 0.590625
## Correct % +/- 1: 0.728125
## For k = 30 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 8 3 0 0
## 5 0 0 94 29 2 0
## 6 0 0 52 74 9 0
## 7 0 0 2 23 17 0
## 8 0 0 0 2 1 0
## Correct %: 0.578125
## Correct % +/- 1: 0.725
## For k = 40 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 7 4 0 0
## 5 0 0 98 25 2 0
## 6 0 0 49 81 5 0
## 7 0 0 3 23 16 0
## 8 0 0 0 2 1 0
## Correct %: 0.609375
## Correct % +/- 1: 0.728125
## For k = 50 : Predicted
## Actual 3 4 5 6 7 8
## 3 0 0 3 1 0 0
## 4 0 0 7 4 0 0
## 5 0 0 100 25 0 0
## 6 0 0 52 78 5 0
## 7 0 0 4 26 12 0
## 8 0 0 0 2 1 0
## Correct %: 0.59375
## Correct % +/- 1: 0.7125
nb.red = naiveBayes(quality ~ . - citric.acid - free.sulfur.dioxide -
pH, data = train)
nb.red.class = predict(nb.red, test)
red.nb.table = table(Actual = test$quality, Predicted = nb.red.class)
red.nb.table
## Predicted
## Actual 3 4 5 6 7 8
## 3 0 2 2 0 0 0
## 4 0 1 5 4 0 1
## 5 0 2 91 30 1 1
## 6 0 2 48 73 9 3
## 7 0 0 2 20 18 2
## 8 0 0 0 1 2 0
print(mean(test$quality == nb.red.class))
## [1] 0.571875
I think here we can make a fun plot that shows the error for each rating value by each model. Here’s some example code.
model.error = data.frame(
response = 3:8,
lm = c(100, 80, 20, 40, 60, 100),
svm = c(100, 80, 40, 50, 60, 80),
rf = c(100, 100, 10, 30, 50, 100)
)
bar_matrix = t(as.matrix(model.error[ , -1])) # transpose for barplot
colnames(bar_matrix) = model.error$response
barplot(
bar_matrix,
beside = TRUE,
col = c("pink", "lightblue", "lightgreen"),
legend = rownames(bar_matrix)
)
whitewine = read.csv("winequality-white.csv", sep=';', header=T)
names(whitewine)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
whitewine[whitewine$quality == "9",]
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 775 9.1 0.27 0.45 10.6 0.035
## 821 6.6 0.36 0.29 1.6 0.021
## 828 7.4 0.24 0.36 2.0 0.031
## 877 6.9 0.36 0.34 4.2 0.018
## 1606 7.1 0.26 0.49 2.2 0.032
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 775 28 124 0.99700 3.20 0.46 10.4
## 821 24 85 0.98965 3.41 0.61 12.4
## 828 27 139 0.99055 3.28 0.48 12.5
## 877 57 119 0.98980 3.28 0.36 12.7
## 1606 31 113 0.99030 3.37 0.42 12.9
## quality
## 775 9
## 821 9
## 828 9
## 877 9
## 1606 9
set.seed(1)
index.w = sample(1:nrow(whitewine), 0.8 * nrow(whitewine))
train.w = whitewine[index.w, ]
test.w = whitewine[-index.w, ]
for (name in setdiff(names(whitewine), c("quality"))) {
#par(mfrow = c(1,2))
formula = as.formula(paste(name, "~ quality"))
boxplot(formula, data = whitewine)
}
There seems to be much less variability between values than we saw in red wine
white.model = lm(quality ~ ., data = train.w)
summary(white.model)
##
## Call:
## lm(formula = quality ~ ., data = train.w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8815 -0.4852 -0.0325 0.4635 2.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.578e+02 2.024e+01 7.796 8.16e-15 ***
## fixed.acidity 6.680e-02 2.290e-02 2.918 0.00355 **
## volatile.acidity -1.812e+00 1.271e-01 -14.254 < 2e-16 ***
## citric.acid 3.447e-03 1.072e-01 0.032 0.97436
## residual.sugar 8.519e-02 8.203e-03 10.386 < 2e-16 ***
## chlorides -2.723e-01 6.017e-01 -0.452 0.65095
## free.sulfur.dioxide 4.067e-03 9.319e-04 4.364 1.31e-05 ***
## total.sulfur.dioxide -4.189e-04 4.230e-04 -0.990 0.32208
## density -1.579e+02 2.053e+01 -7.691 1.83e-14 ***
## pH 7.043e-01 1.170e-01 6.020 1.91e-09 ***
## sulphates 6.986e-01 1.127e-01 6.197 6.33e-10 ***
## alcohol 1.843e-01 2.622e-02 7.031 2.40e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.755 on 3906 degrees of freedom
## Multiple R-squared: 0.283, Adjusted R-squared: 0.281
## F-statistic: 140.1 on 11 and 3906 DF, p-value: < 2.2e-16
#plot(white.model)
pred.w = predict(white.model, newdata = test.w, type='response')
pred.round.w = round(pred.w)
actual.w = test.w$quality
RMSE.w = sqrt(mean((pred.w - actual.w)^2))
RMSE.w
## [1] 0.7374623
white.lm.table = table(Actual = actual.w, Predicted = pred.round.w)
white.lm.table
## Predicted
## Actual 4 5 6 7
## 3 0 0 2 0
## 4 0 14 6 1
## 5 1 130 168 2
## 6 0 66 326 42
## 7 0 9 130 47
## 8 0 1 20 13
## 9 0 0 1 1
exact = sum(pred.round.w == actual.w) / nrow(test.w)
within.1 = sum(abs(pred.round.w - actual.w) == 1) / nrow(test.w)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.5132653
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9561224
lm.results.w = data.frame(actual = test.w$quality, predicted = pred.round.w)
for (i in 3:8){
sample = subset(lm.results.w, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 1
## Error when rating = 5 : 0.5681063
## Error when rating = 6 : 0.2488479
## Error when rating = 7 : 0.7473118
## Error when rating = 8 : 1
white.model.2 = lm(quality ~ poly(fixed.acidity,2) + poly(volatile.acidity,3) +
residual.sugar + free.sulfur.dioxide + poly(density,2) +
pH + sulphates + poly(alcohol,3), data = train.w)
summary(white.model.2)
##
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + poly(volatile.acidity,
## 3) + residual.sugar + free.sulfur.dioxide + poly(density,
## 2) + pH + sulphates + poly(alcohol, 3), data = train.w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7865 -0.4886 -0.0201 0.4791 2.4358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.407e+00 4.426e-01 3.178 0.001492 **
## poly(fixed.acidity, 2)1 7.410e+00 1.333e+00 5.558 2.92e-08 ***
## poly(fixed.acidity, 2)2 -3.622e+00 7.664e-01 -4.726 2.37e-06 ***
## poly(volatile.acidity, 3)1 -1.187e+01 7.876e-01 -15.076 < 2e-16 ***
## poly(volatile.acidity, 3)2 1.979e+00 7.793e-01 2.540 0.011136 *
## poly(volatile.acidity, 3)3 -2.554e+00 7.684e-01 -3.324 0.000897 ***
## residual.sugar 1.058e-01 9.056e-03 11.681 < 2e-16 ***
## free.sulfur.dioxide 3.517e-03 7.442e-04 4.726 2.38e-06 ***
## poly(density, 2)1 -4.332e+01 4.438e+00 -9.760 < 2e-16 ***
## poly(density, 2)2 5.109e+00 9.811e-01 5.207 2.02e-07 ***
## pH 1.030e+00 1.232e-01 8.364 < 2e-16 ***
## sulphates 7.766e-01 1.125e-01 6.904 5.88e-12 ***
## poly(alcohol, 3)1 6.064e+00 2.487e+00 2.438 0.014796 *
## poly(alcohol, 3)2 2.954e+00 8.091e-01 3.651 0.000265 ***
## poly(alcohol, 3)3 -1.658e+00 7.702e-01 -2.153 0.031387 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7468 on 3903 degrees of freedom
## Multiple R-squared: 0.299, Adjusted R-squared: 0.2965
## F-statistic: 118.9 on 14 and 3903 DF, p-value: < 2.2e-16
#plot(white.model.2)
pred2.w = predict(white.model.2, newdata = test.w, type='response')
pred2.round.w = round(pred2.w)
actual.w = test.w$quality
RMSE2.w = sqrt(mean((pred2.w - actual.w)^2))
RMSE2.w
## [1] 0.7286815
white.lm.table.2 = table(Actual = actual.w, Predicted = pred2.round.w)
white.lm.table.2
## Predicted
## Actual 5 6 7
## 3 0 2 0
## 4 15 5 1
## 5 146 154 1
## 6 67 318 49
## 7 6 123 57
## 8 1 19 14
## 9 0 1 1
lm.2.results.w = data.frame(actual = test.w$quality, predicted = pred2.round.w)
for (i in 4:8){
sample = subset(lm.2.results.w, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 4 : 1
## Error when rating = 5 : 0.5149502
## Error when rating = 6 : 0.2672811
## Error when rating = 7 : 0.6935484
## Error when rating = 8 : 1
exact = sum(pred2.round.w == actual.w) / nrow(test.w)
within.1 = sum(abs(pred2.round.w - actual.w) == 1) / nrow(test.w)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.5316327
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9622449
That wasn’t too good of results…
white.tree = tree(quality ~ .,data = train.w)
summary(white.tree)
##
## Regression tree:
## tree(formula = quality ~ ., data = train.w)
## Variables actually used in tree construction:
## [1] "alcohol" "volatile.acidity" "free.sulfur.dioxide"
## Number of terminal nodes: 5
## Residual mean deviance: 0.5906 = 2311 / 3913
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.5910 -0.3539 -0.1841 0.0000 0.6461 2.6460
plot(white.tree)
text(white.tree, pretty = 0, cex = 0.7)
white.cv.tree = cv.tree(white.tree)
plot(white.cv.tree$size, white.cv.tree$dev, type='b')
white.tree.pred = predict(white.tree, newdata = test.w)
white.tree.pred.r = round(white.tree.pred)
tree.RMSE = sqrt(mean((white.tree.pred - test.w$quality)^2))
tree.RMSE
## [1] 0.7402891
white.tree.table = table(Actual = test.w$quality, Predicted = white.tree.pred.r)
white.tree.table
## Predicted
## Actual 5 6 7
## 3 0 2 0
## 4 14 5 2
## 5 190 107 4
## 6 112 244 78
## 7 14 96 76
## 8 0 13 21
## 9 1 0 1
exact = sum(white.tree.pred.r == test.w$quality) / nrow(test.w)
within.1 = sum(abs(white.tree.pred.r - test.w$quality) == 1) / nrow(test.w)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.5204082
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9571429
tree.results.w = data.frame(actual = test.w$quality,
predicted = white.tree.pred.r)
for (i in 3:8){
sample = subset(tree.results.w, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 1
## Error when rating = 5 : 0.3687708
## Error when rating = 6 : 0.437788
## Error when rating = 7 : 0.5913978
## Error when rating = 8 : 1
Will later need to do CV and try out different tree values.
rf.white = randomForest(quality ~ .- citric.acid - chlorides -
total.sulfur.dioxide, data = train.w, importance=TRUE)
varImpPlot(rf.white)
rf.white.pred = predict(rf.white, newdata = test.w)
rf.white.pred.r = round(rf.white.pred)
rf.white.rmse = sqrt(mean((rf.white.pred - test.w$quality)^2))
rf.white.rmse
## [1] 0.5888631
white.rf.table = table(Actual = test.w$quality, Predicted = rf.white.pred.r)
white.rf.table
## Predicted
## Actual 4 5 6 7 8
## 3 0 0 2 0 0
## 4 1 15 4 1 0
## 5 0 201 99 1 0
## 6 0 43 362 29 0
## 7 0 1 83 102 0
## 8 0 0 7 24 3
## 9 0 1 0 1 0
exact = sum(rf.white.pred.r == test.w$quality) / nrow(test.w)
within.1 = sum(abs(rf.white.pred.r - test.w$quality) == 1) / nrow(test.w)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.6826531
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9816327
rf.white.results = data.frame(actual = test.w$quality,
predicted = rf.white.pred.r)
for (i in 3:8){
sample = subset(rf.white.results, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 0.952381
## Error when rating = 5 : 0.3322259
## Error when rating = 6 : 0.1658986
## Error when rating = 7 : 0.4516129
## Error when rating = 8 : 0.9117647
# predefining variables
p = ncol(whitewine) - 1
m = c(p, floor(p / 2), floor(sqrt(p)))
names(m) = c("p", "p/2", "sqrt(p)")
ntrees = seq(1, 150, by = 5)
# to store the errors
results <- data.frame()
for (j in 1:3) {
for (i in seq_along(ntrees)) {
model = randomForest(quality ~ ., data = train.w, mtry = m[j],
ntree = ntrees[i], keep.forest = TRUE)
preds = predict(model, newdata = test.w)
mse = mean((preds - test.w$quality)^2)
results = rbind(results, data.frame(ntree = ntrees[i],MSE = mse,mtry = m[j]))
}
}
plot(x = results[results$mtry == 11, ]$ntree,
y = results[results$mtry == 11, ]$MSE,
col='blue', type = 'l',
xlab = 'number of trees', ylab = 'MSE',
#ylim = c(0.2,0.8)
)
lines(x = results[results$mtry == 5, ]$ntree,
y = results[results$mtry == 5, ]$MSE,
col='red', type = 'l')
lines(x = results[results$mtry == 3, ]$ntree,
y = results[results$mtry == 3, ]$MSE,
col='darkgreen', type = 'l')
legend("topright", legend = names(m),
col = c("blue", "red", "darkgreen"),
lwd = 2, title = "mtry")
It looks like using mtry of sqrt(p) is best, and it looks like a high number of trees is good.
rf.white.best = randomForest(quality ~ ., data = train.w, mtry = 3,
ntree = 150, keep.forest = TRUE)
rf.white.pred.best = predict(rf.white.best, newdata = test.w)
rf.white.pred.best.r = round(rf.white.pred.best)
rf.white.rmse = sqrt(mean((rf.white.pred.best - test.w$quality)^2))
rf.white.rmse
## [1] 0.5885784
white.rf.table.2 = table(Actual = test.w$quality, Predicted = rf.white.pred.best.r)
white.rf.table.2
## Predicted
## Actual 4 5 6 7 8
## 3 0 0 2 0 0
## 4 1 14 6 0 0
## 5 0 207 93 1 0
## 6 0 40 369 25 0
## 7 0 2 77 107 0
## 8 0 0 13 17 4
## 9 0 0 1 1 0
exact = sum(rf.white.pred.best.r == test.w$quality) / nrow(test.w)
within.1 = sum(abs(rf.white.pred.best.r - test.w$quality) == 1) / nrow(test.w)
cat("Percent of test data correclty labeled:", exact, '\n')
## Percent of test data correclty labeled: 0.7020408
cat("Percent of test data +/- 1 point:", exact + within.1, '\n')
## Percent of test data +/- 1 point: 0.9734694
rf.white.best.res = data.frame(actual = test.w$quality,
predicted = rf.white.pred.best.r)
for (i in 3:8){
sample = subset(rf.white.best.res, actual == i)
n = nrow(sample)
correct = (n - sum(sample$actual == sample$predicted)) / n
cat("Error when rating = ", i, ":", correct, "\n")
}
## Error when rating = 3 : 1
## Error when rating = 4 : 0.952381
## Error when rating = 5 : 0.3122924
## Error when rating = 6 : 0.1497696
## Error when rating = 7 : 0.4247312
## Error when rating = 8 : 0.8823529
# ecluded variables
excluded.var = c("quality", "citric.acid", "chlorides", "total.sulfur.dioxide")
# set up variables
x.train.w = train.w[, -which(names(train.w) %in% excluded.var)]
y.train.w = train.w$quality
x.test.w = test.w[, -which(names(test.w) %in% excluded.var)]
y.test.w = test.w$quality
kval = c(1,2,3,5,10,15,20,25)
for (val in kval){
set.seed(1)
knn.pred = knn(x.train.w, x.test.w, y.train.w, k = val)
cat("For k = ", val,":")
print(table(Actual = y.test.w, Predicted = knn.pred))
cat("Correct %:", mean(knn.pred == y.test.w), "\n")
cat("Correct % +/- 1:", mean(abs(as.integer(knn.pred) - as.integer(y.test.w)) == 1)
+ mean(knn.pred == y.test.w), "\n")
}
## For k = 1 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 2 0 0 0 0
## 4 0 7 9 4 1 0 0
## 5 0 14 175 91 18 3 0
## 6 0 5 75 292 53 9 0
## 7 0 3 15 56 100 12 0
## 8 0 0 4 12 7 11 0
## 9 0 0 0 2 0 0 0
## Correct %: 0.5969388
## Correct % +/- 1: 0.7693878
## For k = 2 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 1 1 0 0 0 0
## 4 0 0 13 4 4 0 0
## 5 0 12 145 113 26 5 0
## 6 0 6 109 247 61 11 0
## 7 0 2 21 76 80 7 0
## 8 0 0 9 7 8 10 0
## 9 0 1 0 1 0 0 0
## Correct %: 0.4918367
## Correct % +/- 1: 0.7
## For k = 3 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 1 0 1 0 0 0 0
## 4 0 4 8 8 1 0 0
## 5 0 11 155 113 20 2 0
## 6 0 6 105 246 70 7 0
## 7 0 1 18 79 75 13 0
## 8 0 0 5 12 10 7 0
## 9 0 0 0 2 0 0 0
## Correct %: 0.4979592
## Correct % +/- 1: 0.7091837
## For k = 5 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 2 0 0 0 0
## 4 0 2 9 9 1 0 0
## 5 0 8 157 113 19 4 0
## 6 0 4 108 247 66 9 0
## 7 0 0 14 100 65 7 0
## 8 0 0 4 15 9 6 0
## 9 0 0 0 2 0 0 0
## Correct %: 0.4867347
## Correct % +/- 1: 0.6908163
## For k = 10 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 0 2 0 0 0
## 4 0 3 11 6 1 0 0
## 5 0 4 137 143 12 5 0
## 6 0 3 98 278 51 4 0
## 7 0 0 17 96 67 6 0
## 8 0 0 4 17 10 3 0
## 9 0 0 0 2 0 0 0
## Correct %: 0.4979592
## Correct % +/- 1: 0.7214286
## For k = 15 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 1 1 0 0 0
## 4 0 1 13 7 0 0 0
## 5 0 5 146 136 9 5 0
## 6 0 2 97 298 35 2 0
## 7 0 0 16 112 55 3 0
## 8 0 0 3 19 10 2 0
## 9 0 0 0 1 1 0 0
## Correct %: 0.5122449
## Correct % +/- 1: 0.7091837
## For k = 20 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 0 2 0 0 0
## 4 0 2 11 8 0 0 0
## 5 0 2 141 149 9 0 0
## 6 0 3 89 314 27 1 0
## 7 0 0 14 121 48 3 0
## 8 0 0 4 22 6 2 0
## 9 0 0 0 1 1 0 0
## Correct %: 0.5173469
## Correct % +/- 1: 0.7132653
## For k = 25 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 0 2 0 0 0
## 4 0 3 11 7 0 0 0
## 5 0 4 139 153 5 0 0
## 6 0 1 101 306 25 1 0
## 7 0 0 16 129 41 0 0
## 8 0 0 7 20 6 1 0
## 9 0 0 0 2 0 0 0
## Correct %: 0.5
## Correct % +/- 1: 0.694898
knn.pred = knn(x.train.w, x.test.w, y.train.w, k = val)
white.knn.table = table(Actual = y.test.w, Predicted = knn.pred)
white.knn.table
## Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 0 2 0 0 0
## 4 0 2 12 7 0 0 0
## 5 0 4 136 156 5 0 0
## 6 0 1 98 310 25 0 0
## 7 0 0 18 127 39 2 0
## 8 0 0 8 20 5 1 0
## 9 0 0 0 2 0 0 0
# makes a standaridized version
whitewine.standard = whitewine
whitewine.standard[ , -which(names(whitewine) == "quality")] = scale(whitewine[ , -which(names(whitewine) == "quality")])
train.knn = whitewine.standard[index, ]
test.knn = whitewine.standard[-index, ]
x.train = train.knn[, -which(names(train.knn) %in% excluded.var)]
y.train = train.knn$quality
x.test = test.knn[, -which(names(test.knn) %in% excluded.var)]
y.test = test.knn$quality
kval = c(1,2,3,5,10,15,20,25,30,40,50)
for (val in kval){
set.seed(1)
knn.pred = knn(x.train, x.test, y.train, k = val)
cat("For k = ", val,":")
print(table(Actual = y.test, Predicted = knn.pred))
correct = mean(knn.pred == y.test)
pm1 = mean(abs(as.integer(knn.pred) - as.integer(y.test)) == 1)
cat("Correct %:", correct, "\n")
cat("Correct % +/- 1:", correct + pm1, "\n")
}
## For k = 1 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 2 6 3 0 0 0
## 4 1 15 32 48 10 2 0
## 5 1 35 452 453 112 19 0
## 6 4 51 353 760 398 87 1
## 7 1 6 60 237 282 59 1
## 8 0 2 10 41 58 14 0
## 9 0 0 0 1 1 0 1
## Correct %: 0.4211108
## Correct % +/- 1: 0.6910749
## For k = 2 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 3 5 3 0 0 0
## 4 0 14 32 49 8 5 0
## 5 2 47 427 456 124 16 0
## 6 4 28 394 734 410 82 2
## 7 1 5 64 248 279 49 0
## 8 0 3 9 44 51 18 0
## 9 0 0 0 2 1 0 0
## Correct %: 0.4067422
## Correct % +/- 1: 0.6772589
## For k = 3 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 7 4 0 0 0
## 4 0 11 40 44 8 5 0
## 5 0 28 416 485 130 13 0
## 6 3 27 369 765 423 66 1
## 7 2 2 57 240 287 56 2
## 8 0 1 5 42 63 14 0
## 9 0 1 0 0 1 0 1
## Correct %: 0.4128212
## Correct % +/- 1: 0.6974302
## For k = 5 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 7 4 0 0 0
## 4 0 7 46 46 6 3 0
## 5 0 15 403 511 132 11 0
## 6 0 7 322 843 440 42 0
## 7 0 3 41 270 294 38 0
## 8 0 0 4 46 61 14 0
## 9 0 0 0 2 1 0 0
## Correct %: 0.4313346
## Correct % +/- 1: 0.7231279
## For k = 10 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 6 5 0 0 0
## 4 0 2 50 46 9 1 0
## 5 0 3 427 527 112 3 0
## 6 0 1 294 936 416 7 0
## 7 0 0 24 308 297 17 0
## 8 0 0 6 53 61 5 0
## 9 0 0 0 1 2 0 0
## Correct %: 0.4606245
## Correct % +/- 1: 0.7444045
## For k = 15 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 8 3 0 0 0
## 4 0 1 51 45 11 0 0
## 5 0 3 444 544 79 2 0
## 6 0 0 307 970 376 1 0
## 7 0 0 31 323 290 2 0
## 8 0 0 1 60 64 0 0
## 9 0 0 0 2 1 0 0
## Correct %: 0.4711246
## Correct % +/- 1: 0.7444045
## For k = 20 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 6 5 0 0 0
## 4 0 0 53 43 12 0 0
## 5 0 3 440 562 67 0 0
## 6 0 0 297 984 372 1 0
## 7 0 0 30 326 289 1 0
## 8 0 0 4 59 62 0 0
## 9 0 0 0 1 2 0 0
## Correct %: 0.4733352
## Correct % +/- 1: 0.7510362
## For k = 25 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 5 6 0 0 0
## 4 0 0 52 44 12 0 0
## 5 0 2 471 543 56 0 0
## 6 0 2 275 1005 372 0 0
## 7 0 0 26 325 295 0 0
## 8 0 0 2 59 64 0 0
## 9 0 0 0 1 2 0 0
## Correct %: 0.4893617
## Correct % +/- 1: 0.7615363
## For k = 30 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 6 5 0 0 0
## 4 0 0 51 44 13 0 0
## 5 0 0 470 551 51 0 0
## 6 0 0 289 1017 347 1 0
## 7 0 0 30 327 289 0 0
## 8 0 0 3 62 60 0 0
## 9 0 0 0 1 2 0 0
## Correct %: 0.4907433
## Correct % +/- 1: 0.7579442
## For k = 40 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 6 5 0 0 0
## 4 0 0 54 42 12 0 0
## 5 0 0 456 579 37 0 0
## 6 0 0 289 1049 316 0 0
## 7 0 0 29 344 273 0 0
## 8 0 0 2 60 63 0 0
## 9 0 0 0 1 2 0 0
## Correct %: 0.4912959
## Correct % +/- 1: 0.7582205
## For k = 50 : Predicted
## Actual 3 4 5 6 7 8 9
## 3 0 0 5 5 1 0 0
## 4 0 0 56 41 11 0 0
## 5 0 0 456 578 38 0 0
## 6 0 0 286 1063 305 0 0
## 7 0 0 30 358 258 0 0
## 8 0 0 2 64 59 0 0
## 9 0 0 0 1 2 0 0
## Correct %: 0.4910196
## Correct % +/- 1: 0.7549047
nb.white = naiveBayes(quality ~ . - citric.acid - chlorides -
total.sulfur.dioxide, data = train.w)
nb.white.class = predict(nb.white, test.w)
white.nb.table = table(Actual = test.w$quality, Predicted = nb.white.class)
white.nb.table
## Predicted
## Actual 3 4 5 6 7 8 9
## 3 1 0 0 1 0 0 0
## 4 0 3 7 9 2 0 0
## 5 1 13 175 105 6 1 0
## 6 0 3 148 191 89 0 3
## 7 0 0 26 64 88 5 3
## 8 0 1 4 9 20 0 0
## 9 0 0 1 0 1 0 0
print(mean(test.w$quality == nb.white.class))
## [1] 0.4673469
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
test.table = as.data.frame(red.lm.table)
test.table$Correct = ifelse(as.character(test.table$Actual) ==
as.character(test.table$Predicted),
"Correct", "Incorrect")
names(test.table)
## [1] "Actual" "Predicted" "Freq" "Correct"
t1 = ggplot(test.table, aes(x = factor(Actual), y = Freq,
fill = factor(Correct, levels = c("Correct", "Incorrect")))) +
geom_bar(stat = "identity", position = "stack") + # 'dodge' for side-by-side bars
labs(x = "Actual Label", y = "Count", fill = "Prediction Status") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "pink")) +
theme(legend.position = "none")
t2 = ggplot(test.table, aes(x = factor(Predicted), y = Freq,
fill = factor(Correct, levels = c("Correct", "Incorrect")))) +
geom_bar(stat = "identity", position = "stack") + # 'dodge' for side-by-side bars
labs(x = "Actual Label", y = "Count", fill = "Prediction Status") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "pink"))
grid.arrange(t1, t2, ncol = 2,
top = text("LM Precision and Recall"))
# values
# red.lm.table.2, red.tree.table, red.rf.table.2, red.knn.table
lm.table = as.data.frame(red.lm.table)
lm.table$model = "Regression"
tree.table = as.data.frame(red.tree.table)
tree.table$model = "Tree"
rf.table = as.data.frame(red.rf.table.2)
rf.table$model = "Random Forest"
knn.table = as.data.frame(red.knn.table)
knn.table$model = "KNN"
list.df = list(lm.table, tree.table, rf.table, knn.table)
red.df = do.call(rbind, list.df)
red.df$Correct = ifelse(as.character(red.df$Actual) ==
as.character(red.df$Predicted),
"Correct", "Incorrect")
ggplot(red.df, aes(x = factor(Actual), y = Freq, fill=Correct)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ model, ncol = 4) +
labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
title = "How Well Each Model Predicts Rating by Rating") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "pink"))
ggplot(red.df, aes(x = factor(Predicted, level = c(3:8)), y = Freq, fill=Correct)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ model, ncol = 4) +
labs(x = "Predicted Red Wine Rating", y = "Count", fill = "Prediction Status",
title = "Per-Class Recall for Red Wine") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "pink"))
# values
# red.lm.table.2, red.tree.table, red.rf.table.2, red.knn.table
lm.table = as.data.frame(white.lm.table)
lm.table$model = "Regression"
tree.table = as.data.frame(white.tree.table)
tree.table$model = "Tree"
rf.table = as.data.frame(white.rf.table.2)
rf.table$model = "Random Forest"
knn.table = as.data.frame(white.knn.table)
knn.table$model = "KNN"
list.df = list(lm.table, tree.table, rf.table, knn.table)
white.df = do.call(rbind, list.df)
white.df$Correct = ifelse(as.character(white.df$Actual) ==
as.character(white.df$Predicted),
"Correct", "Incorrect")
ggplot(white.df, aes(x = factor(Actual), y = Freq, fill=Correct)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ model, ncol = 4) +
labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
title = "Per-Class Recall for White Wine") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "pink"))
white.df
## Actual Predicted Freq model Correct
## 1 3 4 0 Regression Incorrect
## 2 4 4 0 Regression Correct
## 3 5 4 1 Regression Incorrect
## 4 6 4 0 Regression Incorrect
## 5 7 4 0 Regression Incorrect
## 6 8 4 0 Regression Incorrect
## 7 9 4 0 Regression Incorrect
## 8 3 5 0 Regression Incorrect
## 9 4 5 14 Regression Incorrect
## 10 5 5 130 Regression Correct
## 11 6 5 66 Regression Incorrect
## 12 7 5 9 Regression Incorrect
## 13 8 5 1 Regression Incorrect
## 14 9 5 0 Regression Incorrect
## 15 3 6 2 Regression Incorrect
## 16 4 6 6 Regression Incorrect
## 17 5 6 168 Regression Incorrect
## 18 6 6 326 Regression Correct
## 19 7 6 130 Regression Incorrect
## 20 8 6 20 Regression Incorrect
## 21 9 6 1 Regression Incorrect
## 22 3 7 0 Regression Incorrect
## 23 4 7 1 Regression Incorrect
## 24 5 7 2 Regression Incorrect
## 25 6 7 42 Regression Incorrect
## 26 7 7 47 Regression Correct
## 27 8 7 13 Regression Incorrect
## 28 9 7 1 Regression Incorrect
## 29 3 5 0 Tree Incorrect
## 30 4 5 14 Tree Incorrect
## 31 5 5 190 Tree Correct
## 32 6 5 112 Tree Incorrect
## 33 7 5 14 Tree Incorrect
## 34 8 5 0 Tree Incorrect
## 35 9 5 1 Tree Incorrect
## 36 3 6 2 Tree Incorrect
## 37 4 6 5 Tree Incorrect
## 38 5 6 107 Tree Incorrect
## 39 6 6 244 Tree Correct
## 40 7 6 96 Tree Incorrect
## 41 8 6 13 Tree Incorrect
## 42 9 6 0 Tree Incorrect
## 43 3 7 0 Tree Incorrect
## 44 4 7 2 Tree Incorrect
## 45 5 7 4 Tree Incorrect
## 46 6 7 78 Tree Incorrect
## 47 7 7 76 Tree Correct
## 48 8 7 21 Tree Incorrect
## 49 9 7 1 Tree Incorrect
## 50 3 4 0 Random Forest Incorrect
## 51 4 4 1 Random Forest Correct
## 52 5 4 0 Random Forest Incorrect
## 53 6 4 0 Random Forest Incorrect
## 54 7 4 0 Random Forest Incorrect
## 55 8 4 0 Random Forest Incorrect
## 56 9 4 0 Random Forest Incorrect
## 57 3 5 0 Random Forest Incorrect
## 58 4 5 14 Random Forest Incorrect
## 59 5 5 207 Random Forest Correct
## 60 6 5 40 Random Forest Incorrect
## 61 7 5 2 Random Forest Incorrect
## 62 8 5 0 Random Forest Incorrect
## 63 9 5 0 Random Forest Incorrect
## 64 3 6 2 Random Forest Incorrect
## 65 4 6 6 Random Forest Incorrect
## 66 5 6 93 Random Forest Incorrect
## 67 6 6 369 Random Forest Correct
## 68 7 6 77 Random Forest Incorrect
## 69 8 6 13 Random Forest Incorrect
## 70 9 6 1 Random Forest Incorrect
## 71 3 7 0 Random Forest Incorrect
## 72 4 7 0 Random Forest Incorrect
## 73 5 7 1 Random Forest Incorrect
## 74 6 7 25 Random Forest Incorrect
## 75 7 7 107 Random Forest Correct
## 76 8 7 17 Random Forest Incorrect
## 77 9 7 1 Random Forest Incorrect
## 78 3 8 0 Random Forest Incorrect
## 79 4 8 0 Random Forest Incorrect
## 80 5 8 0 Random Forest Incorrect
## 81 6 8 0 Random Forest Incorrect
## 82 7 8 0 Random Forest Incorrect
## 83 8 8 4 Random Forest Correct
## 84 9 8 0 Random Forest Incorrect
## 85 3 3 0 KNN Correct
## 86 4 3 0 KNN Incorrect
## 87 5 3 0 KNN Incorrect
## 88 6 3 0 KNN Incorrect
## 89 7 3 0 KNN Incorrect
## 90 8 3 0 KNN Incorrect
## 91 9 3 0 KNN Incorrect
## 92 3 4 0 KNN Incorrect
## 93 4 4 2 KNN Correct
## 94 5 4 4 KNN Incorrect
## 95 6 4 1 KNN Incorrect
## 96 7 4 0 KNN Incorrect
## 97 8 4 0 KNN Incorrect
## 98 9 4 0 KNN Incorrect
## 99 3 5 0 KNN Incorrect
## 100 4 5 12 KNN Incorrect
## 101 5 5 136 KNN Correct
## 102 6 5 98 KNN Incorrect
## 103 7 5 18 KNN Incorrect
## 104 8 5 8 KNN Incorrect
## 105 9 5 0 KNN Incorrect
## 106 3 6 2 KNN Incorrect
## 107 4 6 7 KNN Incorrect
## 108 5 6 156 KNN Incorrect
## 109 6 6 310 KNN Correct
## 110 7 6 127 KNN Incorrect
## 111 8 6 20 KNN Incorrect
## 112 9 6 2 KNN Incorrect
## 113 3 7 0 KNN Incorrect
## 114 4 7 0 KNN Incorrect
## 115 5 7 5 KNN Incorrect
## 116 6 7 25 KNN Incorrect
## 117 7 7 39 KNN Correct
## 118 8 7 5 KNN Incorrect
## 119 9 7 0 KNN Incorrect
## 120 3 8 0 KNN Incorrect
## 121 4 8 0 KNN Incorrect
## 122 5 8 0 KNN Incorrect
## 123 6 8 0 KNN Incorrect
## 124 7 8 2 KNN Incorrect
## 125 8 8 1 KNN Correct
## 126 9 8 0 KNN Incorrect
## 127 3 9 0 KNN Incorrect
## 128 4 9 0 KNN Incorrect
## 129 5 9 0 KNN Incorrect
## 130 6 9 0 KNN Incorrect
## 131 7 9 0 KNN Incorrect
## 132 8 9 0 KNN Incorrect
## 133 9 9 0 KNN Correct
ggplot(white.df, aes(x = factor(Actual), y = Freq, fill=model)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ Correct, ncol = 4) +
labs(x = "Actual Red Wine Rating", y = "Count", fill = "Prediction Status",
title = "Per-Class Recall for White Wine") +
theme_minimal() +
scale_fill_manual(values = c("pink", "lightgreen", "skyblue", "purple"))
ggplot(white.df, aes(x = factor(Predicted, level = c(4:8)), y = Freq, fill=Correct)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ model, ncol = 4) +
labs(x = "Predicted White Wine Rating", y = "Count", fill = "Prediction Status",
title = "Per-Class Precision for White Wine") +
theme_minimal() +
scale_fill_manual(values = c("lightblue", "pink"))
ggplot(white.df, aes(x = factor(Predicted, level = c(4:8)), y = Freq, fill=model)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ Correct, ncol = 2) +
labs(x = "Predicted White Wine Rating", y = "Count", fill = "Prediction Status",
title = "Per-Class Precision for White Wine") +
theme_minimal() +
scale_fill_manual(values = c("pink", "lightgreen", "skyblue", "purple"))
Note sure why 9 is being listed as NA
nrow(whitewine)
## [1] 4898
nrow(redwine)
## [1] 1599