Data Exploration
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(cores = detectCores() - 1)
set.seed(10)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(corrplot)
## corrplot 0.84 loaded
library(kknn)
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(e1071)
white<-read.csv("C:/Users/Richy/Downloads/winequality-white.csv")
str(white)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
table(white$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
#par(mfrow = c(4,3))
for (i in c(1:11)) {
plot(white[, i], jitter(white[, "quality"]), xlab = names(white)[i],
ylab = "quality", col = "firebrick", cex = 0.8, cex.lab = 1.3)
abline(lm(white[, "quality"] ~ white[ ,i]), lty = 2, lwd = 2)
}











#par(mfrow = c(1, 1))
max.sug <- which(white$residual.sugar == max(white$residual.sugar))
white <- white[-max.sug, ]
par(mfrow = c(1,1))
cor.white <- cor(white)
corrplot(cor.white, method = 'number')

Modeling
white$quality <- as.factor(white$quality)
inTrain <- createDataPartition(white$quality, p = 2/3, list = F)
train.white <- white[inTrain,]
test.white <- white[-inTrain,]
#k-nearest neighbours
t.ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)
kknn.grid <- expand.grid(kmax = c(3, 5, 7 ,9, 11), distance = c(1, 2),
kernel = c("rectangular", "gaussian", "cos"))
kknn.train <- train(quality ~ ., data = train.white, method = "kknn",
trControl = t.ctrl, tuneGrid = kknn.grid,
preProcess = c("center", "scale"))
plot(kknn.train)

kknn.train$bestTune
## kmax distance kernel
## 15 7 1 cos
#randomForest
rf.grid <- expand.grid(mtry = 1:11)
rf.train <- train(quality ~ ., data = train.white, method = "rf",
trControl = t.ctrl, tuneGrid = rf.grid,
preProcess = c("center", "scale"))
plot(rf.train)

rf.train$bestTune
## mtry
## 1 1
#SVM
svm.grid <- expand.grid(C = 2^(1:3), sigma = seq(0.25, 2, length = 8))
svm.train <- train(quality ~ ., data = train.white, method = "svmRadial",
trControl = t.ctrl, tuneGrid = svm.grid,
preProcess = c("center", "scale"))
plot(svm.train)

svm.train$bestTune
## sigma C
## 13 1.25 4
Model Selection
kknn.predict <- predict(kknn.train, test.white)
confusionMatrix(kknn.predict, test.white$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 1 3 1 0 0 0
## 4 0 20 15 12 3 0 0
## 5 3 20 300 122 12 4 0
## 6 3 11 148 504 88 15 0
## 7 0 2 17 85 173 17 0
## 8 0 0 2 8 16 22 1
## 9 0 0 0 0 1 0 0
##
## Overall Statistics
##
## Accuracy : 0.6255
## 95% CI : (0.6015, 0.6491)
## No Information Rate : 0.4494
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4403
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.37037 0.6186 0.6885 0.5904 0.37931
## Specificity 0.996919 0.98095 0.8593 0.7046 0.9094 0.98281
## Pos Pred Value 0.000000 0.40000 0.6508 0.6554 0.5884 0.44898
## Neg Pred Value 0.996305 0.97847 0.8416 0.7349 0.9101 0.97722
## Prevalence 0.003683 0.03315 0.2977 0.4494 0.1799 0.03560
## Detection Rate 0.000000 0.01228 0.1842 0.3094 0.1062 0.01351
## Detection Prevalence 0.003069 0.03069 0.2830 0.4721 0.1805 0.03008
## Balanced Accuracy 0.498460 0.67566 0.7389 0.6965 0.7499 0.68106
## Class: 9
## Sensitivity 0.0000000
## Specificity 0.9993857
## Pos Pred Value 0.0000000
## Neg Pred Value 0.9993857
## Prevalence 0.0006139
## Detection Rate 0.0000000
## Detection Prevalence 0.0006139
## Balanced Accuracy 0.4996929
rf.predict <- predict(rf.train, test.white)
confusionMatrix(rf.predict, test.white$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 7 0 1 0 0 0
## 5 1 30 310 84 4 0 0
## 6 5 17 173 609 148 26 1
## 7 0 0 2 36 141 15 0
## 8 0 0 0 2 0 17 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6654
## 95% CI : (0.6419, 0.6883)
## No Information Rate : 0.4494
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4686
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.129630 0.6392 0.8320 0.48123 0.29310
## Specificity 1.000000 0.999365 0.8960 0.5875 0.96033 0.99873
## Pos Pred Value NaN 0.875000 0.7226 0.6221 0.72680 0.89474
## Neg Pred Value 0.996317 0.971006 0.8542 0.8108 0.89408 0.97453
## Prevalence 0.003683 0.033149 0.2977 0.4494 0.17986 0.03560
## Detection Rate 0.000000 0.004297 0.1903 0.3738 0.08656 0.01044
## Detection Prevalence 0.000000 0.004911 0.2634 0.6010 0.11909 0.01166
## Balanced Accuracy 0.500000 0.564497 0.7676 0.7097 0.72078 0.64592
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9993861
## Prevalence 0.0006139
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
svm.predict <- predict(svm.train, test.white)
confusionMatrix(svm.predict, test.white$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 6 1 2 0 0 0
## 5 1 10 243 69 7 0 0
## 6 5 38 238 630 141 33 1
## 7 0 0 3 29 142 6 0
## 8 0 0 0 2 3 19 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6384
## 95% CI : (0.6146, 0.6618)
## No Information Rate : 0.4494
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4164
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.000000 0.111111 0.5010 0.8607 0.48464 0.32759
## Specificity 1.000000 0.998095 0.9240 0.4916 0.97156 0.99682
## Pos Pred Value NaN 0.666667 0.7364 0.5801 0.78889 0.79167
## Neg Pred Value 0.996317 0.970370 0.8137 0.8122 0.89579 0.97570
## Prevalence 0.003683 0.033149 0.2977 0.4494 0.17986 0.03560
## Detection Rate 0.000000 0.003683 0.1492 0.3867 0.08717 0.01166
## Detection Prevalence 0.000000 0.005525 0.2026 0.6667 0.11050 0.01473
## Balanced Accuracy 0.500000 0.554603 0.7125 0.6761 0.72810 0.66220
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9993861
## Prevalence 0.0006139
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000