Data Exploration

library(doParallel)

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: parallel

registerDoParallel(cores = detectCores() - 1)
set.seed(10)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(corrplot)

## corrplot 0.84 loaded

library(kknn)

## 
## Attaching package: 'kknn'

## The following object is masked from 'package:caret':
## 
##     contr.dummy

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(e1071)
white<-read.csv("C:/Users/Richy/Downloads/winequality-white.csv")
str(white)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

table(white$quality)

## 
##    3    4    5    6    7    8    9 
##   20  163 1457 2198  880  175    5

#par(mfrow = c(4,3))
for (i in c(1:11)) {
  plot(white[, i], jitter(white[, "quality"]), xlab = names(white)[i],
       ylab = "quality", col = "firebrick", cex = 0.8, cex.lab = 1.3)
  abline(lm(white[, "quality"] ~ white[ ,i]), lty = 2, lwd = 2)
}

#par(mfrow = c(1, 1))
max.sug <- which(white$residual.sugar == max(white$residual.sugar))
white <- white[-max.sug, ]
par(mfrow = c(1,1))
cor.white <- cor(white)
corrplot(cor.white, method = 'number')

Modeling

white$quality <- as.factor(white$quality)
inTrain <- createDataPartition(white$quality, p = 2/3, list = F)
train.white <- white[inTrain,]
test.white <- white[-inTrain,]
#k-nearest neighbours
t.ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)
kknn.grid <- expand.grid(kmax = c(3, 5, 7 ,9, 11), distance = c(1, 2),
                         kernel = c("rectangular", "gaussian", "cos"))
kknn.train <- train(quality ~ ., data = train.white, method = "kknn",
                    trControl = t.ctrl, tuneGrid = kknn.grid,
                    preProcess = c("center", "scale"))
plot(kknn.train)

kknn.train$bestTune

##    kmax distance kernel
## 15    7        1    cos

#randomForest
rf.grid <- expand.grid(mtry = 1:11)
rf.train <- train(quality ~ ., data = train.white, method = "rf",
                  trControl = t.ctrl, tuneGrid = rf.grid,
                  preProcess = c("center", "scale"))
plot(rf.train)

rf.train$bestTune

##   mtry
## 1    1

#SVM
svm.grid <- expand.grid(C = 2^(1:3), sigma = seq(0.25, 2, length = 8))
svm.train <- train(quality ~ ., data = train.white, method = "svmRadial",
                   trControl = t.ctrl, tuneGrid = svm.grid,
                   preProcess = c("center", "scale"))
plot(svm.train)

svm.train$bestTune

##    sigma C
## 13  1.25 4

Model Selection

kknn.predict <- predict(kknn.train, test.white)
confusionMatrix(kknn.predict, test.white$quality)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   1   3   1   0   0   0
##          4   0  20  15  12   3   0   0
##          5   3  20 300 122  12   4   0
##          6   3  11 148 504  88  15   0
##          7   0   2  17  85 173  17   0
##          8   0   0   2   8  16  22   1
##          9   0   0   0   0   1   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6255          
##                  95% CI : (0.6015, 0.6491)
##     No Information Rate : 0.4494          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4403          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000  0.37037   0.6186   0.6885   0.5904  0.37931
## Specificity          0.996919  0.98095   0.8593   0.7046   0.9094  0.98281
## Pos Pred Value       0.000000  0.40000   0.6508   0.6554   0.5884  0.44898
## Neg Pred Value       0.996305  0.97847   0.8416   0.7349   0.9101  0.97722
## Prevalence           0.003683  0.03315   0.2977   0.4494   0.1799  0.03560
## Detection Rate       0.000000  0.01228   0.1842   0.3094   0.1062  0.01351
## Detection Prevalence 0.003069  0.03069   0.2830   0.4721   0.1805  0.03008
## Balanced Accuracy    0.498460  0.67566   0.7389   0.6965   0.7499  0.68106
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          0.9993857
## Pos Pred Value       0.0000000
## Neg Pred Value       0.9993857
## Prevalence           0.0006139
## Detection Rate       0.0000000
## Detection Prevalence 0.0006139
## Balanced Accuracy    0.4996929

rf.predict <- predict(rf.train, test.white)
confusionMatrix(rf.predict, test.white$quality)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0   7   0   1   0   0   0
##          5   1  30 310  84   4   0   0
##          6   5  17 173 609 148  26   1
##          7   0   0   2  36 141  15   0
##          8   0   0   0   2   0  17   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6654          
##                  95% CI : (0.6419, 0.6883)
##     No Information Rate : 0.4494          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4686          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000 0.129630   0.6392   0.8320  0.48123  0.29310
## Specificity          1.000000 0.999365   0.8960   0.5875  0.96033  0.99873
## Pos Pred Value            NaN 0.875000   0.7226   0.6221  0.72680  0.89474
## Neg Pred Value       0.996317 0.971006   0.8542   0.8108  0.89408  0.97453
## Prevalence           0.003683 0.033149   0.2977   0.4494  0.17986  0.03560
## Detection Rate       0.000000 0.004297   0.1903   0.3738  0.08656  0.01044
## Detection Prevalence 0.000000 0.004911   0.2634   0.6010  0.11909  0.01166
## Balanced Accuracy    0.500000 0.564497   0.7676   0.7097  0.72078  0.64592
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          1.0000000
## Pos Pred Value             NaN
## Neg Pred Value       0.9993861
## Prevalence           0.0006139
## Detection Rate       0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy    0.5000000

svm.predict <- predict(svm.train, test.white)
confusionMatrix(svm.predict, test.white$quality)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0   6   1   2   0   0   0
##          5   1  10 243  69   7   0   0
##          6   5  38 238 630 141  33   1
##          7   0   0   3  29 142   6   0
##          8   0   0   0   2   3  19   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6384          
##                  95% CI : (0.6146, 0.6618)
##     No Information Rate : 0.4494          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4164          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000 0.111111   0.5010   0.8607  0.48464  0.32759
## Specificity          1.000000 0.998095   0.9240   0.4916  0.97156  0.99682
## Pos Pred Value            NaN 0.666667   0.7364   0.5801  0.78889  0.79167
## Neg Pred Value       0.996317 0.970370   0.8137   0.8122  0.89579  0.97570
## Prevalence           0.003683 0.033149   0.2977   0.4494  0.17986  0.03560
## Detection Rate       0.000000 0.003683   0.1492   0.3867  0.08717  0.01166
## Detection Prevalence 0.000000 0.005525   0.2026   0.6667  0.11050  0.01473
## Balanced Accuracy    0.500000 0.554603   0.7125   0.6761  0.72810  0.66220
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          1.0000000
## Pos Pred Value             NaN
## Neg Pred Value       0.9993861
## Prevalence           0.0006139
## Detection Rate       0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy    0.5000000

KNN

Richy Varghese

May 9, 2019

Data Exploration

Modeling

Model Selection