STAT Consulting

## Loading required package: pacman

## Warning: package 'pacman' was built under R version 3.5.2

## Warning: package 'lightgbm' is not available (for R version 3.5.1)

## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.

## Warning in p_install(package, character.only = TRUE, ...):

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'lightgbm'

## Warning in pacman::p_load(tidyverse, skimr, GGally, plotly, viridis, caret, : Failed to install/load:
## lightgbm

url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
wine <- read.table(url, sep = ';', header = TRUE)
head(wine)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.0             0.27        0.36           20.7     0.045
## 2           6.3             0.30        0.34            1.6     0.049
## 3           8.1             0.28        0.40            6.9     0.050
## 4           7.2             0.23        0.32            8.5     0.058
## 5           7.2             0.23        0.32            8.5     0.058
## 6           8.1             0.28        0.40            6.9     0.050
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  45                  170  1.0010 3.00      0.45     8.8
## 2                  14                  132  0.9940 3.30      0.49     9.5
## 3                  30                   97  0.9951 3.26      0.44    10.1
## 4                  47                  186  0.9956 3.19      0.40     9.9
## 5                  47                  186  0.9956 3.19      0.40     9.9
## 6                  30                   97  0.9951 3.26      0.44    10.1
##   quality
## 1       6
## 2       6
## 3       6
## 4       6
## 5       6
## 6       6

Histogram of Wine Quality

barplot(table(wine$quality))

wine$taste <- ifelse(wine$quality < 6, 'bad', 'good')
wine$taste[wine$quality == 6] <- 'normal'
wine$taste <- as.factor(wine$taste)

Classify all wines into bad, normal, or good

table(wine$taste)

## 
##    bad   good normal 
##   1640   1060   2198

Testing and Training Dataset

wine$quality <- as.factor(wine$quality)

set.seed(123)
samp <- sample(nrow(wine), 0.7 * nrow(wine))
train <- wine[samp, ]
test <- wine[-samp, ]

Decision Tree

set.seed(1)
rpart_model <- rpart(taste~. - quality, train)
#rpart.plot(rpart_model)
visTree(rpart_model)

rpart_result <- predict(rpart_model, newdata = test[,!colnames(test) %in% c("taste")], type='class')
confusionMatrix(rpart_result, test$taste)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad    310   18    177
##     good     5   62     36
##     normal 207  221    434
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5483         
##                  95% CI : (0.5224, 0.574)
##     No Information Rate : 0.4401         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.2541         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity              0.5939     0.20598        0.6708
## Specificity              0.7943     0.96493        0.4800
## Pos Pred Value           0.6139     0.60194        0.5035
## Neg Pred Value           0.7803     0.82516        0.6497
## Prevalence               0.3551     0.20476        0.4401
## Detection Rate           0.2109     0.04218        0.2952
## Detection Prevalence     0.3435     0.07007        0.5864
## Balanced Accuracy        0.6941     0.58545        0.5754

Random Forest

Tuning Hyperparameters

Increasing Predictive Power

ntree = number of trees

maxnodes = maximum number of terminal nodes

nodesize = minimum size of terminal nodes

library(randomForest)
model <- randomForest(taste ~ . - quality, data = train,
                       ntree = 100, 
                       nodesize = 5, 
                       nPerm = 2,
                       keep.forest = TRUE)

model

## 
## Call:
##  randomForest(formula = taste ~ . - quality, data = train, ntree = 100,      nodesize = 5, nPerm = 2, keep.forest = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 29.78%
## Confusion matrix:
##        bad good normal class.error
## bad    804   20    294   0.2808587
## good    21  466    272   0.3860343
## normal 260  154   1137   0.2669246

pred <- predict(model, newdata = test)
confusionMatrix(pred, test$taste)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad    373    4     92
##     good    10  193     56
##     normal 139  104    499
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7245          
##                  95% CI : (0.7009, 0.7472)
##     No Information Rate : 0.4401          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5616          
##                                           
##  Mcnemar's Test P-Value : 7.371e-06       
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity              0.7146      0.6412        0.7713
## Specificity              0.8987      0.9435        0.7047
## Pos Pred Value           0.7953      0.7452        0.6725
## Neg Pred Value           0.8511      0.9108        0.7967
## Prevalence               0.3551      0.2048        0.4401
## Detection Rate           0.2537      0.1313        0.3395
## Detection Prevalence     0.3190      0.1762        0.5048
## Balanced Accuracy        0.8066      0.7924        0.7380

varImpPlot(model)

STAT Consulting

Kajal Chokshi

10/14/2019

Histogram of Wine Quality

Testing and Training Dataset

Decision Tree

Random Forest

Tuning Hyperparameters

Increasing Predictive Power

ntree = number of trees

maxnodes = maximum number of terminal nodes

nodesize = minimum size of terminal nodes