## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.5.2
## Warning: package 'lightgbm' is not available (for R version 3.5.1)
## Warning: 'BiocManager' not available. Could not check Bioconductor.
##
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'lightgbm'
## Warning in pacman::p_load(tidyverse, skimr, GGally, plotly, viridis, caret, : Failed to install/load:
## lightgbm
url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
wine <- read.table(url, sep = ';', header = TRUE)
head(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
barplot(table(wine$quality))
wine$taste <- ifelse(wine$quality < 6, 'bad', 'good')
wine$taste[wine$quality == 6] <- 'normal'
wine$taste <- as.factor(wine$taste)
Classify all wines into bad, normal, or good
table(wine$taste)
##
## bad good normal
## 1640 1060 2198
wine$quality <- as.factor(wine$quality)
set.seed(123)
samp <- sample(nrow(wine), 0.7 * nrow(wine))
train <- wine[samp, ]
test <- wine[-samp, ]
set.seed(1)
rpart_model <- rpart(taste~. - quality, train)
#rpart.plot(rpart_model)
visTree(rpart_model)
rpart_result <- predict(rpart_model, newdata = test[,!colnames(test) %in% c("taste")], type='class')
confusionMatrix(rpart_result, test$taste)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 310 18 177
## good 5 62 36
## normal 207 221 434
##
## Overall Statistics
##
## Accuracy : 0.5483
## 95% CI : (0.5224, 0.574)
## No Information Rate : 0.4401
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2541
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.5939 0.20598 0.6708
## Specificity 0.7943 0.96493 0.4800
## Pos Pred Value 0.6139 0.60194 0.5035
## Neg Pred Value 0.7803 0.82516 0.6497
## Prevalence 0.3551 0.20476 0.4401
## Detection Rate 0.2109 0.04218 0.2952
## Detection Prevalence 0.3435 0.07007 0.5864
## Balanced Accuracy 0.6941 0.58545 0.5754
library(randomForest)
model <- randomForest(taste ~ . - quality, data = train,
ntree = 100,
nodesize = 5,
nPerm = 2,
keep.forest = TRUE)
model
##
## Call:
## randomForest(formula = taste ~ . - quality, data = train, ntree = 100, nodesize = 5, nPerm = 2, keep.forest = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 29.78%
## Confusion matrix:
## bad good normal class.error
## bad 804 20 294 0.2808587
## good 21 466 272 0.3860343
## normal 260 154 1137 0.2669246
pred <- predict(model, newdata = test)
confusionMatrix(pred, test$taste)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 373 4 92
## good 10 193 56
## normal 139 104 499
##
## Overall Statistics
##
## Accuracy : 0.7245
## 95% CI : (0.7009, 0.7472)
## No Information Rate : 0.4401
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5616
##
## Mcnemar's Test P-Value : 7.371e-06
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.7146 0.6412 0.7713
## Specificity 0.8987 0.9435 0.7047
## Pos Pred Value 0.7953 0.7452 0.6725
## Neg Pred Value 0.8511 0.9108 0.7967
## Prevalence 0.3551 0.2048 0.4401
## Detection Rate 0.2537 0.1313 0.3395
## Detection Prevalence 0.3190 0.1762 0.5048
## Balanced Accuracy 0.8066 0.7924 0.7380
varImpPlot(model)