# Housekeeping
rm(list=ls()) # clear workspace
cat("\014")  # clear console

graphics.off() # shuts down all open graphics devices

1 Business Questions

This dataset provides us with a comprehensive overview of thousands of wines. There are three questions that our team would like to solve:
1. What variables determine the quality of the wine?
2. Use chemical properties to predict whether the wine will be excellent or poor or normal in terms of quality.
3. Use chemical properties to predict the precise numerical quality rating of the wine.

2 Load Data & Libraries

2.1 Import Dataset

Credit to the UCI Machine Learning Repository.
Link: https://archive.ics.uci.edu/ml/datasets/Wine+Quality\ Citation: P. Cortez, A. Cerdeira, F. Almeida, T. Matos, and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

# This dataset is for red wines
Xred <- read.csv("/Users/thuhuong/Desktop/winequality-red.csv")

2.2 Libraries

library(ggplot2)
library(moments)
library(caret)

## Loading required package: lattice

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(corrplot)

## corrplot 0.90 loaded

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(mclust)

## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.

## 
## Attaching package: 'mclust'

## The following object is masked from 'package:purrr':
## 
##     map

library(dbscan)
library(dplyr)
library(nnet)
library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ISLR)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

library(class)
library(e1071)

## 
## Attaching package: 'e1071'

## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(lightgbm)

## Loading required package: R6

## 
## Attaching package: 'lightgbm'

## The following objects are masked from 'package:xgboost':
## 
##     getinfo, setinfo, slice

## The following object is masked from 'package:dplyr':
## 
##     slice

library(adabag)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: doParallel

## Loading required package: iterators

## Loading required package: parallel

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-2

library(gbm)

## Loaded gbm 2.1.8

2.3 First look at the data

dim(Xred) # This dataset has 1599 rows and 12 columns

## [1] 1599   12

head(Xred) # Show the first five rows

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

str(Xred) # Structure of the dataset

## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

summary(Xred) # Summary of the dataset

##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000

3 Exploratory Data Analysis

# Separate dataset into different components to use in EDA
drop <- c("quality")
Xred_train = Xred[,!(names(Xred) %in% drop)]
yred <- subset(Xred, select = c("quality"))

3.1 Overall Dataset

# Duplicated Rows
sum(ifelse(duplicated(Xred) == TRUE,1,0)) #240 duplicated records in red

## [1] 240

# In our case, duplicated rows are acceptable, as wines with all the same physiochemical values should have the same quality rating.

# Missing or Null Values
sum(is.na(Xred)) # No irregularities found. Great dataset!

## [1] 0

3.2 Features

3.2.1 Dependent Feature

summary(Xred$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.636   6.000   8.000

ggplot(Xred, aes(quality)) + geom_boxplot() # Visualize the distribution of quality

print(skewness(Xred$quality)) # Skewness: 0.2175972

## [1] 0.2173931

print(kurtosis(Xred$quality)) # Kurtosis: 3.292031

## [1] 0.2879148

hist(Xred$quality) # Visualize the distribution of quality rating, 5 and 6 are the most common

# The dataset looks imbalanced, especially for the purpose of classification later on. Consequently, we will have to fix the imbalanced dataset by implementing undersampling or oversampling method.

3.2.2 Independent Features

There are 11 independent numerical features that determine the quality rating. To guide us in the exploration of these features, we use two methods to rank the importance of features.
1. Correlation
2. Recursive Feature Elimination

# Using Correlation
correlation_matrix = cor(Xred_train, yred)
correlation_matrix = correlation_matrix[order(correlation_matrix, decreasing = TRUE),]
head(correlation_matrix,5)

##        alcohol      sulphates    citric.acid  fixed.acidity residual.sugar 
##     0.47616632     0.25139708     0.22637251     0.12405165     0.01373164

# According to correlation, five most important features are alcohol, sulphates, citric.acid, fixed.acidity, and residual.sugar.

# Using Recursive Feature Elimination
subsets <- c(1:5, 10, 15, 20, 25)
ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)
lmProfile <- rfe(Xred_train, as.matrix(yred),
                 sizes = subsets,
                 rfeControl = ctrl)
lmProfile

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE  RMSESD RsquaredSD   MAESD Selected
##          1 0.7952  0.03385 0.6629 0.03550    0.02213 0.02018         
##          2 0.7897  0.04798 0.6542 0.03907    0.03767 0.02838         
##          3 0.7293  0.18803 0.5878 0.03619    0.05751 0.02903         
##          4 0.7059  0.23909 0.5663 0.03721    0.06031 0.03033         
##          5 0.7056  0.23991 0.5675 0.03712    0.06014 0.03017         
##         10 0.6536  0.34775 0.5112 0.04020    0.05210 0.03012         
##         11 0.6502  0.35434 0.5044 0.03929    0.05034 0.02920        *
## 
## The top 5 variables (out of 11):
##    density, chlorides, volatile.acidity, sulphates, pH

# According to recursive feature elimination, five most important features are density, chlorides, volatile.acidity, sulphates, and pH.
# Combining two methods, looks like the important features are: alcohol, sulphates, citric.acid, fixed.acidity, residual.sugar, density, chlorides, volatile.acidity, and pH. We will plot some of them below.
# So variables that are not important are: free sulfur dioxide and total sulfur dioxide.

# Alcohol
ggplot(Xred, aes(x=alcohol, y=quality)) + geom_point() + geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Sulphates
ggplot(Xred, aes(x=sulphates, y=quality)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Citric.acid
ggplot(Xred, aes(x=citric.acid, y=quality)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Density
ggplot(Xred, aes(x=density, y=quality)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Volatile.acidity
ggplot(Xred, aes(x=volatile.acidity, y=quality)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

3.3 Correlation

cor.table.r = cor(Xred)
corrplot(cor.table.r)

Let’s take a look at a few interesting correlations

# Density vs. Alcohol
ggplot(Xred, aes(x=density, y=alcohol)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Seems like a self-explanatory physical attribute, as more alcoholic beverages have less density.

# Fixed.acidity vs. citric.acid
ggplot(Xred, aes(x=fixed.acidity, y=citric.acid)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# The predominant fixed acids found in wines are tartaric, malic, citric, and succinic. As citric acid is a component of fixed acid, their correlation are extremely high.

# Fixed.acidity vs. density
ggplot(Xred, aes(x=fixed.acidity, y=density)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# Another high correlation is between density and fixed.acidity. The four prime acids found in fixed acid all have higher density than water. Consequently, the higher the acid concentration in a given compound relative to water, the more dense it becomes, all else being equal.

# Fixed.acidity vs. pH
ggplot(Xred, aes(x=fixed.acidity, y=pH)) + geom_point() + 
  geom_smooth(method=lm, se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

# There is a negative correlation between fixed.acidity and pH. This makes sense as the higher acidity in wine will lead to a lower pH level. Fundamentally speaking, all wines lie on the acidic side of the pH spectrum, most range from 2.5 to about 4.5 pH. In our case, we have pH ranging from 3.0 to 4.0.

4 Feature Engineering

4.1 Clustering the Quality of Wine

The most important part of our project is to cluster the wines into different category: bad, normal, and good. To aid in the classification task, we can add a new feature that clusters wines based on common physiochemical components.
For clustering, we can use the most popular algorithm: K-means Clustering. However, we want to experiment with many differing clustering techniques and compare them to get the final clusters. Thus, we decide to use four clustering methods:
1. K-Means Clustering
2. Hierarchical Clustering
3. Density Based Clustering
4. Expectation-Maximization Clustering (EM)

4.1.1 K-Means Clustering

Guidance to perform K-Means Clustering come from University of Cincinnati’s Business Analytics program.
Citation: https://uc-r.github.io/kmeans_clustering

# Convert the dataset into scale to use the k-means function
Xred_km <- scale(Xred)
head(Xred_km)

##      fixed.acidity volatile.acidity citric.acid residual.sugar   chlorides
## [1,]    -0.5281944        0.9615758   -1.391037    -0.45307667 -0.24363047
## [2,]    -0.2984541        1.9668271   -1.391037     0.04340257  0.22380518
## [3,]    -0.2984541        1.2966596   -1.185699    -0.16937425  0.09632273
## [4,]     1.6543385       -1.3840105    1.483689    -0.45307667 -0.26487754
## [5,]    -0.5281944        0.9615758   -1.391037    -0.45307667 -0.24363047
## [6,]    -0.5281944        0.7381867   -1.391037    -0.52400227 -0.26487754
##      free.sulfur.dioxide total.sulfur.dioxide    density         pH   sulphates
## [1,]         -0.46604672           -0.3790141 0.55809987  1.2882399 -0.57902538
## [2,]          0.87236532            0.6241680 0.02825193 -0.7197081  0.12891007
## [3,]         -0.08364328            0.2289750 0.13422152 -0.3310730 -0.04807379
## [4,]          0.10755844            0.4113718 0.66406945 -0.9787982 -0.46103614
## [5,]         -0.46604672           -0.3790141 0.55809987  1.2882399 -0.57902538
## [6,]         -0.27484500           -0.1966174 0.55809987  1.2882399 -0.57902538
##         alcohol    quality
## [1,] -0.9599458 -0.7875763
## [2,] -0.5845942 -0.7875763
## [3,] -0.5845942 -0.7875763
## [4,] -0.5845942  0.4507074
## [5,] -0.9599458 -0.7875763
## [6,] -0.9599458 -0.7875763

# To visualize the current dataset and see if there's any cluster, we can measure the distance between each datapoint and graph them.
distance <- get_dist(Xred_km)
fviz_dist(distance, show_labels = FALSE, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# There seems to be 2 to 4 clusters in the data.

# Set center to 3, based on EDA
kmc3 <- kmeans(Xred_km, centers = 3, nstart = 25)
kmc3

## K-means clustering with 3 clusters of sizes 704, 506, 389
## 
## Cluster means:
##   fixed.acidity volatile.acidity citric.acid residual.sugar    chlorides
## 1   -0.64820431       0.49325980  -0.7788889    -0.22866362 -0.172706284
## 2    0.94775088      -0.72929480   1.0064621     0.03702487  0.241189151
## 3   -0.05970723       0.05595957   0.1004318     0.36566736 -0.001173487
##   free.sulfur.dioxide total.sulfur.dioxide    density         pH  sulphates
## 1          -0.2584815           -0.3700325 -0.4266647  0.6202733 -0.3150580
## 2          -0.4276930           -0.4643849  0.3636687 -0.7275167  0.5872260
## 3           1.0241225            1.2737317  0.2991146 -0.1762184 -0.1936646
##       alcohol    quality
## 1  0.03241658 -0.1613987
## 2  0.38154044  0.5632786
## 3 -0.55496332 -0.4406022
## 
## Clustering vector:
##    [1] 1 3 1 2 1 1 1 1 1 3 1 3 1 2 3 3 3 2 1 2 3 3 2 3 1 1 1 2 1 1 1 1 3 3 1 1 1
##   [38] 2 1 3 3 3 2 1 1 1 3 2 1 3 1 1 1 3 3 3 2 3 1 1 3 3 1 1 1 1 1 1 2 1 1 3 3 1
##   [75] 3 2 2 1 1 3 1 2 3 2 1 1 2 1 3 1 3 2 2 1 1 1 1 1 1 1 1 1 1 3 1 3 2 1 3 3 1
##  [112] 3 3 2 1 2 1 1 1 3 1 1 1 1 3 3 1 1 1 1 3 1 1 1 1 3 1 1 3 3 3 1 1 1 1 3 1 3
##  [149] 1 1 2 2 3 3 3 3 3 3 1 1 1 1 1 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1
##  [186] 3 3 1 3 3 3 1 3 1 1 3 1 2 1 1 2 3 1 1 1 2 2 3 3 2 2 1 2 3 1 3 1 1 1 3 3 3
##  [223] 1 1 3 3 2 1 3 1 1 1 3 1 1 1 1 1 1 1 2 2 3 2 2 1 1 3 1 1 2 1 2 3 1 3 2 1 2
##  [260] 2 3 1 1 3 2 2 3 2 1 2 3 2 2 1 3 3 1 2 2 2 2 2 1 2 3 3 2 3 2 3 2 2 2 1 2 2
##  [297] 3 1 1 1 1 2 1 1 3 2 1 2 2 1 2 3 3 3 3 1 3 3 2 3 2 3 1 3 3 3 2 2 2 2 2 2 3
##  [334] 1 1 2 2 3 2 2 2 2 2 2 2 3 1 2 2 1 2 1 1 2 3 1 2 2 2 2 3 2 2 2 2 2 2 2 2 2
##  [371] 1 2 2 3 2 2 2 2 2 3 2 2 2 2 3 1 3 1 3 2 1 2 2 3 2 2 3 2 2 1 3 1 2 2 1 2 2
##  [408] 2 2 3 3 3 3 2 3 3 2 3 2 1 2 1 1 2 1 1 1 1 1 2 2 3 2 2 2 2 3 2 2 1 2 2 2 2
##  [445] 1 1 2 2 1 2 2 2 1 2 1 2 1 3 2 3 2 1 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 2
##  [482] 2 2 2 2 2 2 2 2 2 3 2 2 3 3 2 1 3 2 3 1 2 2 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2
##  [519] 2 3 2 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 1 2 2 3 2 1 2 2 3 1 2 2 2 1 2 2 1 2
##  [556] 2 2 2 2 2 2 3 3 3 2 2 1 1 2 1 2 1 2 3 2 2 1 3 3 2 2 2 2 2 3 1 2 3 1 2 3 3
##  [593] 3 2 1 3 2 2 1 2 1 2 1 2 3 1 2 3 2 1 3 2 1 2 2 3 3 2 2 2 3 3 2 1 3 3 1 1 1
##  [630] 3 1 2 1 3 3 1 3 3 1 2 3 3 3 3 3 1 1 1 2 3 2 3 2 2 2 3 2 2 1 1 1 3 1 2 2 3
##  [667] 2 2 2 2 3 1 3 1 2 2 2 1 3 2 2 1 3 1 3 1 1 3 1 2 1 3 2 3 3 1 1 1 3 2 3 1 1
##  [704] 3 1 1 1 1 1 2 3 3 1 1 3 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 2 1 1 3 1 1 1 1 3 1
##  [741] 1 3 1 3 3 1 3 3 1 1 1 1 3 1 2 1 1 1 1 3 3 1 1 1 1 1 3 3 3 1 3 3 3 2 2 1 1
##  [778] 1 1 3 1 1 3 1 1 3 3 3 3 3 3 3 1 1 2 2 3 2 2 2 3 1 1 1 1 2 2 2 1 1 1 2 2 1
##  [815] 2 2 2 2 1 3 1 1 1 1 1 1 2 1 1 1 1 1 3 3 1 1 3 3 2 1 2 1 3 3 2 1 1 1 1 1 2
##  [852] 2 3 3 3 1 3 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 2 2 1 1 3 1 1 1 2 1 3 1 1 2
##  [889] 1 3 3 1 2 1 1 1 2 1 2 1 2 1 1 1 1 3 1 1 1 1 2 2 2 2 1 2 1 3 3 1 2 3 1 3 2
##  [926] 3 3 3 2 2 1 1 3 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 1
##  [963] 1 2 2 2 2 3 2 1 2 2 2 1 2 3 3 3 1 2 2 1 3 2 2 1 2 3 1 2 1 3 1 3 3 1 1 1 1
## [1000] 1 1 2 2 1 3 1 2 2 2 3 2 2 1 1 1 2 2 3 3 1 2 2 1 2 1 1 1 1 3 1 1 1 1 1 1 2
## [1037] 2 1 2 2 1 1 2 2 1 1 1 1 2 2 1 2 1 2 3 3 2 3 2 2 2 2 2 2 1 1 1 2 2 3 2 3 3
## [1074] 1 3 3 2 2 2 3 2 3 3 2 3 3 2 2 2 2 2 2 1 2 1 2 1 1 2 1 2 1 1 1 1 1 2 2 1 2
## [1111] 1 1 2 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 3 2 1 3 2 1 2 2 2 2 3 3 3 3 1 1 3 3 1
## [1148] 2 2 2 2 1 1 2 1 1 3 1 2 2 2 2 2 1 1 2 2 2 1 1 2 1 2 3 3 1 1 1 1 2 2 2 3 1
## [1185] 3 1 1 1 3 1 2 1 2 1 1 1 3 1 2 3 1 2 2 3 2 2 2 3 2 1 1 1 1 2 2 2 3 3 2 2 2
## [1222] 2 3 2 2 3 3 1 1 3 2 3 3 3 1 3 1 1 1 1 3 3 2 3 3 1 1 1 1 1 1 3 1 1 1 1 3 1
## [1259] 1 1 2 1 3 1 1 1 1 2 1 1 1 1 1 3 1 3 2 1 3 2 1 1 1 3 1 2 2 1 3 3 1 1 1 1 1
## [1296] 3 3 1 1 1 1 1 2 3 3 3 3 1 3 3 3 1 1 1 3 3 1 2 3 2 3 1 2 2 1 1 1 1 1 3 3 3
## [1333] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1 3 2 2 1 2 1 1 1 1 3 3
## [1370] 1 2 2 2 3 1 3 1 1 1 1 1 1 3 3 3 3 1 1 1 3 1 1 1 1 3 1 1 3 1 1 3 3 2 2 1 2
## [1407] 2 1 2 1 1 1 2 3 2 1 2 2 1 3 1 3 1 1 2 2 2 1 1 3 1 3 1 1 3 3 3 1 1 3 2 3 1
## [1444] 1 3 3 1 1 3 2 2 2 1 3 2 1 3 3 1 2 3 1 1 1 1 1 3 1 3 1 1 1 2 1 3 1 3 1 1 2
## [1481] 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 3 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1
## [1518] 1 1 1 1 1 1 3 1 1 1 1 2 3 1 1 1 3 1 1 1 1 1 3 1 2 1 2 2 1 1 1 2 2 1 1 1 1
## [1555] 1 1 1 1 3 3 3 3 1 1 1 1 2 1 1 1 2 1 3 1 3 1 2 1 1 1 1 1 1 3 2 2 3 1 3 3 1
## [1592] 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 4686.527 5609.049 3731.160
##  (between_SS / total_SS =  26.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Let's Visualize the result
fviz_cluster(kmc3, data = Xred_km)

# Let's try different cluster settings to make sure. This block creates a set of clustering objects based on the number of clusters we designated in the center variable.
kmc2 <- kmeans(Xred_km, centers = 2, nstart = 25)
kmc4 <- kmeans(Xred_km, centers = 4, nstart = 25)
kmc5 <- kmeans(Xred_km, centers = 5, nstart = 25)
kmc6 <- kmeans(Xred_km, centers = 6, nstart = 25)
kmc7 <- kmeans(Xred_km, centers = 7, nstart = 25)
kmc8 <- kmeans(Xred_km, centers = 8, nstart = 25)
kmc9 <- kmeans(Xred_km, centers = 9, nstart = 25)
# Plots to compare
p1 <- fviz_cluster(kmc2, geom = "point", data = Xred_km) + ggtitle("k = 2")
p2 <- fviz_cluster(kmc3, geom = "point", data = Xred_km) + ggtitle("k = 3")
p3 <- fviz_cluster(kmc4, geom = "point", data = Xred_km) + ggtitle("k = 4")
p4 <- fviz_cluster(kmc5, geom = "point", data = Xred_km) + ggtitle("k = 5")
p5 <- fviz_cluster(kmc6, geom = "point", data = Xred_km) + ggtitle("k = 6")
p6 <- fviz_cluster(kmc7, geom = "point", data = Xred_km) + ggtitle("k = 7")
p7 <- fviz_cluster(kmc8, geom = "point", data = Xred_km) + ggtitle("k = 8")
p8 <- fviz_cluster(kmc9, geom = "point", data = Xred_km) + ggtitle("k = 9")

# Assign these plots to a grid
grid.arrange(p1, p2, p3, p4, nrow = 2)

grid.arrange(p5, p6, p7, p8, nrow = 2)

# More than 5 doesn't make any sense. 4 clusters look too much. Looks like it could be between 2 or 3 clusters.

# Determine Optimal Number of Clusters
# 1. Elbow Method
set.seed(123)
fviz_nbclust(Xred_km, kmeans, method = "wss")

# Looks like the elbow is at a 3, but it's not very clear. Let's use two other methods.

# 2. Silhouette Method
fviz_nbclust(Xred_km, kmeans, method = "silhouette")

# Very interesting, it looks like the optimal number of clusters is 2, not 3 like our original hypothesis.

# 3. Gap Statistic
# Pubished by 3 professors from Standford. http://web.stanford.edu/~hastie/Papers/gap.pdf 
set.seed(123)
gap_stat <- clusGap(Xred_km, FUN = kmeans, nstart = 25, K.max = 9, B = 50)

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

fviz_gap_stat(gap_stat)

# Looks like 3 is the answer here using this method.

# Let's proceed with 2 clusters, as it is within the range of the optimal number of clusters.
set.seed(123)
finalclusters <- kmeans(Xred_km, 2, nstart = 25)
print(finalclusters)

## K-means clustering with 2 clusters of sizes 585, 1014
## 
## Cluster means:
##   fixed.acidity volatile.acidity citric.acid residual.sugar  chlorides
## 1     0.8858867       -0.7295470   0.9891242     0.12584600  0.2323040
## 2    -0.5110885        0.4208925  -0.5706485    -0.07260346 -0.1340215
##   free.sulfur.dioxide total.sulfur.dioxide    density         pH  sulphates
## 1          -0.1781002           -0.2488421  0.3902513 -0.6740906  0.5693024
## 2           0.1027501            0.1435627 -0.2251450  0.3888984 -0.3284437
##      alcohol    quality
## 1  0.2903706  0.4676411
## 2 -0.1675215 -0.2697930
## 
## Clustering vector:
##    [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2
##   [38] 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2
##   [75] 1 1 1 2 2 2 2 1 2 1 2 2 1 2 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2
##  [112] 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1
##  [149] 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2
##  [186] 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2
##  [223] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1
##  [260] 1 2 2 2 2 1 1 2 1 2 1 2 1 1 2 2 2 2 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1
##  [297] 2 2 2 2 2 1 2 2 2 1 2 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 2
##  [334] 2 2 1 1 2 1 1 1 1 1 1 1 2 2 1 1 2 1 2 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1
##  [371] 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 1 1 2 1 1 2 1 1 2 2 2 1 1 2 1 1
##  [408] 1 1 1 2 2 2 1 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1
##  [445] 2 2 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 2 1
##  [482] 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [519] 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 1 1 2 1 1 2 1
##  [556] 1 1 1 1 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 2 1 2 2 1 1 2
##  [593] 1 1 2 2 1 1 2 1 2 1 2 1 2 2 1 1 1 2 1 1 2 1 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2
##  [630] 2 2 1 2 2 2 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 1 2 2 2 2 2 1 1 2
##  [667] 1 1 1 1 2 2 2 2 1 1 1 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2
##  [704] 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
##  [741] 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2
##  [778] 2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 2 2 1 1 2
##  [815] 1 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1
##  [852] 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 1 2 2 2 2 1
##  [889] 2 1 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 2 2 1
##  [926] 1 1 2 1 1 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 2
##  [963] 2 1 1 1 1 2 1 2 1 1 1 2 1 2 2 2 2 1 1 2 2 1 1 2 1 2 2 1 2 2 2 2 1 2 2 2 2
## [1000] 2 2 1 1 2 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1
## [1037] 1 2 1 1 2 2 1 1 2 2 2 2 1 1 2 1 2 1 2 2 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 2 2
## [1074] 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 2 2 1 2 1 2 2 2 2 2 1 1 2 1
## [1111] 2 2 1 1 2 2 2 2 2 2 1 2 2 1 2 1 2 2 1 1 2 2 1 2 1 1 1 1 2 2 2 1 2 2 2 1 2
## [1148] 1 1 1 1 2 2 1 2 2 1 2 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 2 2 2 2 2 2 1 1 1 1 2
## [1185] 2 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1
## [1222] 1 2 1 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1259] 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2
## [1296] 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 2 2 2 2 2 2
## [1333] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 1 2
## [1370] 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1
## [1407] 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 2 2 2 1 2 2
## [1444] 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 2 1
## [1481] 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2
## [1518] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 2 2 2 1 1 2 2 2 2
## [1555] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2
## [1592] 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 7434.904 8334.629
##  (between_SS / total_SS =  17.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Visualize the final result
fviz_cluster(finalclusters, data = Xred_km)

4.1.2 Hierarchical Clustering

Source for guidance: https://uc-r.github.io/hc_clustering\ Hierarchical clustering is an alternative approach to k-means clustering for identifying groups in the dataset. It does not require us to pre-specify the number of clusters to be generated as is required by the k-means approach. Furthermore, hierarchical clustering has an added advantage over K-means clustering in that it results in an attractive tree-based representation of the observations, called a dendrogram.

# Set-up dataset
Xred_hc <- scale(Xred)
head(Xred_hc)

##      fixed.acidity volatile.acidity citric.acid residual.sugar   chlorides
## [1,]    -0.5281944        0.9615758   -1.391037    -0.45307667 -0.24363047
## [2,]    -0.2984541        1.9668271   -1.391037     0.04340257  0.22380518
## [3,]    -0.2984541        1.2966596   -1.185699    -0.16937425  0.09632273
## [4,]     1.6543385       -1.3840105    1.483689    -0.45307667 -0.26487754
## [5,]    -0.5281944        0.9615758   -1.391037    -0.45307667 -0.24363047
## [6,]    -0.5281944        0.7381867   -1.391037    -0.52400227 -0.26487754
##      free.sulfur.dioxide total.sulfur.dioxide    density         pH   sulphates
## [1,]         -0.46604672           -0.3790141 0.55809987  1.2882399 -0.57902538
## [2,]          0.87236532            0.6241680 0.02825193 -0.7197081  0.12891007
## [3,]         -0.08364328            0.2289750 0.13422152 -0.3310730 -0.04807379
## [4,]          0.10755844            0.4113718 0.66406945 -0.9787982 -0.46103614
## [5,]         -0.46604672           -0.3790141 0.55809987  1.2882399 -0.57902538
## [6,]         -0.27484500           -0.1966174 0.55809987  1.2882399 -0.57902538
##         alcohol    quality
## [1,] -0.9599458 -0.7875763
## [2,] -0.5845942 -0.7875763
## [3,] -0.5845942 -0.7875763
## [4,] -0.5845942  0.4507074
## [5,] -0.9599458 -0.7875763
## [6,] -0.9599458 -0.7875763

# Agglomerative Hierarchical Clustering: Each object is initially considered as a single-element cluster (leaf). At each step of the algorithm, the two clusters that are the most similar are combined into a new bigger cluster (nodes). This procedure is iterated until all points are member of just one single big cluster (root) -- bottom-up manner
# Dissimilarity matrix
d <- dist(Xred_hc, method = "euclidean") 
# Hierarchical clustering using Complete Linkage
hc1 <- agnes(Xred_hc, method = "complete") 
# Plot the obtained dendrogram
pltree(hc1, cex = 0.6, hang = -1)

# Agglomerative coefficient
# The agglomerative coefficient measures the amount of clustering structure found (values closer to 1 suggest strong clustering structure)
hc1$ac

## [1] 0.9487263

# Methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# Function to compute coefficient
ac <- function(x) {
  agnes(Xred_hc, method = x)$ac
}
map_dbl(m, ac)

##   average    single  complete      ward 
## 0.9374775 0.8573798 0.9487263 0.9858633

# We see that Ward's method identifies the strongest clustering structure of the four methods assessed.
# Visualize the final dendrogram using Ward's method
hc2 <- agnes(Xred_hc, method = "ward")
pltree(hc2, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

# Divisive Hierarchical Clustering: It begins with the root, in which all objects are included in a single cluster. At each step of iteration, the most heterogeneous cluster is divided into two. The process is iterated until all objects are in their own cluster -- top-down manner
# Compute divisive hierarchical clustering
hc3 <- diana(Xred_hc)
# Divisive coefficient; amount of clustering structure found
hc3$dc

## [1] 0.9402224

# Plot dendrogram
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of diana")

# Working with Dendrograms
# Ward's method
hc4 <- hclust(d, method = "ward.D2")
# Cut tree into 3 groups
sub_grp <- cutree(hc4, k = 3)
# Number of members in each cluster
table(sub_grp)

## sub_grp
##   1   2   3 
## 923 411 265

# Draw dendrogram with a border around the 3 clusters
plot(hc4, cex = 0.6)
rect.hclust(hc4, k = 3, border = 2:5)

# Visualize the result in a scatter plot
fviz_cluster(list(data = Xred, cluster = sub_grp))

# Determine Optimal Clusters
# Elbow Method
fviz_nbclust(Xred_hc, FUN = hcut, method = "wss")

# Looks like after 2 clusters, the rate of change in Total Within Sum of Square decreases.
# Average Silhouette Method
fviz_nbclust(Xred_hc, FUN = hcut, method = "silhouette")

# This method indicates the optimal number of clusters to be 2.
# Gap Statistic Method
gap_stat <- clusGap(Xred_hc, FUN = hcut, nstart = 25, K.max = 9, B = 50)
fviz_gap_stat(gap_stat)

# The optimal number of clusters in gap_stat is 1.

# Applying the optimal clusters of 2
# Cut tree into 2 groups
sub_grp_final <- cutree(hc4, k = 2)
# Number of members in each cluster
table(sub_grp_final)

## sub_grp_final
##    1    2 
## 1334  265

# Draw dendrogram with a border around the 3 clusters
plot(hc4, cex = 0.6)
rect.hclust(hc4, k = 2, border = 2:5)

# Visualize the result in a scatter plot
fviz_cluster(list(data = Xred_hc, cluster = sub_grp_final))

4.1.3 Density Based Clustering

# Finding the optimal eps
kNNdistplot(Xred, k=3)
abline(h = 9, lty=2)

# Plotting & Manipulating the Cluster
set.seed(123)
f <- fpc::dbscan(Xred, eps = 8, MinPts = 4)
f

## dbscan Pts=1599 MinPts=4 eps=8
##         0    1
## border 21    7
## seed    0 1571
## total  21 1578

d2 <- dbscan::dbscan(Xred, eps = 8, MinPts = 4)

## Warning in dbscan::dbscan(Xred, eps = 8, MinPts = 4): converting argument MinPts
## (fpc) to minPts (dbscan)!

d2

## DBSCAN clustering for 1599 objects.
## Parameters: eps = 8, minPts = 4
## The clustering contains 1 cluster(s) and 21 noise points.
## 
##    0    1 
##   21 1578 
## 
## Available fields: cluster, eps, minPts

## 0 means noise 
fviz_cluster(f, Xred, geom = "point")

4.1.4 Expectation-Maximization Clustering (EM)

This method uses Gaussian Mixture Models (GMM), based on Density Based Clustering. Due to the result of section 3.1.c, we do not expect this clustering method to work for our dataset. However, we still want to try it out as practice.
We will use the mclust R package, which is developed specifically for EM Clustering.
Citation:
Scrucca L., Fop M., Murphy T. B. and Raftery A. E. (2016) mclust 5: clustering, classification and density estimation using Gaussian finite mixture models The R Journal 8/1, pp. 289-317.
url = {https://doi.org/10.32614/RJ-2016-021}

# Use the mclust function to find the best fit
fit <- Mclust(Xred)
summary(fit)

## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 8 components: 
## 
##  log-likelihood    n  df       BIC       ICL
##       -2764.953 1599 265 -7484.846 -7937.639
## 
## Clustering table:
##   1   2   3   4   5   6   7   8 
## 148 151 173 149 162 385 169 262

# 8 clusters is definitely incorrect, and not what we were looking for. However, let's visualize this method.
plot(fit, what = "BIC")

# The classification function takes into consideration every variable and cluster them pairwise.
plot(fit, what = "classification")

# The density based methods are overkill for our data and lead to over-clustering.

4.1.5 Clustering: Comparing Results

Based on four clustering methods that we performed, K-Means Clustering and Hierarchical Clustering make the most sense. We will choose these two methods to take a closer look.

# K-Means Clustering
X_clustering <- Xred # create a X_clustering dataset to keep the integrity of the original Xred dataset.
km_final <- X_clustering %>%
              mutate(cluster = finalclusters$cluster)
head(km_final)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality cluster
## 1       5       2
## 2       5       2
## 3       5       2
## 4       6       1
## 5       5       2
## 6       5       2

# Take a look into the characteristics of Cluster 1
summary(km_final[km_final$cluster == 1,])

##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 5.600   Min.   :0.1200   Min.   :0.0900   Min.   : 0.900  
##  1st Qu.: 8.600   1st Qu.:0.3100   1st Qu.:0.3900   1st Qu.: 2.000  
##  Median : 9.800   Median :0.3800   Median :0.4600   Median : 2.300  
##  Mean   : 9.862   Mean   :0.3972   Mean   :0.4637   Mean   : 2.716  
##  3rd Qu.:10.800   3rd Qu.:0.4700   3rd Qu.:0.5300   3rd Qu.: 2.700  
##  Max.   :15.900   Max.   :0.9000   Max.   :1.0000   Max.   :15.500  
##    chlorides      free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.0380   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.0690   1st Qu.: 6.00       1st Qu.: 19.00       1st Qu.:0.9960  
##  Median :0.0810   Median :10.00       Median : 29.00       Median :0.9974  
##  Mean   :0.0984   Mean   :14.01       Mean   : 38.28       Mean   :0.9975  
##  3rd Qu.:0.0950   3rd Qu.:19.00       3rd Qu.: 49.00       3rd Qu.:0.9988  
##  Max.   :0.6110   Max.   :55.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality         cluster 
##  Min.   :2.740   Min.   :0.4200   Min.   : 8.40   Min.   :3.000   Min.   :1  
##  1st Qu.:3.140   1st Qu.:0.6300   1st Qu.: 9.80   1st Qu.:5.000   1st Qu.:1  
##  Median :3.210   Median :0.7200   Median :10.70   Median :6.000   Median :1  
##  Mean   :3.207   Mean   :0.7546   Mean   :10.73   Mean   :6.014   Mean   :1  
##  3rd Qu.:3.300   3rd Qu.:0.8300   3rd Qu.:11.50   3rd Qu.:7.000   3rd Qu.:1  
##  Max.   :3.540   Max.   :2.0000   Max.   :14.90   Max.   :8.000   Max.   :1

summary(km_final[km_final$cluster == 2,])

##  fixed.acidity   volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.60   Min.   :0.1900   Min.   :0.0000   Min.   : 1.200  
##  1st Qu.: 6.80   1st Qu.:0.5000   1st Qu.:0.0400   1st Qu.: 1.900  
##  Median : 7.40   Median :0.6000   Median :0.1400   Median : 2.100  
##  Mean   : 7.43   Mean   :0.6032   Mean   :0.1598   Mean   : 2.436  
##  3rd Qu.: 8.00   3rd Qu.:0.6850   3rd Qu.:0.2500   3rd Qu.: 2.500  
##  Max.   :10.60   Max.   :1.5800   Max.   :0.7000   Max.   :13.400  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 3.00       Min.   :  7.00       Min.   :0.9902  
##  1st Qu.:0.07000   1st Qu.: 9.00       1st Qu.: 25.00       1st Qu.:0.9955  
##  Median :0.07900   Median :15.00       Median : 42.00       Median :0.9964  
##  Mean   :0.08116   Mean   :16.95       Mean   : 51.19       Mean   :0.9963  
##  3rd Qu.:0.08775   3rd Qu.:23.00       3rd Qu.: 70.00       3rd Qu.:0.9974  
##  Max.   :0.26700   Max.   :72.00       Max.   :165.00       Max.   :1.0014  
##        pH          sulphates         alcohol         quality         cluster 
##  Min.   :2.880   Min.   :0.3300   Min.   : 9.00   Min.   :3.000   Min.   :2  
##  1st Qu.:3.290   1st Qu.:0.5300   1st Qu.: 9.50   1st Qu.:5.000   1st Qu.:2  
##  Median :3.360   Median :0.5900   Median : 9.90   Median :5.000   Median :2  
##  Mean   :3.371   Mean   :0.6025   Mean   :10.24   Mean   :5.418   Mean   :2  
##  3rd Qu.:3.450   3rd Qu.:0.6500   3rd Qu.:10.80   3rd Qu.:6.000   3rd Qu.:2  
##  Max.   :4.010   Max.   :1.2000   Max.   :14.00   Max.   :8.000   Max.   :2

# Suprisingly, there are no distinctive difference between two clusters, even in the quality column. Behind the scene, the clustering algorithm do not classify wines based on a specific feature.

# Hierarchical Clustering
hc_final <- X_clustering %>%
              mutate(cluster = sub_grp_final)
head(hc_final)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality cluster
## 1       5       1
## 2       5       1
## 3       5       1
## 4       6       1
## 5       5       1
## 6       5       1

# Take a look into the characteristics of Cluster 1
summary(hc_final[hc_final$cluster == 1,])

##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.000   1st Qu.:0.4200   1st Qu.:0.0700   1st Qu.: 1.900  
##  Median : 7.600   Median :0.5500   Median :0.2200   Median : 2.200  
##  Mean   : 7.839   Mean   :0.5513   Mean   :0.2217   Mean   : 2.485  
##  3rd Qu.: 8.600   3rd Qu.:0.6550   3rd Qu.:0.3400   3rd Qu.: 2.500  
##  Max.   :12.500   Max.   :1.5800   Max.   :0.7800   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.0        Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 9.00       1st Qu.: 25.0        1st Qu.:0.9954  
##  Median :0.07900   Median :15.00       Median : 42.0        Median :0.9965  
##  Mean   :0.08279   Mean   :17.21       Mean   : 50.3        Mean   :0.9964  
##  3rd Qu.:0.08900   3rd Qu.:24.00       3rd Qu.: 67.0        3rd Qu.:0.9975  
##  Max.   :0.27000   Max.   :72.00       Max.   :289.0        Max.   :1.0037  
##        pH          sulphates         alcohol         quality         cluster 
##  Min.   :2.870   Min.   :0.3300   Min.   : 8.40   Min.   :3.000   Min.   :1  
##  1st Qu.:3.250   1st Qu.:0.5400   1st Qu.: 9.50   1st Qu.:5.000   1st Qu.:1  
##  Median :3.340   Median :0.6000   Median :10.10   Median :5.000   Median :1  
##  Mean   :3.341   Mean   :0.6355   Mean   :10.37   Mean   :5.545   Mean   :1  
##  3rd Qu.:3.420   3rd Qu.:0.7000   3rd Qu.:11.00   3rd Qu.:6.000   3rd Qu.:1  
##  Max.   :4.010   Max.   :1.9800   Max.   :14.00   Max.   :8.000   Max.   :1

summary(hc_final[hc_final$cluster == 2,])

##  fixed.acidity   volatile.acidity  citric.acid     residual.sugar 
##  Min.   : 7.20   Min.   :0.1800   Min.   :0.2000   Min.   :1.400  
##  1st Qu.: 9.40   1st Qu.:0.3200   1st Qu.:0.4600   1st Qu.:2.000  
##  Median :10.60   Median :0.4000   Median :0.5000   Median :2.400  
##  Mean   :10.74   Mean   :0.4097   Mean   :0.5192   Mean   :2.809  
##  3rd Qu.:11.90   3rd Qu.:0.4800   3rd Qu.:0.6000   3rd Qu.:3.000  
##  Max.   :15.90   Max.   :0.8400   Max.   :1.0000   Max.   :9.000  
##    chlorides     free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.039   Min.   : 1.000      Min.   : 6.00        Min.   :0.9948  
##  1st Qu.:0.072   1st Qu.: 6.000      1st Qu.:17.00        1st Qu.:0.9970  
##  Median :0.083   Median : 7.000      Median :23.00        Median :0.9980  
##  Mean   :0.111   Mean   : 9.158      Mean   :27.17        Mean   :0.9982  
##  3rd Qu.:0.098   3rd Qu.:12.000      3rd Qu.:35.00        3rd Qu.:0.9996  
##  Max.   :0.611   Max.   :32.000      Max.   :81.00        Max.   :1.0032  
##        pH          sulphates         alcohol         quality         cluster 
##  Min.   :2.740   Min.   :0.4300   Min.   : 8.40   Min.   :4.000   Min.   :2  
##  1st Qu.:3.070   1st Qu.:0.6500   1st Qu.: 9.80   1st Qu.:6.000   1st Qu.:2  
##  Median :3.170   Median :0.7300   Median :10.60   Median :6.000   Median :2  
##  Mean   :3.161   Mean   :0.7722   Mean   :10.69   Mean   :6.094   Mean   :2  
##  3rd Qu.:3.240   3rd Qu.:0.8400   3rd Qu.:11.40   3rd Qu.:7.000   3rd Qu.:2  
##  Max.   :3.480   Max.   :2.0000   Max.   :14.90   Max.   :8.000   Max.   :2

# Similarly, the hierarchical clustering algorithm does not classify wines based on a specific feature. Additionally, looks like the characteristics of HC's cluster 2 is similar to KM's cluster 1 and HC's cluster 1 is similar to KM's cluster 2.
# Either way, since these two methods are not significantly different from each other. We decide to go with the more popular method: K-Means Clustering.

As a result of the clustering step, we create an additional variable named cluster to add to our original Xred dataset. This feature will help in the classification task later on.

# Add cluster column to Xred
X4red <- Xred %>%
  mutate(cluster = finalclusters$cluster) %>%
  group_by(cluster)
# Since cluster does not represent quality score. We will create a categorical variable named type that is the target variable for the classification task.
X4red$type <- ifelse(X4red$quality < 5, "bad", (ifelse(X4red$quality < 7, "normal", "good")))

4.2 Handling Multi-collinearity

# Drop fixed.acidity since it has high correlations with three variables: citric.acid, density, and pH. This will reduce multi-collinearity problem which happens when there is a high intercorrelation among two or more independent variables in a multiple regression model.
# Drop quality since we won't need it for the purpose of classification
X4red <- X4red %>% 
                select(-one_of('quality','fixed.acidity'))
head(X4red)

## # A tibble: 6 x 12
## # Groups:   cluster [2]
##   volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
##              <dbl>       <dbl>          <dbl>     <dbl>               <dbl>
## 1             0.7         0               1.9     0.076                  11
## 2             0.88        0               2.6     0.098                  25
## 3             0.76        0.04            2.3     0.092                  15
## 4             0.28        0.56            1.9     0.075                  17
## 5             0.7         0               1.9     0.076                  11
## 6             0.66        0               1.8     0.075                  13
## # … with 7 more variables: total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>,
## #   sulphates <dbl>, alcohol <dbl>, cluster <int>, type <chr>

4.3 Test and Train Dataset

# Create a test and train dataset by splitting X4red to 70:30
set.seed(1234)
splitindex <- createDataPartition(X4red$type, p=0.7, list=FALSE, times=1)
X4red_train <- X4red[splitindex,]
table(X4red_train$type)

## 
##    bad   good normal 
##     45    152    924

X4red_test <- X4red[-splitindex,]
table(X4red_test$type)

## 
##    bad   good normal 
##     18     65    395

4.4 Balancing Dataset

The train dataset is highly imbalanced with the minority class, bad, accounts for only 4.01% of the entire X4red_train dataset.
Imbalanced datasets pose a challenge for predictive modeling as most of the machine learning algorithms used for classification were designed around the assumption of an equal number of examples for each class. This results in models that have poor predictive performance, specifically for the minority class(es).
In this situation, since X4red_train is quite small already, we will use oversampling to preserve the information of the train dataset.

# Oversampling the imbalanced dataset
set.seed(234)
X4red_train <- upSample(x = X4red_train[, -12],
                   y = as.factor(X4red_train$type))
# upSample function changes the name of our target variable from type to Class. Consequently, Class will be our target variable. The number of samples within each class is now balanced with each class having 924 samples.
table(X4red_train$Class)

## 
##    bad   good normal 
##    924    924    924

names(X4red_test)[12] <- "Class"
X4red_test$Class <- as.factor(X4red_test$Class)

5 Modeling - Classification: Bad/Normal/Good

5.1 Logistic Regression

# Fit the model on the train dataset
model1 <- multinom(Class ~ ., data = X4red_train)

## # weights:  39 (24 variable)
## initial  value 3045.353264 
## iter  10 value 2412.715248
## iter  20 value 2018.928426
## iter  30 value 2010.623731
## iter  40 value 2006.008867
## iter  50 value 2005.496529
## iter  60 value 2005.387039
## iter  70 value 2005.295465
## iter  80 value 2005.237719
## iter  90 value 2005.186050
## final  value 2005.145558 
## converged

summary(model1)

## Call:
## multinom(formula = Class ~ ., data = X4red_train)
## 
## Coefficients:
##        (Intercept) volatile.acidity citric.acid residual.sugar  chlorides
## good     -61.49818        -6.663263   -2.384354     -0.2450162 -12.347272
## normal  -142.59605        -5.280397   -1.706137     -0.3462484  -2.816825
##        free.sulfur.dioxide total.sulfur.dioxide   density        pH sulphates
## good            0.03715415         -0.002253426  60.93148 -2.919018  3.751323
## normal          0.02366501          0.013034211 150.19473 -1.773522 -0.367506
##          alcohol     cluster
## good   1.3851622 -1.03458657
## normal 0.2872678 -0.01840033
## 
## Std. Errors:
##        (Intercept) volatile.acidity citric.acid residual.sugar chlorides
## good     0.9343012        0.5374589   0.6330079     0.04343921  1.746952
## normal   0.7455405        0.3776624   0.4665346     0.03712102  1.130305
##        free.sulfur.dioxide total.sulfur.dioxide   density        pH sulphates
## good           0.009389438          0.003211168 0.9302974 0.5769931 0.4849103
## normal         0.007744228          0.002582445 0.7438576 0.4676803 0.3863389
##           alcohol   cluster
## good   0.08276945 0.2527466
## normal 0.06976332 0.1993658
## 
## Residual Deviance: 4010.291 
## AIC: 4058.291

# Make prediction on the test dataset
p1 <- predict(model1, X4red_test, type = 'class')
head(p1)

## [1] bad    bad    bad    bad    normal normal
## Levels: bad good normal

logistic.pred <- data.frame(p1, X4red_test$Class)
names(logistic.pred) <- c("prediction", "label")
# Calculate the final accuracy
result1 = sum(logistic.pred$prediction==logistic.pred$label)/nrow(logistic.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result1)))

## [1] "Final Accuracy = 58.58%"

# Calculate the confusion matrix
confusionMatrix(logistic.pred$prediction,logistic.pred$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad     13    5    109
##     good     0   50     69
##     normal   5   10    217
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5858          
##                  95% CI : (0.5402, 0.6303)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2537          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity             0.72222      0.7692        0.5494
## Specificity             0.75217      0.8329        0.8193
## Pos Pred Value          0.10236      0.4202        0.9353
## Neg Pred Value          0.98575      0.9582        0.2764
## Prevalence              0.03766      0.1360        0.8264
## Detection Rate          0.02720      0.1046        0.4540
## Detection Prevalence    0.26569      0.2490        0.4854
## Balanced Accuracy       0.73720      0.8011        0.6843

# Calculate multi-class AUC
auc1 <- multiclass.roc(as.numeric(logistic.pred$label), as.numeric(logistic.pred$prediction))

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

auc1

## 
## Call:
## multiclass.roc.default(response = as.numeric(logistic.pred$label),     predictor = as.numeric(logistic.pred$prediction))
## 
## Data: as.numeric(logistic.pred$prediction) with 3 levels of as.numeric(logistic.pred$label): 1, 2, 3.
## Multi-class area under the curve: 0.671

5.2 Decision Tree

# Fit the model on the train dataset
tree <- rpart(Class~., X4red_train)
# Visualize the tree built
rpart.plot(tree)

# Make prediction on the test dataset
p2 <- predict(tree, X4red_test, type="class")
head(p2)

##      1      2      3      4      5      6 
##    bad    bad normal    bad    bad    bad 
## Levels: bad good normal

# Calculate the final accuracy
result2 = sum(p2==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result2)))

## [1] "Final Accuracy = 59.62%"

# Calculate the confusion matrix
confusionMatrix(p2,X4red_test$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad     10    6    121
##     good     0   45     44
##     normal   8   14    230
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5962          
##                  95% CI : (0.5507, 0.6406)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2356          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity             0.55556     0.69231        0.5823
## Specificity             0.72391     0.89346        0.7349
## Pos Pred Value          0.07299     0.50562        0.9127
## Neg Pred Value          0.97654     0.94859        0.2699
## Prevalence              0.03766     0.13598        0.8264
## Detection Rate          0.02092     0.09414        0.4812
## Detection Prevalence    0.28661     0.18619        0.5272
## Balanced Accuracy       0.63973     0.79289        0.6586

# Calculate multi-class AUC
auc2 <- multiclass.roc(as.numeric(X4red_test$Class), as.numeric(p2))

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

auc2

## 
## Call:
## multiclass.roc.default(response = as.numeric(X4red_test$Class),     predictor = as.numeric(p2))
## 
## Data: as.numeric(p2) with 3 levels of as.numeric(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.5867

5.3 Random Forest

# Fit the model on the train dataset using default method
model2 <- randomForest(Class ~ ., data = X4red_train) ## default for classification mtry is sqrt(p)
model2 # shows out of bag error rate as 1.66%

## 
## Call:
##  randomForest(formula = Class ~ ., data = X4red_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.66%
## Confusion matrix:
##        bad good normal class.error
## bad    924    0      0 0.000000000
## good     0  923      1 0.001082251
## normal   3   42    879 0.048701299

# Make prediction on the test dataset
p3 <- predict(model2, X4red_test)
head(p3) # look at prediction

##      1      2      3      4      5      6 
## normal normal normal normal normal normal 
## Levels: bad good normal

# Calculate the final accuracy
result3 = sum(p3==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result3)))

## [1] "Final Accuracy = 87.45%"

# Calculate the confusion matrix
confusionMatrix(p3, X4red_test$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad      0    0      2
##     good     0   38     13
##     normal  18   27    380
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8745          
##                  95% CI : (0.8414, 0.9028)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 0.002447        
##                                           
##                   Kappa : 0.4991          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity            0.000000      0.5846        0.9620
## Specificity            0.995652      0.9685        0.4578
## Pos Pred Value         0.000000      0.7451        0.8941
## Neg Pred Value         0.962185      0.9368        0.7170
## Prevalence             0.037657      0.1360        0.8264
## Detection Rate         0.000000      0.0795        0.7950
## Detection Prevalence   0.004184      0.1067        0.8891
## Balanced Accuracy      0.497826      0.7766        0.7099

# Calculate multi-class AUC
auc3 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(p3))

## Setting direction: controls > cases

## Setting direction: controls < cases
## Setting direction: controls < cases

auc3

## 
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class),     predictor = as.integer(p3))
## 
## Data: as.integer(p3) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6817

# Error rate of Random Forest 
plot(model2)

# as the number of trees grow, the OOB error goes flat
# Tune mtry
train <- as.data.frame(X4red_train)
t <- tuneRF(train[,-12], train[,12],
           stepFactor = 0.5, 
           plot= TRUE,
           ntreeTry = 500,
           trace = TRUE,
           improve = 0.05)

## mtry = 3  OOB error = 1.55% 
## Searching left ...
## mtry = 6     OOB error = 2.13% 
## -0.372093 0.05 
## Searching right ...
## mtry = 1     OOB error = 2.71% 
## -0.744186 0.05

# OOB error is high at mtry = 1 but using elbow method it bottoms at mtry = 3 and this is the value to choose. This mtry is consistent with the default method
# ntree of 500 and mtry of 3 are the optimal parameters. Since they are consistent with the default method, we will keep model 2 as is.

# Histogram using trees - size of trees in terms of number of nodes
hist(treesize(model2),
     main = "# of Nodels for the Trees",
     col = "purple")

# Variable Importance 
varImpPlot(model2,
           sort = T,
           main = "Variable Importance")

# First chart shows the maximum importance of each variable in decreasing order - shows how the model would perform without the variable 
# Second one captures how pure the nodes are at the ends of the tree 
importance(model2) # gives actual data points for above charts

##                      MeanDecreaseGini
## volatile.acidity            265.49365
## citric.acid                 127.26183
## residual.sugar              130.03275
## chlorides                   157.65967
## free.sulfur.dioxide         132.97718
## total.sulfur.dioxide        159.49283
## density                     162.35419
## pH                          130.81704
## sulphates                   234.84849
## alcohol                     283.77406
## cluster                      62.58418

varUsed(model2) # how many of each variables are actually used in the random forest

##  [1] 9486 7910 7925 9155 7553 8603 9216 8606 8837 9145 1155

# Cluster, which is the last variable, was only used 944 times in the random forest model

5.4 K-Nearest Neighbor (KNN)

Guidance for K-Nearest Neighbors: 1. https://daviddalpiaz.github.io/r4sl/knn-class.html\ 2. https://www.geeksforgeeks.org/k-nn-classifier-in-r-programming/\ 3. https://towardsdatascience.com/k-nearest-neighbors-algorithm-with-examples-in-r-simply-explained-knn-1f2c88da405c

# Normalize function
# Write a function called normalize
normalize <- function(x) {
  (x - min(x))/(max(x)-min(x))
}

# Scale the features
X4red_test_knn <- scale(X4red_test[,1:10])
X4red_train_knn <- scale(X4red_train[,1:10])
head(X4red_test_knn)

##      volatile.acidity citric.acid residual.sugar  chlorides free.sulfur.dioxide
## [1,]        0.8220416   -1.316788    -0.49412161 -0.2849096         -0.45894214
## [2,]        1.7870908   -1.316788    -0.02302006  0.3654289          0.86944920
## [3,]        1.1437247   -1.111579    -0.22492072  0.1880639         -0.07940175
## [4,]        0.8220416   -1.316788    -0.49412161 -0.2849096         -0.45894214
## [5,]        0.2859031   -1.008975    -0.69602227 -0.4918355         -0.07940175
## [6,]        0.5539723   -1.316788    -0.96522316 -0.6100789         -0.07940175
##      total.sulfur.dioxide     density          pH   sulphates    alcohol
## [1,]           -0.4029842  0.50625426  1.30615953 -0.53751202 -0.8947077
## [2,]            0.5650616 -0.01658200 -0.70346272  0.23251019 -0.5242433
## [3,]            0.1837102  0.08798525 -0.31450358  0.04000464 -0.5242433
## [4,]           -0.4029842  0.50625426  1.30615953 -0.53751202 -0.8947077
## [5,]            0.3303838 -0.22571651 -0.05519748 -1.17919720 -0.8947077
## [6,]           -0.7843355 -1.16682179  0.52824124 -1.11502868 -0.3390111

# Get independent variable
Yknn_train <- X4red_train[,12]
Yknn_test <- X4red_test[,12]

# Everything should be set up, let's run our K-Nearest Neighbors function
knn_predicted_values <- knn(X4red_train_knn, X4red_test_knn, cl = Yknn_train, k = 38)
head(knn_predicted_values)

## [1] bad    normal normal bad    normal bad   
## Levels: bad good normal

our_Y_knn_test <- pull(Yknn_test, "Class")
# Calculate the final accuracy
result4 = sum(knn_predicted_values==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result4)))

## [1] "Final Accuracy = 48.95%"

# Calculate the confusion matrix
confusionMatrix(knn_predicted_values, X4red_test$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad     14    8    133
##     good     0   56     98
##     normal   4    1    164
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4895          
##                  95% CI : (0.4439, 0.5353)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2169          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity             0.77778      0.8615        0.4152
## Specificity             0.69348      0.7627        0.9398
## Pos Pred Value          0.09032      0.3636        0.9704
## Neg Pred Value          0.98762      0.9722        0.2524
## Prevalence              0.03766      0.1360        0.8264
## Detection Rate          0.02929      0.1172        0.3431
## Detection Prevalence    0.32427      0.3222        0.3536
## Balanced Accuracy       0.73563      0.8121        0.6775

# Calculate multi-class AUC
auc4 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(knn_predicted_values))

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

auc4

## 
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class),     predictor = as.integer(knn_predicted_values))
## 
## Data: as.integer(knn_predicted_values) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6649

# Figure out the optimal number of K
set.seed(42)
k_to_try = 1:100
err_k = rep(x = 0, times = length(k_to_try))
knnerror <- function(actual, predicted){
  mean(actual != predicted)
}
for (i in seq_along(k_to_try)) {
  pred = knn(train = X4red_train_knn, 
             test  = X4red_test_knn, 
             cl    = Yknn_train, 
             k     = k_to_try[i])
  err_k[i] = knnerror(our_Y_knn_test, pred)
}
# Let's plot error vs K
plot(err_k, type = "b", col = "dodgerblue", cex = 1, pch = 20, 
     xlab = "k, number of neighbors", ylab = "classification error",
     main = "(Test) Error Rate vs Neighbors")
# add line for min error seen
abline(h = min(err_k), col = "darkorange", lty = 3)
# add line for minority prevalence in test set
abline(h = mean(our_Y_knn_test == "Yes"), col = "grey", lty = 2)

# Interesting... Looks like any K higher than about 5 produces huge errors. Let's find out which K we should use.
print(min(err_k))

## [1] 0.1820084

print(which(err_k == min(err_k)))

## [1] 1

# Looks like 1 K is our answer. Let's do the process again with K = 1

final_knn_values <- knn(X4red_train_knn, X4red_test_knn, cl = Yknn_train, k = 1)
# Calculate the final accuracy
result5 = sum(final_knn_values==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result5)))

## [1] "Final Accuracy = 81.80%"

# Calculate the confusion matrix
confusionMatrix(final_knn_values, X4red_test$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad      3    0     10
##     good     0   37     34
##     normal  15   28    351
## 
## Overall Statistics
##                                           
##                Accuracy : 0.818           
##                  95% CI : (0.7804, 0.8516)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 0.7097          
##                                           
##                   Kappa : 0.3885          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity            0.166667     0.56923        0.8886
## Specificity            0.978261     0.91768        0.4819
## Pos Pred Value         0.230769     0.52113        0.8909
## Neg Pred Value         0.967742     0.93120        0.4762
## Prevalence             0.037657     0.13598        0.8264
## Detection Rate         0.006276     0.07741        0.7343
## Detection Prevalence   0.027197     0.14854        0.8243
## Balanced Accuracy      0.572464     0.74345        0.6853

# Calculate multi-class AUC
auc5 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(final_knn_values))

## Setting direction: controls > cases

## Setting direction: controls < cases
## Setting direction: controls < cases

auc5

## 
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class),     predictor = as.integer(final_knn_values))
## 
## Data: as.integer(final_knn_values) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6368

5.5 Support Vector Machine (SVM)

Guidance for SVM: https://www.geeksforgeeks.org/classifying-data-using-support-vector-machinessvms-in-r/

# Finalize Train and Test Dataset
X4red_test_svm <- X4red_test[,c(1,2,3,4,5,6,7,8,9,10,12)]
X4red_test_svm[-11] <- scale(X4red_test_svm[-11])
X4red_train_svm <- X4red_train[,c(1,2,3,4,5,6,7,8,9,10,12)]
X4red_train_svm[-11] <- scale(X4red_train_svm[-11])
head(X4red_test_svm)

## # A tibble: 6 x 11
##   volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
##              <dbl>       <dbl>          <dbl>     <dbl>               <dbl>
## 1            0.822       -1.32        -0.494     -0.285             -0.459 
## 2            1.79        -1.32        -0.0230     0.365              0.869 
## 3            1.14        -1.11        -0.225      0.188             -0.0794
## 4            0.822       -1.32        -0.494     -0.285             -0.459 
## 5            0.286       -1.01        -0.696     -0.492             -0.0794
## 6            0.554       -1.32        -0.965     -0.610             -0.0794
## # … with 6 more variables: total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>,
## #   sulphates <dbl>, alcohol <dbl>, Class <fct>

# Get dependent variable
Ysvm_train <- X4red_train_svm[,11]
Ysvm_test <- X4red_test_svm[,11]

# Fit a SVM classifier
svm_classifier <- svm(formula = Class~.,
                      data = X4red_train_svm,
                      type = 'C-classification',
                      kernel = 'linear')
svm_classifier

## 
## Call:
## svm(formula = Class ~ ., data = X4red_train_svm, type = "C-classification", 
##     kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  1861

# Predicting Test set results
svm_predict <- predict(svm_classifier, newdata = X4red_test_svm[-11])
head(svm_predict)

##      1      2      3      4      5      6 
##    bad    bad    bad    bad normal    bad 
## Levels: bad good normal

# Calculate the final accuracy
result6 = sum(svm_predict==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result6)))

## [1] "Final Accuracy = 47.91%"

# Calculate the confusion matrix
confusionMatrix(svm_predict, X4red_test$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad     14    6    136
##     good     0   57    101
##     normal   4    2    158
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4791          
##                  95% CI : (0.4335, 0.5249)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2098          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity             0.77778      0.8769        0.4000
## Specificity             0.69130      0.7554        0.9277
## Pos Pred Value          0.08974      0.3608        0.9634
## Neg Pred Value          0.98758      0.9750        0.2452
## Prevalence              0.03766      0.1360        0.8264
## Detection Rate          0.02929      0.1192        0.3305
## Detection Prevalence    0.32636      0.3305        0.3431
## Balanced Accuracy       0.73454      0.8162        0.6639

# Calculate multi-class AUC
auc6 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(svm_predict))

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

auc6

## 
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class),     predictor = as.integer(svm_predict))
## 
## Data: as.integer(svm_predict) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6597

# SVM is better if there is only 2 independent variables. As there is too many variables here, we can't visualize our data and SVM method doesn't work very well.

5.6 XGBoost

# Divide train and test dataset into data and label
train.data = X4red_train
class1 = train.data$Class
label1 = as.integer(train.data$Class)-1
train.data$Class = NULL
train.data = as.matrix(train.data)
train.label = label1
test.data = X4red_test
test.data$cluster <- as.numeric(test.data$cluster)
test.data$Class <- as.factor(test.data$Class)
class2 = as.factor(test.data$Class)
label2 = as.integer(as.factor(test.data$Class))-1
test.data$Class = NULL
test.data = as.matrix(test.data)
test.label = label2
# Transform the two data sets into xgb.Matrix
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)

# Define the parameters for multinomial classification
num_class = length(levels(class1))
xgb.params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=num_class
)

# Train the XGBoost classifer
xgb.fit=xgb.train(
  params=xgb.params,
  data=xgb.train,
  nrounds=10000,
  nthreads=1,
  early_stopping_rounds=50,
  watchlist=list(val1=xgb.train,val2=xgb.test),
  verbose=0
)

## [00:10:26] WARNING: amalgamation/../src/learner.cc:576: 
## Parameters: { "nthreads" } might not be used.
## 
##   This could be a false alarm, with some parameters getting used by language bindings but
##   then being mistakenly passed down to XGBoost core, or some parameter actually being used
##   but getting flagged wrongly here. Please open an issue if you find any such cases.

# Review the final model and results
xgb.fit

## ##### xgb.Booster
## raw: 74.9 Mb 
## call:
##   xgb.train(params = xgb.params, data = xgb.train, nrounds = 10000, 
##     watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0, 
##     early_stopping_rounds = 50, nthreads = 1)
## params (as set within xgb.train):
##   booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "3", nthreads = "1", validate_parameters = "TRUE"
## xgb.attributes:
##   best_iteration, best_msg, best_ntreelimit, best_score, niter
## callbacks:
##   cb.evaluation.log()
##   cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     verbose = verbose)
## # of features: 11 
## niter: 10000
## best_iteration : 10000 
## best_ntreelimit : 10000 
## best_score : 0.444072 
## best_msg : [10000]   val1-mlogloss:0.158273  val2-mlogloss:0.444072 
## nfeatures : 11 
## evaluation_log:
##      iter val1_mlogloss val2_mlogloss
##         1      1.097714      1.097978
##         2      1.096851      1.097405
## ---                                  
##      9999      0.158285      0.444081
##     10000      0.158273      0.444072

# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test.data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = levels(class2)

# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = levels(class2)[test.label+1]

# Calculate the final accuracy
result7 = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result7)))

## [1] "Final Accuracy = 83.47%"

# Calculate the confusion matrix
confusionMatrix(as.factor(xgb.pred$prediction),as.factor(xgb.pred$label))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad      8    1     17
##     good     0   48     35
##     normal  10   16    343
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8347          
##                  95% CI : (0.7983, 0.8669)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 0.3403          
##                                           
##                   Kappa : 0.5087          
##                                           
##  Mcnemar's Test P-Value : 0.0195          
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity             0.44444      0.7385        0.8684
## Specificity             0.96087      0.9153        0.6867
## Pos Pred Value          0.30769      0.5783        0.9295
## Neg Pred Value          0.97788      0.9570        0.5229
## Prevalence              0.03766      0.1360        0.8264
## Detection Rate          0.01674      0.1004        0.7176
## Detection Prevalence    0.05439      0.1736        0.7720
## Balanced Accuracy       0.70266      0.8269        0.7776

# Calculate multi-class AUC
auc7 <- multiclass.roc(as.integer(as.factor(xgb.pred$label)),as.integer(as.factor(xgb.pred$prediction)))

## Setting direction: controls > cases

## Setting direction: controls < cases
## Setting direction: controls < cases

auc7

## 
## Call:
## multiclass.roc.default(response = as.integer(as.factor(xgb.pred$label)),     predictor = as.integer(as.factor(xgb.pred$prediction)))
## 
## Data: as.integer(as.factor(xgb.pred$prediction)) with 3 levels of as.integer(as.factor(xgb.pred$label)): 1, 2, 3.
## Multi-class area under the curve: 0.6542

5.7 LightGBM

# Transform the two data sets into lgb.Dataset
lgb.train = lgb.Dataset(data=train.data,label=train.label)
lgb.test = lgb.Dataset(data=test.data,label=test.label)

# Define the parameters for multinomial classification
lgb.params <- list(
    objective = "multiclass", 
    metric = "multi_logloss",
    num_class = 3L,
    min_data = 1L,
    learning_rate = 1.0
)

# Train the LightGBM classifer
lgb.fit=lgb.train(
  params=lgb.params,
  data=lgb.train,
  nrounds = 10000L,
  valids = list(val1=lgb.train,val2=lgb.test),
  early_stopping_rounds=50
)

## [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
## You can set `force_row_wise=true` to remove the overhead.
## And if memory is not enough, you can set `force_col_wise=true`.
## [LightGBM] [Info] Total Bins 946
## [LightGBM] [Info] Number of data points in the train set: 2772, number of used features: 11
## [LightGBM] [Info] Start training from score -1.098612
## [LightGBM] [Info] Start training from score -1.098612
## [LightGBM] [Info] Start training from score -1.098612
## [1] "[1]:  val1's multi_logloss:0.308736  val2's multi_logloss:0.70352"
## [1] "[2]:  val1's multi_logloss:0.142459  val2's multi_logloss:0.619246"
## [1] "[3]:  val1's multi_logloss:0.0700909  val2's multi_logloss:0.62743"
## [1] "[4]:  val1's multi_logloss:0.0332382  val2's multi_logloss:0.589903"
## [1] "[5]:  val1's multi_logloss:0.0162919  val2's multi_logloss:0.60438"
## [1] "[6]:  val1's multi_logloss:0.00893837  val2's multi_logloss:0.605546"
## [1] "[7]:  val1's multi_logloss:0.00481682  val2's multi_logloss:0.620166"
## [1] "[8]:  val1's multi_logloss:0.00264789  val2's multi_logloss:0.653594"
## [1] "[9]:  val1's multi_logloss:0.0014357  val2's multi_logloss:0.695867"
## [1] "[10]:  val1's multi_logloss:0.000769721  val2's multi_logloss:0.721256"
## [1] "[11]:  val1's multi_logloss:0.000418808  val2's multi_logloss:0.760272"
## [1] "[12]:  val1's multi_logloss:0.000239663  val2's multi_logloss:0.776561"
## [1] "[13]:  val1's multi_logloss:0.000138903  val2's multi_logloss:0.790873"
## [1] "[14]:  val1's multi_logloss:7.75786e-05  val2's multi_logloss:0.809931"
## [1] "[15]:  val1's multi_logloss:4.43349e-05  val2's multi_logloss:0.847126"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[16]:  val1's multi_logloss:2.68454e-05  val2's multi_logloss:0.870349"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[17]:  val1's multi_logloss:1.69985e-05  val2's multi_logloss:0.908464"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[18]:  val1's multi_logloss:1.14122e-05  val2's multi_logloss:0.939347"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[19]:  val1's multi_logloss:8.17648e-06  val2's multi_logloss:0.951316"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[20]:  val1's multi_logloss:6.37837e-06  val2's multi_logloss:0.959388"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[21]:  val1's multi_logloss:5.38712e-06  val2's multi_logloss:0.979799"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[22]:  val1's multi_logloss:4.72323e-06  val2's multi_logloss:0.999218"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[23]:  val1's multi_logloss:4.24951e-06  val2's multi_logloss:1.00509"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[24]:  val1's multi_logloss:3.90897e-06  val2's multi_logloss:1.00399"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[25]:  val1's multi_logloss:3.59214e-06  val2's multi_logloss:1.00779"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[26]:  val1's multi_logloss:3.34248e-06  val2's multi_logloss:1.00913"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[27]:  val1's multi_logloss:3.14463e-06  val2's multi_logloss:1.01367"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[28]:  val1's multi_logloss:2.99919e-06  val2's multi_logloss:1.01593"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[29]:  val1's multi_logloss:2.87908e-06  val2's multi_logloss:1.0226"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[30]:  val1's multi_logloss:2.7801e-06  val2's multi_logloss:1.0302"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[31]:  val1's multi_logloss:2.64759e-06  val2's multi_logloss:1.03443"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[32]:  val1's multi_logloss:2.54394e-06  val2's multi_logloss:1.04187"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[33]:  val1's multi_logloss:2.45106e-06  val2's multi_logloss:1.04452"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[34]:  val1's multi_logloss:2.37794e-06  val2's multi_logloss:1.05096"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[35]:  val1's multi_logloss:2.31499e-06  val2's multi_logloss:1.0507"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[36]:  val1's multi_logloss:2.2556e-06  val2's multi_logloss:1.05631"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[37]:  val1's multi_logloss:2.19636e-06  val2's multi_logloss:1.05808"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[38]:  val1's multi_logloss:2.13612e-06  val2's multi_logloss:1.05724"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[39]:  val1's multi_logloss:2.08752e-06  val2's multi_logloss:1.05995"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[40]:  val1's multi_logloss:2.05726e-06  val2's multi_logloss:1.059"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[41]:  val1's multi_logloss:2.0005e-06  val2's multi_logloss:1.06427"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[42]:  val1's multi_logloss:1.96501e-06  val2's multi_logloss:1.06545"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[43]:  val1's multi_logloss:1.93624e-06  val2's multi_logloss:1.0646"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[44]:  val1's multi_logloss:1.89989e-06  val2's multi_logloss:1.06107"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[45]:  val1's multi_logloss:1.86434e-06  val2's multi_logloss:1.06425"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[46]:  val1's multi_logloss:1.8423e-06  val2's multi_logloss:1.07086"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[47]:  val1's multi_logloss:1.81136e-06  val2's multi_logloss:1.07168"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[48]:  val1's multi_logloss:1.78957e-06  val2's multi_logloss:1.0714"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[49]:  val1's multi_logloss:1.7614e-06  val2's multi_logloss:1.0697"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[50]:  val1's multi_logloss:1.74134e-06  val2's multi_logloss:1.06975"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[51]:  val1's multi_logloss:1.72606e-06  val2's multi_logloss:1.06888"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[52]:  val1's multi_logloss:1.70982e-06  val2's multi_logloss:1.07092"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[53]:  val1's multi_logloss:1.68341e-06  val2's multi_logloss:1.07005"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[54]:  val1's multi_logloss:1.66536e-06  val2's multi_logloss:1.0704"

# Review the final model and results
lgb.fit

## <lgb.Booster>
##   Public:
##     add_valid: function (data, name) 
##     best_iter: 4
##     best_score: 0.589902716611008
##     current_iter: function () 
##     dump_model: function (num_iteration = NULL, feature_importance_type = 0L) 
##     eval: function (data, name, feval = NULL) 
##     eval_train: function (feval = NULL) 
##     eval_valid: function (feval = NULL) 
##     finalize: function () 
##     initialize: function (params = list(), train_set = NULL, modelfile = NULL, 
##     lower_bound: function () 
##     params: list
##     predict: function (data, start_iteration = NULL, num_iteration = NULL, 
##     raw: NA
##     record_evals: list
##     reset_parameter: function (params, ...) 
##     rollback_one_iter: function () 
##     save: function () 
##     save_model: function (filename, num_iteration = NULL, feature_importance_type = 0L) 
##     save_model_to_string: function (num_iteration = NULL, feature_importance_type = 0L) 
##     set_train_data_name: function (name) 
##     to_predictor: function () 
##     update: function (train_set = NULL, fobj = NULL) 
##     upper_bound: function () 
##   Private:
##     eval_names: multi_logloss
##     get_eval_info: function () 
##     handle: lgb.Booster.handle
##     higher_better_inner_eval: FALSE
##     init_predictor: NULL
##     inner_eval: function (data_name, data_idx, feval = NULL) 
##     inner_predict: function (idx) 
##     is_predicted_cur_iter: list
##     name_train_set: val1
##     name_valid_sets: list
##     num_class: 3
##     num_dataset: 2
##     predict_buffer: list
##     set_objective_to_none: FALSE
##     train_set: lgb.Dataset, R6
##     train_set_version: 1
##     valid_sets: list

# Predict outcomes with the test data
lgb.pred = predict(lgb.fit,test.data,reshape=T)
lgb.pred = as.data.frame(lgb.pred)
colnames(lgb.pred) = levels(class2)

# Use the predicted label with the highest probability
lgb.pred$prediction = apply(lgb.pred,1,function(x) colnames(lgb.pred)[which.max(x)])
lgb.pred$label = levels(class2)[test.label+1]

# Calculate the final accuracy
result8 = sum(lgb.pred$prediction==lgb.pred$label)/nrow(lgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result8)))

## [1] "Final Accuracy = 84.31%"

# Calculate the confusion matrix
confusionMatrix(as.factor(lgb.pred$prediction),as.factor(lgb.pred$label))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad      5    1      9
##     good     1   36     24
##     normal  12   28    362
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8431          
##                  95% CI : (0.8073, 0.8745)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 0.1832          
##                                           
##                   Kappa : 0.4523          
##                                           
##  Mcnemar's Test P-Value : 0.8646          
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity             0.27778     0.55385        0.9165
## Specificity             0.97826     0.93947        0.5181
## Pos Pred Value          0.33333     0.59016        0.9005
## Neg Pred Value          0.97192     0.93046        0.5658
## Prevalence              0.03766     0.13598        0.8264
## Detection Rate          0.01046     0.07531        0.7573
## Detection Prevalence    0.03138     0.12762        0.8410
## Balanced Accuracy       0.62802     0.74666        0.7173

# Calculate multi-class AUC
auc8 <- multiclass.roc(as.integer(as.factor(lgb.pred$label)),as.integer(as.factor(lgb.pred$prediction)))

## Setting direction: controls > cases

## Setting direction: controls < cases
## Setting direction: controls < cases

auc8

## 
## Call:
## multiclass.roc.default(response = as.integer(as.factor(lgb.pred$label)),     predictor = as.integer(as.factor(lgb.pred$prediction)))
## 
## Data: as.integer(as.factor(lgb.pred$prediction)) with 3 levels of as.integer(as.factor(lgb.pred$label)): 1, 2, 3.
## Multi-class area under the curve: 0.6371

5.8 AdaBoost

# Train the AdaBoost classifier
adaboost <- boosting(Class~., data=X4red_train, boos=TRUE, mfinal = 50)
# Plot variable importance
importanceplot(adaboost, cex.names=.3)

# Predict outcomes with the test data
test.data <- as.data.frame(test.data)
adaboost.pred <- predict.boosting(adaboost, newdata=test.data)
ada.pred <- data.frame(as.factor(adaboost.pred$class), class2)
names(ada.pred) <- c("prediction", "label")
# Calculate the final accuracy
result9 = sum(ada.pred$prediction==ada.pred$label)/nrow(ada.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result9)))

## [1] "Final Accuracy = 86.19%"

# Calculate the confusion matrix
confusionMatrix(as.factor(ada.pred$prediction),as.factor(ada.pred$label))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good normal
##     bad      3    0      9
##     good     0   40     17
##     normal  15   25    369
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8619          
##                  95% CI : (0.8277, 0.8916)
##     No Information Rate : 0.8264          
##     P-Value [Acc > NIR] : 0.02088         
##                                           
##                   Kappa : 0.4993          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: bad Class: good Class: normal
## Sensitivity            0.166667     0.61538        0.9342
## Specificity            0.980435     0.95884        0.5181
## Pos Pred Value         0.250000     0.70175        0.9022
## Neg Pred Value         0.967811     0.94062        0.6232
## Prevalence             0.037657     0.13598        0.8264
## Detection Rate         0.006276     0.08368        0.7720
## Detection Prevalence   0.025105     0.11925        0.8556
## Balanced Accuracy      0.573551     0.78711        0.7261

# Calculate multi-class AUC
auc9 <- multiclass.roc(as.integer(as.factor(ada.pred$label)),as.integer(as.factor(ada.pred$prediction)))

## Setting direction: controls > cases

## Setting direction: controls < cases
## Setting direction: controls < cases

auc9

## 
## Call:
## multiclass.roc.default(response = as.integer(as.factor(ada.pred$label)),     predictor = as.integer(as.factor(ada.pred$prediction)))
## 
## Data: as.integer(as.factor(ada.pred$prediction)) with 3 levels of as.integer(as.factor(ada.pred$label)): 1, 2, 3.
## Multi-class area under the curve: 0.665

6 Modeling - Regression: Quality Rating Prediction

X5red <- Xred
head(X5red)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

# Drop fixed.acidity to avoid multi-collinearity issue
X5red <- X5red %>% 
            select(-one_of('fixed.acidity'))
# Create a 70/30 split
set.seed(1234)
splitindex2 <- createDataPartition(X5red$quality, p=0.7, list=FALSE, times=1)
X5red_train <- X5red[splitindex2,]
table(X5red_train$quality)

## 
##   3   4   5   6   7   8 
##   7  43 471 447 140  12

X5red_test <- X5red[-splitindex2,]
table(X5red_test$quality)

## 
##   3   4   5   6   7   8 
##   3  10 210 191  59   6

6.1 Linear Regression

# Fit a multiple linear regression model
lmmodel <- lm(quality ~., X5red_train)
summary(lmmodel)

## 
## Call:
## lm(formula = quality ~ ., data = X5red_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.61641 -0.35575 -0.05257  0.44666  2.03589 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          11.5831446 16.1108396   0.719 0.472313    
## volatile.acidity     -1.1266551  0.1485411  -7.585 7.02e-14 ***
## citric.acid          -0.1293092  0.1677458  -0.771 0.440952    
## residual.sugar        0.0171160  0.0168176   1.018 0.309024    
## chlorides            -1.7466245  0.4639141  -3.765 0.000175 ***
## free.sulfur.dioxide   0.0022740  0.0026210   0.868 0.385805    
## total.sulfur.dioxide -0.0029122  0.0008746  -3.330 0.000898 ***
## density              -7.1303565 16.0537276  -0.444 0.657017    
## pH                   -0.4336497  0.1656097  -2.619 0.008952 ** 
## sulphates             0.8533680  0.1297696   6.576 7.43e-11 ***
## alcohol               0.2751083  0.0264145  10.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6593 on 1109 degrees of freedom
## Multiple R-squared:  0.3495, Adjusted R-squared:  0.3436 
## F-statistic: 59.58 on 10 and 1109 DF,  p-value: < 2.2e-16

# Predict the test data
lm_pred <- predict(lmmodel,X5red_test)
head(lm_pred)

##        3        4        6        7        9       11 
## 5.205333 5.665306 5.079564 5.111391 5.337221 5.093868

# Accuracy check
lm_mse = mean((X5red_test$quality - lm_pred)^2)
lm_mae = caret::MAE(X5red_test$quality, lm_pred)
lm_rmse = caret::RMSE(X5red_test$quality, lm_pred)
cat("MSE: ", lm_mse, "MAE: ", lm_mae, "RMSE: ", lm_rmse)

## MSE:  0.388386 MAE:  0.4950919 RMSE:  0.6232063

6.2 Random Forest

# Fit a random forest model using default parameters
rf <- randomForest(quality~.,data=X5red_train,importance = TRUE)
rf

## 
## Call:
##  randomForest(formula = quality ~ ., data = X5red_train, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.3557046
##                     % Var explained: 46.23

# Predict the test dataset
rf_pred <- predict(rf, X5red_test)
head(rf_pred)

##        3        4        6        7        9       11 
## 5.270167 5.575400 5.110800 5.097633 5.372467 5.243133

# Accuracy check
rf_mse = mean((X5red_test$quality - rf_pred)^2)
rf_mae = caret::MAE(X5red_test$quality, rf_pred)
rf_rmse = caret::RMSE(X5red_test$quality, rf_pred)
cat("MSE: ", rf_mse, "MAE: ", rf_mae, "RMSE: ", rf_rmse)

## MSE:  0.3092426 MAE:  0.4182004 RMSE:  0.5560958

6.3 Regularization Modeling

Guidance from Smith College: http://www.science.smith.edu/~jcrouser/SDS293/labs/lab10-r.html

# Set up datasets
X5red_reg <- subset(X5red, select = -c(11))
Y5red_reg <- subset(X5red, select = c(11))
our_Y_reg <- pull(Y5red_reg, "quality")

6.3.1 Ridge Regression

lmnet() function has an alpha argument that determines what type of model is fit. If alpha = 0 then a ridge regression model is fit, and if alpha = 1 then a lasso model is fit.

grid = 10^seq(10, -2, length = 100)
ridge_mod = glmnet(X5red_reg, our_Y_reg, alpha = 0, lambda = grid)
dim(coef(ridge_mod))

## [1]  11 100

# Draw plot of coefficients
plot(ridge_mod)

# Looks like around alpha = 0 might be the common coefficients

# Let's apply this to our train and test data
Y5red_train_reg <- subset(X5red_train, select = c(11))
X5red_train_reg <- data.matrix(subset(X5red_train, select = -c(11)))
our_Y_reg_train <- pull(Y5red_train_reg, "quality")
Y5red_test_reg <- subset(X5red_test, select = c(11))
X5red_test_reg <- data.matrix(subset(X5red_test, select = -c(11)))
our_Y_reg_test <- pull(Y5red_test_reg, "quality")
# First, try to fit a ridge regression model, choosing a random lambda = 3 as this coefficient shows up a lot in the graph.
ridge_mod = glmnet(X5red_train_reg, our_Y_reg_train, alpha=0, lambda = grid, thresh = 1e-12)
# Cross validation: Instead of choosing a random lambda, we use cross-validation to choose this tuning parameter.
set.seed(1)
cv.out = cv.glmnet(X5red_train_reg, our_Y_reg_train, alpha = 0) # Fit ridge regression model on training data
bestlam = cv.out$lambda.min  # Select lamda that minimizes training MSE
bestlam

## [1] 0.03858909

# Draw plot of training MSE as a function of lambda
plot(cv.out)

# Use the best lambda to predict the test data
ridge_pred = predict(ridge_mod, s = bestlam, newx = X5red_test_reg)
ridge_pred <- as.vector(ridge_pred)
head(ridge_pred)

## [1] 5.204171 5.671423 5.088011 5.119452 5.336803 5.108101

# Accuracy check
ridge_mse = mean((X5red_test$quality - ridge_pred)^2)
ridge_mae = caret::MAE(X5red_test$quality, ridge_pred)
ridge_rmse = caret::RMSE(X5red_test$quality, ridge_pred)
cat("MSE: ", ridge_mse, "MAE: ", ridge_mae, "RMSE: ", ridge_rmse)

## MSE:  0.3910052 MAE:  0.4981249 RMSE:  0.6253041

6.3.2 Lasso Regression

# This time, for the glmnet function, we use alpha = 1
lasso_mod = glmnet(X5red_train_reg, 
                   our_Y_reg_train,
                   alpha = 1, 
                   lambda = grid)
plot(lasso_mod)

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

# Let's perform cross-validation
# Fit Lasso model on training data
set.seed(1)
cv.out2 = cv.glmnet(X5red_train_reg, our_Y_reg_train, alpha = 1)
plot(cv.out2)

# Select best lambda
bestlam2 = cv.out2$lambda.min
print(bestlam2)

## [1] 0.000912435

# Use best lambda to predict test data
lasso_pred = predict(lasso_mod, s = bestlam2, newx = X5red_test_reg)
lasso_pred <- as.vector(lasso_pred)
head(lasso_pred)

## [1] 5.191617 5.691305 5.112436 5.120707 5.318177 5.100923

# Accuracy check
lasso_mse = mean((X5red_test$quality - lasso_pred)^2)
lasso_mae = caret::MAE(X5red_test$quality, lasso_pred)
lasso_rmse = caret::RMSE(X5red_test$quality, lasso_pred)
cat("MSE: ", lasso_mse, "MAE: ", lasso_mae, "RMSE: ", lasso_rmse)

## MSE:  0.3940802 MAE:  0.5003707 RMSE:  0.6277581

6.4 XGBoost

# Set up test and train dependent and independent sets
train_x = data.matrix(X5red_train[,-11])
train_y = X5red_train[,11]
test_x = data.matrix(X5red_test[,-11])
test_y = X5red_test[,11]

# Convert dataset to xgb.DMatrix
xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)
# Fitting the model
xgbc = xgboost(data = xgb_train, max.depth = 2, nrounds = 50, eta = 0.3, gamma = 0)

## [1]  train-rmse:3.675893 
## [2]  train-rmse:2.622542 
## [3]  train-rmse:1.898558 
## [4]  train-rmse:1.409962 
## [5]  train-rmse:1.089524 
## [6]  train-rmse:0.888388 
## [7]  train-rmse:0.768303 
## [8]  train-rmse:0.696406 
## [9]  train-rmse:0.657402 
## [10] train-rmse:0.634882 
## [11] train-rmse:0.622415 
## [12] train-rmse:0.615167 
## [13] train-rmse:0.608908 
## [14] train-rmse:0.605823 
## [15] train-rmse:0.601920 
## [16] train-rmse:0.596653 
## [17] train-rmse:0.594761 
## [18] train-rmse:0.592256 
## [19] train-rmse:0.588774 
## [20] train-rmse:0.586063 
## [21] train-rmse:0.584521 
## [22] train-rmse:0.583443 
## [23] train-rmse:0.579704 
## [24] train-rmse:0.577861 
## [25] train-rmse:0.575236 
## [26] train-rmse:0.573417 
## [27] train-rmse:0.572473 
## [28] train-rmse:0.570917 
## [29] train-rmse:0.569900 
## [30] train-rmse:0.569218 
## [31] train-rmse:0.566729 
## [32] train-rmse:0.564760 
## [33] train-rmse:0.563887 
## [34] train-rmse:0.562945 
## [35] train-rmse:0.562162 
## [36] train-rmse:0.561427 
## [37] train-rmse:0.559963 
## [38] train-rmse:0.558266 
## [39] train-rmse:0.556743 
## [40] train-rmse:0.554678 
## [41] train-rmse:0.552732 
## [42] train-rmse:0.551073 
## [43] train-rmse:0.550555 
## [44] train-rmse:0.548427 
## [45] train-rmse:0.545629 
## [46] train-rmse:0.544948 
## [47] train-rmse:0.543194 
## [48] train-rmse:0.540699 
## [49] train-rmse:0.540079 
## [50] train-rmse:0.539636

print(xgbc)

## ##### xgb.Booster
## raw: 38.2 Kb 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, max.depth = 2, eta = 0.3, gamma = 0)
## params (as set within xgb.train):
##   max_depth = "2", eta = "0.3", gamma = "0", validate_parameters = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 10 
## niter: 50
## nfeatures : 10 
## evaluation_log:
##     iter train_rmse
##        1   3.675893
##        2   2.622542
## ---                
##       49   0.540079
##       50   0.539636

# Predict the x test data with the xgbc model
pred_y = predict(xgbc,test_x)
head(pred_y)

## [1] 5.316401 5.619293 5.342974 5.118239 5.328153 5.190927

# Visualize y original test and y predicted data in a plot
x = 1:length(test_y)
plot(x, test_y, col = "red", type = "l")
lines(x, pred_y, col = "blue", type = "l")
legend("topright",  legend = c("original test_y", "predicted test_y"), 
       col = c("red", "blue"), box.lty = 1, cex = 0.8, lty = c(1, 1))

# Accuracy check
mse = mean((test_y - pred_y)^2)
mae = caret::MAE(test_y, pred_y)
rmse = caret::RMSE(test_y, pred_y)
cat("MSE: ", mse, "MAE: ", mae, "RMSE: ", rmse)

## MSE:  0.3633217 MAE:  0.4727701 RMSE:  0.6027617

# Feature importance
importance_matrix <- xgb.importance(model = xgbc)
print(importance_matrix)

##                  Feature       Gain      Cover  Frequency
##  1:              alcohol 0.32341509 0.15749814 0.14184397
##  2:            sulphates 0.20720771 0.07271356 0.08510638
##  3:     volatile.acidity 0.19006030 0.08056331 0.12056738
##  4: total.sulfur.dioxide 0.07001677 0.13282618 0.12765957
##  5:                   pH 0.05592583 0.12146469 0.12056738
##  6:              density 0.04973980 0.14258000 0.13475177
##  7:            chlorides 0.03418957 0.06410038 0.06382979
##  8:          citric.acid 0.03148723 0.09456534 0.09219858
##  9:       residual.sugar 0.02419104 0.09774477 0.07801418
## 10:  free.sulfur.dioxide 0.01376666 0.03594363 0.03546099

xgb.plot.importance(importance_matrix = importance_matrix) # Alcohol, sulphates, and volatile.acidity are the three most important variables in determining the quality of a wine.

6.5 LightGBM

# Convert dataset to lgb.Dataset
lgb_train = lgb.Dataset(data=train_x, label=train_y)
lgb_test = lgb.Dataset(data=test_x, label=test_y)
# Fitting the model
lgbd = lightgbm(boosting_type = 'gbdt', objective = "regression", metric = 'rmse', lgb_train, nrounds = 50)

## Warning in (function (params = list(), data, nrounds = 100L, valids = list(), :
## lgb.train: Found the following passed through '...': boosting_type, objective,
## metric. These will be used, but in future releases of lightgbm, this warning
## will become an error. Add these to 'params' instead. See ?lgb.train for
## documentation on how to call this function.

## [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000027 seconds.
## You can set `force_row_wise=true` to remove the overhead.
## And if memory is not enough, you can set `force_col_wise=true`.
## [LightGBM] [Info] Total Bins 915
## [LightGBM] [Info] Number of data points in the train set: 1120, number of used features: 10
## [LightGBM] [Info] Start training from score 5.630357
## [1] "[1]:  train's rmse:0.775186"
## [1] "[2]:  train's rmse:0.742942"
## [1] "[3]:  train's rmse:0.714977"
## [1] "[4]:  train's rmse:0.688906"
## [1] "[5]:  train's rmse:0.666146"
## [1] "[6]:  train's rmse:0.645208"
## [1] "[7]:  train's rmse:0.627439"
## [1] "[8]:  train's rmse:0.61115"
## [1] "[9]:  train's rmse:0.594696"
## [1] "[10]:  train's rmse:0.580377"
## [1] "[11]:  train's rmse:0.567246"
## [1] "[12]:  train's rmse:0.555614"
## [1] "[13]:  train's rmse:0.545501"
## [1] "[14]:  train's rmse:0.535901"
## [1] "[15]:  train's rmse:0.526397"
## [1] "[16]:  train's rmse:0.517803"
## [1] "[17]:  train's rmse:0.509556"
## [1] "[18]:  train's rmse:0.50181"
## [1] "[19]:  train's rmse:0.494594"
## [1] "[20]:  train's rmse:0.487754"
## [1] "[21]:  train's rmse:0.480944"
## [1] "[22]:  train's rmse:0.474107"
## [1] "[23]:  train's rmse:0.467495"
## [1] "[24]:  train's rmse:0.461551"
## [1] "[25]:  train's rmse:0.455975"
## [1] "[26]:  train's rmse:0.450292"
## [1] "[27]:  train's rmse:0.44512"
## [1] "[28]:  train's rmse:0.440155"
## [1] "[29]:  train's rmse:0.434612"
## [1] "[30]:  train's rmse:0.429691"
## [1] "[31]:  train's rmse:0.425077"
## [1] "[32]:  train's rmse:0.420487"
## [1] "[33]:  train's rmse:0.414955"
## [1] "[34]:  train's rmse:0.410699"
## [1] "[35]:  train's rmse:0.406949"
## [1] "[36]:  train's rmse:0.402598"
## [1] "[37]:  train's rmse:0.398814"
## [1] "[38]:  train's rmse:0.394605"
## [1] "[39]:  train's rmse:0.390025"
## [1] "[40]:  train's rmse:0.386324"
## [1] "[41]:  train's rmse:0.382477"
## [1] "[42]:  train's rmse:0.377629"
## [1] "[43]:  train's rmse:0.374125"
## [1] "[44]:  train's rmse:0.369756"
## [1] "[45]:  train's rmse:0.366184"
## [1] "[46]:  train's rmse:0.362641"
## [1] "[47]:  train's rmse:0.36003"
## [1] "[48]:  train's rmse:0.356629"
## [1] "[49]:  train's rmse:0.352808"
## [1] "[50]:  train's rmse:0.348812"

print(lgbd)

## <lgb.Booster>
##   Public:
##     add_valid: function (data, name) 
##     best_iter: -1
##     best_score: NA
##     current_iter: function () 
##     dump_model: function (num_iteration = NULL, feature_importance_type = 0L) 
##     eval: function (data, name, feval = NULL) 
##     eval_train: function (feval = NULL) 
##     eval_valid: function (feval = NULL) 
##     finalize: function () 
##     initialize: function (params = list(), train_set = NULL, modelfile = NULL, 
##     lower_bound: function () 
##     params: list
##     predict: function (data, start_iteration = NULL, num_iteration = NULL, 
##     raw: NA
##     record_evals: list
##     reset_parameter: function (params, ...) 
##     rollback_one_iter: function () 
##     save: function () 
##     save_model: function (filename, num_iteration = NULL, feature_importance_type = 0L) 
##     save_model_to_string: function (num_iteration = NULL, feature_importance_type = 0L) 
##     set_train_data_name: function (name) 
##     to_predictor: function () 
##     update: function (train_set = NULL, fobj = NULL) 
##     upper_bound: function () 
##   Private:
##     eval_names: rmse
##     get_eval_info: function () 
##     handle: lgb.Booster.handle
##     higher_better_inner_eval: FALSE
##     init_predictor: NULL
##     inner_eval: function (data_name, data_idx, feval = NULL) 
##     inner_predict: function (idx) 
##     is_predicted_cur_iter: list
##     name_train_set: train
##     name_valid_sets: list
##     num_class: 1
##     num_dataset: 1
##     predict_buffer: list
##     set_objective_to_none: FALSE
##     train_set: lgb.Dataset, R6
##     train_set_version: 1
##     valid_sets: list

# Predict the x test data with the xgbc model
pred_y2 <- predict(lgbd, data = test_x)
head(pred_y2)

## [1] 5.155409 5.551289 5.141556 4.883104 5.542482 5.272589

# Accuracy check
mse2 = mean((test_y - pred_y2)^2)
mae2 = caret::MAE(test_y, pred_y2)
rmse2 = caret::RMSE(test_y, pred_y2)
cat("MSE: ", mse2, "MAE: ", mae2, "RMSE: ", rmse2) # LightGBM seems to perform better than XGBoost a bit

## MSE:  0.337878 MAE:  0.4468788 RMSE:  0.5812728

# Feature importance
importance_matrix <- lgb.importance(model = lgbd)
print(importance_matrix)

##                  Feature       Gain      Cover  Frequency
##  1:              alcohol 0.31890439 0.13869487 0.11200000
##  2:            sulphates 0.14732333 0.10744348 0.10466667
##  3:     volatile.acidity 0.12953290 0.11998389 0.11733333
##  4: total.sulfur.dioxide 0.09795119 0.10685207 0.12400000
##  5:                   pH 0.06209832 0.07342794 0.09200000
##  6:            chlorides 0.06135905 0.10888491 0.10666667
##  7:              density 0.05640627 0.09962375 0.10400000
##  8:          citric.acid 0.05178529 0.09784952 0.09933333
##  9:  free.sulfur.dioxide 0.03885817 0.04580768 0.06400000
## 10:       residual.sugar 0.03578108 0.10143189 0.07600000

lgb.plot.importance(importance_matrix) # Similar to XGBoost's evaluation, LightGBM evaluates alcohol, sulphates, and volatile.acidity as the three most important variables.

6.6 Gradient Boosting

# Fitting the model
model_gbm = gbm(quality ~.,
                data = X5red_train,
                distribution = "gaussian",
                cv.folds = 10,
                shrinkage = .01,
                n.minobsinnode = 10,
                n.trees = 50)
print(model_gbm)

## gbm(formula = quality ~ ., distribution = "gaussian", data = X5red_train, 
##     n.trees = 50, n.minobsinnode = 10, shrinkage = 0.01, cv.folds = 10)
## A gradient boosted model with gaussian loss function.
## 50 iterations were performed.
## The best cross-validation iteration was 50.
## There were 10 predictors of which 3 had non-zero influence.

# Make predictions on the test dataset
pred_y3 <- predict.gbm(model_gbm, data = test_x)

## Using 50 trees...

head(pred_y3)

## [1] 5.517190 5.569225 5.517190 5.534084 5.669058 5.517190

# Accuracy check
mse3 = mean((test_y - pred_y3)^2)

## Warning in test_y - pred_y3: longer object length is not a multiple of shorter
## object length

mae3 = caret::MAE(test_y, pred_y3)

## Warning in pred - obs: longer object length is not a multiple of shorter object
## length

rmse3 = caret::RMSE(test_y, pred_y3)

## Warning in pred - obs: longer object length is not a multiple of shorter object
## length

cat("MSE: ", mse3, "MAE: ", mae3, "RMSE: ", rmse3) # Gradient boosting seems to not perform well compared to the two aboved-mention methods.

## MSE:  0.6191481 MAE:  0.665234 RMSE:  0.7868596

Fuqua MQM Winter Data Competition

Hamza, Minh, and Jenny

16 Jan 2022