# Housekeeping
rm(list=ls()) # clear workspace
cat("\014") # clear console
graphics.off() # shuts down all open graphics devices
This dataset provides us with a comprehensive overview of thousands of wines. There are three questions that our team would like to solve:
1. What variables determine the quality of the wine?
2. Use chemical properties to predict whether the wine will be excellent or poor or normal in terms of quality.
3. Use chemical properties to predict the precise numerical quality rating of the wine.
Credit to the UCI Machine Learning Repository.
Link: https://archive.ics.uci.edu/ml/datasets/Wine+Quality\ Citation: P. Cortez, A. Cerdeira, F. Almeida, T. Matos, and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
# This dataset is for red wines
Xred <- read.csv("/Users/thuhuong/Desktop/winequality-red.csv")
library(ggplot2)
library(moments)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.90 loaded
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(mclust)
## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
##
## map
library(dbscan)
library(dplyr)
library(nnet)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ISLR)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(class)
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
##
## kurtosis, moment, skewness
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(lightgbm)
## Loading required package: R6
##
## Attaching package: 'lightgbm'
## The following objects are masked from 'package:xgboost':
##
## getinfo, setinfo, slice
## The following object is masked from 'package:dplyr':
##
## slice
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
library(gbm)
## Loaded gbm 2.1.8
dim(Xred) # This dataset has 1599 rows and 12 columns
## [1] 1599 12
head(Xred) # Show the first five rows
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
str(Xred) # Structure of the dataset
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(Xred) # Summary of the dataset
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
# Separate dataset into different components to use in EDA
drop <- c("quality")
Xred_train = Xred[,!(names(Xred) %in% drop)]
yred <- subset(Xred, select = c("quality"))
# Duplicated Rows
sum(ifelse(duplicated(Xred) == TRUE,1,0)) #240 duplicated records in red
## [1] 240
# In our case, duplicated rows are acceptable, as wines with all the same physiochemical values should have the same quality rating.
# Missing or Null Values
sum(is.na(Xred)) # No irregularities found. Great dataset!
## [1] 0
summary(Xred$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.636 6.000 8.000
ggplot(Xred, aes(quality)) + geom_boxplot() # Visualize the distribution of quality
print(skewness(Xred$quality)) # Skewness: 0.2175972
## [1] 0.2173931
print(kurtosis(Xred$quality)) # Kurtosis: 3.292031
## [1] 0.2879148
hist(Xred$quality) # Visualize the distribution of quality rating, 5 and 6 are the most common
# The dataset looks imbalanced, especially for the purpose of classification later on. Consequently, we will have to fix the imbalanced dataset by implementing undersampling or oversampling method.
There are 11 independent numerical features that determine the quality rating. To guide us in the exploration of these features, we use two methods to rank the importance of features.
1. Correlation
2. Recursive Feature Elimination
# Using Correlation
correlation_matrix = cor(Xred_train, yred)
correlation_matrix = correlation_matrix[order(correlation_matrix, decreasing = TRUE),]
head(correlation_matrix,5)
## alcohol sulphates citric.acid fixed.acidity residual.sugar
## 0.47616632 0.25139708 0.22637251 0.12405165 0.01373164
# According to correlation, five most important features are alcohol, sulphates, citric.acid, fixed.acidity, and residual.sugar.
# Using Recursive Feature Elimination
subsets <- c(1:5, 10, 15, 20, 25)
ctrl <- rfeControl(functions = lmFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(Xred_train, as.matrix(yred),
sizes = subsets,
rfeControl = ctrl)
lmProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 0.7952 0.03385 0.6629 0.03550 0.02213 0.02018
## 2 0.7897 0.04798 0.6542 0.03907 0.03767 0.02838
## 3 0.7293 0.18803 0.5878 0.03619 0.05751 0.02903
## 4 0.7059 0.23909 0.5663 0.03721 0.06031 0.03033
## 5 0.7056 0.23991 0.5675 0.03712 0.06014 0.03017
## 10 0.6536 0.34775 0.5112 0.04020 0.05210 0.03012
## 11 0.6502 0.35434 0.5044 0.03929 0.05034 0.02920 *
##
## The top 5 variables (out of 11):
## density, chlorides, volatile.acidity, sulphates, pH
# According to recursive feature elimination, five most important features are density, chlorides, volatile.acidity, sulphates, and pH.
# Combining two methods, looks like the important features are: alcohol, sulphates, citric.acid, fixed.acidity, residual.sugar, density, chlorides, volatile.acidity, and pH. We will plot some of them below.
# So variables that are not important are: free sulfur dioxide and total sulfur dioxide.
# Alcohol
ggplot(Xred, aes(x=alcohol, y=quality)) + geom_point() + geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Sulphates
ggplot(Xred, aes(x=sulphates, y=quality)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Citric.acid
ggplot(Xred, aes(x=citric.acid, y=quality)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Density
ggplot(Xred, aes(x=density, y=quality)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Volatile.acidity
ggplot(Xred, aes(x=volatile.acidity, y=quality)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
cor.table.r = cor(Xred)
corrplot(cor.table.r)
Let’s take a look at a few interesting correlations
# Density vs. Alcohol
ggplot(Xred, aes(x=density, y=alcohol)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Seems like a self-explanatory physical attribute, as more alcoholic beverages have less density.
# Fixed.acidity vs. citric.acid
ggplot(Xred, aes(x=fixed.acidity, y=citric.acid)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# The predominant fixed acids found in wines are tartaric, malic, citric, and succinic. As citric acid is a component of fixed acid, their correlation are extremely high.
# Fixed.acidity vs. density
ggplot(Xred, aes(x=fixed.acidity, y=density)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Another high correlation is between density and fixed.acidity. The four prime acids found in fixed acid all have higher density than water. Consequently, the higher the acid concentration in a given compound relative to water, the more dense it becomes, all else being equal.
# Fixed.acidity vs. pH
ggplot(Xred, aes(x=fixed.acidity, y=pH)) + geom_point() +
geom_smooth(method=lm, se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# There is a negative correlation between fixed.acidity and pH. This makes sense as the higher acidity in wine will lead to a lower pH level. Fundamentally speaking, all wines lie on the acidic side of the pH spectrum, most range from 2.5 to about 4.5 pH. In our case, we have pH ranging from 3.0 to 4.0.
The most important part of our project is to cluster the wines into different category: bad, normal, and good. To aid in the classification task, we can add a new feature that clusters wines based on common physiochemical components.
For clustering, we can use the most popular algorithm: K-means Clustering. However, we want to experiment with many differing clustering techniques and compare them to get the final clusters. Thus, we decide to use four clustering methods:
1. K-Means Clustering
2. Hierarchical Clustering
3. Density Based Clustering
4. Expectation-Maximization Clustering (EM)
Guidance to perform K-Means Clustering come from University of Cincinnati’s Business Analytics program.
Citation: https://uc-r.github.io/kmeans_clustering
# Convert the dataset into scale to use the k-means function
Xred_km <- scale(Xred)
head(Xred_km)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## [1,] -0.5281944 0.9615758 -1.391037 -0.45307667 -0.24363047
## [2,] -0.2984541 1.9668271 -1.391037 0.04340257 0.22380518
## [3,] -0.2984541 1.2966596 -1.185699 -0.16937425 0.09632273
## [4,] 1.6543385 -1.3840105 1.483689 -0.45307667 -0.26487754
## [5,] -0.5281944 0.9615758 -1.391037 -0.45307667 -0.24363047
## [6,] -0.5281944 0.7381867 -1.391037 -0.52400227 -0.26487754
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## [1,] -0.46604672 -0.3790141 0.55809987 1.2882399 -0.57902538
## [2,] 0.87236532 0.6241680 0.02825193 -0.7197081 0.12891007
## [3,] -0.08364328 0.2289750 0.13422152 -0.3310730 -0.04807379
## [4,] 0.10755844 0.4113718 0.66406945 -0.9787982 -0.46103614
## [5,] -0.46604672 -0.3790141 0.55809987 1.2882399 -0.57902538
## [6,] -0.27484500 -0.1966174 0.55809987 1.2882399 -0.57902538
## alcohol quality
## [1,] -0.9599458 -0.7875763
## [2,] -0.5845942 -0.7875763
## [3,] -0.5845942 -0.7875763
## [4,] -0.5845942 0.4507074
## [5,] -0.9599458 -0.7875763
## [6,] -0.9599458 -0.7875763
# To visualize the current dataset and see if there's any cluster, we can measure the distance between each datapoint and graph them.
distance <- get_dist(Xred_km)
fviz_dist(distance, show_labels = FALSE, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
# There seems to be 2 to 4 clusters in the data.
# Set center to 3, based on EDA
kmc3 <- kmeans(Xred_km, centers = 3, nstart = 25)
kmc3
## K-means clustering with 3 clusters of sizes 704, 506, 389
##
## Cluster means:
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 -0.64820431 0.49325980 -0.7788889 -0.22866362 -0.172706284
## 2 0.94775088 -0.72929480 1.0064621 0.03702487 0.241189151
## 3 -0.05970723 0.05595957 0.1004318 0.36566736 -0.001173487
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 1 -0.2584815 -0.3700325 -0.4266647 0.6202733 -0.3150580
## 2 -0.4276930 -0.4643849 0.3636687 -0.7275167 0.5872260
## 3 1.0241225 1.2737317 0.2991146 -0.1762184 -0.1936646
## alcohol quality
## 1 0.03241658 -0.1613987
## 2 0.38154044 0.5632786
## 3 -0.55496332 -0.4406022
##
## Clustering vector:
## [1] 1 3 1 2 1 1 1 1 1 3 1 3 1 2 3 3 3 2 1 2 3 3 2 3 1 1 1 2 1 1 1 1 3 3 1 1 1
## [38] 2 1 3 3 3 2 1 1 1 3 2 1 3 1 1 1 3 3 3 2 3 1 1 3 3 1 1 1 1 1 1 2 1 1 3 3 1
## [75] 3 2 2 1 1 3 1 2 3 2 1 1 2 1 3 1 3 2 2 1 1 1 1 1 1 1 1 1 1 3 1 3 2 1 3 3 1
## [112] 3 3 2 1 2 1 1 1 3 1 1 1 1 3 3 1 1 1 1 3 1 1 1 1 3 1 1 3 3 3 1 1 1 1 3 1 3
## [149] 1 1 2 2 3 3 3 3 3 3 1 1 1 1 1 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1
## [186] 3 3 1 3 3 3 1 3 1 1 3 1 2 1 1 2 3 1 1 1 2 2 3 3 2 2 1 2 3 1 3 1 1 1 3 3 3
## [223] 1 1 3 3 2 1 3 1 1 1 3 1 1 1 1 1 1 1 2 2 3 2 2 1 1 3 1 1 2 1 2 3 1 3 2 1 2
## [260] 2 3 1 1 3 2 2 3 2 1 2 3 2 2 1 3 3 1 2 2 2 2 2 1 2 3 3 2 3 2 3 2 2 2 1 2 2
## [297] 3 1 1 1 1 2 1 1 3 2 1 2 2 1 2 3 3 3 3 1 3 3 2 3 2 3 1 3 3 3 2 2 2 2 2 2 3
## [334] 1 1 2 2 3 2 2 2 2 2 2 2 3 1 2 2 1 2 1 1 2 3 1 2 2 2 2 3 2 2 2 2 2 2 2 2 2
## [371] 1 2 2 3 2 2 2 2 2 3 2 2 2 2 3 1 3 1 3 2 1 2 2 3 2 2 3 2 2 1 3 1 2 2 1 2 2
## [408] 2 2 3 3 3 3 2 3 3 2 3 2 1 2 1 1 2 1 1 1 1 1 2 2 3 2 2 2 2 3 2 2 1 2 2 2 2
## [445] 1 1 2 2 1 2 2 2 1 2 1 2 1 3 2 3 2 1 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 2
## [482] 2 2 2 2 2 2 2 2 2 3 2 2 3 3 2 1 3 2 3 1 2 2 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2
## [519] 2 3 2 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 1 2 2 3 2 1 2 2 3 1 2 2 2 1 2 2 1 2
## [556] 2 2 2 2 2 2 3 3 3 2 2 1 1 2 1 2 1 2 3 2 2 1 3 3 2 2 2 2 2 3 1 2 3 1 2 3 3
## [593] 3 2 1 3 2 2 1 2 1 2 1 2 3 1 2 3 2 1 3 2 1 2 2 3 3 2 2 2 3 3 2 1 3 3 1 1 1
## [630] 3 1 2 1 3 3 1 3 3 1 2 3 3 3 3 3 1 1 1 2 3 2 3 2 2 2 3 2 2 1 1 1 3 1 2 2 3
## [667] 2 2 2 2 3 1 3 1 2 2 2 1 3 2 2 1 3 1 3 1 1 3 1 2 1 3 2 3 3 1 1 1 3 2 3 1 1
## [704] 3 1 1 1 1 1 2 3 3 1 1 3 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 2 1 1 3 1 1 1 1 3 1
## [741] 1 3 1 3 3 1 3 3 1 1 1 1 3 1 2 1 1 1 1 3 3 1 1 1 1 1 3 3 3 1 3 3 3 2 2 1 1
## [778] 1 1 3 1 1 3 1 1 3 3 3 3 3 3 3 1 1 2 2 3 2 2 2 3 1 1 1 1 2 2 2 1 1 1 2 2 1
## [815] 2 2 2 2 1 3 1 1 1 1 1 1 2 1 1 1 1 1 3 3 1 1 3 3 2 1 2 1 3 3 2 1 1 1 1 1 2
## [852] 2 3 3 3 1 3 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 2 2 1 1 3 1 1 1 2 1 3 1 1 2
## [889] 1 3 3 1 2 1 1 1 2 1 2 1 2 1 1 1 1 3 1 1 1 1 2 2 2 2 1 2 1 3 3 1 2 3 1 3 2
## [926] 3 3 3 2 2 1 1 3 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 1
## [963] 1 2 2 2 2 3 2 1 2 2 2 1 2 3 3 3 1 2 2 1 3 2 2 1 2 3 1 2 1 3 1 3 3 1 1 1 1
## [1000] 1 1 2 2 1 3 1 2 2 2 3 2 2 1 1 1 2 2 3 3 1 2 2 1 2 1 1 1 1 3 1 1 1 1 1 1 2
## [1037] 2 1 2 2 1 1 2 2 1 1 1 1 2 2 1 2 1 2 3 3 2 3 2 2 2 2 2 2 1 1 1 2 2 3 2 3 3
## [1074] 1 3 3 2 2 2 3 2 3 3 2 3 3 2 2 2 2 2 2 1 2 1 2 1 1 2 1 2 1 1 1 1 1 2 2 1 2
## [1111] 1 1 2 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 3 2 1 3 2 1 2 2 2 2 3 3 3 3 1 1 3 3 1
## [1148] 2 2 2 2 1 1 2 1 1 3 1 2 2 2 2 2 1 1 2 2 2 1 1 2 1 2 3 3 1 1 1 1 2 2 2 3 1
## [1185] 3 1 1 1 3 1 2 1 2 1 1 1 3 1 2 3 1 2 2 3 2 2 2 3 2 1 1 1 1 2 2 2 3 3 2 2 2
## [1222] 2 3 2 2 3 3 1 1 3 2 3 3 3 1 3 1 1 1 1 3 3 2 3 3 1 1 1 1 1 1 3 1 1 1 1 3 1
## [1259] 1 1 2 1 3 1 1 1 1 2 1 1 1 1 1 3 1 3 2 1 3 2 1 1 1 3 1 2 2 1 3 3 1 1 1 1 1
## [1296] 3 3 1 1 1 1 1 2 3 3 3 3 1 3 3 3 1 1 1 3 3 1 2 3 2 3 1 2 2 1 1 1 1 1 3 3 3
## [1333] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1 3 2 2 1 2 1 1 1 1 3 3
## [1370] 1 2 2 2 3 1 3 1 1 1 1 1 1 3 3 3 3 1 1 1 3 1 1 1 1 3 1 1 3 1 1 3 3 2 2 1 2
## [1407] 2 1 2 1 1 1 2 3 2 1 2 2 1 3 1 3 1 1 2 2 2 1 1 3 1 3 1 1 3 3 3 1 1 3 2 3 1
## [1444] 1 3 3 1 1 3 2 2 2 1 3 2 1 3 3 1 2 3 1 1 1 1 1 3 1 3 1 1 1 2 1 3 1 3 1 1 2
## [1481] 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 3 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1
## [1518] 1 1 1 1 1 1 3 1 1 1 1 2 3 1 1 1 3 1 1 1 1 1 3 1 2 1 2 2 1 1 1 2 2 1 1 1 1
## [1555] 1 1 1 1 3 3 3 3 1 1 1 1 2 1 1 1 2 1 3 1 3 1 2 1 1 1 1 1 1 3 2 2 3 1 3 3 1
## [1592] 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 4686.527 5609.049 3731.160
## (between_SS / total_SS = 26.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Let's Visualize the result
fviz_cluster(kmc3, data = Xred_km)
# Let's try different cluster settings to make sure. This block creates a set of clustering objects based on the number of clusters we designated in the center variable.
kmc2 <- kmeans(Xred_km, centers = 2, nstart = 25)
kmc4 <- kmeans(Xred_km, centers = 4, nstart = 25)
kmc5 <- kmeans(Xred_km, centers = 5, nstart = 25)
kmc6 <- kmeans(Xred_km, centers = 6, nstart = 25)
kmc7 <- kmeans(Xred_km, centers = 7, nstart = 25)
kmc8 <- kmeans(Xred_km, centers = 8, nstart = 25)
kmc9 <- kmeans(Xred_km, centers = 9, nstart = 25)
# Plots to compare
p1 <- fviz_cluster(kmc2, geom = "point", data = Xred_km) + ggtitle("k = 2")
p2 <- fviz_cluster(kmc3, geom = "point", data = Xred_km) + ggtitle("k = 3")
p3 <- fviz_cluster(kmc4, geom = "point", data = Xred_km) + ggtitle("k = 4")
p4 <- fviz_cluster(kmc5, geom = "point", data = Xred_km) + ggtitle("k = 5")
p5 <- fviz_cluster(kmc6, geom = "point", data = Xred_km) + ggtitle("k = 6")
p6 <- fviz_cluster(kmc7, geom = "point", data = Xred_km) + ggtitle("k = 7")
p7 <- fviz_cluster(kmc8, geom = "point", data = Xred_km) + ggtitle("k = 8")
p8 <- fviz_cluster(kmc9, geom = "point", data = Xred_km) + ggtitle("k = 9")
# Assign these plots to a grid
grid.arrange(p1, p2, p3, p4, nrow = 2)
grid.arrange(p5, p6, p7, p8, nrow = 2)
# More than 5 doesn't make any sense. 4 clusters look too much. Looks like it could be between 2 or 3 clusters.
# Determine Optimal Number of Clusters
# 1. Elbow Method
set.seed(123)
fviz_nbclust(Xred_km, kmeans, method = "wss")
# Looks like the elbow is at a 3, but it's not very clear. Let's use two other methods.
# 2. Silhouette Method
fviz_nbclust(Xred_km, kmeans, method = "silhouette")
# Very interesting, it looks like the optimal number of clusters is 2, not 3 like our original hypothesis.
# 3. Gap Statistic
# Pubished by 3 professors from Standford. http://web.stanford.edu/~hastie/Papers/gap.pdf
set.seed(123)
gap_stat <- clusGap(Xred_km, FUN = kmeans, nstart = 25, K.max = 9, B = 50)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
fviz_gap_stat(gap_stat)
# Looks like 3 is the answer here using this method.
# Let's proceed with 2 clusters, as it is within the range of the optimal number of clusters.
set.seed(123)
finalclusters <- kmeans(Xred_km, 2, nstart = 25)
print(finalclusters)
## K-means clustering with 2 clusters of sizes 585, 1014
##
## Cluster means:
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 0.8858867 -0.7295470 0.9891242 0.12584600 0.2323040
## 2 -0.5110885 0.4208925 -0.5706485 -0.07260346 -0.1340215
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 1 -0.1781002 -0.2488421 0.3902513 -0.6740906 0.5693024
## 2 0.1027501 0.1435627 -0.2251450 0.3888984 -0.3284437
## alcohol quality
## 1 0.2903706 0.4676411
## 2 -0.1675215 -0.2697930
##
## Clustering vector:
## [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 1 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [38] 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2
## [75] 1 1 1 2 2 2 2 1 2 1 2 2 1 2 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2
## [112] 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1
## [149] 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2
## [186] 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1
## [260] 1 2 2 2 2 1 1 2 1 2 1 2 1 1 2 2 2 2 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 2 1 1
## [297] 2 2 2 2 2 1 2 2 2 1 2 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 2
## [334] 2 2 1 1 2 1 1 1 1 1 1 1 2 2 1 1 2 1 2 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [371] 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 1 1 2 1 1 2 1 1 2 2 2 1 1 2 1 1
## [408] 1 1 1 2 2 2 1 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1
## [445] 2 2 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 2 1
## [482] 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [519] 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 1 1 2 1 1 2 1
## [556] 1 1 1 1 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 2 1 2 2 1 1 2
## [593] 1 1 2 2 1 1 2 1 2 1 2 1 2 2 1 1 1 2 1 1 2 1 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## [630] 2 2 1 2 2 2 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 1 2 2 2 2 2 1 1 2
## [667] 1 1 1 1 2 2 2 2 1 1 1 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2
## [704] 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [741] 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2
## [778] 2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 2 2 1 1 2
## [815] 1 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1
## [852] 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 1 2 2 2 2 1
## [889] 2 1 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 2 2 1
## [926] 1 1 2 1 1 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 2
## [963] 2 1 1 1 1 2 1 2 1 1 1 2 1 2 2 2 2 1 1 2 2 1 1 2 1 2 2 1 2 2 2 2 1 2 2 2 2
## [1000] 2 2 1 1 2 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1
## [1037] 1 2 1 1 2 2 1 1 2 2 2 2 1 1 2 1 2 1 2 2 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 2 2
## [1074] 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 2 2 1 2 1 2 2 2 2 2 1 1 2 1
## [1111] 2 2 1 1 2 2 2 2 2 2 1 2 2 1 2 1 2 2 1 1 2 2 1 2 1 1 1 1 2 2 2 1 2 2 2 1 2
## [1148] 1 1 1 1 2 2 1 2 2 1 2 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 2 2 2 2 2 2 1 1 1 1 2
## [1185] 2 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1
## [1222] 1 2 1 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [1259] 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2
## [1296] 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 2 2 2 2 2 2
## [1333] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 1 2
## [1370] 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1
## [1407] 1 2 1 2 2 2 1 1 1 2 1 1 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 2 2 2 1 2 2
## [1444] 2 2 2 2 2 2 1 1 1 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 2 1
## [1481] 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2
## [1518] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 2 2 2 1 1 2 2 2 2
## [1555] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2
## [1592] 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 7434.904 8334.629
## (between_SS / total_SS = 17.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Visualize the final result
fviz_cluster(finalclusters, data = Xred_km)
Source for guidance: https://uc-r.github.io/hc_clustering\ Hierarchical clustering is an alternative approach to k-means clustering for identifying groups in the dataset. It does not require us to pre-specify the number of clusters to be generated as is required by the k-means approach. Furthermore, hierarchical clustering has an added advantage over K-means clustering in that it results in an attractive tree-based representation of the observations, called a dendrogram.
# Set-up dataset
Xred_hc <- scale(Xred)
head(Xred_hc)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## [1,] -0.5281944 0.9615758 -1.391037 -0.45307667 -0.24363047
## [2,] -0.2984541 1.9668271 -1.391037 0.04340257 0.22380518
## [3,] -0.2984541 1.2966596 -1.185699 -0.16937425 0.09632273
## [4,] 1.6543385 -1.3840105 1.483689 -0.45307667 -0.26487754
## [5,] -0.5281944 0.9615758 -1.391037 -0.45307667 -0.24363047
## [6,] -0.5281944 0.7381867 -1.391037 -0.52400227 -0.26487754
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## [1,] -0.46604672 -0.3790141 0.55809987 1.2882399 -0.57902538
## [2,] 0.87236532 0.6241680 0.02825193 -0.7197081 0.12891007
## [3,] -0.08364328 0.2289750 0.13422152 -0.3310730 -0.04807379
## [4,] 0.10755844 0.4113718 0.66406945 -0.9787982 -0.46103614
## [5,] -0.46604672 -0.3790141 0.55809987 1.2882399 -0.57902538
## [6,] -0.27484500 -0.1966174 0.55809987 1.2882399 -0.57902538
## alcohol quality
## [1,] -0.9599458 -0.7875763
## [2,] -0.5845942 -0.7875763
## [3,] -0.5845942 -0.7875763
## [4,] -0.5845942 0.4507074
## [5,] -0.9599458 -0.7875763
## [6,] -0.9599458 -0.7875763
# Agglomerative Hierarchical Clustering: Each object is initially considered as a single-element cluster (leaf). At each step of the algorithm, the two clusters that are the most similar are combined into a new bigger cluster (nodes). This procedure is iterated until all points are member of just one single big cluster (root) -- bottom-up manner
# Dissimilarity matrix
d <- dist(Xred_hc, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- agnes(Xred_hc, method = "complete")
# Plot the obtained dendrogram
pltree(hc1, cex = 0.6, hang = -1)
# Agglomerative coefficient
# The agglomerative coefficient measures the amount of clustering structure found (values closer to 1 suggest strong clustering structure)
hc1$ac
## [1] 0.9487263
# Methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# Function to compute coefficient
ac <- function(x) {
agnes(Xred_hc, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.9374775 0.8573798 0.9487263 0.9858633
# We see that Ward's method identifies the strongest clustering structure of the four methods assessed.
# Visualize the final dendrogram using Ward's method
hc2 <- agnes(Xred_hc, method = "ward")
pltree(hc2, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
# Divisive Hierarchical Clustering: It begins with the root, in which all objects are included in a single cluster. At each step of iteration, the most heterogeneous cluster is divided into two. The process is iterated until all objects are in their own cluster -- top-down manner
# Compute divisive hierarchical clustering
hc3 <- diana(Xred_hc)
# Divisive coefficient; amount of clustering structure found
hc3$dc
## [1] 0.9402224
# Plot dendrogram
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# Working with Dendrograms
# Ward's method
hc4 <- hclust(d, method = "ward.D2")
# Cut tree into 3 groups
sub_grp <- cutree(hc4, k = 3)
# Number of members in each cluster
table(sub_grp)
## sub_grp
## 1 2 3
## 923 411 265
# Draw dendrogram with a border around the 3 clusters
plot(hc4, cex = 0.6)
rect.hclust(hc4, k = 3, border = 2:5)
# Visualize the result in a scatter plot
fviz_cluster(list(data = Xred, cluster = sub_grp))
# Determine Optimal Clusters
# Elbow Method
fviz_nbclust(Xred_hc, FUN = hcut, method = "wss")
# Looks like after 2 clusters, the rate of change in Total Within Sum of Square decreases.
# Average Silhouette Method
fviz_nbclust(Xred_hc, FUN = hcut, method = "silhouette")
# This method indicates the optimal number of clusters to be 2.
# Gap Statistic Method
gap_stat <- clusGap(Xred_hc, FUN = hcut, nstart = 25, K.max = 9, B = 50)
fviz_gap_stat(gap_stat)
# The optimal number of clusters in gap_stat is 1.
# Applying the optimal clusters of 2
# Cut tree into 2 groups
sub_grp_final <- cutree(hc4, k = 2)
# Number of members in each cluster
table(sub_grp_final)
## sub_grp_final
## 1 2
## 1334 265
# Draw dendrogram with a border around the 3 clusters
plot(hc4, cex = 0.6)
rect.hclust(hc4, k = 2, border = 2:5)
# Visualize the result in a scatter plot
fviz_cluster(list(data = Xred_hc, cluster = sub_grp_final))
# Finding the optimal eps
kNNdistplot(Xred, k=3)
abline(h = 9, lty=2)
# Plotting & Manipulating the Cluster
set.seed(123)
f <- fpc::dbscan(Xred, eps = 8, MinPts = 4)
f
## dbscan Pts=1599 MinPts=4 eps=8
## 0 1
## border 21 7
## seed 0 1571
## total 21 1578
d2 <- dbscan::dbscan(Xred, eps = 8, MinPts = 4)
## Warning in dbscan::dbscan(Xred, eps = 8, MinPts = 4): converting argument MinPts
## (fpc) to minPts (dbscan)!
d2
## DBSCAN clustering for 1599 objects.
## Parameters: eps = 8, minPts = 4
## The clustering contains 1 cluster(s) and 21 noise points.
##
## 0 1
## 21 1578
##
## Available fields: cluster, eps, minPts
## 0 means noise
fviz_cluster(f, Xred, geom = "point")
This method uses Gaussian Mixture Models (GMM), based on Density Based Clustering. Due to the result of section 3.1.c, we do not expect this clustering method to work for our dataset. However, we still want to try it out as practice.
We will use the mclust R package, which is developed specifically for EM Clustering.
Citation:
Scrucca L., Fop M., Murphy T. B. and Raftery A. E. (2016) mclust 5: clustering, classification and density estimation using Gaussian finite mixture models The R Journal 8/1, pp. 289-317.
url = {https://doi.org/10.32614/RJ-2016-021}
# Use the mclust function to find the best fit
fit <- Mclust(Xred)
summary(fit)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 8 components:
##
## log-likelihood n df BIC ICL
## -2764.953 1599 265 -7484.846 -7937.639
##
## Clustering table:
## 1 2 3 4 5 6 7 8
## 148 151 173 149 162 385 169 262
# 8 clusters is definitely incorrect, and not what we were looking for. However, let's visualize this method.
plot(fit, what = "BIC")
# The classification function takes into consideration every variable and cluster them pairwise.
plot(fit, what = "classification")
# The density based methods are overkill for our data and lead to over-clustering.
Based on four clustering methods that we performed, K-Means Clustering and Hierarchical Clustering make the most sense. We will choose these two methods to take a closer look.
# K-Means Clustering
X_clustering <- Xred # create a X_clustering dataset to keep the integrity of the original Xred dataset.
km_final <- X_clustering %>%
mutate(cluster = finalclusters$cluster)
head(km_final)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality cluster
## 1 5 2
## 2 5 2
## 3 5 2
## 4 6 1
## 5 5 2
## 6 5 2
# Take a look into the characteristics of Cluster 1
summary(km_final[km_final$cluster == 1,])
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 5.600 Min. :0.1200 Min. :0.0900 Min. : 0.900
## 1st Qu.: 8.600 1st Qu.:0.3100 1st Qu.:0.3900 1st Qu.: 2.000
## Median : 9.800 Median :0.3800 Median :0.4600 Median : 2.300
## Mean : 9.862 Mean :0.3972 Mean :0.4637 Mean : 2.716
## 3rd Qu.:10.800 3rd Qu.:0.4700 3rd Qu.:0.5300 3rd Qu.: 2.700
## Max. :15.900 Max. :0.9000 Max. :1.0000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.0380 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.0690 1st Qu.: 6.00 1st Qu.: 19.00 1st Qu.:0.9960
## Median :0.0810 Median :10.00 Median : 29.00 Median :0.9974
## Mean :0.0984 Mean :14.01 Mean : 38.28 Mean :0.9975
## 3rd Qu.:0.0950 3rd Qu.:19.00 3rd Qu.: 49.00 3rd Qu.:0.9988
## Max. :0.6110 Max. :55.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality cluster
## Min. :2.740 Min. :0.4200 Min. : 8.40 Min. :3.000 Min. :1
## 1st Qu.:3.140 1st Qu.:0.6300 1st Qu.: 9.80 1st Qu.:5.000 1st Qu.:1
## Median :3.210 Median :0.7200 Median :10.70 Median :6.000 Median :1
## Mean :3.207 Mean :0.7546 Mean :10.73 Mean :6.014 Mean :1
## 3rd Qu.:3.300 3rd Qu.:0.8300 3rd Qu.:11.50 3rd Qu.:7.000 3rd Qu.:1
## Max. :3.540 Max. :2.0000 Max. :14.90 Max. :8.000 Max. :1
summary(km_final[km_final$cluster == 2,])
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1900 Min. :0.0000 Min. : 1.200
## 1st Qu.: 6.80 1st Qu.:0.5000 1st Qu.:0.0400 1st Qu.: 1.900
## Median : 7.40 Median :0.6000 Median :0.1400 Median : 2.100
## Mean : 7.43 Mean :0.6032 Mean :0.1598 Mean : 2.436
## 3rd Qu.: 8.00 3rd Qu.:0.6850 3rd Qu.:0.2500 3rd Qu.: 2.500
## Max. :10.60 Max. :1.5800 Max. :0.7000 Max. :13.400
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 3.00 Min. : 7.00 Min. :0.9902
## 1st Qu.:0.07000 1st Qu.: 9.00 1st Qu.: 25.00 1st Qu.:0.9955
## Median :0.07900 Median :15.00 Median : 42.00 Median :0.9964
## Mean :0.08116 Mean :16.95 Mean : 51.19 Mean :0.9963
## 3rd Qu.:0.08775 3rd Qu.:23.00 3rd Qu.: 70.00 3rd Qu.:0.9974
## Max. :0.26700 Max. :72.00 Max. :165.00 Max. :1.0014
## pH sulphates alcohol quality cluster
## Min. :2.880 Min. :0.3300 Min. : 9.00 Min. :3.000 Min. :2
## 1st Qu.:3.290 1st Qu.:0.5300 1st Qu.: 9.50 1st Qu.:5.000 1st Qu.:2
## Median :3.360 Median :0.5900 Median : 9.90 Median :5.000 Median :2
## Mean :3.371 Mean :0.6025 Mean :10.24 Mean :5.418 Mean :2
## 3rd Qu.:3.450 3rd Qu.:0.6500 3rd Qu.:10.80 3rd Qu.:6.000 3rd Qu.:2
## Max. :4.010 Max. :1.2000 Max. :14.00 Max. :8.000 Max. :2
# Suprisingly, there are no distinctive difference between two clusters, even in the quality column. Behind the scene, the clustering algorithm do not classify wines based on a specific feature.
# Hierarchical Clustering
hc_final <- X_clustering %>%
mutate(cluster = sub_grp_final)
head(hc_final)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality cluster
## 1 5 1
## 2 5 1
## 3 5 1
## 4 6 1
## 5 5 1
## 6 5 1
# Take a look into the characteristics of Cluster 1
summary(hc_final[hc_final$cluster == 1,])
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.000 1st Qu.:0.4200 1st Qu.:0.0700 1st Qu.: 1.900
## Median : 7.600 Median :0.5500 Median :0.2200 Median : 2.200
## Mean : 7.839 Mean :0.5513 Mean :0.2217 Mean : 2.485
## 3rd Qu.: 8.600 3rd Qu.:0.6550 3rd Qu.:0.3400 3rd Qu.: 2.500
## Max. :12.500 Max. :1.5800 Max. :0.7800 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.0 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 9.00 1st Qu.: 25.0 1st Qu.:0.9954
## Median :0.07900 Median :15.00 Median : 42.0 Median :0.9965
## Mean :0.08279 Mean :17.21 Mean : 50.3 Mean :0.9964
## 3rd Qu.:0.08900 3rd Qu.:24.00 3rd Qu.: 67.0 3rd Qu.:0.9975
## Max. :0.27000 Max. :72.00 Max. :289.0 Max. :1.0037
## pH sulphates alcohol quality cluster
## Min. :2.870 Min. :0.3300 Min. : 8.40 Min. :3.000 Min. :1
## 1st Qu.:3.250 1st Qu.:0.5400 1st Qu.: 9.50 1st Qu.:5.000 1st Qu.:1
## Median :3.340 Median :0.6000 Median :10.10 Median :5.000 Median :1
## Mean :3.341 Mean :0.6355 Mean :10.37 Mean :5.545 Mean :1
## 3rd Qu.:3.420 3rd Qu.:0.7000 3rd Qu.:11.00 3rd Qu.:6.000 3rd Qu.:1
## Max. :4.010 Max. :1.9800 Max. :14.00 Max. :8.000 Max. :1
summary(hc_final[hc_final$cluster == 2,])
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 7.20 Min. :0.1800 Min. :0.2000 Min. :1.400
## 1st Qu.: 9.40 1st Qu.:0.3200 1st Qu.:0.4600 1st Qu.:2.000
## Median :10.60 Median :0.4000 Median :0.5000 Median :2.400
## Mean :10.74 Mean :0.4097 Mean :0.5192 Mean :2.809
## 3rd Qu.:11.90 3rd Qu.:0.4800 3rd Qu.:0.6000 3rd Qu.:3.000
## Max. :15.90 Max. :0.8400 Max. :1.0000 Max. :9.000
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.039 Min. : 1.000 Min. : 6.00 Min. :0.9948
## 1st Qu.:0.072 1st Qu.: 6.000 1st Qu.:17.00 1st Qu.:0.9970
## Median :0.083 Median : 7.000 Median :23.00 Median :0.9980
## Mean :0.111 Mean : 9.158 Mean :27.17 Mean :0.9982
## 3rd Qu.:0.098 3rd Qu.:12.000 3rd Qu.:35.00 3rd Qu.:0.9996
## Max. :0.611 Max. :32.000 Max. :81.00 Max. :1.0032
## pH sulphates alcohol quality cluster
## Min. :2.740 Min. :0.4300 Min. : 8.40 Min. :4.000 Min. :2
## 1st Qu.:3.070 1st Qu.:0.6500 1st Qu.: 9.80 1st Qu.:6.000 1st Qu.:2
## Median :3.170 Median :0.7300 Median :10.60 Median :6.000 Median :2
## Mean :3.161 Mean :0.7722 Mean :10.69 Mean :6.094 Mean :2
## 3rd Qu.:3.240 3rd Qu.:0.8400 3rd Qu.:11.40 3rd Qu.:7.000 3rd Qu.:2
## Max. :3.480 Max. :2.0000 Max. :14.90 Max. :8.000 Max. :2
# Similarly, the hierarchical clustering algorithm does not classify wines based on a specific feature. Additionally, looks like the characteristics of HC's cluster 2 is similar to KM's cluster 1 and HC's cluster 1 is similar to KM's cluster 2.
# Either way, since these two methods are not significantly different from each other. We decide to go with the more popular method: K-Means Clustering.
As a result of the clustering step, we create an additional variable named cluster to add to our original Xred dataset. This feature will help in the classification task later on.
# Add cluster column to Xred
X4red <- Xred %>%
mutate(cluster = finalclusters$cluster) %>%
group_by(cluster)
# Since cluster does not represent quality score. We will create a categorical variable named type that is the target variable for the classification task.
X4red$type <- ifelse(X4red$quality < 5, "bad", (ifelse(X4red$quality < 7, "normal", "good")))
# Drop fixed.acidity since it has high correlations with three variables: citric.acid, density, and pH. This will reduce multi-collinearity problem which happens when there is a high intercorrelation among two or more independent variables in a multiple regression model.
# Drop quality since we won't need it for the purpose of classification
X4red <- X4red %>%
select(-one_of('quality','fixed.acidity'))
head(X4red)
## # A tibble: 6 x 12
## # Groups: cluster [2]
## volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.7 0 1.9 0.076 11
## 2 0.88 0 2.6 0.098 25
## 3 0.76 0.04 2.3 0.092 15
## 4 0.28 0.56 1.9 0.075 17
## 5 0.7 0 1.9 0.076 11
## 6 0.66 0 1.8 0.075 13
## # … with 7 more variables: total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>,
## # sulphates <dbl>, alcohol <dbl>, cluster <int>, type <chr>
# Create a test and train dataset by splitting X4red to 70:30
set.seed(1234)
splitindex <- createDataPartition(X4red$type, p=0.7, list=FALSE, times=1)
X4red_train <- X4red[splitindex,]
table(X4red_train$type)
##
## bad good normal
## 45 152 924
X4red_test <- X4red[-splitindex,]
table(X4red_test$type)
##
## bad good normal
## 18 65 395
The train dataset is highly imbalanced with the minority class, bad, accounts for only 4.01% of the entire X4red_train dataset.
Imbalanced datasets pose a challenge for predictive modeling as most of the machine learning algorithms used for classification were designed around the assumption of an equal number of examples for each class. This results in models that have poor predictive performance, specifically for the minority class(es).
In this situation, since X4red_train is quite small already, we will use oversampling to preserve the information of the train dataset.
# Oversampling the imbalanced dataset
set.seed(234)
X4red_train <- upSample(x = X4red_train[, -12],
y = as.factor(X4red_train$type))
# upSample function changes the name of our target variable from type to Class. Consequently, Class will be our target variable. The number of samples within each class is now balanced with each class having 924 samples.
table(X4red_train$Class)
##
## bad good normal
## 924 924 924
names(X4red_test)[12] <- "Class"
X4red_test$Class <- as.factor(X4red_test$Class)
# Fit the model on the train dataset
model1 <- multinom(Class ~ ., data = X4red_train)
## # weights: 39 (24 variable)
## initial value 3045.353264
## iter 10 value 2412.715248
## iter 20 value 2018.928426
## iter 30 value 2010.623731
## iter 40 value 2006.008867
## iter 50 value 2005.496529
## iter 60 value 2005.387039
## iter 70 value 2005.295465
## iter 80 value 2005.237719
## iter 90 value 2005.186050
## final value 2005.145558
## converged
summary(model1)
## Call:
## multinom(formula = Class ~ ., data = X4red_train)
##
## Coefficients:
## (Intercept) volatile.acidity citric.acid residual.sugar chlorides
## good -61.49818 -6.663263 -2.384354 -0.2450162 -12.347272
## normal -142.59605 -5.280397 -1.706137 -0.3462484 -2.816825
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## good 0.03715415 -0.002253426 60.93148 -2.919018 3.751323
## normal 0.02366501 0.013034211 150.19473 -1.773522 -0.367506
## alcohol cluster
## good 1.3851622 -1.03458657
## normal 0.2872678 -0.01840033
##
## Std. Errors:
## (Intercept) volatile.acidity citric.acid residual.sugar chlorides
## good 0.9343012 0.5374589 0.6330079 0.04343921 1.746952
## normal 0.7455405 0.3776624 0.4665346 0.03712102 1.130305
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## good 0.009389438 0.003211168 0.9302974 0.5769931 0.4849103
## normal 0.007744228 0.002582445 0.7438576 0.4676803 0.3863389
## alcohol cluster
## good 0.08276945 0.2527466
## normal 0.06976332 0.1993658
##
## Residual Deviance: 4010.291
## AIC: 4058.291
# Make prediction on the test dataset
p1 <- predict(model1, X4red_test, type = 'class')
head(p1)
## [1] bad bad bad bad normal normal
## Levels: bad good normal
logistic.pred <- data.frame(p1, X4red_test$Class)
names(logistic.pred) <- c("prediction", "label")
# Calculate the final accuracy
result1 = sum(logistic.pred$prediction==logistic.pred$label)/nrow(logistic.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result1)))
## [1] "Final Accuracy = 58.58%"
# Calculate the confusion matrix
confusionMatrix(logistic.pred$prediction,logistic.pred$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 13 5 109
## good 0 50 69
## normal 5 10 217
##
## Overall Statistics
##
## Accuracy : 0.5858
## 95% CI : (0.5402, 0.6303)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2537
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.72222 0.7692 0.5494
## Specificity 0.75217 0.8329 0.8193
## Pos Pred Value 0.10236 0.4202 0.9353
## Neg Pred Value 0.98575 0.9582 0.2764
## Prevalence 0.03766 0.1360 0.8264
## Detection Rate 0.02720 0.1046 0.4540
## Detection Prevalence 0.26569 0.2490 0.4854
## Balanced Accuracy 0.73720 0.8011 0.6843
# Calculate multi-class AUC
auc1 <- multiclass.roc(as.numeric(logistic.pred$label), as.numeric(logistic.pred$prediction))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc1
##
## Call:
## multiclass.roc.default(response = as.numeric(logistic.pred$label), predictor = as.numeric(logistic.pred$prediction))
##
## Data: as.numeric(logistic.pred$prediction) with 3 levels of as.numeric(logistic.pred$label): 1, 2, 3.
## Multi-class area under the curve: 0.671
# Fit the model on the train dataset
tree <- rpart(Class~., X4red_train)
# Visualize the tree built
rpart.plot(tree)
# Make prediction on the test dataset
p2 <- predict(tree, X4red_test, type="class")
head(p2)
## 1 2 3 4 5 6
## bad bad normal bad bad bad
## Levels: bad good normal
# Calculate the final accuracy
result2 = sum(p2==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result2)))
## [1] "Final Accuracy = 59.62%"
# Calculate the confusion matrix
confusionMatrix(p2,X4red_test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 10 6 121
## good 0 45 44
## normal 8 14 230
##
## Overall Statistics
##
## Accuracy : 0.5962
## 95% CI : (0.5507, 0.6406)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2356
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.55556 0.69231 0.5823
## Specificity 0.72391 0.89346 0.7349
## Pos Pred Value 0.07299 0.50562 0.9127
## Neg Pred Value 0.97654 0.94859 0.2699
## Prevalence 0.03766 0.13598 0.8264
## Detection Rate 0.02092 0.09414 0.4812
## Detection Prevalence 0.28661 0.18619 0.5272
## Balanced Accuracy 0.63973 0.79289 0.6586
# Calculate multi-class AUC
auc2 <- multiclass.roc(as.numeric(X4red_test$Class), as.numeric(p2))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc2
##
## Call:
## multiclass.roc.default(response = as.numeric(X4red_test$Class), predictor = as.numeric(p2))
##
## Data: as.numeric(p2) with 3 levels of as.numeric(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.5867
# Fit the model on the train dataset using default method
model2 <- randomForest(Class ~ ., data = X4red_train) ## default for classification mtry is sqrt(p)
model2 # shows out of bag error rate as 1.66%
##
## Call:
## randomForest(formula = Class ~ ., data = X4red_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.66%
## Confusion matrix:
## bad good normal class.error
## bad 924 0 0 0.000000000
## good 0 923 1 0.001082251
## normal 3 42 879 0.048701299
# Make prediction on the test dataset
p3 <- predict(model2, X4red_test)
head(p3) # look at prediction
## 1 2 3 4 5 6
## normal normal normal normal normal normal
## Levels: bad good normal
# Calculate the final accuracy
result3 = sum(p3==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result3)))
## [1] "Final Accuracy = 87.45%"
# Calculate the confusion matrix
confusionMatrix(p3, X4red_test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 0 0 2
## good 0 38 13
## normal 18 27 380
##
## Overall Statistics
##
## Accuracy : 0.8745
## 95% CI : (0.8414, 0.9028)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 0.002447
##
## Kappa : 0.4991
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.000000 0.5846 0.9620
## Specificity 0.995652 0.9685 0.4578
## Pos Pred Value 0.000000 0.7451 0.8941
## Neg Pred Value 0.962185 0.9368 0.7170
## Prevalence 0.037657 0.1360 0.8264
## Detection Rate 0.000000 0.0795 0.7950
## Detection Prevalence 0.004184 0.1067 0.8891
## Balanced Accuracy 0.497826 0.7766 0.7099
# Calculate multi-class AUC
auc3 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(p3))
## Setting direction: controls > cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc3
##
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class), predictor = as.integer(p3))
##
## Data: as.integer(p3) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6817
# Error rate of Random Forest
plot(model2)
# as the number of trees grow, the OOB error goes flat
# Tune mtry
train <- as.data.frame(X4red_train)
t <- tuneRF(train[,-12], train[,12],
stepFactor = 0.5,
plot= TRUE,
ntreeTry = 500,
trace = TRUE,
improve = 0.05)
## mtry = 3 OOB error = 1.55%
## Searching left ...
## mtry = 6 OOB error = 2.13%
## -0.372093 0.05
## Searching right ...
## mtry = 1 OOB error = 2.71%
## -0.744186 0.05
# OOB error is high at mtry = 1 but using elbow method it bottoms at mtry = 3 and this is the value to choose. This mtry is consistent with the default method
# ntree of 500 and mtry of 3 are the optimal parameters. Since they are consistent with the default method, we will keep model 2 as is.
# Histogram using trees - size of trees in terms of number of nodes
hist(treesize(model2),
main = "# of Nodels for the Trees",
col = "purple")
# Variable Importance
varImpPlot(model2,
sort = T,
main = "Variable Importance")
# First chart shows the maximum importance of each variable in decreasing order - shows how the model would perform without the variable
# Second one captures how pure the nodes are at the ends of the tree
importance(model2) # gives actual data points for above charts
## MeanDecreaseGini
## volatile.acidity 265.49365
## citric.acid 127.26183
## residual.sugar 130.03275
## chlorides 157.65967
## free.sulfur.dioxide 132.97718
## total.sulfur.dioxide 159.49283
## density 162.35419
## pH 130.81704
## sulphates 234.84849
## alcohol 283.77406
## cluster 62.58418
varUsed(model2) # how many of each variables are actually used in the random forest
## [1] 9486 7910 7925 9155 7553 8603 9216 8606 8837 9145 1155
# Cluster, which is the last variable, was only used 944 times in the random forest model
Guidance for K-Nearest Neighbors:Â 1. https://daviddalpiaz.github.io/r4sl/knn-class.html\ 2. https://www.geeksforgeeks.org/k-nn-classifier-in-r-programming/\ 3. https://towardsdatascience.com/k-nearest-neighbors-algorithm-with-examples-in-r-simply-explained-knn-1f2c88da405c
# Normalize function
# Write a function called normalize
normalize <- function(x) {
(x - min(x))/(max(x)-min(x))
}
# Scale the features
X4red_test_knn <- scale(X4red_test[,1:10])
X4red_train_knn <- scale(X4red_train[,1:10])
head(X4red_test_knn)
## volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
## [1,] 0.8220416 -1.316788 -0.49412161 -0.2849096 -0.45894214
## [2,] 1.7870908 -1.316788 -0.02302006 0.3654289 0.86944920
## [3,] 1.1437247 -1.111579 -0.22492072 0.1880639 -0.07940175
## [4,] 0.8220416 -1.316788 -0.49412161 -0.2849096 -0.45894214
## [5,] 0.2859031 -1.008975 -0.69602227 -0.4918355 -0.07940175
## [6,] 0.5539723 -1.316788 -0.96522316 -0.6100789 -0.07940175
## total.sulfur.dioxide density pH sulphates alcohol
## [1,] -0.4029842 0.50625426 1.30615953 -0.53751202 -0.8947077
## [2,] 0.5650616 -0.01658200 -0.70346272 0.23251019 -0.5242433
## [3,] 0.1837102 0.08798525 -0.31450358 0.04000464 -0.5242433
## [4,] -0.4029842 0.50625426 1.30615953 -0.53751202 -0.8947077
## [5,] 0.3303838 -0.22571651 -0.05519748 -1.17919720 -0.8947077
## [6,] -0.7843355 -1.16682179 0.52824124 -1.11502868 -0.3390111
# Get independent variable
Yknn_train <- X4red_train[,12]
Yknn_test <- X4red_test[,12]
# Everything should be set up, let's run our K-Nearest Neighbors function
knn_predicted_values <- knn(X4red_train_knn, X4red_test_knn, cl = Yknn_train, k = 38)
head(knn_predicted_values)
## [1] bad normal normal bad normal bad
## Levels: bad good normal
our_Y_knn_test <- pull(Yknn_test, "Class")
# Calculate the final accuracy
result4 = sum(knn_predicted_values==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result4)))
## [1] "Final Accuracy = 48.95%"
# Calculate the confusion matrix
confusionMatrix(knn_predicted_values, X4red_test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 14 8 133
## good 0 56 98
## normal 4 1 164
##
## Overall Statistics
##
## Accuracy : 0.4895
## 95% CI : (0.4439, 0.5353)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2169
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.77778 0.8615 0.4152
## Specificity 0.69348 0.7627 0.9398
## Pos Pred Value 0.09032 0.3636 0.9704
## Neg Pred Value 0.98762 0.9722 0.2524
## Prevalence 0.03766 0.1360 0.8264
## Detection Rate 0.02929 0.1172 0.3431
## Detection Prevalence 0.32427 0.3222 0.3536
## Balanced Accuracy 0.73563 0.8121 0.6775
# Calculate multi-class AUC
auc4 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(knn_predicted_values))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc4
##
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class), predictor = as.integer(knn_predicted_values))
##
## Data: as.integer(knn_predicted_values) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6649
# Figure out the optimal number of K
set.seed(42)
k_to_try = 1:100
err_k = rep(x = 0, times = length(k_to_try))
knnerror <- function(actual, predicted){
mean(actual != predicted)
}
for (i in seq_along(k_to_try)) {
pred = knn(train = X4red_train_knn,
test = X4red_test_knn,
cl = Yknn_train,
k = k_to_try[i])
err_k[i] = knnerror(our_Y_knn_test, pred)
}
# Let's plot error vs K
plot(err_k, type = "b", col = "dodgerblue", cex = 1, pch = 20,
xlab = "k, number of neighbors", ylab = "classification error",
main = "(Test) Error Rate vs Neighbors")
# add line for min error seen
abline(h = min(err_k), col = "darkorange", lty = 3)
# add line for minority prevalence in test set
abline(h = mean(our_Y_knn_test == "Yes"), col = "grey", lty = 2)
# Interesting... Looks like any K higher than about 5 produces huge errors. Let's find out which K we should use.
print(min(err_k))
## [1] 0.1820084
print(which(err_k == min(err_k)))
## [1] 1
# Looks like 1 K is our answer. Let's do the process again with K = 1
final_knn_values <- knn(X4red_train_knn, X4red_test_knn, cl = Yknn_train, k = 1)
# Calculate the final accuracy
result5 = sum(final_knn_values==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result5)))
## [1] "Final Accuracy = 81.80%"
# Calculate the confusion matrix
confusionMatrix(final_knn_values, X4red_test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 3 0 10
## good 0 37 34
## normal 15 28 351
##
## Overall Statistics
##
## Accuracy : 0.818
## 95% CI : (0.7804, 0.8516)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 0.7097
##
## Kappa : 0.3885
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.166667 0.56923 0.8886
## Specificity 0.978261 0.91768 0.4819
## Pos Pred Value 0.230769 0.52113 0.8909
## Neg Pred Value 0.967742 0.93120 0.4762
## Prevalence 0.037657 0.13598 0.8264
## Detection Rate 0.006276 0.07741 0.7343
## Detection Prevalence 0.027197 0.14854 0.8243
## Balanced Accuracy 0.572464 0.74345 0.6853
# Calculate multi-class AUC
auc5 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(final_knn_values))
## Setting direction: controls > cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc5
##
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class), predictor = as.integer(final_knn_values))
##
## Data: as.integer(final_knn_values) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6368
Guidance for SVM: https://www.geeksforgeeks.org/classifying-data-using-support-vector-machinessvms-in-r/
# Finalize Train and Test Dataset
X4red_test_svm <- X4red_test[,c(1,2,3,4,5,6,7,8,9,10,12)]
X4red_test_svm[-11] <- scale(X4red_test_svm[-11])
X4red_train_svm <- X4red_train[,c(1,2,3,4,5,6,7,8,9,10,12)]
X4red_train_svm[-11] <- scale(X4red_train_svm[-11])
head(X4red_test_svm)
## # A tibble: 6 x 11
## volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.822 -1.32 -0.494 -0.285 -0.459
## 2 1.79 -1.32 -0.0230 0.365 0.869
## 3 1.14 -1.11 -0.225 0.188 -0.0794
## 4 0.822 -1.32 -0.494 -0.285 -0.459
## 5 0.286 -1.01 -0.696 -0.492 -0.0794
## 6 0.554 -1.32 -0.965 -0.610 -0.0794
## # … with 6 more variables: total.sulfur.dioxide <dbl>, density <dbl>, pH <dbl>,
## # sulphates <dbl>, alcohol <dbl>, Class <fct>
# Get dependent variable
Ysvm_train <- X4red_train_svm[,11]
Ysvm_test <- X4red_test_svm[,11]
# Fit a SVM classifier
svm_classifier <- svm(formula = Class~.,
data = X4red_train_svm,
type = 'C-classification',
kernel = 'linear')
svm_classifier
##
## Call:
## svm(formula = Class ~ ., data = X4red_train_svm, type = "C-classification",
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 1861
# Predicting Test set results
svm_predict <- predict(svm_classifier, newdata = X4red_test_svm[-11])
head(svm_predict)
## 1 2 3 4 5 6
## bad bad bad bad normal bad
## Levels: bad good normal
# Calculate the final accuracy
result6 = sum(svm_predict==X4red_test$Class)/nrow(X4red_test)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result6)))
## [1] "Final Accuracy = 47.91%"
# Calculate the confusion matrix
confusionMatrix(svm_predict, X4red_test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 14 6 136
## good 0 57 101
## normal 4 2 158
##
## Overall Statistics
##
## Accuracy : 0.4791
## 95% CI : (0.4335, 0.5249)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2098
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.77778 0.8769 0.4000
## Specificity 0.69130 0.7554 0.9277
## Pos Pred Value 0.08974 0.3608 0.9634
## Neg Pred Value 0.98758 0.9750 0.2452
## Prevalence 0.03766 0.1360 0.8264
## Detection Rate 0.02929 0.1192 0.3305
## Detection Prevalence 0.32636 0.3305 0.3431
## Balanced Accuracy 0.73454 0.8162 0.6639
# Calculate multi-class AUC
auc6 <- multiclass.roc(as.integer(X4red_test$Class),as.integer(svm_predict))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc6
##
## Call:
## multiclass.roc.default(response = as.integer(X4red_test$Class), predictor = as.integer(svm_predict))
##
## Data: as.integer(svm_predict) with 3 levels of as.integer(X4red_test$Class): 1, 2, 3.
## Multi-class area under the curve: 0.6597
# SVM is better if there is only 2 independent variables. As there is too many variables here, we can't visualize our data and SVM method doesn't work very well.
# Divide train and test dataset into data and label
train.data = X4red_train
class1 = train.data$Class
label1 = as.integer(train.data$Class)-1
train.data$Class = NULL
train.data = as.matrix(train.data)
train.label = label1
test.data = X4red_test
test.data$cluster <- as.numeric(test.data$cluster)
test.data$Class <- as.factor(test.data$Class)
class2 = as.factor(test.data$Class)
label2 = as.integer(as.factor(test.data$Class))-1
test.data$Class = NULL
test.data = as.matrix(test.data)
test.label = label2
# Transform the two data sets into xgb.Matrix
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)
# Define the parameters for multinomial classification
num_class = length(levels(class1))
xgb.params = list(
booster="gbtree",
eta=0.001,
max_depth=5,
gamma=3,
subsample=0.75,
colsample_bytree=1,
objective="multi:softprob",
eval_metric="mlogloss",
num_class=num_class
)
# Train the XGBoost classifer
xgb.fit=xgb.train(
params=xgb.params,
data=xgb.train,
nrounds=10000,
nthreads=1,
early_stopping_rounds=50,
watchlist=list(val1=xgb.train,val2=xgb.test),
verbose=0
)
## [00:10:26] WARNING: amalgamation/../src/learner.cc:576:
## Parameters: { "nthreads" } might not be used.
##
## This could be a false alarm, with some parameters getting used by language bindings but
## then being mistakenly passed down to XGBoost core, or some parameter actually being used
## but getting flagged wrongly here. Please open an issue if you find any such cases.
# Review the final model and results
xgb.fit
## ##### xgb.Booster
## raw: 74.9 Mb
## call:
## xgb.train(params = xgb.params, data = xgb.train, nrounds = 10000,
## watchlist = list(val1 = xgb.train, val2 = xgb.test), verbose = 0,
## early_stopping_rounds = 50, nthreads = 1)
## params (as set within xgb.train):
## booster = "gbtree", eta = "0.001", max_depth = "5", gamma = "3", subsample = "0.75", colsample_bytree = "1", objective = "multi:softprob", eval_metric = "mlogloss", num_class = "3", nthreads = "1", validate_parameters = "TRUE"
## xgb.attributes:
## best_iteration, best_msg, best_ntreelimit, best_score, niter
## callbacks:
## cb.evaluation.log()
## cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,
## verbose = verbose)
## # of features: 11
## niter: 10000
## best_iteration : 10000
## best_ntreelimit : 10000
## best_score : 0.444072
## best_msg : [10000] val1-mlogloss:0.158273 val2-mlogloss:0.444072
## nfeatures : 11
## evaluation_log:
## iter val1_mlogloss val2_mlogloss
## 1 1.097714 1.097978
## 2 1.096851 1.097405
## ---
## 9999 0.158285 0.444081
## 10000 0.158273 0.444072
# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test.data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = levels(class2)
# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = levels(class2)[test.label+1]
# Calculate the final accuracy
result7 = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result7)))
## [1] "Final Accuracy = 83.47%"
# Calculate the confusion matrix
confusionMatrix(as.factor(xgb.pred$prediction),as.factor(xgb.pred$label))
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 8 1 17
## good 0 48 35
## normal 10 16 343
##
## Overall Statistics
##
## Accuracy : 0.8347
## 95% CI : (0.7983, 0.8669)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 0.3403
##
## Kappa : 0.5087
##
## Mcnemar's Test P-Value : 0.0195
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.44444 0.7385 0.8684
## Specificity 0.96087 0.9153 0.6867
## Pos Pred Value 0.30769 0.5783 0.9295
## Neg Pred Value 0.97788 0.9570 0.5229
## Prevalence 0.03766 0.1360 0.8264
## Detection Rate 0.01674 0.1004 0.7176
## Detection Prevalence 0.05439 0.1736 0.7720
## Balanced Accuracy 0.70266 0.8269 0.7776
# Calculate multi-class AUC
auc7 <- multiclass.roc(as.integer(as.factor(xgb.pred$label)),as.integer(as.factor(xgb.pred$prediction)))
## Setting direction: controls > cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc7
##
## Call:
## multiclass.roc.default(response = as.integer(as.factor(xgb.pred$label)), predictor = as.integer(as.factor(xgb.pred$prediction)))
##
## Data: as.integer(as.factor(xgb.pred$prediction)) with 3 levels of as.integer(as.factor(xgb.pred$label)): 1, 2, 3.
## Multi-class area under the curve: 0.6542
# Transform the two data sets into lgb.Dataset
lgb.train = lgb.Dataset(data=train.data,label=train.label)
lgb.test = lgb.Dataset(data=test.data,label=test.label)
# Define the parameters for multinomial classification
lgb.params <- list(
objective = "multiclass",
metric = "multi_logloss",
num_class = 3L,
min_data = 1L,
learning_rate = 1.0
)
# Train the LightGBM classifer
lgb.fit=lgb.train(
params=lgb.params,
data=lgb.train,
nrounds = 10000L,
valids = list(val1=lgb.train,val2=lgb.test),
early_stopping_rounds=50
)
## [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
## You can set `force_row_wise=true` to remove the overhead.
## And if memory is not enough, you can set `force_col_wise=true`.
## [LightGBM] [Info] Total Bins 946
## [LightGBM] [Info] Number of data points in the train set: 2772, number of used features: 11
## [LightGBM] [Info] Start training from score -1.098612
## [LightGBM] [Info] Start training from score -1.098612
## [LightGBM] [Info] Start training from score -1.098612
## [1] "[1]: val1's multi_logloss:0.308736 val2's multi_logloss:0.70352"
## [1] "[2]: val1's multi_logloss:0.142459 val2's multi_logloss:0.619246"
## [1] "[3]: val1's multi_logloss:0.0700909 val2's multi_logloss:0.62743"
## [1] "[4]: val1's multi_logloss:0.0332382 val2's multi_logloss:0.589903"
## [1] "[5]: val1's multi_logloss:0.0162919 val2's multi_logloss:0.60438"
## [1] "[6]: val1's multi_logloss:0.00893837 val2's multi_logloss:0.605546"
## [1] "[7]: val1's multi_logloss:0.00481682 val2's multi_logloss:0.620166"
## [1] "[8]: val1's multi_logloss:0.00264789 val2's multi_logloss:0.653594"
## [1] "[9]: val1's multi_logloss:0.0014357 val2's multi_logloss:0.695867"
## [1] "[10]: val1's multi_logloss:0.000769721 val2's multi_logloss:0.721256"
## [1] "[11]: val1's multi_logloss:0.000418808 val2's multi_logloss:0.760272"
## [1] "[12]: val1's multi_logloss:0.000239663 val2's multi_logloss:0.776561"
## [1] "[13]: val1's multi_logloss:0.000138903 val2's multi_logloss:0.790873"
## [1] "[14]: val1's multi_logloss:7.75786e-05 val2's multi_logloss:0.809931"
## [1] "[15]: val1's multi_logloss:4.43349e-05 val2's multi_logloss:0.847126"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[16]: val1's multi_logloss:2.68454e-05 val2's multi_logloss:0.870349"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[17]: val1's multi_logloss:1.69985e-05 val2's multi_logloss:0.908464"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[18]: val1's multi_logloss:1.14122e-05 val2's multi_logloss:0.939347"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[19]: val1's multi_logloss:8.17648e-06 val2's multi_logloss:0.951316"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[20]: val1's multi_logloss:6.37837e-06 val2's multi_logloss:0.959388"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[21]: val1's multi_logloss:5.38712e-06 val2's multi_logloss:0.979799"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[22]: val1's multi_logloss:4.72323e-06 val2's multi_logloss:0.999218"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[23]: val1's multi_logloss:4.24951e-06 val2's multi_logloss:1.00509"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[24]: val1's multi_logloss:3.90897e-06 val2's multi_logloss:1.00399"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[25]: val1's multi_logloss:3.59214e-06 val2's multi_logloss:1.00779"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[26]: val1's multi_logloss:3.34248e-06 val2's multi_logloss:1.00913"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[27]: val1's multi_logloss:3.14463e-06 val2's multi_logloss:1.01367"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[28]: val1's multi_logloss:2.99919e-06 val2's multi_logloss:1.01593"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[29]: val1's multi_logloss:2.87908e-06 val2's multi_logloss:1.0226"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[30]: val1's multi_logloss:2.7801e-06 val2's multi_logloss:1.0302"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[31]: val1's multi_logloss:2.64759e-06 val2's multi_logloss:1.03443"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[32]: val1's multi_logloss:2.54394e-06 val2's multi_logloss:1.04187"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[33]: val1's multi_logloss:2.45106e-06 val2's multi_logloss:1.04452"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[34]: val1's multi_logloss:2.37794e-06 val2's multi_logloss:1.05096"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[35]: val1's multi_logloss:2.31499e-06 val2's multi_logloss:1.0507"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[36]: val1's multi_logloss:2.2556e-06 val2's multi_logloss:1.05631"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[37]: val1's multi_logloss:2.19636e-06 val2's multi_logloss:1.05808"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[38]: val1's multi_logloss:2.13612e-06 val2's multi_logloss:1.05724"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[39]: val1's multi_logloss:2.08752e-06 val2's multi_logloss:1.05995"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[40]: val1's multi_logloss:2.05726e-06 val2's multi_logloss:1.059"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[41]: val1's multi_logloss:2.0005e-06 val2's multi_logloss:1.06427"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[42]: val1's multi_logloss:1.96501e-06 val2's multi_logloss:1.06545"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[43]: val1's multi_logloss:1.93624e-06 val2's multi_logloss:1.0646"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[44]: val1's multi_logloss:1.89989e-06 val2's multi_logloss:1.06107"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[45]: val1's multi_logloss:1.86434e-06 val2's multi_logloss:1.06425"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[46]: val1's multi_logloss:1.8423e-06 val2's multi_logloss:1.07086"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[47]: val1's multi_logloss:1.81136e-06 val2's multi_logloss:1.07168"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[48]: val1's multi_logloss:1.78957e-06 val2's multi_logloss:1.0714"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[49]: val1's multi_logloss:1.7614e-06 val2's multi_logloss:1.0697"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[50]: val1's multi_logloss:1.74134e-06 val2's multi_logloss:1.06975"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[51]: val1's multi_logloss:1.72606e-06 val2's multi_logloss:1.06888"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[52]: val1's multi_logloss:1.70982e-06 val2's multi_logloss:1.07092"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[53]: val1's multi_logloss:1.68341e-06 val2's multi_logloss:1.07005"
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
## [1] "[54]: val1's multi_logloss:1.66536e-06 val2's multi_logloss:1.0704"
# Review the final model and results
lgb.fit
## <lgb.Booster>
## Public:
## add_valid: function (data, name)
## best_iter: 4
## best_score: 0.589902716611008
## current_iter: function ()
## dump_model: function (num_iteration = NULL, feature_importance_type = 0L)
## eval: function (data, name, feval = NULL)
## eval_train: function (feval = NULL)
## eval_valid: function (feval = NULL)
## finalize: function ()
## initialize: function (params = list(), train_set = NULL, modelfile = NULL,
## lower_bound: function ()
## params: list
## predict: function (data, start_iteration = NULL, num_iteration = NULL,
## raw: NA
## record_evals: list
## reset_parameter: function (params, ...)
## rollback_one_iter: function ()
## save: function ()
## save_model: function (filename, num_iteration = NULL, feature_importance_type = 0L)
## save_model_to_string: function (num_iteration = NULL, feature_importance_type = 0L)
## set_train_data_name: function (name)
## to_predictor: function ()
## update: function (train_set = NULL, fobj = NULL)
## upper_bound: function ()
## Private:
## eval_names: multi_logloss
## get_eval_info: function ()
## handle: lgb.Booster.handle
## higher_better_inner_eval: FALSE
## init_predictor: NULL
## inner_eval: function (data_name, data_idx, feval = NULL)
## inner_predict: function (idx)
## is_predicted_cur_iter: list
## name_train_set: val1
## name_valid_sets: list
## num_class: 3
## num_dataset: 2
## predict_buffer: list
## set_objective_to_none: FALSE
## train_set: lgb.Dataset, R6
## train_set_version: 1
## valid_sets: list
# Predict outcomes with the test data
lgb.pred = predict(lgb.fit,test.data,reshape=T)
lgb.pred = as.data.frame(lgb.pred)
colnames(lgb.pred) = levels(class2)
# Use the predicted label with the highest probability
lgb.pred$prediction = apply(lgb.pred,1,function(x) colnames(lgb.pred)[which.max(x)])
lgb.pred$label = levels(class2)[test.label+1]
# Calculate the final accuracy
result8 = sum(lgb.pred$prediction==lgb.pred$label)/nrow(lgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result8)))
## [1] "Final Accuracy = 84.31%"
# Calculate the confusion matrix
confusionMatrix(as.factor(lgb.pred$prediction),as.factor(lgb.pred$label))
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 5 1 9
## good 1 36 24
## normal 12 28 362
##
## Overall Statistics
##
## Accuracy : 0.8431
## 95% CI : (0.8073, 0.8745)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 0.1832
##
## Kappa : 0.4523
##
## Mcnemar's Test P-Value : 0.8646
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.27778 0.55385 0.9165
## Specificity 0.97826 0.93947 0.5181
## Pos Pred Value 0.33333 0.59016 0.9005
## Neg Pred Value 0.97192 0.93046 0.5658
## Prevalence 0.03766 0.13598 0.8264
## Detection Rate 0.01046 0.07531 0.7573
## Detection Prevalence 0.03138 0.12762 0.8410
## Balanced Accuracy 0.62802 0.74666 0.7173
# Calculate multi-class AUC
auc8 <- multiclass.roc(as.integer(as.factor(lgb.pred$label)),as.integer(as.factor(lgb.pred$prediction)))
## Setting direction: controls > cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc8
##
## Call:
## multiclass.roc.default(response = as.integer(as.factor(lgb.pred$label)), predictor = as.integer(as.factor(lgb.pred$prediction)))
##
## Data: as.integer(as.factor(lgb.pred$prediction)) with 3 levels of as.integer(as.factor(lgb.pred$label)): 1, 2, 3.
## Multi-class area under the curve: 0.6371
# Train the AdaBoost classifier
adaboost <- boosting(Class~., data=X4red_train, boos=TRUE, mfinal = 50)
# Plot variable importance
importanceplot(adaboost, cex.names=.3)
# Predict outcomes with the test data
test.data <- as.data.frame(test.data)
adaboost.pred <- predict.boosting(adaboost, newdata=test.data)
ada.pred <- data.frame(as.factor(adaboost.pred$class), class2)
names(ada.pred) <- c("prediction", "label")
# Calculate the final accuracy
result9 = sum(ada.pred$prediction==ada.pred$label)/nrow(ada.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result9)))
## [1] "Final Accuracy = 86.19%"
# Calculate the confusion matrix
confusionMatrix(as.factor(ada.pred$prediction),as.factor(ada.pred$label))
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good normal
## bad 3 0 9
## good 0 40 17
## normal 15 25 369
##
## Overall Statistics
##
## Accuracy : 0.8619
## 95% CI : (0.8277, 0.8916)
## No Information Rate : 0.8264
## P-Value [Acc > NIR] : 0.02088
##
## Kappa : 0.4993
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: bad Class: good Class: normal
## Sensitivity 0.166667 0.61538 0.9342
## Specificity 0.980435 0.95884 0.5181
## Pos Pred Value 0.250000 0.70175 0.9022
## Neg Pred Value 0.967811 0.94062 0.6232
## Prevalence 0.037657 0.13598 0.8264
## Detection Rate 0.006276 0.08368 0.7720
## Detection Prevalence 0.025105 0.11925 0.8556
## Balanced Accuracy 0.573551 0.78711 0.7261
# Calculate multi-class AUC
auc9 <- multiclass.roc(as.integer(as.factor(ada.pred$label)),as.integer(as.factor(ada.pred$prediction)))
## Setting direction: controls > cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc9
##
## Call:
## multiclass.roc.default(response = as.integer(as.factor(ada.pred$label)), predictor = as.integer(as.factor(ada.pred$prediction)))
##
## Data: as.integer(as.factor(ada.pred$prediction)) with 3 levels of as.integer(as.factor(ada.pred$label)): 1, 2, 3.
## Multi-class area under the curve: 0.665
X5red <- Xred
head(X5red)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
# Drop fixed.acidity to avoid multi-collinearity issue
X5red <- X5red %>%
select(-one_of('fixed.acidity'))
# Create a 70/30 split
set.seed(1234)
splitindex2 <- createDataPartition(X5red$quality, p=0.7, list=FALSE, times=1)
X5red_train <- X5red[splitindex2,]
table(X5red_train$quality)
##
## 3 4 5 6 7 8
## 7 43 471 447 140 12
X5red_test <- X5red[-splitindex2,]
table(X5red_test$quality)
##
## 3 4 5 6 7 8
## 3 10 210 191 59 6
# Fit a multiple linear regression model
lmmodel <- lm(quality ~., X5red_train)
summary(lmmodel)
##
## Call:
## lm(formula = quality ~ ., data = X5red_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.61641 -0.35575 -0.05257 0.44666 2.03589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.5831446 16.1108396 0.719 0.472313
## volatile.acidity -1.1266551 0.1485411 -7.585 7.02e-14 ***
## citric.acid -0.1293092 0.1677458 -0.771 0.440952
## residual.sugar 0.0171160 0.0168176 1.018 0.309024
## chlorides -1.7466245 0.4639141 -3.765 0.000175 ***
## free.sulfur.dioxide 0.0022740 0.0026210 0.868 0.385805
## total.sulfur.dioxide -0.0029122 0.0008746 -3.330 0.000898 ***
## density -7.1303565 16.0537276 -0.444 0.657017
## pH -0.4336497 0.1656097 -2.619 0.008952 **
## sulphates 0.8533680 0.1297696 6.576 7.43e-11 ***
## alcohol 0.2751083 0.0264145 10.415 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6593 on 1109 degrees of freedom
## Multiple R-squared: 0.3495, Adjusted R-squared: 0.3436
## F-statistic: 59.58 on 10 and 1109 DF, p-value: < 2.2e-16
# Predict the test data
lm_pred <- predict(lmmodel,X5red_test)
head(lm_pred)
## 3 4 6 7 9 11
## 5.205333 5.665306 5.079564 5.111391 5.337221 5.093868
# Accuracy check
lm_mse = mean((X5red_test$quality - lm_pred)^2)
lm_mae = caret::MAE(X5red_test$quality, lm_pred)
lm_rmse = caret::RMSE(X5red_test$quality, lm_pred)
cat("MSE: ", lm_mse, "MAE: ", lm_mae, "RMSE: ", lm_rmse)
## MSE: 0.388386 MAE: 0.4950919 RMSE: 0.6232063
# Fit a random forest model using default parameters
rf <- randomForest(quality~.,data=X5red_train,importance = TRUE)
rf
##
## Call:
## randomForest(formula = quality ~ ., data = X5red_train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.3557046
## % Var explained: 46.23
# Predict the test dataset
rf_pred <- predict(rf, X5red_test)
head(rf_pred)
## 3 4 6 7 9 11
## 5.270167 5.575400 5.110800 5.097633 5.372467 5.243133
# Accuracy check
rf_mse = mean((X5red_test$quality - rf_pred)^2)
rf_mae = caret::MAE(X5red_test$quality, rf_pred)
rf_rmse = caret::RMSE(X5red_test$quality, rf_pred)
cat("MSE: ", rf_mse, "MAE: ", rf_mae, "RMSE: ", rf_rmse)
## MSE: 0.3092426 MAE: 0.4182004 RMSE: 0.5560958
Guidance from Smith College: http://www.science.smith.edu/~jcrouser/SDS293/labs/lab10-r.html
# Set up datasets
X5red_reg <- subset(X5red, select = -c(11))
Y5red_reg <- subset(X5red, select = c(11))
our_Y_reg <- pull(Y5red_reg, "quality")
lmnet() function has an alpha argument that determines what type of model is fit. If alpha = 0 then a ridge regression model is fit, and if alpha = 1 then a lasso model is fit.
grid = 10^seq(10, -2, length = 100)
ridge_mod = glmnet(X5red_reg, our_Y_reg, alpha = 0, lambda = grid)
dim(coef(ridge_mod))
## [1] 11 100
# Draw plot of coefficients
plot(ridge_mod)
# Looks like around alpha = 0 might be the common coefficients
# Let's apply this to our train and test data
Y5red_train_reg <- subset(X5red_train, select = c(11))
X5red_train_reg <- data.matrix(subset(X5red_train, select = -c(11)))
our_Y_reg_train <- pull(Y5red_train_reg, "quality")
Y5red_test_reg <- subset(X5red_test, select = c(11))
X5red_test_reg <- data.matrix(subset(X5red_test, select = -c(11)))
our_Y_reg_test <- pull(Y5red_test_reg, "quality")
# First, try to fit a ridge regression model, choosing a random lambda = 3 as this coefficient shows up a lot in the graph.
ridge_mod = glmnet(X5red_train_reg, our_Y_reg_train, alpha=0, lambda = grid, thresh = 1e-12)
# Cross validation: Instead of choosing a random lambda, we use cross-validation to choose this tuning parameter.
set.seed(1)
cv.out = cv.glmnet(X5red_train_reg, our_Y_reg_train, alpha = 0) # Fit ridge regression model on training data
bestlam = cv.out$lambda.min # Select lamda that minimizes training MSE
bestlam
## [1] 0.03858909
# Draw plot of training MSE as a function of lambda
plot(cv.out)
# Use the best lambda to predict the test data
ridge_pred = predict(ridge_mod, s = bestlam, newx = X5red_test_reg)
ridge_pred <- as.vector(ridge_pred)
head(ridge_pred)
## [1] 5.204171 5.671423 5.088011 5.119452 5.336803 5.108101
# Accuracy check
ridge_mse = mean((X5red_test$quality - ridge_pred)^2)
ridge_mae = caret::MAE(X5red_test$quality, ridge_pred)
ridge_rmse = caret::RMSE(X5red_test$quality, ridge_pred)
cat("MSE: ", ridge_mse, "MAE: ", ridge_mae, "RMSE: ", ridge_rmse)
## MSE: 0.3910052 MAE: 0.4981249 RMSE: 0.6253041
# This time, for the glmnet function, we use alpha = 1
lasso_mod = glmnet(X5red_train_reg,
our_Y_reg_train,
alpha = 1,
lambda = grid)
plot(lasso_mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
# Let's perform cross-validation
# Fit Lasso model on training data
set.seed(1)
cv.out2 = cv.glmnet(X5red_train_reg, our_Y_reg_train, alpha = 1)
plot(cv.out2)
# Select best lambda
bestlam2 = cv.out2$lambda.min
print(bestlam2)
## [1] 0.000912435
# Use best lambda to predict test data
lasso_pred = predict(lasso_mod, s = bestlam2, newx = X5red_test_reg)
lasso_pred <- as.vector(lasso_pred)
head(lasso_pred)
## [1] 5.191617 5.691305 5.112436 5.120707 5.318177 5.100923
# Accuracy check
lasso_mse = mean((X5red_test$quality - lasso_pred)^2)
lasso_mae = caret::MAE(X5red_test$quality, lasso_pred)
lasso_rmse = caret::RMSE(X5red_test$quality, lasso_pred)
cat("MSE: ", lasso_mse, "MAE: ", lasso_mae, "RMSE: ", lasso_rmse)
## MSE: 0.3940802 MAE: 0.5003707 RMSE: 0.6277581
# Set up test and train dependent and independent sets
train_x = data.matrix(X5red_train[,-11])
train_y = X5red_train[,11]
test_x = data.matrix(X5red_test[,-11])
test_y = X5red_test[,11]
# Convert dataset to xgb.DMatrix
xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)
# Fitting the model
xgbc = xgboost(data = xgb_train, max.depth = 2, nrounds = 50, eta = 0.3, gamma = 0)
## [1] train-rmse:3.675893
## [2] train-rmse:2.622542
## [3] train-rmse:1.898558
## [4] train-rmse:1.409962
## [5] train-rmse:1.089524
## [6] train-rmse:0.888388
## [7] train-rmse:0.768303
## [8] train-rmse:0.696406
## [9] train-rmse:0.657402
## [10] train-rmse:0.634882
## [11] train-rmse:0.622415
## [12] train-rmse:0.615167
## [13] train-rmse:0.608908
## [14] train-rmse:0.605823
## [15] train-rmse:0.601920
## [16] train-rmse:0.596653
## [17] train-rmse:0.594761
## [18] train-rmse:0.592256
## [19] train-rmse:0.588774
## [20] train-rmse:0.586063
## [21] train-rmse:0.584521
## [22] train-rmse:0.583443
## [23] train-rmse:0.579704
## [24] train-rmse:0.577861
## [25] train-rmse:0.575236
## [26] train-rmse:0.573417
## [27] train-rmse:0.572473
## [28] train-rmse:0.570917
## [29] train-rmse:0.569900
## [30] train-rmse:0.569218
## [31] train-rmse:0.566729
## [32] train-rmse:0.564760
## [33] train-rmse:0.563887
## [34] train-rmse:0.562945
## [35] train-rmse:0.562162
## [36] train-rmse:0.561427
## [37] train-rmse:0.559963
## [38] train-rmse:0.558266
## [39] train-rmse:0.556743
## [40] train-rmse:0.554678
## [41] train-rmse:0.552732
## [42] train-rmse:0.551073
## [43] train-rmse:0.550555
## [44] train-rmse:0.548427
## [45] train-rmse:0.545629
## [46] train-rmse:0.544948
## [47] train-rmse:0.543194
## [48] train-rmse:0.540699
## [49] train-rmse:0.540079
## [50] train-rmse:0.539636
print(xgbc)
## ##### xgb.Booster
## raw: 38.2 Kb
## call:
## xgb.train(params = params, data = dtrain, nrounds = nrounds,
## watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
## early_stopping_rounds = early_stopping_rounds, maximize = maximize,
## save_period = save_period, save_name = save_name, xgb_model = xgb_model,
## callbacks = callbacks, max.depth = 2, eta = 0.3, gamma = 0)
## params (as set within xgb.train):
## max_depth = "2", eta = "0.3", gamma = "0", validate_parameters = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 10
## niter: 50
## nfeatures : 10
## evaluation_log:
## iter train_rmse
## 1 3.675893
## 2 2.622542
## ---
## 49 0.540079
## 50 0.539636
# Predict the x test data with the xgbc model
pred_y = predict(xgbc,test_x)
head(pred_y)
## [1] 5.316401 5.619293 5.342974 5.118239 5.328153 5.190927
# Visualize y original test and y predicted data in a plot
x = 1:length(test_y)
plot(x, test_y, col = "red", type = "l")
lines(x, pred_y, col = "blue", type = "l")
legend("topright", legend = c("original test_y", "predicted test_y"),
col = c("red", "blue"), box.lty = 1, cex = 0.8, lty = c(1, 1))
# Accuracy check
mse = mean((test_y - pred_y)^2)
mae = caret::MAE(test_y, pred_y)
rmse = caret::RMSE(test_y, pred_y)
cat("MSE: ", mse, "MAE: ", mae, "RMSE: ", rmse)
## MSE: 0.3633217 MAE: 0.4727701 RMSE: 0.6027617
# Feature importance
importance_matrix <- xgb.importance(model = xgbc)
print(importance_matrix)
## Feature Gain Cover Frequency
## 1: alcohol 0.32341509 0.15749814 0.14184397
## 2: sulphates 0.20720771 0.07271356 0.08510638
## 3: volatile.acidity 0.19006030 0.08056331 0.12056738
## 4: total.sulfur.dioxide 0.07001677 0.13282618 0.12765957
## 5: pH 0.05592583 0.12146469 0.12056738
## 6: density 0.04973980 0.14258000 0.13475177
## 7: chlorides 0.03418957 0.06410038 0.06382979
## 8: citric.acid 0.03148723 0.09456534 0.09219858
## 9: residual.sugar 0.02419104 0.09774477 0.07801418
## 10: free.sulfur.dioxide 0.01376666 0.03594363 0.03546099
xgb.plot.importance(importance_matrix = importance_matrix) # Alcohol, sulphates, and volatile.acidity are the three most important variables in determining the quality of a wine.
# Convert dataset to lgb.Dataset
lgb_train = lgb.Dataset(data=train_x, label=train_y)
lgb_test = lgb.Dataset(data=test_x, label=test_y)
# Fitting the model
lgbd = lightgbm(boosting_type = 'gbdt', objective = "regression", metric = 'rmse', lgb_train, nrounds = 50)
## Warning in (function (params = list(), data, nrounds = 100L, valids = list(), :
## lgb.train: Found the following passed through '...': boosting_type, objective,
## metric. These will be used, but in future releases of lightgbm, this warning
## will become an error. Add these to 'params' instead. See ?lgb.train for
## documentation on how to call this function.
## [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000027 seconds.
## You can set `force_row_wise=true` to remove the overhead.
## And if memory is not enough, you can set `force_col_wise=true`.
## [LightGBM] [Info] Total Bins 915
## [LightGBM] [Info] Number of data points in the train set: 1120, number of used features: 10
## [LightGBM] [Info] Start training from score 5.630357
## [1] "[1]: train's rmse:0.775186"
## [1] "[2]: train's rmse:0.742942"
## [1] "[3]: train's rmse:0.714977"
## [1] "[4]: train's rmse:0.688906"
## [1] "[5]: train's rmse:0.666146"
## [1] "[6]: train's rmse:0.645208"
## [1] "[7]: train's rmse:0.627439"
## [1] "[8]: train's rmse:0.61115"
## [1] "[9]: train's rmse:0.594696"
## [1] "[10]: train's rmse:0.580377"
## [1] "[11]: train's rmse:0.567246"
## [1] "[12]: train's rmse:0.555614"
## [1] "[13]: train's rmse:0.545501"
## [1] "[14]: train's rmse:0.535901"
## [1] "[15]: train's rmse:0.526397"
## [1] "[16]: train's rmse:0.517803"
## [1] "[17]: train's rmse:0.509556"
## [1] "[18]: train's rmse:0.50181"
## [1] "[19]: train's rmse:0.494594"
## [1] "[20]: train's rmse:0.487754"
## [1] "[21]: train's rmse:0.480944"
## [1] "[22]: train's rmse:0.474107"
## [1] "[23]: train's rmse:0.467495"
## [1] "[24]: train's rmse:0.461551"
## [1] "[25]: train's rmse:0.455975"
## [1] "[26]: train's rmse:0.450292"
## [1] "[27]: train's rmse:0.44512"
## [1] "[28]: train's rmse:0.440155"
## [1] "[29]: train's rmse:0.434612"
## [1] "[30]: train's rmse:0.429691"
## [1] "[31]: train's rmse:0.425077"
## [1] "[32]: train's rmse:0.420487"
## [1] "[33]: train's rmse:0.414955"
## [1] "[34]: train's rmse:0.410699"
## [1] "[35]: train's rmse:0.406949"
## [1] "[36]: train's rmse:0.402598"
## [1] "[37]: train's rmse:0.398814"
## [1] "[38]: train's rmse:0.394605"
## [1] "[39]: train's rmse:0.390025"
## [1] "[40]: train's rmse:0.386324"
## [1] "[41]: train's rmse:0.382477"
## [1] "[42]: train's rmse:0.377629"
## [1] "[43]: train's rmse:0.374125"
## [1] "[44]: train's rmse:0.369756"
## [1] "[45]: train's rmse:0.366184"
## [1] "[46]: train's rmse:0.362641"
## [1] "[47]: train's rmse:0.36003"
## [1] "[48]: train's rmse:0.356629"
## [1] "[49]: train's rmse:0.352808"
## [1] "[50]: train's rmse:0.348812"
print(lgbd)
## <lgb.Booster>
## Public:
## add_valid: function (data, name)
## best_iter: -1
## best_score: NA
## current_iter: function ()
## dump_model: function (num_iteration = NULL, feature_importance_type = 0L)
## eval: function (data, name, feval = NULL)
## eval_train: function (feval = NULL)
## eval_valid: function (feval = NULL)
## finalize: function ()
## initialize: function (params = list(), train_set = NULL, modelfile = NULL,
## lower_bound: function ()
## params: list
## predict: function (data, start_iteration = NULL, num_iteration = NULL,
## raw: NA
## record_evals: list
## reset_parameter: function (params, ...)
## rollback_one_iter: function ()
## save: function ()
## save_model: function (filename, num_iteration = NULL, feature_importance_type = 0L)
## save_model_to_string: function (num_iteration = NULL, feature_importance_type = 0L)
## set_train_data_name: function (name)
## to_predictor: function ()
## update: function (train_set = NULL, fobj = NULL)
## upper_bound: function ()
## Private:
## eval_names: rmse
## get_eval_info: function ()
## handle: lgb.Booster.handle
## higher_better_inner_eval: FALSE
## init_predictor: NULL
## inner_eval: function (data_name, data_idx, feval = NULL)
## inner_predict: function (idx)
## is_predicted_cur_iter: list
## name_train_set: train
## name_valid_sets: list
## num_class: 1
## num_dataset: 1
## predict_buffer: list
## set_objective_to_none: FALSE
## train_set: lgb.Dataset, R6
## train_set_version: 1
## valid_sets: list
# Predict the x test data with the xgbc model
pred_y2 <- predict(lgbd, data = test_x)
head(pred_y2)
## [1] 5.155409 5.551289 5.141556 4.883104 5.542482 5.272589
# Accuracy check
mse2 = mean((test_y - pred_y2)^2)
mae2 = caret::MAE(test_y, pred_y2)
rmse2 = caret::RMSE(test_y, pred_y2)
cat("MSE: ", mse2, "MAE: ", mae2, "RMSE: ", rmse2) # LightGBM seems to perform better than XGBoost a bit
## MSE: 0.337878 MAE: 0.4468788 RMSE: 0.5812728
# Feature importance
importance_matrix <- lgb.importance(model = lgbd)
print(importance_matrix)
## Feature Gain Cover Frequency
## 1: alcohol 0.31890439 0.13869487 0.11200000
## 2: sulphates 0.14732333 0.10744348 0.10466667
## 3: volatile.acidity 0.12953290 0.11998389 0.11733333
## 4: total.sulfur.dioxide 0.09795119 0.10685207 0.12400000
## 5: pH 0.06209832 0.07342794 0.09200000
## 6: chlorides 0.06135905 0.10888491 0.10666667
## 7: density 0.05640627 0.09962375 0.10400000
## 8: citric.acid 0.05178529 0.09784952 0.09933333
## 9: free.sulfur.dioxide 0.03885817 0.04580768 0.06400000
## 10: residual.sugar 0.03578108 0.10143189 0.07600000
lgb.plot.importance(importance_matrix) # Similar to XGBoost's evaluation, LightGBM evaluates alcohol, sulphates, and volatile.acidity as the three most important variables.
# Fitting the model
model_gbm = gbm(quality ~.,
data = X5red_train,
distribution = "gaussian",
cv.folds = 10,
shrinkage = .01,
n.minobsinnode = 10,
n.trees = 50)
print(model_gbm)
## gbm(formula = quality ~ ., distribution = "gaussian", data = X5red_train,
## n.trees = 50, n.minobsinnode = 10, shrinkage = 0.01, cv.folds = 10)
## A gradient boosted model with gaussian loss function.
## 50 iterations were performed.
## The best cross-validation iteration was 50.
## There were 10 predictors of which 3 had non-zero influence.
# Make predictions on the test dataset
pred_y3 <- predict.gbm(model_gbm, data = test_x)
## Using 50 trees...
head(pred_y3)
## [1] 5.517190 5.569225 5.517190 5.534084 5.669058 5.517190
# Accuracy check
mse3 = mean((test_y - pred_y3)^2)
## Warning in test_y - pred_y3: longer object length is not a multiple of shorter
## object length
mae3 = caret::MAE(test_y, pred_y3)
## Warning in pred - obs: longer object length is not a multiple of shorter object
## length
rmse3 = caret::RMSE(test_y, pred_y3)
## Warning in pred - obs: longer object length is not a multiple of shorter object
## length
cat("MSE: ", mse3, "MAE: ", mae3, "RMSE: ", rmse3) # Gradient boosting seems to not perform well compared to the two aboved-mention methods.
## MSE: 0.6191481 MAE: 0.665234 RMSE: 0.7868596