Apply \(K\)-means and hierarchical clustering on wine data
# Load required packages and data (wine data)
# install.packages(c("HDclassif", "useful", "factoextra"))
library(HDclassif)
## Loading required package: MASS
data(wine)
head(wine)
## class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450
# Check out the dimension: 178 obs, 14 variables (including the "type" variable, which is our label)
dim(wine)
## [1] 178 14
# Re-name the columns
names(wine) <- c("Type", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline")
# Set k = 3 (the default) and plot the k-means clustering
set.seed(123) # Set random seed for reproducibility
wineK3 <- kmeans(x = wine, centers = 3) # Data as the first argument, the number of centroids as the second argument
library(useful) # The namesake, a very useful package (I think ... )
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
plot(wineK3, data = wine) # Map the k-means labels to wine data
# Cluster plot
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(wineK3, data = wine) # The x- and y-axis are the two principal components that account for the largest proportion of variance
# Overlay their original classification ("Type") onto the plot, do you think K-means did a good job at classifying the data?
plot(wineK3, data = wine, class = "Type", col = c("black", "ivory4", "ivory3"))
# K-means can be sensitive to random starting conditions, so we can run it with different random starts, let's run it 25 times using the "nstart" argument. "nstart" implements multiple initial graph configurations and reports the best one. For example, set nstart = 25 will generate 25 initial random centroids and pick the best one for the algorithm.
set.seed(123) # Set seed for reproducibility
wineK3N25 <- kmeans(wine, centers = 3, nstart = 25)
# Compare the results
wineK3$size
## [1] 62 47 69
wineK3N25$size
## [1] 62 47 69
# Determine the optimal number of clusters using Hartiganās rule, which compares the ratio between the within-cluster sum of squared errors for the clustering result using k clusters and the one generated using k + 1 clusters, accounting for the number of rows and clusters. If the "Hartigan" score is greater than 10, then it is fine to use the k + 1 clusters (yet if it's less than 10, then it's recommended to stick to the current k clusters)
wineBest <- FitKMeans(wine, max.clusters = 20, nstart = 25, seed = 123)
# Plotting
# This generates an elbow plot that plots the Hartigan score as a decreasing function of the number of clusters. You can check at which number of cluster does the score fall below 10 as your decision criterion
PlotHartigan(wineBest) # Here I'll say 18
# Does the Hartigan's rule provide a good fit, compared to the true labels?
table(wine$Type, wineK3N25$cluster) # table(true labels, estimated)
##
## 1 2 3
## 1 13 46 0
## 2 20 1 50
## 3 29 0 19
# Calculate accuracy
sum(diag(table(wine$Type, wineK3N25$cluster)))/sum(table(wine$Type, wineK3N25$cluster)) # Not high at all!
## [1] 0.1853933
# Plot the confusion matrix of wine clustering
par(mar=c(1,1,1,1)) # Set plotting margin
plot(table(wine$Type, wineK3N25$cluster),
main="Confusion Matrix for Wine Clustering",
xlab="Type (actual data)", ylab="Cluster (classified)")
## Hierarchical clustering
# Apply hclust on wine data, using "complete linkage" method
# dist() returns the distance matrix computed using the designated distance measure (euclidean) on the data supplied in the first argument
hc1 <- hclust(dist(wine, method = "euclidean"), method = "complete")
plot(hc1, cex = 0.6, hang = -1) # Notice how row.names become labels, a negative value of "hang" will cause the labels to hang down from 0.
# Should we cut the total number of clusters at 3 or 4?
# Let's begin with k = 3
clusterCut_3 <- cutree(hc1, k = 3)
# Make a table and compare the result with the original types.
table(clusterCut_3, wine$Type)
##
## clusterCut_3 1 2 3
## 1 43 0 0
## 2 16 15 21
## 3 0 56 27
# Compute accuracy
tab3 <- table(clusterCut_3, wine$Type)
sum(diag(tab3))/sum(tab3)
## [1] 0.4775281
# Calculate the accuracy score computed at k = 4
# Make a table and compare the result with the original types.
clusterCut_4 <- cutree(hc1, k = 4)
table(clusterCut_4, wine$Type)
##
## clusterCut_4 1 2 3
## 1 37 0 0
## 2 6 0 0
## 3 16 15 21
## 4 0 56 27
# Compute accuracy
tab4 <- table(clusterCut_4, wine$Type)
sum(diag(tab4))/sum(tab4)
## [1] 0.3258427
# It doesn't seem that using 4 clusters improves classification accuracy than if we classify the data using 3 clusters.
## Applying the same tricks on coffee data
# install.packages("pgmm") # Need to require the pgmm package
library(pgmm)
## Warning: package 'pgmm' was built under R version 4.4.3
data(coffee)
head(coffee)
## Variety Country Water Bean Weight Extract Yield ph Value Free Acid
## 1 1 mexico 8.939999 156.6 33.5 5.80 32.7
## 2 1 mexico 7.400000 157.3 32.1 5.81 30.8
## 3 1 guatemal 9.740000 152.9 33.1 5.26 36.7
## 4 1 honduras 10.400000 174.0 31.5 5.61 34.2
## 5 1 salvador 10.540000 145.1 35.2 5.77 31.8
## 6 1 salvador 10.000000 156.4 34.5 5.83 32.6
## Mineral Content Fat Caffine Trigonelline Chlorogenic Acid
## 1 3.80 15.2 1.13 1.03 5.38
## 2 3.71 15.0 1.25 1.01 5.13
## 3 4.15 16.1 1.21 1.05 5.94
## 4 3.94 15.8 1.06 0.94 5.87
## 5 4.09 15.2 1.11 0.99 5.09
## 6 3.88 15.4 1.20 0.81 5.30
## Neochlorogenic Acid Isochlorogenic Acid
## 1 0.40 0.79
## 2 0.32 0.97
## 3 0.24 0.76
## 4 0.39 0.59
## 5 0.49 0.72
## 6 0.43 0.69
# Remove label and factor variables (why do I do this?)
coffee_dat <- coffee[complete.cases(coffee), -c(1,2)]
Model performance metrics and many moreā¦
# load required packages and data (HMDA)
# install.packages(c("pROC", "plotROC", "AUC"))
library(e1071) # library of SVM models
## Warning: package 'e1071' was built under R version 4.4.3
library(caret) # for train() class of functions
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(AER) # HMDA data
## Warning: package 'AER' was built under R version 4.4.3
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
library(gbm) # gradient boost machines
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(pROC) # Calculate ROC and AUC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(plotROC) # for plotting ROC
##
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
##
## ggroc
data(HMDA)
# Check variable columns
names(HMDA)
## [1] "deny" "pirat" "hirat" "lvrat" "chist" "mhist"
## [7] "phist" "unemp" "selfemp" "insurance" "condomin" "afam"
## [13] "single" "hschool"
# Subset needed columns
subset <- c("deny", "hirat", "lvrat", "mhist", "unemp")
# Subset data
data <- HMDA[complete.cases(HMDA), subset]
# Do a 75-25 train-test split
set.seed(123)
train_row_numbers <- createDataPartition(data$deny, p=0.75, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]
# Fit a logit model and predict on testing data
logit.fit <- glm(deny ~ hirat + unemp + mhist, family = binomial(link = "logit"), data = training)
logit.pred <- predict(logit.fit, testing, type = "response") # Set type = "response" to get predicted values in probability terms (within (0, 1) scale)
# Pause for a moment and see how the proportion of Type I and Type II errors changes as a function of discrimination thresholds, as shown in the confusion matrix. Let's compare two thresholds:
logit.pred01 <- as.factor(ifelse(logit.pred > 0.1, "yes", "no"))
table(Predicted = logit.pred01, Actual = testing$deny)
## Actual
## Predicted no yes
## no 204 21
## yes 319 50
logit.pred02 <- as.factor(ifelse(logit.pred > 0.25, "yes", "no"))
table(Predicted = logit.pred02, Actual = testing$deny) # TN class increases, while TP class gets fewer. Meanwhile, instances of Type I error decrease, but instances of Type II error increase. Classification accuracy is very sensitive to the threshold cutpoint.
## Actual
## Predicted no yes
## no 514 67
## yes 9 4
# Calculate sensitivity and specificity
# Threshold at 0.1
sensitivity(logit.pred01, testing$deny, positive = "yes") # Specify the positive class to arrange the class as ("no", "yes"); otherwise this function will use the default 1/0 coding ("yes", "no")
## [1] 0.7042254
specificity(logit.pred01, testing$deny, negative = "no")
## [1] 0.3900574
# Threshold at 0.25
sensitivity(logit.pred02, testing$deny, positive = "yes")
## [1] 0.05633803
specificity(logit.pred02, testing$deny, negative = "no")
## [1] 0.9827916
# Let's find out the optimal threshold for predicted probability and convert predicted probability to binary class (0, 1). In order to use the optimalCutoff() function, you need to download InformationValue package from your own drive
library(InformationValue) # The developer has removed this package from the depository, but you can download it from CRAN archive or install using the ZIP file from our NTU COOL course folder.
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, sensitivity, specificity
optCut.logit <- optimalCutoff(testing$deny, logit.pred)[1] # The output maximizes the probability of predicting class when y = 1
optCut.logit
## [1] 0.02663739
# Now we can dichotomize predicted values into ("yes", "no") class:
logit.predicted <- as.factor(ifelse(logit.pred > optCut.logit, "yes", "no"))
table(Predicted = logit.predicted, Actual = testing$deny)
## Actual
## Predicted no yes
## no 1 0
## yes 522 71
# Calculate sensitivity and specificity
caret::sensitivity(logit.predicted, testing$deny, positive = "yes")
## [1] 1
caret::specificity(logit.predicted, testing$deny, negative = "no")
## [1] 0.001912046
detach("package:InformationValue", unload = TRUE) # Unload this package
# It DOESN'T seem that this optimal value offers much improvement in mitigating Type I error, perhaps because the metric is only optimized at y = 1 class. Instead, this metric increases the false positive rate (FPR): 1 - specificity. Meaning that it increases the probability of incorrectly rejecting a true null hypothesis among all negative cases. For the ease of comparison, we will stick to prob = 0.1 as our threshold cutpoint for all other models.
logit.predicted <- logit.pred01
# A faster (and more intuitive) way to generate a confusion matrix is to use the confusion_matrix() function from regclass package
# install.packages("regclass")
library(regclass)
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked _by_ '.GlobalEnv':
##
## wine
## The following object is masked from 'package:AER':
##
## tobit
## The following object is masked from 'package:lmtest':
##
## lrtest
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:caret':
##
## predictors
## Loading required package: rpart
## Loading required package: randomForest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
##
## Attaching package: 'regclass'
## The following object is masked from 'package:lattice':
##
## qq
confusion_matrix(logit.fit)
## Predicted no Predicted yes Total
## Actual no 1571 1 1572
## Actual yes 208 6 214
## Total 1779 7 1786
# Fit a probit model and predict on testing data
probit.fit <- glm(deny ~ hirat + unemp + mhist, family = binomial(link = "probit"), data = training)
probit.pred <- predict(probit.fit, testing, type = "response")
# Now we can classify predicted values into (0, 1) class:
probit.predicted <- as.factor(ifelse(probit.pred > 0.1, "yes", "no"))
table(Predicted = probit.predicted, Actual = testing$deny)
## Actual
## Predicted no yes
## no 199 20
## yes 324 51
# Calculate sensitivity and specificity
sensitivity(probit.predicted, testing$deny, positive = "yes")
## [1] 0.7183099
specificity(probit.predicted, testing$deny, negative = "no")
## [1] 0.3804971
# Fit and predict a plain vanilla SVM model
svm.fit <- svm(deny ~ hirat + unemp + mhist, data = training)
svm.pred <- predict(svm.fit, testing) # svm is for classifying categorical data, the output is already in factor class ("no", "yes")
svm.predicted <- svm.pred # I want to preserve svm.pred for later use
# Generate the confusion matrix
table(Predicted = svm.predicted, Actual = testing$deny)
## Actual
## Predicted no yes
## no 521 71
## yes 2 0
# Calculate sensitivity and specificity
sensitivity(svm.predicted, testing$deny, positive = "yes")
## [1] 0
specificity(svm.predicted, testing$deny, negative = "no")
## [1] 0.9961759
# Fit and predict a GBM model
# Let's time how fast GBM goes
library(tictoc) # Require tictoc package
# To feed "deny" into gbm, we will need to first convert it to (0, 1) class.
training$y <- ifelse(training$deny == "yes", 1, 0)
testing$y <- ifelse(testing$deny == "yes", 1, 0)
tic() # Set the timer
gbm.logit.fit <- gbm(y ~ .,
data = training,
interaction.depth = 2, # Tree depth
distribution = "bernoulli", # Meaning that the outcome is on the binary scale
cv.folds = 5, # 5-fold CV
shrinkage = .1, # Learning rate
n.trees = 50) # Set up to 50 trees (the default)
toc()
## 7.22 sec elapsed
gbm.pred <- predict(gbm.logit.fit, testing, type = "response")
## Using 50 trees...
# Generate confusion matrix
gbm.predicted <- as.factor(ifelse(gbm.pred > 0.1, "yes", "no"))
table(Predicted = gbm.predicted, Actual = testing$deny)
## Actual
## Predicted no yes
## no 523 0
## yes 0 71
# Calculate sensitivity and specificity, wow!
sensitivity(gbm.predicted, testing$deny, positive = "yes")
## [1] 1
specificity(gbm.predicted, testing$deny, negative = "no")
## [1] 1
## Plot ROC-AUC curve, we'll do this step by step
# Construct the objects containing ROC-information (using "roc()" from the pROC package) for all 4 models
# Note: because the predicted values must be of numeric class, we need to convert both the actual values (of testing data, which we previously appended as y in the testing data) and our predictor values.
logit_pred <- ifelse(logit.pred > 0.1, 1, 0)
probit_pred <- ifelse(probit.pred > 0.1, 1, 0)
svm_pred <- ifelse(svm.pred == "no", 0, 1)
gbm_pred <- ifelse(gbm.pred > 0.1, 1, 0)
# Feed actual and predicted values into roc() to compute classification accuracy at various ROCs
ROC_logit <- roc(testing$y, logit_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_probit <- roc(testing$y, probit_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_svm <- roc(testing$y, svm_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_gbm <- roc(testing$y, gbm_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Compute the AUCs from ROC objects we made earlier, the auc() function will calculate the area-under-the-curve (AUC) for the ROC object you feed into it
auc(ROC_logit)
## Area under the curve: 0.5471
auc(ROC_probit)
## Area under the curve: 0.5494
auc(ROC_svm)
## Area under the curve: 0.4981
auc(ROC_gbm)
## Area under the curve: 1
## Draw all ROCs in one plot
# Label the AUC information on the legend
# I use the simplest plotting method for pedagogical purpose. The output isn't pretty, but it allows you to quickly display relevant ROC-AUC information with only a few lines of code. There are other more-refined ROC curves plotting options available. For better visualization result, you can check out this page: https://rviews.rstudio.com/2019/03/01/some-r-packages-for-roc-curves/
# Plot the ROC of logit model as the base plot and then append other lines formed by the ROCs of other models onto the base plot
par(mar=c(1,1,1,1))
plot(ROC_logit, xlim = c(1.1, -0.1), xlab = "1 - Specificity") # FPR on the x-axis, xlim sets the range of x values on the horizontal axis
lines(ROC_probit, col="blue")
lines(ROC_svm, col="red")
lines(ROC_gbm, col="green")
legend(0.2, 0.5,
legend = c("model AUC", "logit: 0.5471", "probit: 0.5494", "SVM: 0.4981", "GBM: 1"),
col = c("white", "black", "blue", "red", "green"),
lty = c(1, 1, 2, 3, 4),
pch = c(NA, NA, NA, NA, NA),
cex = 0.7)