library(MASS)
## Warning: package 'MASS' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
wine <- read.csv("wine.csv")
head(wine)
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34
## Proanthocyanins Color Hue Dilution Proline
## 1 2.29 5.64 1.04 3.92 1065
## 2 1.28 4.38 1.05 3.40 1050
## 3 2.81 5.68 1.03 3.17 1185
## 4 2.18 7.80 0.86 3.45 1480
## 5 1.82 4.32 1.04 2.93 735
## 6 1.97 6.75 1.05 2.85 1450
wine_x <- wine[, -1]
wine_scaled <- scale(wine_x)
summary(wine_scaled)
## Alcohol Malic Ash Alcalinity
## Min. :-2.42739 Min. :-1.4290 Min. :-3.66881 Min. :-2.663505
## 1st Qu.:-0.78603 1st Qu.:-0.6569 1st Qu.:-0.57051 1st Qu.:-0.687199
## Median : 0.06083 Median :-0.4219 Median :-0.02375 Median : 0.001514
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.83378 3rd Qu.: 0.6679 3rd Qu.: 0.69614 3rd Qu.: 0.600395
## Max. : 2.25341 Max. : 3.1004 Max. : 3.14745 Max. : 3.145637
## Magnesium Phenols Flavanoids Nonflavanoids
## Min. :-2.0824 Min. :-2.10132 Min. :-1.6912 Min. :-1.8630
## 1st Qu.:-0.8221 1st Qu.:-0.88298 1st Qu.:-0.8252 1st Qu.:-0.7381
## Median :-0.1219 Median : 0.09569 Median : 0.1059 Median :-0.1756
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5082 3rd Qu.: 0.80672 3rd Qu.: 0.8467 3rd Qu.: 0.6078
## Max. : 4.3591 Max. : 2.53237 Max. : 3.0542 Max. : 2.3956
## Proanthocyanins Color Hue Dilution
## Min. :-2.06321 Min. :-1.6297 Min. :-2.08884 Min. :-1.8897
## 1st Qu.:-0.59560 1st Qu.:-0.7929 1st Qu.:-0.76540 1st Qu.:-0.9496
## Median :-0.06272 Median :-0.1588 Median : 0.03303 Median : 0.2371
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.62741 3rd Qu.: 0.4926 3rd Qu.: 0.71116 3rd Qu.: 0.7864
## Max. : 3.47527 Max. : 3.4258 Max. : 3.29241 Max. : 1.9554
## Proline
## Min. :-1.4890
## 1st Qu.:-0.7824
## Median :-0.2331
## Mean : 0.0000
## 3rd Qu.: 0.7561
## Max. : 2.9631
apply(wine_scaled, 2, sd)
## Alcohol Malic Ash Alcalinity Magnesium
## 1 1 1 1 1
## Phenols Flavanoids Nonflavanoids Proanthocyanins Color
## 1 1 1 1 1
## Hue Dilution Proline
## 1 1 1
apply(wine_scaled, 2, range)
## Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
## [1,] -2.427388 -1.428952 -3.668813 -2.663505 -2.082381 -2.101318 -1.691200
## [2,] 2.253415 3.100446 3.147447 3.145637 4.359076 2.532372 3.054216
## Nonflavanoids Proanthocyanins Color Hue Dilution Proline
## [1,] -1.862979 -2.063214 -1.629691 -2.088840 -1.889723 -1.488987
## [2,] 2.395645 3.475269 3.425768 3.292407 1.955399 2.963114
set.seed(1)
km3 <- kmeans(wine_scaled, centers = 3, nstart = 25)
km3$centers
## Alcohol Malic Ash Alcalinity Magnesium Phenols
## 1 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 2 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## 3 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## Flavanoids Nonflavanoids Proanthocyanins Color Hue Dilution
## 1 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504 0.7770551
## 2 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122 -1.2887761
## 3 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046 0.2700025
## Proline
## 1 1.1220202
## 2 -0.4059428
## 3 -0.7517257
km3$size
## [1] 62 51 65
km3$tot.withinss
## [1] 1270.749
The WGSS tells us how closely the data points are clustered around the cluster centers. A smaller WGSS means that the clusters are more compact. Generally when the number of clusters increases, the WGSS decreases.
wine_clustered <- cbind(wine, Cluster = factor(km3$cluster))
head(wine_clustered)
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34
## Proanthocyanins Color Hue Dilution Proline Cluster
## 1 2.29 5.64 1.04 3.92 1065 1
## 2 1.28 4.38 1.05 3.40 1050 1
## 3 2.81 5.68 1.03 3.17 1185 1
## 4 2.18 7.80 0.86 3.45 1480 1
## 5 1.82 4.32 1.04 2.93 735 1
## 6 1.97 6.75 1.05 2.85 1450 1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
ggplot(wine_clustered,
aes(x = Alcohol, y = Color, color = Cluster)) +
geom_point() +
theme_minimal() +
labs(title = "k-means Clustering (k = 3)",
x = "Alcohol",
y = "Color Intensity")
### b)
centers <- as.data.frame(km3$centers)
centers$Cluster <- factor(1:3)
ggplot(wine_clustered,
aes(x = Alcohol, y = Color, color = Cluster)) +
geom_point(alpha = 0.7) +
geom_point(data = centers,
aes(x = Alcohol, y = Color),
size = 4, shape = 8) +
theme_minimal()
table(wine_clustered$Cluster, wine_clustered$Type)
##
## 1 2 3
## 1 59 3 0
## 2 0 3 48
## 3 0 65 0
mapped_cluster <- factor(ifelse(wine_clustered$Cluster == 1, 2,
ifelse(wine_clustered$Cluster == 2, 1, 3)))
table(Predicted = mapped_cluster,
Actual = wine_clustered$Type)
## Actual
## Predicted 1 2 3
## 1 0 3 48
## 2 59 3 0
## 3 0 65 0
The cluster labels produced by k-means don’t correspond to the actual wine type labels because the numbering of clusters is random. To make a comparison the clusters need to be matched to the wine type that appears most frequently within it. After this most wines in each cluster belonged to a single wine type with only a small amount of overlap. This means that k-means clustering is effective at separating the wines based on their chemical characteristics.
wgss <- numeric()
for (k in 2:5) {
set.seed(1)
km <- kmeans(wine_scaled, centers = k, nstart = 25)
wgss[k] <- km$tot.withinss
}
plot(2:5, wgss[2:5], type = "b",
xlab = "Number of clusters (k)",
ylab = "WGSS",
main = "Elbow Plot for k-means")
The plot suggests that k=3 is the most accurate choice for the number of clusters. This is because the decreases in 4 and 5 are not sharp like the decrease seen from k=2 to k=3.
wine <- read.csv("wine.csv")
head(wine)
## Type Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34
## Proanthocyanins Color Hue Dilution Proline
## 1 2.29 5.64 1.04 3.92 1065
## 2 1.28 4.38 1.05 3.40 1050
## 3 2.81 5.68 1.03 3.17 1185
## 4 2.18 7.80 0.86 3.45 1480
## 5 1.82 4.32 1.04 2.93 735
## 6 1.97 6.75 1.05 2.85 1450
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoids : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ Dilution : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
summary(wine)
## Type Alcohol Malic Ash
## Min. :1.000 Min. :11.03 Min. :0.740 Min. :1.360
## 1st Qu.:1.000 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210
## Median :2.000 Median :13.05 Median :1.865 Median :2.360
## Mean :1.938 Mean :13.00 Mean :2.336 Mean :2.367
## 3rd Qu.:3.000 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558
## Max. :3.000 Max. :14.83 Max. :5.800 Max. :3.230
## Alcalinity Magnesium Phenols Flavanoids
## Min. :10.60 Min. : 70.00 Min. :0.980 Min. :0.340
## 1st Qu.:17.20 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205
## Median :19.50 Median : 98.00 Median :2.355 Median :2.135
## Mean :19.49 Mean : 99.74 Mean :2.295 Mean :2.029
## 3rd Qu.:21.50 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875
## Max. :30.00 Max. :162.00 Max. :3.880 Max. :5.080
## Nonflavanoids Proanthocyanins Color Hue
## Min. :0.1300 Min. :0.410 Min. : 1.280 Min. :0.4800
## 1st Qu.:0.2700 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825
## Median :0.3400 Median :1.555 Median : 4.690 Median :0.9650
## Mean :0.3619 Mean :1.591 Mean : 5.058 Mean :0.9574
## 3rd Qu.:0.4375 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200
## Max. :0.6600 Max. :3.580 Max. :13.000 Max. :1.7100
## Dilution Proline
## Min. :1.270 Min. : 278.0
## 1st Qu.:1.938 1st Qu.: 500.5
## Median :2.780 Median : 673.5
## Mean :2.612 Mean : 746.9
## 3rd Qu.:3.170 3rd Qu.: 985.0
## Max. :4.000 Max. :1680.0
wine_x <- wine[, -1]
wine_x_scaled <- scale(wine_x)
wine_y <- wine$Type
set.seed(1)
n <- nrow(wine)
train_indices <- sample(1:n, size = 0.7 * n)
x_train <- wine_x_scaled[train_indices, ]
x_test <- wine_x_scaled[-train_indices, ]
y_train <- wine_y[train_indices]
y_test <- wine_y[-train_indices]
library(class)
knn_pred_3 <- knn(train = x_train,
test = x_test,
cl = y_train,
k = 3)
cm3 <- table(Predicted = knn_pred_3,
Actual = y_test)
cm3
## Actual
## Predicted 1 2 3
## 1 20 2 0
## 2 0 19 0
## 3 0 1 12
accuracy_3 <- sum(diag(cm3)) / sum(cm3)
accuracy_3
## [1] 0.9444444
k_values <- c(1, 3, 5, 7)
accuracies <- numeric()
for (i in seq_along(k_values)) {
preds <- knn(train = x_train,
test = x_test,
cl = y_train,
k = k_values[i])
cm <- table(preds, y_test)
accuracies[i] <- sum(diag(cm)) / sum(cm)
}
results <- data.frame(k = k_values,
Accuracy = accuracies)
results
## k Accuracy
## 1 1 0.9444444
## 2 3 0.9444444
## 3 5 0.9074074
## 4 7 0.9074074
The KNN classifier has the highest accuracy for k=1 and k=3, with both having an accuracy of 94.4%. When k=5 and k=7 the accuracy decreased to 90.7%. This means that smaller values of k are a better capture of the local structure of the data, while the larger values introduce more smoothing and reduce classification performance.