Final Project

library(MASS)

## Warning: package 'MASS' was built under R version 4.5.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Part 1:

Question 1:

a)

wine <- read.csv("wine.csv")
head(wine)

##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06          0.28
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76          0.26
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24          0.30
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49          0.24
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69          0.39
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39          0.34
##   Proanthocyanins Color  Hue Dilution Proline
## 1            2.29  5.64 1.04     3.92    1065
## 2            1.28  4.38 1.05     3.40    1050
## 3            2.81  5.68 1.03     3.17    1185
## 4            2.18  7.80 0.86     3.45    1480
## 5            1.82  4.32 1.04     2.93     735
## 6            1.97  6.75 1.05     2.85    1450

b)

wine_x <- wine[, -1]
wine_scaled <- scale(wine_x)

c)

summary(wine_scaled)

##     Alcohol             Malic              Ash             Alcalinity       
##  Min.   :-2.42739   Min.   :-1.4290   Min.   :-3.66881   Min.   :-2.663505  
##  1st Qu.:-0.78603   1st Qu.:-0.6569   1st Qu.:-0.57051   1st Qu.:-0.687199  
##  Median : 0.06083   Median :-0.4219   Median :-0.02375   Median : 0.001514  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.83378   3rd Qu.: 0.6679   3rd Qu.: 0.69614   3rd Qu.: 0.600395  
##  Max.   : 2.25341   Max.   : 3.1004   Max.   : 3.14745   Max.   : 3.145637  
##    Magnesium          Phenols           Flavanoids      Nonflavanoids    
##  Min.   :-2.0824   Min.   :-2.10132   Min.   :-1.6912   Min.   :-1.8630  
##  1st Qu.:-0.8221   1st Qu.:-0.88298   1st Qu.:-0.8252   1st Qu.:-0.7381  
##  Median :-0.1219   Median : 0.09569   Median : 0.1059   Median :-0.1756  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5082   3rd Qu.: 0.80672   3rd Qu.: 0.8467   3rd Qu.: 0.6078  
##  Max.   : 4.3591   Max.   : 2.53237   Max.   : 3.0542   Max.   : 2.3956  
##  Proanthocyanins        Color              Hue              Dilution      
##  Min.   :-2.06321   Min.   :-1.6297   Min.   :-2.08884   Min.   :-1.8897  
##  1st Qu.:-0.59560   1st Qu.:-0.7929   1st Qu.:-0.76540   1st Qu.:-0.9496  
##  Median :-0.06272   Median :-0.1588   Median : 0.03303   Median : 0.2371  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.62741   3rd Qu.: 0.4926   3rd Qu.: 0.71116   3rd Qu.: 0.7864  
##  Max.   : 3.47527   Max.   : 3.4258   Max.   : 3.29241   Max.   : 1.9554  
##     Proline       
##  Min.   :-1.4890  
##  1st Qu.:-0.7824  
##  Median :-0.2331  
##  Mean   : 0.0000  
##  3rd Qu.: 0.7561  
##  Max.   : 2.9631

apply(wine_scaled, 2, sd)

##         Alcohol           Malic             Ash      Alcalinity       Magnesium 
##               1               1               1               1               1 
##         Phenols      Flavanoids   Nonflavanoids Proanthocyanins           Color 
##               1               1               1               1               1 
##             Hue        Dilution         Proline 
##               1               1               1

apply(wine_scaled, 2, range)

##        Alcohol     Malic       Ash Alcalinity Magnesium   Phenols Flavanoids
## [1,] -2.427388 -1.428952 -3.668813  -2.663505 -2.082381 -2.101318  -1.691200
## [2,]  2.253415  3.100446  3.147447   3.145637  4.359076  2.532372   3.054216
##      Nonflavanoids Proanthocyanins     Color       Hue  Dilution   Proline
## [1,]     -1.862979       -2.063214 -1.629691 -2.088840 -1.889723 -1.488987
## [2,]      2.395645        3.475269  3.425768  3.292407  1.955399  2.963114

Question 2:

a)

set.seed(1)
km3 <- kmeans(wine_scaled, centers = 3, nstart = 25)

b)

km3$centers

##      Alcohol      Malic        Ash Alcalinity   Magnesium     Phenols
## 1  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
## 2  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
## 3 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
##    Flavanoids Nonflavanoids Proanthocyanins      Color        Hue   Dilution
## 1  0.97506900   -0.56050853      0.57865427  0.1705823  0.4726504  0.7770551
## 2 -1.21182921    0.72402116     -0.77751312  0.9388902 -1.1615122 -1.2887761
## 3  0.02075402   -0.03343924      0.05810161 -0.8993770  0.4605046  0.2700025
##      Proline
## 1  1.1220202
## 2 -0.4059428
## 3 -0.7517257

km3$size

## [1] 62 51 65

km3$tot.withinss

## [1] 1270.749

The WGSS tells us how closely the data points are clustered around the cluster centers. A smaller WGSS means that the clusters are more compact. Generally when the number of clusters increases, the WGSS decreases.

c)

wine_clustered <- cbind(wine, Cluster = factor(km3$cluster))
head(wine_clustered)

##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06          0.28
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76          0.26
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24          0.30
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49          0.24
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69          0.39
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39          0.34
##   Proanthocyanins Color  Hue Dilution Proline Cluster
## 1            2.29  5.64 1.04     3.92    1065       1
## 2            1.28  4.38 1.05     3.40    1050       1
## 3            2.81  5.68 1.03     3.17    1185       1
## 4            2.18  7.80 0.86     3.45    1480       1
## 5            1.82  4.32 1.04     2.93     735       1
## 6            1.97  6.75 1.05     2.85    1450       1

Question 3:

a)

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

ggplot(wine_clustered,
       aes(x = Alcohol, y = Color, color = Cluster)) +
  geom_point() +
  theme_minimal() +
  labs(title = "k-means Clustering (k = 3)",
       x = "Alcohol",
       y = "Color Intensity")

### b)

centers <- as.data.frame(km3$centers)
centers$Cluster <- factor(1:3)

ggplot(wine_clustered,
       aes(x = Alcohol, y = Color, color = Cluster)) +
  geom_point(alpha = 0.7) +
  geom_point(data = centers,
             aes(x = Alcohol, y = Color),
             size = 4, shape = 8) +
  theme_minimal()

Question 4:

table(wine_clustered$Cluster, wine_clustered$Type)

##    
##      1  2  3
##   1 59  3  0
##   2  0  3 48
##   3  0 65  0

mapped_cluster <- factor(ifelse(wine_clustered$Cluster == 1, 2,
                         ifelse(wine_clustered$Cluster == 2, 1, 3)))

table(Predicted = mapped_cluster,
      Actual = wine_clustered$Type)

##          Actual
## Predicted  1  2  3
##         1  0  3 48
##         2 59  3  0
##         3  0 65  0

The cluster labels produced by k-means don’t correspond to the actual wine type labels because the numbering of clusters is random. To make a comparison the clusters need to be matched to the wine type that appears most frequently within it. After this most wines in each cluster belonged to a single wine type with only a small amount of overlap. This means that k-means clustering is effective at separating the wines based on their chemical characteristics.

Question 5:

wgss <- numeric()

for (k in 2:5) {
  set.seed(1)
  km <- kmeans(wine_scaled, centers = k, nstart = 25)
  wgss[k] <- km$tot.withinss
}

plot(2:5, wgss[2:5], type = "b",
     xlab = "Number of clusters (k)",
     ylab = "WGSS",
     main = "Elbow Plot for k-means")

The plot suggests that k=3 is the most accurate choice for the number of clusters. This is because the decreases in 4 and 5 are not sharp like the decrease seen from k=2 to k=3.

Part 2:

Question 1:

a)

wine <- read.csv("wine.csv")
head(wine)

##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06          0.28
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76          0.26
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24          0.30
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49          0.24
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69          0.39
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39          0.34
##   Proanthocyanins Color  Hue Dilution Proline
## 1            2.29  5.64 1.04     3.92    1065
## 2            1.28  4.38 1.05     3.40    1050
## 3            2.81  5.68 1.03     3.17    1185
## 4            2.18  7.80 0.86     3.45    1480
## 5            1.82  4.32 1.04     2.93     735
## 6            1.97  6.75 1.05     2.85    1450

b)

str(wine)

## 'data.frame':    178 obs. of  14 variables:
##  $ Type           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol        : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash            : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Alcalinity     : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium      : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols        : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids     : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoids  : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins: num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color          : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue            : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ Dilution       : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline        : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

summary(wine)

##       Type          Alcohol          Malic            Ash       
##  Min.   :1.000   Min.   :11.03   Min.   :0.740   Min.   :1.360  
##  1st Qu.:1.000   1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210  
##  Median :2.000   Median :13.05   Median :1.865   Median :2.360  
##  Mean   :1.938   Mean   :13.00   Mean   :2.336   Mean   :2.367  
##  3rd Qu.:3.000   3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558  
##  Max.   :3.000   Max.   :14.83   Max.   :5.800   Max.   :3.230  
##    Alcalinity      Magnesium         Phenols        Flavanoids   
##  Min.   :10.60   Min.   : 70.00   Min.   :0.980   Min.   :0.340  
##  1st Qu.:17.20   1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205  
##  Median :19.50   Median : 98.00   Median :2.355   Median :2.135  
##  Mean   :19.49   Mean   : 99.74   Mean   :2.295   Mean   :2.029  
##  3rd Qu.:21.50   3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875  
##  Max.   :30.00   Max.   :162.00   Max.   :3.880   Max.   :5.080  
##  Nonflavanoids    Proanthocyanins     Color             Hue        
##  Min.   :0.1300   Min.   :0.410   Min.   : 1.280   Min.   :0.4800  
##  1st Qu.:0.2700   1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825  
##  Median :0.3400   Median :1.555   Median : 4.690   Median :0.9650  
##  Mean   :0.3619   Mean   :1.591   Mean   : 5.058   Mean   :0.9574  
##  3rd Qu.:0.4375   3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200  
##  Max.   :0.6600   Max.   :3.580   Max.   :13.000   Max.   :1.7100  
##     Dilution        Proline      
##  Min.   :1.270   Min.   : 278.0  
##  1st Qu.:1.938   1st Qu.: 500.5  
##  Median :2.780   Median : 673.5  
##  Mean   :2.612   Mean   : 746.9  
##  3rd Qu.:3.170   3rd Qu.: 985.0  
##  Max.   :4.000   Max.   :1680.0

c)

wine_x <- wine[, -1]
wine_x_scaled <- scale(wine_x)
wine_y <- wine$Type

d)

set.seed(1)

n <- nrow(wine)
train_indices <- sample(1:n, size = 0.7 * n)

x_train <- wine_x_scaled[train_indices, ]
x_test  <- wine_x_scaled[-train_indices, ]

y_train <- wine_y[train_indices]
y_test  <- wine_y[-train_indices]

Question 2:

a)

library(class)

knn_pred_3 <- knn(train = x_train,
                  test  = x_test,
                  cl    = y_train,
                  k     = 3)

Question 3:

a)

cm3 <- table(Predicted = knn_pred_3,
             Actual = y_test)
cm3

##          Actual
## Predicted  1  2  3
##         1 20  2  0
##         2  0 19  0
##         3  0  1 12

b)

accuracy_3 <- sum(diag(cm3)) / sum(cm3)
accuracy_3

## [1] 0.9444444

c)

k_values <- c(1, 3, 5, 7)
accuracies <- numeric()

for (i in seq_along(k_values)) {
  preds <- knn(train = x_train,
               test  = x_test,
               cl    = y_train,
               k     = k_values[i])
  
  cm <- table(preds, y_test)
  accuracies[i] <- sum(diag(cm)) / sum(cm)
}

results <- data.frame(k = k_values,
                      Accuracy = accuracies)
results

##   k  Accuracy
## 1 1 0.9444444
## 2 3 0.9444444
## 3 5 0.9074074
## 4 7 0.9074074

The KNN classifier has the highest accuracy for k=1 and k=3, with both having an accuracy of 94.4%. When k=5 and k=7 the accuracy decreased to 90.7%. This means that smaller values of k are a better capture of the local structure of the data, while the larger values introduce more smoothing and reduce classification performance.

Final Project

Andrew Stein

12-19-2025

Part 1:

Question 1:

a)

b)

c)

Question 2:

a)

b)

c)

Question 3:

a)

Question 4:

Question 5:

Part 2:

Question 1:

a)

b)

c)

d)

Question 2:

a)

Question 3:

a)

b)

c)