knitr::opts_chunk$set(echo = TRUE,
                      fig.width=10, 
                      fig.height=6, 
                      fig.align = "center")


# Load the needed package(s) below:
pacman::p_load(readxl, tidyverse, skimr, caret,
               GGally, class, rpart, rpart.plot)


# Change the default theme below:
theme_set(theme_classic())

theme_update(axis.title = element_text(size = 12),
             axis.text = element_text(size = 12),
             plot.title = element_text(size = 14, hjust = 0.5),
             plot.subtitle = element_text(size = 12, hjust = 0.5),
             plot.caption = element_text(size = 8))

Reading in the data and preliminary data check:

wines <- read_excel("wine cultivar.xlsx")


# Changing the cultivar to a factor 
wines$Cultivar <- factor(wines$Cultivar)

# checking the data for any missing values
skim(wines)
Data summary
Name wines
Number of rows 178
Number of columns 14
_______________________
Column type frequency:
factor 1
numeric 13
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Cultivar 0 1 FALSE 3 2: 71, 1: 59, 3: 48

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Alcohol 0 1 13.00 0.81 11.03 12.36 13.05 13.68 14.83 ▂▇▇▇▃
Malic.acid 0 1 2.34 1.12 0.74 1.60 1.87 3.08 5.80 ▇▅▂▂▁
Ash 0 1 2.37 0.27 1.36 2.21 2.36 2.56 3.23 ▁▂▇▅▁
Alkalinity.ash 0 1 19.49 3.34 10.60 17.20 19.50 21.50 30.00 ▁▆▇▃▁
Magnesium 0 1 99.74 14.28 70.00 88.00 98.00 107.00 162.00 ▅▇▃▁▁
Phenols 0 1 2.30 0.63 0.98 1.74 2.36 2.80 3.88 ▅▇▇▇▁
Flavanoids 0 1 2.03 1.00 0.34 1.20 2.13 2.88 5.08 ▆▆▇▂▁
NF.phenols 0 1 0.36 0.12 0.13 0.27 0.34 0.44 0.66 ▃▇▅▃▂
Proanthocyanins 0 1 1.59 0.57 0.41 1.25 1.56 1.95 3.58 ▃▇▆▂▁
Color.intensity 0 1 5.06 2.32 1.28 3.22 4.69 6.20 13.00 ▆▇▃▂▁
Hue 0 1 0.96 0.23 0.48 0.78 0.96 1.12 1.71 ▅▇▇▃▁
OD.Ratio 0 1 2.61 0.71 1.27 1.94 2.78 3.17 4.00 ▆▃▆▇▃
Proline 0 1 746.89 314.91 278.00 500.50 673.50 985.00 1680.00 ▇▇▅▃▁
# Calculating k, p, and N
k <- n_distinct(wines$Cultivar)

p <- wines |> 
  dplyr::select(where(is.numeric)) |> 
  ncol() 

N <- nrow(wines) 

Question 1: Scatterplot of Linear Discriminants

Create a scatterplot using the appropriate method and indicate which cultivar an observation is by color.

## Getting the LDA object using lda()
wine_lda <- 
  MASS::lda(Cultivar ~ ., 
                      data = wines)


# Creating a data frame that has the LDs and the cultivar factor
data.frame(
  cultivar = wines$Cultivar,
           predict(wine_lda)$x) |> 

  # Initializing ggplot  
  ggplot(mapping = aes(x = LD1, 
                       y = LD2, 
                       color = cultivar)) + 
  
  # geom_point() is for the scatterplot 
  geom_point(size = 1.5, 
             alpha = 0.75) + 
  
  # Where to place the legend of the cultivar colors
  theme(legend.position = "bottom") + 
  
  # Add in the 95% ellipses for each group
  stat_ellipse(
    linetype = "dashed", 
    size = 1, 
    show.legend = F
  ) +
  
  # Title and labels for the axes
  labs(
    title = "First two discriminant functions for Wine Cultivars",
    
    x = paste0("LD1 (Percent Explained: ", 
               round(wine_lda$svd[1]^2/sum(wine_lda$svd^2),3)*100, "%)"),
    
    y = paste0("LD2 (Percent Explained: ", 
               round(wine_lda$svd[2]^2/sum(wine_lda$svd^2),3)*100, "%)")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Does it appear the cultivar types are well separated? Why or why not?

Yes, the first two Linear Discriminants show that there is little overlap between the 3 cultivars!

Question 2: Would Classification Work Well?

Yes, classification should work well since the graph of the two discriminants shows that the 3 groups are well separated, meaning any classification method should perform well (low misclassification rates).

Question 3: LDA or QDA

Use either LDA or QDA to create a set of classification rules. Explain why you used LDA or QDA to form the rules.

## Box M test for equal covariance matrices
rstatix::box_m(wines |> select(where(is.numeric)), 
               group = wines$Cultivar)
## Fitting the QDA Model
wine_qda <- MASS::qda(Cultivar ~ ., 
                      data = wines,
                      prior = c(1/3, 1/3, 1/3))

There is very strong evidence from Box’s M-test that the 3 wine cultivar groups don’t have equal covariance matrices.

We can also see that from the plot of the 2 discriminants since three ellipses are angled in different directions.

Since the covariance matrices are not equal, we should use QDA instead of LDA

Question 4: LDA or QDA Confusion Matrices

Create the confusion matrix using cross-validation for the classification rules created in question 3.

What is the estimated error rate?

### Confusion Matrix for QDA - holdout cross-validation
data.frame(predicted = MASS::qda(Cultivar ~ ., 
                                 data = wines, 
                                 CV = T)$class,
           actual = wines$Cultivar) |> 
  table() |> 
  caret::confusionMatrix()
## Confusion Matrix and Statistics
## 
##          actual
## predicted  1  2  3
##         1 59  1  0
##         2  0 70  0
##         3  0  0 48
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9944          
##                  95% CI : (0.9691, 0.9999)
##     No Information Rate : 0.3989          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9915          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   0.9859   1.0000
## Specificity            0.9916   1.0000   1.0000
## Pos Pred Value         0.9833   1.0000   1.0000
## Neg Pred Value         1.0000   0.9907   1.0000
## Prevalence             0.3315   0.3989   0.2697
## Detection Rate         0.3315   0.3933   0.2697
## Detection Prevalence   0.3371   0.3933   0.2697
## Balanced Accuracy      0.9958   0.9930   1.0000

The apparent error rate and the estimated error rate are both the same, 1/178 or about 0.56%

The sole misclassification was a wine made from cultivar 2 grapes, but we predicted it was made with cultivar 1 grapes.

Question 5:KNN - Which k to use?

Use kNN to classify the wine cultivars.

Which choice of k do you recommend? Justify your answer!

Include the confusion matrix for the result using your choice of k.

Before performing kNN, make sure to rescale the data!

## Reading in the find best k function.R script
source("find best k function.R")

# Using the find_best_k() function to find the accuracy of all choices of k
knn_df <- 
  find_best_k(x = wines |> dplyr::select(where(is.numeric)),
              y = wines$Cultivar,
              k = 1:177)

# Plotting the results
ggplot(
  data = knn_df,
  mapping = aes(
    x = k,
    y = accuracy,
    color = rescale_method
  )
) +
  
  geom_line(linewidth = 1) + 
  
  # Displaying the choice of k with the highest accuracy
  ggrepel::geom_text_repel(
    data = knn_df |> slice_max(accuracy),
             mapping = aes(label = k),
             show.legend = F
  ) + 
  
  labs(color = "Rescale\nMethod") +
  
  theme(legend.position.inside = c(0.9, 0.8))

## table of the best choice of k
knn_df |> 
  slice_max(accuracy) |> 
  gt::gt()
k rescale_method accuracy
36 standardize 0.9831461
37 standardize 0.9831461
39 standardize 0.9831461
41 standardize 0.9831461

Looks like choosing k = 36 - 41 gives the lowest error rate. We’ll use k = 41 since the larger the choice of k, the simpler the model

# Standardizing the data:
wines_sc <- 
  wines |> 
  mutate(
    across(
      .cols = where(is.numeric),
      .fns = scale
    )
  )

wine_knn <- 
  class::knn.cv(
    train = wines_sc |> select(where(is.numeric)),  # Getting the expl vars
    cl = wines_sc$Cultivar,                         # Correct answer
    k = 41                                          # Choice of k
  )

data.frame(
  predicted = wine_knn,
  actual = wines$Cultivar
) |>
  table() |>
  confusionMatrix()
## Confusion Matrix and Statistics
## 
##          actual
## predicted  1  2  3
##         1 59  2  0
##         2  0 68  0
##         3  0  1 48
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9831          
##                  95% CI : (0.9515, 0.9965)
##     No Information Rate : 0.3989          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9745          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   0.9577   1.0000
## Specificity            0.9832   1.0000   0.9923
## Pos Pred Value         0.9672   1.0000   0.9796
## Neg Pred Value         1.0000   0.9727   1.0000
## Prevalence             0.3315   0.3989   0.2697
## Detection Rate         0.3315   0.3820   0.2697
## Detection Prevalence   0.3427   0.3820   0.2753
## Balanced Accuracy      0.9916   0.9789   0.9962

With k = 41, only 3 wines were misclassified (all 3 misclassified wines were cultivar 2)!

Question 6: Classification Trees

Part a) Full tree

Create the full classification tree by specifying that minsplit = 2, minbucket = 1, and cp = -1

The function will keep splitting nodes until they are all pure (or a limit on tree size has been reached).

RNGversion("4.1.0"); set.seed(5230)
# Using rpart to fit the classification tree
wine_fullct <- rpart(
  Cultivar ~ ., 
  data = wines, 
  method = "class", 
  parms = list(split = "information"),
  minsplit = 2, 
  cp = -1,
  minbucket = 1
)

wine_fullct$cptable
##             CP nsplit   rel error    xerror       xstd
## 1  0.411214953      0 1.000000000 1.0000000 0.06105585
## 2  0.121495327      2 0.177570093 0.2523364 0.04472770
## 3  0.037383178      3 0.056074766 0.1962617 0.04022219
## 4  0.009345794      4 0.018691589 0.1121495 0.03126446
## 5  0.004672897      5 0.009345794 0.1028037 0.03002346
## 6 -1.000000000      7 0.000000000 0.1028037 0.03002346

Since there are 7 splits, where will be 8 total leaf (terminal) nodes

Part 6b: Pruning the Classification Tree

Prune the classification tree made in part 5a and graph your pruned tree. Make sure to briefly justify your answer!

# Converting the cp table to a data frame
cp_df <- data.frame(wine_fullct$cptable)

# Finding where to prune the tree
cp_df |> 
  # Finding all the rows of the table below min(xerror) + xstd
  filter(
    xerror < cp_df |> 
             slice_min(xerror, with_ties = F) |> 
             mutate(xcutoff = xerror + xstd) |> 
             pull(xcutoff)
  ) |> 
  slice(1) |> 
  pull(CP) ->
  
  cp_cutoff
  
cp_cutoff
## [1] 0.009345794
# Or we can create a graph
plotcp(wine_fullct)
## Warning in sqrt(cp0 * c(Inf, cp0[-length(cp0)])): NaNs produced

# Using the graph, a cp around 0.019 would work


# Pruning the tree using the prune() function when the complexity parameter is at 0.02
wine_prunect <- 
  prune(
    wine_fullct, 
    cp = cp_cutoff
  )

We want to prune the tree at a cp value of 0.0093

Plotting the pruned tree

rpart.plot(wine_prunect, 
           type = 5, 
           extra = 101,
           box.palette = list("lightblue","#CC79A7","orange"),
           legend.x = NA)

Part 6c) Create the confusion matrix for the pruned tree

data.frame(actual = wines$Cultivar,
           predicted = rpart.predict(wine_prunect,
                                     type = "class")) |> 
  table() |> 
  t()
##          actual
## predicted  1  2  3
##         1 58  0  0
##         2  1 70  0
##         3  0  1 48

Only 2 misclassifications using the classification tree. However, that estimate is with resubstitution. If we want to find a less biased error rate, we can find the root node error rate and multiply by the xerror rate given by the cptable

# Finding the cultivar that occurs the most: 2 with 71 wines
root_error <- 
  wines |> 
  count(Cultivar) |> 
  summarize(no_info_error = 1 - max(n)/sum(n)) |> 
  pull(no_info_error)
  

# Looking at the xerror for our pruned tree: about 0.0935
prune_xerror <- 
  wine_prunect |> 
  pluck("cptable") |> 
  data.frame() |> 
  slice_tail() |> 
  pull(xerror) 
  

c(
  "NIR error" = root_error, 
  "xerror" = prune_xerror, 
  "CV error" = root_error * prune_xerror
)
##  NIR error     xerror   CV error 
## 0.60112360 0.11214953 0.06741573

So our error rate is 0.601 times 0.112 = 0.0674

Question 7

Between the 3 methods used to classify the wines, which approach would you recommend when classifying future wines?

QDA had the lowest overall error rate with only 1 wine being predicted into the wrong cultivar. Since it had the lowest overall error rate, it is the method I would recommend for the future.