knitr::opts_chunk$set(echo = TRUE,
fig.width=10,
fig.height=6,
fig.align = "center")
# Load the needed package(s) below:
pacman::p_load(readxl, tidyverse, skimr, caret,
GGally, class, rpart, rpart.plot)
# Change the default theme below:
theme_set(theme_classic())
theme_update(axis.title = element_text(size = 12),
axis.text = element_text(size = 12),
plot.title = element_text(size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
plot.caption = element_text(size = 8))
wines <- read_excel("wine cultivar.xlsx")
# Changing the cultivar to a factor
wines$Cultivar <- factor(wines$Cultivar)
# checking the data for any missing values
skim(wines)
Name | wines |
Number of rows | 178 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 13 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Cultivar | 0 | 1 | FALSE | 3 | 2: 71, 1: 59, 3: 48 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Alcohol | 0 | 1 | 13.00 | 0.81 | 11.03 | 12.36 | 13.05 | 13.68 | 14.83 | ▂▇▇▇▃ |
Malic.acid | 0 | 1 | 2.34 | 1.12 | 0.74 | 1.60 | 1.87 | 3.08 | 5.80 | ▇▅▂▂▁ |
Ash | 0 | 1 | 2.37 | 0.27 | 1.36 | 2.21 | 2.36 | 2.56 | 3.23 | ▁▂▇▅▁ |
Alkalinity.ash | 0 | 1 | 19.49 | 3.34 | 10.60 | 17.20 | 19.50 | 21.50 | 30.00 | ▁▆▇▃▁ |
Magnesium | 0 | 1 | 99.74 | 14.28 | 70.00 | 88.00 | 98.00 | 107.00 | 162.00 | ▅▇▃▁▁ |
Phenols | 0 | 1 | 2.30 | 0.63 | 0.98 | 1.74 | 2.36 | 2.80 | 3.88 | ▅▇▇▇▁ |
Flavanoids | 0 | 1 | 2.03 | 1.00 | 0.34 | 1.20 | 2.13 | 2.88 | 5.08 | ▆▆▇▂▁ |
NF.phenols | 0 | 1 | 0.36 | 0.12 | 0.13 | 0.27 | 0.34 | 0.44 | 0.66 | ▃▇▅▃▂ |
Proanthocyanins | 0 | 1 | 1.59 | 0.57 | 0.41 | 1.25 | 1.56 | 1.95 | 3.58 | ▃▇▆▂▁ |
Color.intensity | 0 | 1 | 5.06 | 2.32 | 1.28 | 3.22 | 4.69 | 6.20 | 13.00 | ▆▇▃▂▁ |
Hue | 0 | 1 | 0.96 | 0.23 | 0.48 | 0.78 | 0.96 | 1.12 | 1.71 | ▅▇▇▃▁ |
OD.Ratio | 0 | 1 | 2.61 | 0.71 | 1.27 | 1.94 | 2.78 | 3.17 | 4.00 | ▆▃▆▇▃ |
Proline | 0 | 1 | 746.89 | 314.91 | 278.00 | 500.50 | 673.50 | 985.00 | 1680.00 | ▇▇▅▃▁ |
# Calculating k, p, and N
k <- n_distinct(wines$Cultivar)
p <- wines |>
dplyr::select(where(is.numeric)) |>
ncol()
N <- nrow(wines)
Create a scatterplot using the appropriate method and indicate which cultivar an observation is by color.
## Getting the LDA object using lda()
wine_lda <-
MASS::lda(Cultivar ~ .,
data = wines)
# Creating a data frame that has the LDs and the cultivar factor
data.frame(
cultivar = wines$Cultivar,
predict(wine_lda)$x) |>
# Initializing ggplot
ggplot(mapping = aes(x = LD1,
y = LD2,
color = cultivar)) +
# geom_point() is for the scatterplot
geom_point(size = 1.5,
alpha = 0.75) +
# Where to place the legend of the cultivar colors
theme(legend.position = "bottom") +
# Add in the 95% ellipses for each group
stat_ellipse(
linetype = "dashed",
size = 1,
show.legend = F
) +
# Title and labels for the axes
labs(
title = "First two discriminant functions for Wine Cultivars",
x = paste0("LD1 (Percent Explained: ",
round(wine_lda$svd[1]^2/sum(wine_lda$svd^2),3)*100, "%)"),
y = paste0("LD2 (Percent Explained: ",
round(wine_lda$svd[2]^2/sum(wine_lda$svd^2),3)*100, "%)")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Does it appear the cultivar types are well separated? Why or why not?
Yes, the first two Linear Discriminants show that there is little overlap between the 3 cultivars!
Yes, classification should work well since the graph of the two discriminants shows that the 3 groups are well separated, meaning any classification method should perform well (low misclassification rates).
Use either LDA or QDA to create a set of classification rules. Explain why you used LDA or QDA to form the rules.
## Box M test for equal covariance matrices
rstatix::box_m(wines |> select(where(is.numeric)),
group = wines$Cultivar)
## Fitting the QDA Model
wine_qda <- MASS::qda(Cultivar ~ .,
data = wines,
prior = c(1/3, 1/3, 1/3))
There is very strong evidence from Box’s M-test that the 3 wine cultivar groups don’t have equal covariance matrices.
We can also see that from the plot of the 2 discriminants since three ellipses are angled in different directions.
Since the covariance matrices are not equal, we should use QDA instead of LDA
Create the confusion matrix using cross-validation for the classification rules created in question 3.
What is the estimated error rate?
### Confusion Matrix for QDA - holdout cross-validation
data.frame(predicted = MASS::qda(Cultivar ~ .,
data = wines,
CV = T)$class,
actual = wines$Cultivar) |>
table() |>
caret::confusionMatrix()
## Confusion Matrix and Statistics
##
## actual
## predicted 1 2 3
## 1 59 1 0
## 2 0 70 0
## 3 0 0 48
##
## Overall Statistics
##
## Accuracy : 0.9944
## 95% CI : (0.9691, 0.9999)
## No Information Rate : 0.3989
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9915
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 0.9859 1.0000
## Specificity 0.9916 1.0000 1.0000
## Pos Pred Value 0.9833 1.0000 1.0000
## Neg Pred Value 1.0000 0.9907 1.0000
## Prevalence 0.3315 0.3989 0.2697
## Detection Rate 0.3315 0.3933 0.2697
## Detection Prevalence 0.3371 0.3933 0.2697
## Balanced Accuracy 0.9958 0.9930 1.0000
The apparent error rate and the estimated error rate are both the same, 1/178 or about 0.56%
The sole misclassification was a wine made from cultivar 2 grapes, but we predicted it was made with cultivar 1 grapes.
Use kNN to classify the wine cultivars.
Which choice of k do you recommend? Justify your answer!
Include the confusion matrix for the result using your choice of k.
Before performing kNN, make sure to rescale the data!
## Reading in the find best k function.R script
source("find best k function.R")
# Using the find_best_k() function to find the accuracy of all choices of k
knn_df <-
find_best_k(x = wines |> dplyr::select(where(is.numeric)),
y = wines$Cultivar,
k = 1:177)
# Plotting the results
ggplot(
data = knn_df,
mapping = aes(
x = k,
y = accuracy,
color = rescale_method
)
) +
geom_line(linewidth = 1) +
# Displaying the choice of k with the highest accuracy
ggrepel::geom_text_repel(
data = knn_df |> slice_max(accuracy),
mapping = aes(label = k),
show.legend = F
) +
labs(color = "Rescale\nMethod") +
theme(legend.position.inside = c(0.9, 0.8))
## table of the best choice of k
knn_df |>
slice_max(accuracy) |>
gt::gt()
k | rescale_method | accuracy |
---|---|---|
36 | standardize | 0.9831461 |
37 | standardize | 0.9831461 |
39 | standardize | 0.9831461 |
41 | standardize | 0.9831461 |
Looks like choosing k = 36 - 41 gives the lowest error rate. We’ll use k = 41 since the larger the choice of k, the simpler the model
# Standardizing the data:
wines_sc <-
wines |>
mutate(
across(
.cols = where(is.numeric),
.fns = scale
)
)
wine_knn <-
class::knn.cv(
train = wines_sc |> select(where(is.numeric)), # Getting the expl vars
cl = wines_sc$Cultivar, # Correct answer
k = 41 # Choice of k
)
data.frame(
predicted = wine_knn,
actual = wines$Cultivar
) |>
table() |>
confusionMatrix()
## Confusion Matrix and Statistics
##
## actual
## predicted 1 2 3
## 1 59 2 0
## 2 0 68 0
## 3 0 1 48
##
## Overall Statistics
##
## Accuracy : 0.9831
## 95% CI : (0.9515, 0.9965)
## No Information Rate : 0.3989
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9745
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 0.9577 1.0000
## Specificity 0.9832 1.0000 0.9923
## Pos Pred Value 0.9672 1.0000 0.9796
## Neg Pred Value 1.0000 0.9727 1.0000
## Prevalence 0.3315 0.3989 0.2697
## Detection Rate 0.3315 0.3820 0.2697
## Detection Prevalence 0.3427 0.3820 0.2753
## Balanced Accuracy 0.9916 0.9789 0.9962
With k = 41, only 3 wines were misclassified (all 3 misclassified wines were cultivar 2)!
Create the full classification tree by specifying that minsplit = 2, minbucket = 1, and cp = -1
The function will keep splitting nodes until they are all pure (or a limit on tree size has been reached).
RNGversion("4.1.0"); set.seed(5230)
# Using rpart to fit the classification tree
wine_fullct <- rpart(
Cultivar ~ .,
data = wines,
method = "class",
parms = list(split = "information"),
minsplit = 2,
cp = -1,
minbucket = 1
)
wine_fullct$cptable
## CP nsplit rel error xerror xstd
## 1 0.411214953 0 1.000000000 1.0000000 0.06105585
## 2 0.121495327 2 0.177570093 0.2523364 0.04472770
## 3 0.037383178 3 0.056074766 0.1962617 0.04022219
## 4 0.009345794 4 0.018691589 0.1121495 0.03126446
## 5 0.004672897 5 0.009345794 0.1028037 0.03002346
## 6 -1.000000000 7 0.000000000 0.1028037 0.03002346
Since there are 7 splits, where will be 8 total leaf (terminal) nodes
Prune the classification tree made in part 5a and graph your pruned tree. Make sure to briefly justify your answer!
# Converting the cp table to a data frame
cp_df <- data.frame(wine_fullct$cptable)
# Finding where to prune the tree
cp_df |>
# Finding all the rows of the table below min(xerror) + xstd
filter(
xerror < cp_df |>
slice_min(xerror, with_ties = F) |>
mutate(xcutoff = xerror + xstd) |>
pull(xcutoff)
) |>
slice(1) |>
pull(CP) ->
cp_cutoff
cp_cutoff
## [1] 0.009345794
# Or we can create a graph
plotcp(wine_fullct)
## Warning in sqrt(cp0 * c(Inf, cp0[-length(cp0)])): NaNs produced
# Using the graph, a cp around 0.019 would work
# Pruning the tree using the prune() function when the complexity parameter is at 0.02
wine_prunect <-
prune(
wine_fullct,
cp = cp_cutoff
)
We want to prune the tree at a cp value of 0.0093
Plotting the pruned tree
rpart.plot(wine_prunect,
type = 5,
extra = 101,
box.palette = list("lightblue","#CC79A7","orange"),
legend.x = NA)
data.frame(actual = wines$Cultivar,
predicted = rpart.predict(wine_prunect,
type = "class")) |>
table() |>
t()
## actual
## predicted 1 2 3
## 1 58 0 0
## 2 1 70 0
## 3 0 1 48
Only 2 misclassifications using the classification tree. However, that estimate is with resubstitution. If we want to find a less biased error rate, we can find the root node error rate and multiply by the xerror rate given by the cptable
# Finding the cultivar that occurs the most: 2 with 71 wines
root_error <-
wines |>
count(Cultivar) |>
summarize(no_info_error = 1 - max(n)/sum(n)) |>
pull(no_info_error)
# Looking at the xerror for our pruned tree: about 0.0935
prune_xerror <-
wine_prunect |>
pluck("cptable") |>
data.frame() |>
slice_tail() |>
pull(xerror)
c(
"NIR error" = root_error,
"xerror" = prune_xerror,
"CV error" = root_error * prune_xerror
)
## NIR error xerror CV error
## 0.60112360 0.11214953 0.06741573
So our error rate is 0.601 times 0.112 = 0.0674
Between the 3 methods used to classify the wines, which approach would you recommend when classifying future wines?
QDA had the lowest overall error rate with only 1 wine being predicted into the wrong cultivar. Since it had the lowest overall error rate, it is the method I would recommend for the future.