Reading in the data:

stars <- 
  read_excel("stars.xlsx") |>  
  dplyr::mutate(
    type = factor(type, 
                  levels=c("Brown Dwarf", "Red Dwarf",
                           "White Dwarf","Main Sequence",
                           "Supergiant", "Hypergiant"))
  ) |> 
  mutate(
    across(
      .cols = c(lumin, radius, temp), 
      .fns = log10
    )
  ) |> 
  
  dplyr::select(-lumin, -color,-class)

# Total sample size:
N <- nrow(stars)

K-Neareast-Neighbors

We’ll use KNN to predict the type of star using temp, mag, and radius.

Since KNN uses Euclidean distance to determine how far away each point is from one another, we need to standardize the data first:

star_sc <- 
  stars |> 
  mutate(
    across(
      .cols = where(is.numeric), 
      .fns = ~ (. - mean(.)) / sd(.) )
  ) 

tibble(star_sc)
## # A tibble: 240 × 4
##      temp radius   mag type       
##     <dbl>  <dbl> <dbl> <fct>      
##  1 -0.994 -0.628  1.11 Brown Dwarf
##  2 -1.00  -0.653  1.16 Brown Dwarf
##  3 -1.19  -0.758  1.36 Brown Dwarf
##  4 -1.10  -0.644  1.16 Brown Dwarf
##  5 -1.53  -0.755  1.49 Brown Dwarf
##  6 -1.08  -0.739  1.20 Brown Dwarf
##  7 -1.17  -0.702  1.22 Brown Dwarf
##  8 -1.19  -0.773  1.24 Brown Dwarf
##  9 -1.17  -0.739  1.24 Brown Dwarf
## 10 -1.14  -0.696  1.11 Brown Dwarf
## # ℹ 230 more rows

An important consideration when running KNN is which value of k to choose (how many nearest neighbors should be used to classify a future observation).

Choosing k

We often look at a range of k to find an choice of k so that the misclassification rate is the lowest.

Let’s look at 5 - 50 for our choice of k

k_choice <- 5:50

# data.frame to store the predictions for different choices of k
knn_predictions <- 
  data.frame(
    k = k_choice,
    accuracy = rep(-1, length(k_choice))
  )

The function knn.cv() in the class package performs KNN using leave-one-out cross-validation and returns the predicted class based on the nearest neighbors.

We’ll use a for loop to calculate the accuracy for the different choices of k

for (i in 1:length(k_choice)){
  
  knn_temp <- 
    class::knn.cv(
      train = star_sc |> select(-type), 
      cl = star_sc$type, 
      k = k_choice[i]
    )
  
  # calculating the accuracy of the predictions for the choice of k
  knn_predictions[i, 2] <- 
    confusionMatrix(data = knn_temp, reference = star_sc$type) |> 
    pluck("overall") |> 
    pluck("Accuracy")
    
}

tibble(knn_predictions)
## # A tibble: 46 × 2
##        k accuracy
##    <int>    <dbl>
##  1     5    0.988
##  2     6    0.988
##  3     7    0.988
##  4     8    0.988
##  5     9    0.983
##  6    10    0.983
##  7    11    0.988
##  8    12    0.983
##  9    13    0.983
## 10    14    0.975
## # ℹ 36 more rows

Let’s create a graph to see how the accuracy changes for the different values of k

# Calculating the error rate for each choice of k:
knn_predictions |> 
  
  ggplot(
    mapping = aes(
      x = k,
      y = accuracy
    )
  ) +
  geom_line(
    color = "darkred",
    linewidth = 1
  ) + 
  
  labs(
    x = "Choice of k",
    y = "Correct Prediction Percentage"
  ) +
  
  scale_x_continuous(
    breaks = round(seq(min(k_choice), max(k_choice), length.out = 10)),
    expand = c(0, 0)
  ) +
  scale_y_continuous(labels = scales::percent)

The choices of k from 5 to 17 all have an accuracy over 97.5%!

We can use slice_max() to find which k has the highest accuracy

knn_predictions |> 
  slice_max(accuracy, n = 1)
##    k accuracy
## 1  5    0.988
## 2  6    0.988
## 3  7    0.988
## 4  8    0.988
## 5 11    0.988

The find_best_k() function in the additional R script will calculate the accuracy for a supplied choices of k:

  1. x = the original (not rescaled) explanatory variables

  2. y = a vector of the response variable

  3. k_vec = a vector of k to loop through

  4. rescale = how to rescale x: “normalize”, “standardize”, or “both” (default)

source("find best k function.R")
set.seed(5230)
knn_df <- 
  find_best_k(
    x = stars |> dplyr::select(-type),
    y = stars$type,
    k_vec = 1:50
  )

# creating a graph comparing normalized and standardized accuracy
ggplot(
  data = knn_df,
  mapping = aes(
    x = k,
    y = accuracy,
    color = rescale_method
  )
) + 
  
  geom_line(linewidth = 1)

knn_df |> 
  slice_max(accuracy, n = 1)
## # A tibble: 1 × 3
##       k rescale_method accuracy
##   <int> <chr>             <dbl>
## 1     4 normalize             1