stars <-
read_excel("stars.xlsx") |>
dplyr::mutate(
type = factor(type,
levels=c("Brown Dwarf", "Red Dwarf",
"White Dwarf","Main Sequence",
"Supergiant", "Hypergiant"))
) |>
mutate(
across(
.cols = c(lumin, radius, temp),
.fns = log10
)
) |>
dplyr::select(-lumin, -color,-class)
# Total sample size:
N <- nrow(stars)
We’ll use KNN to predict the type of star using temp, mag, and radius.
Since KNN uses Euclidean distance to determine how far away each point is from one another, we need to standardize the data first:
star_sc <-
stars |>
mutate(
across(
.cols = where(is.numeric),
.fns = ~ (. - mean(.)) / sd(.) )
)
tibble(star_sc)
## # A tibble: 240 × 4
## temp radius mag type
## <dbl> <dbl> <dbl> <fct>
## 1 -0.994 -0.628 1.11 Brown Dwarf
## 2 -1.00 -0.653 1.16 Brown Dwarf
## 3 -1.19 -0.758 1.36 Brown Dwarf
## 4 -1.10 -0.644 1.16 Brown Dwarf
## 5 -1.53 -0.755 1.49 Brown Dwarf
## 6 -1.08 -0.739 1.20 Brown Dwarf
## 7 -1.17 -0.702 1.22 Brown Dwarf
## 8 -1.19 -0.773 1.24 Brown Dwarf
## 9 -1.17 -0.739 1.24 Brown Dwarf
## 10 -1.14 -0.696 1.11 Brown Dwarf
## # ℹ 230 more rows
An important consideration when running KNN is which value of k to choose (how many nearest neighbors should be used to classify a future observation).
We often look at a range of k to find an choice of k so that the misclassification rate is the lowest.
Let’s look at 5 - 50 for our choice of k
k_choice <- 5:50
# data.frame to store the predictions for different choices of k
knn_predictions <-
data.frame(
k = k_choice,
accuracy = rep(-1, length(k_choice))
)
The function knn.cv()
in the class
package
performs KNN using leave-one-out cross-validation and returns the
predicted class based on the nearest neighbors.
We’ll use a for
loop to calculate the accuracy for the
different choices of k
for (i in 1:length(k_choice)){
knn_temp <-
class::knn.cv(
train = star_sc |> select(-type),
cl = star_sc$type,
k = k_choice[i]
)
# calculating the accuracy of the predictions for the choice of k
knn_predictions[i, 2] <-
confusionMatrix(data = knn_temp, reference = star_sc$type) |>
pluck("overall") |>
pluck("Accuracy")
}
tibble(knn_predictions)
## # A tibble: 46 × 2
## k accuracy
## <int> <dbl>
## 1 5 0.988
## 2 6 0.988
## 3 7 0.988
## 4 8 0.988
## 5 9 0.983
## 6 10 0.983
## 7 11 0.988
## 8 12 0.983
## 9 13 0.983
## 10 14 0.975
## # ℹ 36 more rows
Let’s create a graph to see how the accuracy changes for the different values of k
# Calculating the error rate for each choice of k:
knn_predictions |>
ggplot(
mapping = aes(
x = k,
y = accuracy
)
) +
geom_line(
color = "darkred",
linewidth = 1
) +
labs(
x = "Choice of k",
y = "Correct Prediction Percentage"
) +
scale_x_continuous(
breaks = round(seq(min(k_choice), max(k_choice), length.out = 10)),
expand = c(0, 0)
) +
scale_y_continuous(labels = scales::percent)
The choices of k from 5 to 17 all have an accuracy over 97.5%!
We can use slice_max()
to find which k has the highest
accuracy
knn_predictions |>
slice_max(accuracy, n = 1)
## k accuracy
## 1 5 0.988
## 2 6 0.988
## 3 7 0.988
## 4 8 0.988
## 5 11 0.988
The find_best_k()
function in the additional R script
will calculate the accuracy for a supplied choices of k:
x =
the original (not rescaled) explanatory
variables
y =
a vector of the response variable
k_vec =
a vector of k to loop through
rescale =
how to rescale x
:
“normalize”, “standardize”, or “both” (default)
source("find best k function.R")
set.seed(5230)
knn_df <-
find_best_k(
x = stars |> dplyr::select(-type),
y = stars$type,
k_vec = 1:50
)
# creating a graph comparing normalized and standardized accuracy
ggplot(
data = knn_df,
mapping = aes(
x = k,
y = accuracy,
color = rescale_method
)
) +
geom_line(linewidth = 1)
knn_df |>
slice_max(accuracy, n = 1)
## # A tibble: 1 × 3
## k rescale_method accuracy
## <int> <chr> <dbl>
## 1 4 normalize 1