# Library
## tidyverse for data wrangling
library(tidyverse)
## plotly for 3D plots
library(plotly)
## class for classification functions
library(class)
## caret for train() function
library(caret)
## scales for rescaling
library(scales)

The Data

# loading data
fish <- read_csv("fishcatch.csv")
## Rows: 159 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): std_name, common_name, sex
## dbl (8): weight_g, length_nose2tail_base_cm, length_nose2tail_notch_cm, leng...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(fish)
## # A tibble: 6 × 11
##   std_name    common_name weight_g length_nose2tail_bas…¹ length_nose2tail_not…²
##   <chr>       <chr>          <dbl>                  <dbl>                  <dbl>
## 1 Abramis_br… bream            242                   23.2                   25.4
## 2 Abramis_br… bream            290                   24                     26.3
## 3 Abramis_br… bream            340                   23.9                   26.5
## 4 Abramis_br… bream            363                   26.3                   29  
## 5 Abramis_br… bream            430                   26.5                   29  
## 6 Abramis_br… bream            450                   26.8                   29.7
## # ℹ abbreviated names: ¹​length_nose2tail_base_cm, ²​length_nose2tail_notch_cm
## # ℹ 6 more variables: length_nose2tail_end_cm <dbl>, height_cm <dbl>,
## #   width_cm <dbl>, height2length_pct <dbl>, width2length_pct <dbl>, sex <chr>
str(fish)
## spc_tbl_ [159 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ std_name                 : chr [1:159] "Abramis_brama" "Abramis_brama" "Abramis_brama" "Abramis_brama" ...
##  $ common_name              : chr [1:159] "bream" "bream" "bream" "bream" ...
##  $ weight_g                 : num [1:159] 242 290 340 363 430 450 500 390 450 500 ...
##  $ length_nose2tail_base_cm : num [1:159] 23.2 24 23.9 26.3 26.5 26.8 26.8 27.6 27.6 28.5 ...
##  $ length_nose2tail_notch_cm: num [1:159] 25.4 26.3 26.5 29 29 29.7 29.7 30 30 30.7 ...
##  $ length_nose2tail_end_cm  : num [1:159] 30 31.2 31.1 33.5 34 34.7 34.5 35 35.1 36.2 ...
##  $ height_cm                : num [1:159] 11.5 12.5 12.4 12.7 12.4 13.6 14.2 12.7 14 14.2 ...
##  $ width_cm                 : num [1:159] 4 4.3 4.7 4.5 5.1 4.9 5.3 4.7 4.8 5 ...
##  $ height2length_pct        : num [1:159] 38.4 40 39.8 38 36.6 39.2 41.1 36.2 39.9 39.3 ...
##  $ width2length_pct         : num [1:159] 13.4 13.8 15.1 13.3 15.1 14.2 15.3 13.4 13.8 13.7 ...
##  $ sex                      : chr [1:159] NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   std_name = col_character(),
##   ..   common_name = col_character(),
##   ..   weight_g = col_double(),
##   ..   length_nose2tail_base_cm = col_double(),
##   ..   length_nose2tail_notch_cm = col_double(),
##   ..   length_nose2tail_end_cm = col_double(),
##   ..   height_cm = col_double(),
##   ..   width_cm = col_double(),
##   ..   height2length_pct = col_double(),
##   ..   width2length_pct = col_double(),
##   ..   sex = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
unique(fish$std_name)
## [1] "Abramis_brama"     "Leusiscus_idus"    "Leuciscus_rutilus"
## [4] "Abramis_bjrkna"    "Osmerus_eperlanus" "Esox_lucius"      
## [7] "Perca_fluviatilis"

These data consist of fish catch records in Lake Laengelmavesi, Finland, from 1917. Seven species of fish were caught and multiple measurements were taken, including weight (g) as well as height, weight, and three measures of length (cm). The data also include ratios of height to length and width to length, as a percent.

In order to categorize fish based on these measurements, we need to first standardize the measurements and tidy the data. I will do this by converting the measurements into z-scores, which subtracts the mean and divides by the standard deviation to transform the data into a standard normal distribution range.

fish_std <- fish |>
  # remove columns we don't need
  select(!common_name & !sex) |>
  # remove NAs
  drop_na()

z_score <- function(x) {
  return((x - mean(x)) / sd(x))
}

fish_std <- tibble(data.frame(fish_std[,1],
                              apply(fish_std[,-1], 2, FUN = z_score),
                              id = seq(nrow(fish_std)))) |>
  mutate(std_name = as.factor(std_name))

Now we can visualize the some of the data in the same reference space. For instance, width, height, and weight:

p1 <- plot_ly(fish_std,
             x = ~width_cm,
             y = ~height_cm,
             z = ~weight_g,
             color = ~std_name,
             size = I(150)) |>
  add_markers() |>
  layout(scene = list(xaxis = list(title = "Width"),
                      yaxis = list(title = "Height"),
                      zaxis = list(title = "Weight"))) 
p1

There are some distinct groupings by species, as well as potential linear or logarithmic trends in these variables.

kNN

Single Case

First, I ran single k-Nearest Neighbors run on a train and test subset of the fish catch data. I started with a k equal to the square-root of the data size, as recommended as a potential starting point in the reading. Ultimately I expanded that to multiple k’s. I set the training dataset to be 67% of the original 158 rows, sampling randomly from each species to be sure of a representative subset.

set.seed(42)
# sample random rows from each species
fish_train <- fish_std |>
  group_by(std_name) |>
  slice_sample(prop = 0.67) 

# pull non-sampled rows for test dataset
fish_test <- filter(fish_std, !(id %in% fish_train$id))

# determine initial k value
k1 <- round(sqrt(nrow(fish_train)))

# run kNN
fish_knn <- knn(test = fish_test[,-c(1,10), drop = FALSE], 
                train = fish_train[,-c(1,10), drop = FALSE],
                cl = fish_train$std_name,
                k = k1)

A confusion matrix helps compare the real classifications of our test dataset to the predicted classes from kNN.

# create confusion matrix (caret package)
cm <- confusionMatrix(data = fish_knn, reference = fish_test$std_name)
cm
## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Abramis_bjrkna Abramis_brama Esox_lucius Leuciscus_rutilus
##   Abramis_bjrkna                 3             0           0                 0
##   Abramis_brama                  1            12           0                 0
##   Esox_lucius                    0             0           6                 0
##   Leuciscus_rutilus              0             0           0                 0
##   Leusiscus_idus                 0             0           0                 0
##   Osmerus_eperlanus              0             0           0                 0
##   Perca_fluviatilis              0             0           0                 7
##                    Reference
## Prediction          Leusiscus_idus Osmerus_eperlanus Perca_fluviatilis
##   Abramis_bjrkna                 0                 0                 0
##   Abramis_brama                  0                 0                 0
##   Esox_lucius                    0                 0                 0
##   Leuciscus_rutilus              0                 0                 1
##   Leusiscus_idus                 0                 0                 0
##   Osmerus_eperlanus              0                 5                 0
##   Perca_fluviatilis              2                 0                18
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.6703, 0.8957)
##     No Information Rate : 0.3455          
##     P-Value [Acc > NIR] : 6.331e-12       
##                                           
##                   Kappa : 0.7342          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Abramis_bjrkna Class: Abramis_brama
## Sensitivity                        0.75000               1.0000
## Specificity                        1.00000               0.9767
## Pos Pred Value                     1.00000               0.9231
## Neg Pred Value                     0.98077               1.0000
## Prevalence                         0.07273               0.2182
## Detection Rate                     0.05455               0.2182
## Detection Prevalence               0.05455               0.2364
## Balanced Accuracy                  0.87500               0.9884
##                      Class: Esox_lucius Class: Leuciscus_rutilus
## Sensitivity                      1.0000                  0.00000
## Specificity                      1.0000                  0.97917
## Pos Pred Value                   1.0000                  0.00000
## Neg Pred Value                   1.0000                  0.87037
## Prevalence                       0.1091                  0.12727
## Detection Rate                   0.1091                  0.00000
## Detection Prevalence             0.1091                  0.01818
## Balanced Accuracy                1.0000                  0.48958
##                      Class: Leusiscus_idus Class: Osmerus_eperlanus
## Sensitivity                        0.00000                  1.00000
## Specificity                        1.00000                  1.00000
## Pos Pred Value                         NaN                  1.00000
## Neg Pred Value                     0.96364                  1.00000
## Prevalence                         0.03636                  0.09091
## Detection Rate                     0.00000                  0.09091
## Detection Prevalence               0.00000                  0.09091
## Balanced Accuracy                  0.50000                  1.00000
##                      Class: Perca_fluviatilis
## Sensitivity                            0.9474
## Specificity                            0.7500
## Pos Pred Value                         0.6667
## Neg Pred Value                         0.9643
## Prevalence                             0.3455
## Detection Rate                         0.3273
## Detection Prevalence                   0.4909
## Balanced Accuracy                      0.8487

With a Kappa of 0.734 and an accuracy of 0.8, the initial run of the kNN did surprisingly well at predicting species. Interestingly, many of the mismatches were between species that shared a genus, just as Abramis bjrkna and Abramis brama, as well as Leuciscus rutilus and Leuscis idus in some runs.

Repeated Runs

We can get a confidence interval by repeating the sampling process multiple times and evaluating the spread of our kappa and accuracy values.

# create a function to produce the stats of interest
fish_knn_fxn <- function(x) {
  fish_train <- fish_std |>
    group_by(std_name) |>
    slice_sample(prop = 0.67) 

  fish_test <- filter(fish_std, !(id %in% fish_train$id))
  k1 <- round(sqrt(nrow(fish_train)))
  
  fish_knn <- knn(test = fish_test[,-c(1,10), drop = FALSE], 
                  train = fish_train[,-c(1,10), drop = FALSE],
                  cl = fish_train$std_name,
                  k = k1)
  cm <- confusionMatrix(data = fish_knn, reference = fish_test$std_name)
  
  return(cm$overall[1:2])
}
# test function on a single run
fish_knn_fxn(fish_std)
##  Accuracy     Kappa 
## 0.8545455 0.8071867
# establish loop to repeat function multiple times
n <- 100
fish_knn_df <- matrix(nrow = n, ncol = 2)
for(i in seq(n)) {
  fish_knn_df[i,] <- fish_knn_fxn(fish_std)
}
fish_knn_df
##             [,1]      [,2]
##   [1,] 0.8363636 0.7845953
##   [2,] 0.7818182 0.7130435
##   [3,] 0.8727273 0.8324630
##   [4,] 0.8363636 0.7825132
##   [5,] 0.8363636 0.7866379
##   [6,] 0.8181818 0.7572816
##   [7,] 0.8181818 0.7621107
##   [8,] 0.8181818 0.7614918
##   [9,] 0.8181818 0.7641509
##  [10,] 0.8727273 0.8323171
##  [11,] 0.8363636 0.7825132
##  [12,] 0.8181818 0.7641509
##  [13,] 0.8363636 0.7842197
##  [14,] 0.8000000 0.7360384
##  [15,] 0.7818182 0.7056200
##  [16,] 0.8181818 0.7596154
##  [17,] 0.8000000 0.7364983
##  [18,] 0.8363636 0.7832750
##  [19,] 0.8000000 0.7318262
##  [20,] 0.8363636 0.7825132
##  [21,] 0.7818182 0.7105263
##  [22,] 0.8181818 0.7594051
##  [23,] 0.8363636 0.7844077
##  [24,] 0.8181818 0.7608696
##  [25,] 0.8363636 0.7819383
##  [26,] 0.8545455 0.8073555
##  [27,] 0.8000000 0.7318262
##  [28,] 0.7818182 0.7089947
##  [29,] 0.8000000 0.7367276
##  [30,] 0.8727273 0.8349057
##  [31,] 0.8181818 0.7568523
##  [32,] 0.8000000 0.7315883
##  [33,] 0.8363636 0.7825132
##  [34,] 0.8181818 0.7621107
##  [35,] 0.8181818 0.7596154
##  [36,] 0.8363636 0.7847826
##  [37,] 0.7818182 0.7135417
##  [38,] 0.8545455 0.8103448
##  [39,] 0.8000000 0.7380952
##  [40,] 0.8363636 0.7820343
##  [41,] 0.8000000 0.7378683
##  [42,] 0.8181818 0.7591944
##  [43,] 0.8181818 0.7621107
##  [44,] 0.8363636 0.7866379
##  [45,] 0.8181818 0.7583480
##  [46,] 0.8545455 0.8075241
##  [47,] 0.8000000 0.7360384
##  [48,] 0.8545455 0.8081953
##  [49,] 0.7818182 0.7115385
##  [50,] 0.7818182 0.7130435
##  [51,] 0.7636364 0.6855761
##  [52,] 0.8545455 0.8076923
##  [53,] 0.8181818 0.7596154
##  [54,] 0.8181818 0.7568523
##  [55,] 0.8545455 0.8073555
##  [56,] 0.8181818 0.7600349
##  [57,] 0.8363636 0.7842197
##  [58,] 0.7818182 0.7115385
##  [59,] 0.8181818 0.7616984
##  [60,] 0.8000000 0.7368421
##  [61,] 0.8181818 0.7591944
##  [62,] 0.8181818 0.7626241
##  [63,] 0.8363636 0.7888225
##  [64,] 0.8363636 0.7866379
##  [65,] 0.8727273 0.8323171
##  [66,] 0.8545455 0.8091934
##  [67,] 0.8363636 0.7830850
##  [68,] 0.8181818 0.7596154
##  [69,] 0.8363636 0.7825132
##  [70,] 0.8181818 0.7596154
##  [71,] 0.8181818 0.7570671
##  [72,] 0.8181818 0.7587719
##  [73,] 0.8909091 0.8570191
##  [74,] 0.8545455 0.8076923
##  [75,] 0.7818182 0.7097625
##  [76,] 0.8181818 0.7591944
##  [77,] 0.8181818 0.7564216
##  [78,] 0.7818182 0.7082228
##  [79,] 0.8181818 0.7572816
##  [80,] 0.8545455 0.8073555
##  [81,] 0.8181818 0.7596154
##  [82,] 0.8363636 0.7823219
##  [83,] 0.8000000 0.7392241
##  [84,] 0.8000000 0.7392241
##  [85,] 0.7818182 0.7130435
##  [86,] 0.8181818 0.7589833
##  [87,] 0.8727273 0.8323171
##  [88,] 0.8000000 0.7339490
##  [89,] 0.8181818 0.7568523
##  [90,] 0.8181818 0.7568523
##  [91,] 0.8363636 0.7860847
##  [92,] 0.8181818 0.7568523
##  [93,] 0.8545455 0.8073555
##  [94,] 0.8363636 0.7821303
##  [95,] 0.8181818 0.7568523
##  [96,] 0.8363636 0.7871883
##  [97,] 0.8181818 0.7596154
##  [98,] 0.7818182 0.7180692
##  [99,] 0.8181818 0.7568523
## [100,] 0.8545455 0.8075241
fish_cv <- data.frame(k = c(mean(fish_knn_df[,1]), sd(fish_knn_df[,1])),
                      acc = c(mean(fish_knn_df[,2]), sd(fish_knn_df[,2])))
row.names(fish_cv) <- c("mean", "sd")

head(fish_cv)
##               k       acc
## mean 0.82236364 0.7652655
## sd   0.02530126 0.0338257

Our kappa and accuracy are consistently high (k = 0.8223636, sd = 0.0253013; accuracy = 0.7652655, sd = 0.0338257) with relatively little spread. Given the sample size of some species in the dataset, there may be some bias in the classification, but I am impressed with the consistently high accuracy.