suppressPackageStartupMessages({
  library(tidyverse)
  library(knitr)
  library(Hmisc)
  library(reshape2)
  library(psych)
})

Data Import and Exploration

We first import data directly from the URL, then extract the Shucked Weight values into a separate vector as the target data.

df <- read.csv("https://s3.us-east-2.amazonaws.com/artificium.us/datasets/abalone.csv")
target_data <- select(df, ShuckedWeight)
train_data <- select(df, -ShuckedWeight)

We do some initial exploration below on any data distribution, missing values, and colinearities. We observe that there are no missing values in this dataset and that the data contain many highly interrelated features.

str(df)
## 'data.frame':    4178 obs. of  9 variables:
##  $ Length       : num  0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ Diameter     : num  0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ Height       : num  0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ ShuckedWeight: num  0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ VisceraWeight: num  0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ ShellWeight  : num  0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ WholeWeight  : num  0.514 0.226 0.677 0.516 0.205 ...
##  $ NumRings     : int  15 7 9 10 7 8 20 16 9 19 ...
##  $ Sex          : chr  "M" "M" "F" "F" ...
summary(df)
##      Length         Diameter          Height       ShuckedWeight   
##  Min.   :0.075   Min.   :0.0550   Min.   :0.0000   Min.   :0.0010  
##  1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150   1st Qu.:0.1861  
##  Median :0.545   Median :0.4250   Median :0.1400   Median :0.3360  
##  Mean   :0.524   Mean   :0.4079   Mean   :0.1395   Mean   :0.3595  
##  3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650   3rd Qu.:0.5020  
##  Max.   :0.815   Max.   :0.6500   Max.   :1.1300   Max.   :1.4880  
##  VisceraWeight     ShellWeight      WholeWeight        NumRings     
##  Min.   :0.0005   Min.   :0.0015   Min.   :0.0020   Min.   : 1.000  
##  1st Qu.:0.0935   1st Qu.:0.1300   1st Qu.:0.4416   1st Qu.: 8.000  
##  Median :0.1710   Median :0.2340   Median :0.7997   Median : 9.000  
##  Mean   :0.1806   Mean   :0.2389   Mean   :0.8290   Mean   : 9.934  
##  3rd Qu.:0.2530   3rd Qu.:0.3290   3rd Qu.:1.1538   3rd Qu.:11.000  
##  Max.   :0.7600   Max.   :1.0050   Max.   :2.8255   Max.   :29.000  
##      Sex           
##  Length:4178       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Distribution
hist.data.frame(df)

# Check for missing values
missing <- apply(df, MARGIN=2, function(x) sum(is.na(x))) %>%
  as.data.frame() %>%
  rename(NumMissing = 1) %>%
  mutate(PctMissing = NumMissing / nrow(df) * 100) %>%
  arrange(-PctMissing)
kable(missing)
NumMissing PctMissing
Length 0 0
Diameter 0 0
Height 0 0
ShuckedWeight 0 0
VisceraWeight 0 0
ShellWeight 0 0
WholeWeight 0 0
NumRings 0 0
Sex 0 0
# Inspect colinearity
pairs.panels(df[-9])

Data Preparation

Encoding Categorical Variables

Next, we encode the single non-numeric column, Sex, using one-hot encoding. This approach is especially amenable to the min-max normalization we will be performing in the next step because it is already on a scale of 0 to 1. Furthermore, it is a straightforward way to convert a binary categorical variable (male and female) into 1 or 0.

train_data$Sex <- ifelse(train_data$Sex == "M", 1, 0)

Min-Max Normalization

Then, we normalize all the columns (all numeric) using min-max normalization, where we subtract the column’s minimum then divide by the column’s range. Note that the Shucked Weight is not normalized, so that we do not need to invert the normalization after predicting a value.

train_data.norm <- lapply(train_data, function(x) {
  (x - min(x)) / (max(x) - min(x))
}) %>% as.data.frame()

Data Modeling with Custom KNN Regression Function

Then, we create a custom KNN regression function to predict the shucked weight, a continuous variable, given the other features. This involves the following:

  1. Check inputs: column names match between the new data and training data, row counts match between the target data and training data, and no NAs in any input.
  2. For each row of new data, calculate its Euclidean distance to each row of the training set. Obtain the smallest k distances (the k nearest neighbors), then calculate their average Shucked Weight.
  3. Return the predicted Shucked Weight for each row of new data, as a vector.
euclidean_dist <- function(v1, v2) {
  sqrt(sum((v1-v2)^2))
}

knn.reg <- function(new_data, target_data, train_data, k) {
  # new_data: df with new cases
  # target_data: df with single column of expected outcome of train_data
  # train_data: df with normalized and encoded features that correspond to values in target_data
  # k: number of nearest neighbors to perform KNN regression on
  
  # Check that column names match
  if (!identical(colnames(new_data), colnames(train_data))) {
    stop("Column names of new_data and train_data must match")
  }
  
  # Check target_data length matches train_data length
  if (nrow(target_data) != nrow(train_data)) {
    stop("Row count of target_data and train_data must match")
  }
  
  # Check for any NA
  if (anyNA(new_data) || anyNA(target_data) || anyNA(train_data)) {
    stop("Inputs must not contain any NA values")
  }
  
  # Calculate a distance vector for each new data point to the train data values
  predictions <- map(seq(1,nrow(new_data)), function(ii) {
    distances <- apply(train_data, MARGIN=1, function(row) {
      euclidean_dist(as.numeric(row), as.numeric(new_data[ii,]))
      })
    # Get the indices of the k nearest neighbors
    nn_ind <- order(distances)[1:k]
    # Get the values of those k nearest neighbors
    nn_vals <- target_data[nn_ind,]
    # Return a simple average (regression version of kNN)
    mean(nn_vals)
  })
  
  # Return as a vector
  return(unlist(predictions))
}

We demonstrate our custom function on 10 randomly generated new datapoints with values for each feature that are within the range of their respective columns.

set.seed(-1)
# Generate random data
random_data <- lapply(train_data.norm, function(column, n=10) {
    # Generate n random values within the min and max of the column
    runif(n, min = min(column), max = max(column))
  }) %>%
  as.data.frame() %>%
  mutate(Sex = round(Sex)) # convert back to 0 or 1
# Run knn.reg
predictions <- knn.reg(random_data, target_data, train_data.norm, k=5)
cbind(random_data, predictions) %>% round(digits=2) %>% kable()
Length Diameter Height VisceraWeight ShellWeight WholeWeight NumRings Sex predictions
0.49 0.79 0.40 0.08 0.54 0.79 0.26 1 0.60
0.19 0.26 0.97 0.71 0.01 0.65 0.87 1 0.40
0.99 0.27 0.38 0.18 0.39 0.07 0.14 1 0.38
0.15 0.54 0.61 0.14 0.85 0.38 0.01 0 0.40
0.24 0.29 0.25 0.72 0.10 0.42 0.43 1 0.41
0.54 0.95 0.28 0.76 0.50 0.19 0.31 1 0.64
0.36 0.06 0.34 0.37 0.83 0.20 0.34 1 0.41
0.87 0.09 0.41 0.40 0.29 0.89 0.10 0 0.70
0.39 0.31 0.57 0.31 0.80 0.33 0.65 1 0.38
0.22 0.77 0.02 0.43 0.22 0.89 0.62 0 0.65

Forecasting the Shucked Weight of a New Abalone

Next, we forecast the Shucked Weight of a new abalone with the following characteristics, using a k of 3, with our custom function:

Sex: M | Length: 0.38 | Diameter: 0.490 | Height: 0.231 | Whole weight: 0.4653 | Viscera weight: 0.0847 | Shell weight: 0.17 | Rings: 11

new_abalone <- data.frame(
  Length = 0.38,
  Diameter = 0.490,
  Height = 0.231,
  VisceraWeight = 0.0847,
  ShellWeight = 0.17,
  WholeWeight = 0.4653,
  NumRings = 11,
  Sex = 1
)
# Normalize
temp_train <- lapply(rbind(new_abalone, train_data), function(x) {
  (x - min(x)) / (max(x) - min(x))
}) %>% as.data.frame()
# Run knn.reg
prediction <- knn.reg(temp_train[1,], target_data, temp_train[-1,], 3)

Note that we first normalize the values using the same min/max normalization before running KNN. Our model predicts that this abalone will have a shucked weight of 0.21 grams.

Assess Model Accuracy

Lastly, we assess the accuracy of our model by doing the following:

  1. Split data into training (80%) and testing (20%)
  2. Run our custom algorithm, making sure to separate out the target variable and feeding the inputs correctly. We use a k value of 3 here.
  3. Compare the predicted output back to the actual values and calculate the MSE (Mean Squared Error).
# Split data
split_train_val <- function(x, train_pct) {
  # Generate random indices
  rand_ind <- sample(seq(1,nrow(x)),
         size = floor(train_pct * nrow(x)),
         replace = FALSE)
  # Split
  df.train <- x[rand_ind,]
  df.val <- x[-rand_ind,]
  list(train = df.train, val = df.val)
}
split_data <- split_train_val(cbind(target_data, train_data.norm), train_pct=0.8)
df.train <- split_data$train
df.val <- split_data$val
# Run knn.reg
start <- proc.time()
predictions <- knn.reg(df.val[,-1], df.train[1], df.train[,-1], 3)
elap <- (proc.time() - start)["elapsed"]
# Assess accuracy with MSE
assess <- cbind(df.val[1], predictions)
mse <- assess %>%
  mutate(SqErr = (predictions - ShuckedWeight)^2) %>%
  select(SqErr) %>% sum()
assess %>%
  ggplot(aes(x=ShuckedWeight, y=predictions)) +
  geom_point() +
  theme_bw() +
  geom_abline(slope=1, linetype="dashed", color="red") +
  labs(x = "Actual Shucked Weight",
       y = "KNN Predicted Shucked Weight",
       title = "Actual vs Predicted shucked oyster weight using KNN regression, k=3")

We trained the model with 3342 datapoints and validated with 836 datapoints. The total time elapsed to both train and create predictions for the validation set was 257.93 seconds. The calculated MSE of our validation set’s prediction values against the actual was 1.7089774. The plot above shows the actual versus predicted values from running our model.