suppressPackageStartupMessages({
library(tidyverse)
library(knitr)
library(Hmisc)
library(reshape2)
library(psych)
})
We first import data directly from the URL, then extract the Shucked Weight values into a separate vector as the target data.
df <- read.csv("https://s3.us-east-2.amazonaws.com/artificium.us/datasets/abalone.csv")
target_data <- select(df, ShuckedWeight)
train_data <- select(df, -ShuckedWeight)
We do some initial exploration below on any data distribution, missing values, and colinearities. We observe that there are no missing values in this dataset and that the data contain many highly interrelated features.
str(df)
## 'data.frame': 4178 obs. of 9 variables:
## $ Length : num 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
## $ Diameter : num 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
## $ Height : num 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
## $ ShuckedWeight: num 0.2245 0.0995 0.2565 0.2155 0.0895 ...
## $ VisceraWeight: num 0.101 0.0485 0.1415 0.114 0.0395 ...
## $ ShellWeight : num 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
## $ WholeWeight : num 0.514 0.226 0.677 0.516 0.205 ...
## $ NumRings : int 15 7 9 10 7 8 20 16 9 19 ...
## $ Sex : chr "M" "M" "F" "F" ...
summary(df)
## Length Diameter Height ShuckedWeight
## Min. :0.075 Min. :0.0550 Min. :0.0000 Min. :0.0010
## 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150 1st Qu.:0.1861
## Median :0.545 Median :0.4250 Median :0.1400 Median :0.3360
## Mean :0.524 Mean :0.4079 Mean :0.1395 Mean :0.3595
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650 3rd Qu.:0.5020
## Max. :0.815 Max. :0.6500 Max. :1.1300 Max. :1.4880
## VisceraWeight ShellWeight WholeWeight NumRings
## Min. :0.0005 Min. :0.0015 Min. :0.0020 Min. : 1.000
## 1st Qu.:0.0935 1st Qu.:0.1300 1st Qu.:0.4416 1st Qu.: 8.000
## Median :0.1710 Median :0.2340 Median :0.7997 Median : 9.000
## Mean :0.1806 Mean :0.2389 Mean :0.8290 Mean : 9.934
## 3rd Qu.:0.2530 3rd Qu.:0.3290 3rd Qu.:1.1538 3rd Qu.:11.000
## Max. :0.7600 Max. :1.0050 Max. :2.8255 Max. :29.000
## Sex
## Length:4178
## Class :character
## Mode :character
##
##
##
# Distribution
hist.data.frame(df)
# Check for missing values
missing <- apply(df, MARGIN=2, function(x) sum(is.na(x))) %>%
as.data.frame() %>%
rename(NumMissing = 1) %>%
mutate(PctMissing = NumMissing / nrow(df) * 100) %>%
arrange(-PctMissing)
kable(missing)
| NumMissing | PctMissing | |
|---|---|---|
| Length | 0 | 0 |
| Diameter | 0 | 0 |
| Height | 0 | 0 |
| ShuckedWeight | 0 | 0 |
| VisceraWeight | 0 | 0 |
| ShellWeight | 0 | 0 |
| WholeWeight | 0 | 0 |
| NumRings | 0 | 0 |
| Sex | 0 | 0 |
# Inspect colinearity
pairs.panels(df[-9])
Next, we encode the single non-numeric column, Sex, using one-hot encoding. This approach is especially amenable to the min-max normalization we will be performing in the next step because it is already on a scale of 0 to 1. Furthermore, it is a straightforward way to convert a binary categorical variable (male and female) into 1 or 0.
train_data$Sex <- ifelse(train_data$Sex == "M", 1, 0)
Then, we normalize all the columns (all numeric) using min-max normalization, where we subtract the column’s minimum then divide by the column’s range. Note that the Shucked Weight is not normalized, so that we do not need to invert the normalization after predicting a value.
train_data.norm <- lapply(train_data, function(x) {
(x - min(x)) / (max(x) - min(x))
}) %>% as.data.frame()
Then, we create a custom KNN regression function to predict the shucked weight, a continuous variable, given the other features. This involves the following:
euclidean_dist <- function(v1, v2) {
sqrt(sum((v1-v2)^2))
}
knn.reg <- function(new_data, target_data, train_data, k) {
# new_data: df with new cases
# target_data: df with single column of expected outcome of train_data
# train_data: df with normalized and encoded features that correspond to values in target_data
# k: number of nearest neighbors to perform KNN regression on
# Check that column names match
if (!identical(colnames(new_data), colnames(train_data))) {
stop("Column names of new_data and train_data must match")
}
# Check target_data length matches train_data length
if (nrow(target_data) != nrow(train_data)) {
stop("Row count of target_data and train_data must match")
}
# Check for any NA
if (anyNA(new_data) || anyNA(target_data) || anyNA(train_data)) {
stop("Inputs must not contain any NA values")
}
# Calculate a distance vector for each new data point to the train data values
predictions <- map(seq(1,nrow(new_data)), function(ii) {
distances <- apply(train_data, MARGIN=1, function(row) {
euclidean_dist(as.numeric(row), as.numeric(new_data[ii,]))
})
# Get the indices of the k nearest neighbors
nn_ind <- order(distances)[1:k]
# Get the values of those k nearest neighbors
nn_vals <- target_data[nn_ind,]
# Return a simple average (regression version of kNN)
mean(nn_vals)
})
# Return as a vector
return(unlist(predictions))
}
We demonstrate our custom function on 10 randomly generated new datapoints with values for each feature that are within the range of their respective columns.
set.seed(-1)
# Generate random data
random_data <- lapply(train_data.norm, function(column, n=10) {
# Generate n random values within the min and max of the column
runif(n, min = min(column), max = max(column))
}) %>%
as.data.frame() %>%
mutate(Sex = round(Sex)) # convert back to 0 or 1
# Run knn.reg
predictions <- knn.reg(random_data, target_data, train_data.norm, k=5)
cbind(random_data, predictions) %>% round(digits=2) %>% kable()
| Length | Diameter | Height | VisceraWeight | ShellWeight | WholeWeight | NumRings | Sex | predictions |
|---|---|---|---|---|---|---|---|---|
| 0.49 | 0.79 | 0.40 | 0.08 | 0.54 | 0.79 | 0.26 | 1 | 0.60 |
| 0.19 | 0.26 | 0.97 | 0.71 | 0.01 | 0.65 | 0.87 | 1 | 0.40 |
| 0.99 | 0.27 | 0.38 | 0.18 | 0.39 | 0.07 | 0.14 | 1 | 0.38 |
| 0.15 | 0.54 | 0.61 | 0.14 | 0.85 | 0.38 | 0.01 | 0 | 0.40 |
| 0.24 | 0.29 | 0.25 | 0.72 | 0.10 | 0.42 | 0.43 | 1 | 0.41 |
| 0.54 | 0.95 | 0.28 | 0.76 | 0.50 | 0.19 | 0.31 | 1 | 0.64 |
| 0.36 | 0.06 | 0.34 | 0.37 | 0.83 | 0.20 | 0.34 | 1 | 0.41 |
| 0.87 | 0.09 | 0.41 | 0.40 | 0.29 | 0.89 | 0.10 | 0 | 0.70 |
| 0.39 | 0.31 | 0.57 | 0.31 | 0.80 | 0.33 | 0.65 | 1 | 0.38 |
| 0.22 | 0.77 | 0.02 | 0.43 | 0.22 | 0.89 | 0.62 | 0 | 0.65 |
Next, we forecast the Shucked Weight of a new abalone with the following characteristics, using a k of 3, with our custom function:
Sex: M | Length: 0.38 | Diameter: 0.490 | Height: 0.231 | Whole weight: 0.4653 | Viscera weight: 0.0847 | Shell weight: 0.17 | Rings: 11
new_abalone <- data.frame(
Length = 0.38,
Diameter = 0.490,
Height = 0.231,
VisceraWeight = 0.0847,
ShellWeight = 0.17,
WholeWeight = 0.4653,
NumRings = 11,
Sex = 1
)
# Normalize
temp_train <- lapply(rbind(new_abalone, train_data), function(x) {
(x - min(x)) / (max(x) - min(x))
}) %>% as.data.frame()
# Run knn.reg
prediction <- knn.reg(temp_train[1,], target_data, temp_train[-1,], 3)
Note that we first normalize the values using the same min/max normalization before running KNN. Our model predicts that this abalone will have a shucked weight of 0.21 grams.
Lastly, we assess the accuracy of our model by doing the following:
# Split data
split_train_val <- function(x, train_pct) {
# Generate random indices
rand_ind <- sample(seq(1,nrow(x)),
size = floor(train_pct * nrow(x)),
replace = FALSE)
# Split
df.train <- x[rand_ind,]
df.val <- x[-rand_ind,]
list(train = df.train, val = df.val)
}
split_data <- split_train_val(cbind(target_data, train_data.norm), train_pct=0.8)
df.train <- split_data$train
df.val <- split_data$val
# Run knn.reg
start <- proc.time()
predictions <- knn.reg(df.val[,-1], df.train[1], df.train[,-1], 3)
elap <- (proc.time() - start)["elapsed"]
# Assess accuracy with MSE
assess <- cbind(df.val[1], predictions)
mse <- assess %>%
mutate(SqErr = (predictions - ShuckedWeight)^2) %>%
select(SqErr) %>% sum()
assess %>%
ggplot(aes(x=ShuckedWeight, y=predictions)) +
geom_point() +
theme_bw() +
geom_abline(slope=1, linetype="dashed", color="red") +
labs(x = "Actual Shucked Weight",
y = "KNN Predicted Shucked Weight",
title = "Actual vs Predicted shucked oyster weight using KNN regression, k=3")
We trained the model with 3342 datapoints and validated with 836 datapoints. The total time elapsed to both train and create predictions for the validation set was 257.93 seconds. The calculated MSE of our validation set’s prediction values against the actual was 1.7089774. The plot above shows the actual versus predicted values from running our model.