1 Problem statement

Dataset:
https://archive.ics.uci.edu/ml/datasets/Abalone

Create a tool which can be used by researchers in the field to estimate the age of an abalone species based on measurements. This problem may be treated as regression or classification. Both approaches are considered for deployment.

2 Data understanding and data preparation

library(dplyr)
library(data.table)
library(caret)
library(mlbench)
library(Metrics)
library(ggfortify)
library(Cubist)
library(pls)
library(randomForest)

abalone <- read.csv("data/abalone.data", header=FALSE, stringsAsFactors=TRUE)
colnames(abalone) <- c("Sex", "Length", "Diameter", 
                       "Height", "Whole weight", "Shucked weight", 
                       "Viscera weight", "Shell weight", "Rings")
dim(abalone); head(abalone)

## [1] 4177    9

##   Sex Length Diameter Height Whole weight Shucked weight Viscera weight
## 1   M  0.455    0.365  0.095       0.5140         0.2245         0.1010
## 2   M  0.350    0.265  0.090       0.2255         0.0995         0.0485
## 3   F  0.530    0.420  0.135       0.6770         0.2565         0.1415
## 4   M  0.440    0.365  0.125       0.5160         0.2155         0.1140
## 5   I  0.330    0.255  0.080       0.2050         0.0895         0.0395
## 6   I  0.425    0.300  0.095       0.3515         0.1410         0.0775
##   Shell weight Rings
## 1        0.150    15
## 2        0.070     7
## 3        0.210     9
## 4        0.155    10
## 5        0.055     7
## 6        0.120     8

cat_EDA <- function(dataframe, i){
  sum_na <- sum(is.na(dataframe[[i]]))
  cat("===========", 
      "\nEDA RESULTS FOR '", colnames(dataframe[i]), "' VECTOR (CATEGORICAL):", 
      "\n===========", 
      "\nCount:\t", length(dataframe[[i]]), 
      "\nMissing values (%):\t", ((sum(is.na(dataframe[[i]])))/length(dataframe[[i]]))*100, 
      "\nCardinality:\t", length(names(table(dataframe[[i]]))),
      "\n1st Mode:\t", names(sort(table(dataframe[[i]]), 
                                decreasing=T))[1], 
      "\n1st Mode Freq.:\t", sort(table(dataframe[[i]]), 
                                decreasing=T)[[1]],
      "\n1st Mode (%):\t", sort(table(dataframe[[i]]), 
                              decreasing=T)[[1]]/length(dataframe[[i]])*100, 
      "\n2nd Mode:\t", names(sort(table(dataframe[[i]]), 
                                decreasing=T))[2], 
      "\n2nd Mode Freq.:\t", sort(table(dataframe[[i]]), 
                                decreasing=T)[[2]],
      "\n2nd Mode (%):\t", sort(table(dataframe[[i]]), 
                              decreasing=T)[[2]]/length(dataframe[[i]])*100, 
      "\n\n")
  
  cat("Summary:\n")
  print(summary(dataframe[[i]]))
  cat("\nDistinct values:")
  print(table(dataframe[[i]]))
  barplot(table(dataframe[[i]]), 
       xlab=colnames(dataframe[i]), 
       main=i)
}

cont_EDA <- function(dataframe, i){
  check_null <- any(is.null(dataframe))
  check_na <- any(is.na(dataframe))
  cat("===========", 
      "\nEDA RESULTS FOR '", colnames(dataframe[i]), "' VECTOR (CONTINUOUS):", 
      "\n===========",
      "\nCount:\t", length(dataframe[[i]]) , 
      "\nMissing values (%):\t", ((sum(is.na(dataframe[[i]]))) / length(dataframe[[i]])) * 100, 
      "\nCardinality:\t", length(names(table(dataframe[[i]]))),  
      "\nSD:\t", sd(dataframe[[i]]),
      "\n\n")
  
  cat("Summary:\n")
  print(summary(dataframe[[i]]))
  cat("\n")
  hist(dataframe[[i]], 
       xlab=colnames(dataframe[i]), 
       main=i)
}

# pre-cleaning EDA - categorical
cat_EDA(abalone, 1)

## =========== 
## EDA RESULTS FOR ' Sex ' VECTOR (CATEGORICAL): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  3 
## 1st Mode:     M 
## 1st Mode Freq.:   1528 
## 1st Mode (%):     36.58128 
## 2nd Mode:     I 
## 2nd Mode Freq.:   1342 
## 2nd Mode (%):     32.12832 
## 
## Summary:
##    F    I    M 
## 1307 1342 1528 
## 
## Distinct values:
##    F    I    M 
## 1307 1342 1528

# pre-cleaning EDA - continuous
par(mfrow=c(3, 3)); for(i in 2:9){cont_EDA(abalone, i)}

## =========== 
## EDA RESULTS FOR ' Length ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  134 
## SD:   0.1200929 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.075   0.450   0.545   0.524   0.615   0.815

## =========== 
## EDA RESULTS FOR ' Diameter ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  111 
## SD:   0.09923987 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0550  0.3500  0.4250  0.4079  0.4800  0.6500

## =========== 
## EDA RESULTS FOR ' Height ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  51 
## SD:   0.04182706 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1150  0.1400  0.1395  0.1650  1.1300

## =========== 
## EDA RESULTS FOR ' Whole weight ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  2429 
## SD:   0.490389 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0020  0.4415  0.7995  0.8287  1.1530  2.8255

## =========== 
## EDA RESULTS FOR ' Shucked weight ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  1515 
## SD:   0.2219629 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0010  0.1860  0.3360  0.3594  0.5020  1.4880

## =========== 
## EDA RESULTS FOR ' Viscera weight ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  880 
## SD:   0.1096143 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0005  0.0935  0.1710  0.1806  0.2530  0.7600

## =========== 
## EDA RESULTS FOR ' Shell weight ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  926 
## SD:   0.1392027 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0015  0.1300  0.2340  0.2388  0.3290  1.0050

## =========== 
## EDA RESULTS FOR ' Rings ' VECTOR (CONTINUOUS): 
## =========== 
## Count:    4177 
## Missing values (%):   0 
## Cardinality:  28 
## SD:   3.224169 
## 
## Summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   9.934  11.000  29.000

# check for outliers - continuous features
par(mfrow=c(2, 2))

for(i in c(2:9)){
  if(i == 6){par(mfrow=c(2, 2))}
  boxplot(abalone[i], xlab=colnames(abalone[i]), main=i)
  cat("Outliers for column", i, colnames(abalone[i]), ":", length(boxplot.stats(abalone[[i]])$out), "\n")
}

## Outliers for column 2 Length : 49

## Outliers for column 3 Diameter : 59

## Outliers for column 4 Height : 29

## Outliers for column 5 Whole weight : 30

## Outliers for column 6 Shucked weight : 48

## Outliers for column 7 Viscera weight : 26

## Outliers for column 8 Shell weight : 35

## Outliers for column 9 Rings : 278

# height may have outliers worth examining more as seen with the boxplot
out_ind <- c()

for(i in c(2:9)){
  vals_out <- boxplot.stats(abalone[[i]])$out
  out_ind <- unique(c(out_ind, which(abalone[[i]] %in% c(vals_out))))
}

dim(abalone)

## [1] 4177    9

abalone <- abalone[-c(out_ind), ]
dim(abalone)

## [1] 3781    9

# cat feature(s) vs target
# abalone Sex is only cat feature

# plot paired plots for each categorical variable, useful as reference
# get combinations, only using 2 cols here but can take many to combine at scale
combs <- expand.grid(c(1, 9), 
                     c(1, 9))
combs <- data.frame(with(combs, cbind(Var2, Var1)))
colnames(combs) <- c("Vec1", "Vec2")

# remove duplicates (avoid plotting a column against itself)
to_remove <- c()

for(row in 1:nrow(combs)){
  if(combs[row, "Vec1"]==combs[row, "Vec2"]){
    to_remove <- c(to_remove, row)
  }
}

combs <- combs[-c(to_remove), ]

for(row in 1:nrow(combs)){
  cont_table <- table(abalone[[combs[row, ][[2]]]], abalone[[combs[row, ][[1]]]])
  barplot(prop.table(cont_table, 2), names.arg=names(table(abalone[combs[row, ][[1]]])), 
          legend.text=names(table(abalone[combs[row, ][[2]]])), 
          main=colnames(abalone[combs[row, ][[2]]]), 
          xlab=colnames(abalone[combs[row, ][[1]]]))
}

# continuous + categorical pairwise visualisations
combs_2 <- expand.grid(c(2, 3, 4, 5, 6, 7, 8, 9), 
                       c(1))
combs_2 <- data.frame(with(combs_2, cbind(Var2, Var1)))
colnames(combs_2) <- c("Vec1", 
                       "Vec2")

for(row in 1:nrow(combs_2)){
  boxplot(data=abalone, 
          abalone[[combs_2[row, 2]]] ~ abalone[[combs_2[row, 1]]],
          xlab=colnames(abalone[combs_2[row, 1]]),
          ylab=colnames(abalone[combs_2[row, 2]]))
}

3 Modeling and evaluation

3.1 Checking dimensionality

my_pca <- prcomp(purrr::keep(abalone, is.numeric), center=TRUE, scale=TRUE)
autoplot(my_pca, loadings=TRUE, loadings.label=TRUE, loadings.label.size=5, loadings.colour="blue", alpha=.2)

my_pca; summary(my_pca)

## Standard deviations (1, .., p=8):
## [1] 2.62963153 0.80463242 0.39212562 0.35768425 0.26837925 0.25269855 0.11552974
## [8] 0.08168883
## 
## Rotation (n x k) = (8 x 8):
##                      PC1          PC2         PC3         PC4         PC5
## Length         0.3692998  0.068229541 -0.22876433  0.54468523 -0.03947722
## Diameter       0.3697852  0.037921905 -0.26242072  0.52687270  0.04782258
## Height         0.3563192 -0.044043974 -0.71150748 -0.54731524 -0.24043484
## Whole weight   0.3737130  0.146432242  0.28144434 -0.12355157  0.04415563
## Shucked weight 0.3600032  0.274032415  0.36066129  0.01878924 -0.43572884
## Viscera weight 0.3646312  0.145844646  0.33928165 -0.27190051 -0.15132171
## Shell weight   0.3689327  0.005239726  0.06609293 -0.18906614  0.84078456
## Rings          0.2481577 -0.934950537  0.20926430  0.02958511 -0.12930340
##                        PC6           PC7          PC8
## Length          0.14111046  0.6988854528  0.005978851
## Diameter        0.08826142 -0.7108011231  0.006788153
## Height         -0.08536094  0.0132861083  0.005288835
## Whole weight   -0.14096349  0.0003293175  0.850058654
## Shucked weight -0.56559810 -0.0206214836 -0.393304291
## Viscera weight  0.76873007 -0.0459814425 -0.201923364
## Shell weight   -0.17801317  0.0594036227 -0.285676054
## Rings          -0.05103672  0.0087137312 -0.014777277

## Importance of components:
##                           PC1     PC2     PC3     PC4    PC5     PC6     PC7
## Standard deviation     2.6296 0.80463 0.39213 0.35768 0.2684 0.25270 0.11553
## Proportion of Variance 0.8644 0.08093 0.01922 0.01599 0.0090 0.00798 0.00167
## Cumulative Proportion  0.8644 0.94530 0.96452 0.98051 0.9895 0.99750 0.99917
##                            PC8
## Standard deviation     0.08169
## Proportion of Variance 0.00083
## Cumulative Proportion  1.00000

3.2 Exploring feature importance

corm <- round(cor(abalone[, -1]), 2); corm

##                Length Diameter Height Whole weight Shucked weight
## Length           1.00     0.99   0.89         0.94           0.92
## Diameter         0.99     1.00   0.90         0.94           0.91
## Height           0.89     0.90   1.00         0.89           0.85
## Whole weight     0.94     0.94   0.89         1.00           0.97
## Shucked weight   0.92     0.91   0.85         0.97           1.00
## Viscera weight   0.91     0.91   0.87         0.97           0.93
## Shell weight     0.92     0.93   0.90         0.96           0.90
## Rings            0.59     0.60   0.62         0.56           0.47
##                Viscera weight Shell weight Rings
## Length                   0.91         0.92  0.59
## Diameter                 0.91         0.93  0.60
## Height                   0.87         0.90  0.62
## Whole weight             0.97         0.96  0.56
## Shucked weight           0.93         0.90  0.47
## Viscera weight           1.00         0.92  0.55
## Shell weight             0.92         1.00  0.62
## Rings                    0.55         0.62  1.00

set.seed(1)
highcorr <- findCorrelation(corm, cutoff=.5)
abalone$Rings <- as.factor(abalone$Rings)

# feature importance w/ Learning Vector Quantization (LVQ) model
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Rings ~ Length + Diameter + Height + `Whole weight` + `Shucked weight` + `Viscera weight` + `Shell weight`,
               data=abalone, method="lvq", preProcess="scale", trControl=control)
# model <- caret::train(Rings ~ Length + Diameter + Height + `Whole weight` + `Shucked weight` + `Viscera weight` + `Shell weight`,
#                data=abalone, method="lvq", preProcess="scale", trControl=control)
importance <- varImp(model, scale=F)
print(importance)

## ROC curve variable importance
## 
##   variables are sorted by maximum importance across the classes
##                    X4     X5     X6     X7     X8     X9    X10    X11 X12 X13
## Viscera weight 0.9327 0.9772 0.9940 0.9995 0.9994 0.9993 1.0000 0.9996   1   1
## Length         0.9315 0.9780 0.9953 0.9979 0.9990 0.9992 0.9999 0.9995   1   1
## Shucked weight 0.9287 0.9771 0.9927 0.9978 0.9989 0.9989 0.9994 0.9998   1   1
## Shell weight   0.9297 0.9786 0.9956 0.9989 0.9995 0.9995 1.0000 0.9996   1   1
## Height         0.8995 0.9618 0.9921 0.9974 0.9976 0.9979 0.9994 0.9990   1   1
## Whole weight   0.9385 0.9793 0.9952 0.9989 0.9995 0.9995 0.9999 0.9998   1   1
## Diameter       0.9282 0.9764 0.9951 0.9980 0.9991 0.9996 0.9999 0.9996   1   1
##                   X14    X15
## Viscera weight 0.7921 0.9772
## Length         0.7962 0.9780
## Shucked weight 0.7741 0.9771
## Shell weight   0.8053 0.9786
## Height         0.7769 0.9618
## Whole weight   0.8026 0.9793
## Diameter       0.8004 0.9764

plot(importance)

# average importance for each feature (across the 12 distinct target values)
# mostly all equal, height least important yet still significant
imp_df <- importance$importance

for(r in 1:nrow(imp_df)){
    cat(rownames(imp_df[r, ]),
        mean(as.numeric(imp_df[r, ])),
        "\n")
}

## Length 0.9728498 
## Diameter 0.9727228 
## Height 0.965291 
## Whole weight 0.9743772 
## Shucked weight 0.9703844 
## Viscera weight 0.9725956 
## Shell weight 0.9737819

3.3 Feature selecton (caret)

# using random forest as selection function
set.seed(1)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(abalone[, 2:8], abalone[, 9], sizes=c(2:8), rfeControl=control)
print(results)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          2   0.2438 0.1280    0.02345 0.02662         
##          3   0.2406 0.1255    0.02482 0.02838         
##          4   0.2491 0.1355    0.02085 0.02331         
##          5   0.2594 0.1467    0.02016 0.02238         
##          6   0.2644 0.1515    0.02303 0.02634        *
##          7   0.2631 0.1495    0.02045 0.02421         
## 
## The top 5 variables (out of 6):
##    Shell weight, Diameter, Whole weight, Viscera weight, Shucked weight

predictors(results)

## [1] "Shell weight"   "Diameter"       "Whole weight"   "Viscera weight"
## [5] "Shucked weight" "Length"

plot(results, type=c("g", "o"))

3.4 Multiple linear regression

Initially we consider treating the problem of Rings prediction as a regression (predicting a quantitative value).

abalone$Rings <- as.integer(abalone$Rings)
corrplot::corrplot(purrr::keep(abalone, is.numeric) %>% cor(), addCoef.col="black"); dim(abalone)

## [1] 3781    9

# reduce collinearity by identifying extremes - 95% cutoff
to_remove <- caret::findCorrelation(corm, cutoff=.95, exact=FALSE)
abalone <- abalone[, -c(to_remove)]
head(abalone)

##   Sex Diameter Whole weight Shucked weight Viscera weight Shell weight Rings
## 1   M    0.365       0.5140         0.2245         0.1010        0.150    12
## 2   M    0.265       0.2255         0.0995         0.0485        0.070     4
## 3   F    0.420       0.6770         0.2565         0.1415        0.210     6
## 4   M    0.365       0.5160         0.2155         0.1140        0.155     7
## 5   I    0.255       0.2050         0.0895         0.0395        0.055     4
## 6   I    0.300       0.3515         0.1410         0.0775        0.120     5

# perform stratified random split
set.seed(1)
train_index <- createDataPartition(abalone$Rings, p=.8, list=FALSE)
abalone_train <- abalone[ train_index, ]
abalone_test  <- abalone[-train_index, ]

dim(abalone_train); dim(abalone_test)

## [1] 3026    7

## [1] 755   7

lm_mod <- lm(Rings ~ ., data=abalone_train)
summary(lm_mod)

## 
## Call:
## lm(formula = Rings ~ ., data = abalone_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3997 -1.1149 -0.2354  0.8444  6.7469 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.58355    0.27786   5.699 1.32e-08 ***
## SexI              -0.79257    0.09070  -8.738  < 2e-16 ***
## SexM               0.09432    0.07531   1.252    0.210    
## Diameter          11.81163    1.02522  11.521  < 2e-16 ***
## `Whole weight`     4.61279    0.73357   6.288 3.68e-10 ***
## `Shucked weight` -13.06539    0.82924 -15.756  < 2e-16 ***
## `Viscera weight`  -3.20802    1.26159  -2.543    0.011 *  
## `Shell weight`     7.72029    1.18064   6.539 7.24e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.679 on 3018 degrees of freedom
## Multiple R-squared:  0.4889, Adjusted R-squared:  0.4877 
## F-statistic: 412.5 on 7 and 3018 DF,  p-value: < 2.2e-16

plot(cbind(abalone_train$Rings, lm_mod$fitted.values))

# evaluate on test set
lm_mod_pred <- predict(lm_mod, abalone_test)
cat("RMSE:", rmse(abalone_test$Rings, lm_mod_pred), "\n")

## RMSE: 1.526494

cat("Accuracy:", accuracy(abalone_test$Rings, round(lm_mod_pred)))

## Accuracy: 0.2582781

# lr preds
cbind(abalone_test$Rings, lm_mod_pred) %>% round(2) %>% head(10)

##       lm_mod_pred
## 4   7        6.38
## 5   4        3.88
## 17  4        4.96
## 18  7        5.95
## 25  7        7.30
## 27  8        7.66
## 31  7        8.46
## 41  6        5.66
## 42 11        7.26
## 51  5        6.37

3.5 Model tree

head(abalone)

##   Sex Diameter Whole weight Shucked weight Viscera weight Shell weight Rings
## 1   M    0.365       0.5140         0.2245         0.1010        0.150    12
## 2   M    0.265       0.2255         0.0995         0.0485        0.070     4
## 3   F    0.420       0.6770         0.2565         0.1415        0.210     6
## 4   M    0.365       0.5160         0.2155         0.1140        0.155     7
## 5   I    0.255       0.2050         0.0895         0.0395        0.055     4
## 6   I    0.300       0.3515         0.1410         0.0775        0.120     5

preds <- c("Sex", "Whole weight", "Viscera weight", "Shell weight")
train_pred <- abalone[train_index, preds]
test_pred <- abalone[-train_index, preds]
train_resp <- abalone$Rings[train_index]
test_resp <- abalone$Rings[-train_index]
model_tree <- cubist(y=train_resp, x=train_pred)
model_tree; summary(model_tree)

## 
## Call:
## cubist.default(x = train_pred, y = train_resp)
## 
## Number of samples: 3026 
## Number of predictors: 4 
## 
## Number of committees: 1 
## Number of rules: 5

## 
## Call:
## cubist.default(x = train_pred, y = train_resp)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Wed May 26 23:39:10 2021
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 3026 cases (5 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [699 cases, mean 4.1, range 1 to 12, est err 1.0]
## 
##     if
##  Sex = I
##  Shell weight <= 0.165
##     then
##  outcome = 1.9 + 38.8 Shell weight - 7.01 Whole weight
##            + 11.2 Viscera weight
## 
##   Rule 2: [378 cases, mean 5.7, range 1 to 12, est err 1.5]
## 
##     if
##  Sex in {F, M}
##  Shell weight <= 0.165
##     then
##  outcome = 3.4 + 38.1 Shell weight - 7.52 Whole weight
##            + 6.6 Viscera weight
## 
##   Rule 3: [296 cases, mean 6.7, range 3 to 12, est err 1.3]
## 
##     if
##  Sex = I
##  Shell weight > 0.165
##     then
##  outcome = 1.2 + 26.5 Shell weight - 1.18 Whole weight
##            + 1.5 Viscera weight
## 
##   Rule 4: [1721 cases, mean 7.4, range 3 to 12, est err 1.3]
## 
##     if
##  Whole weight > 0.6605
##     then
##  outcome = 4.9 + 16.3 Shell weight - 3.07 Whole weight
##            + 2.3 Viscera weight
## 
##   Rule 5: [143 cases, mean 7.8, range 4 to 12, est err 1.8]
## 
##     if
##  Sex in {F, M}
##  Whole weight <= 0.6605
##  Shell weight > 0.165
##     then
##  outcome = 4.2 - 14.05 Whole weight + 45.8 Shell weight
##            + 22.9 Viscera weight
## 
## 
## Evaluation on training data (3026 cases):
## 
##     Average  |error|                1.5
##     Relative |error|               0.77
##     Correlation coefficient        0.61
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##     58%   100%    Whole weight
##     47%   100%    Shell weight
##     47%           Sex
##           100%    Viscera weight
## 
## 
## Time: 0.1 secs

model_tree_pred <- predict(model_tree, test_pred)
cat("RMSE:", rmse(abalone_test$Rings, round(model_tree_pred)), "\n")

## RMSE: 1.596768

cat("Accuracy:", accuracy(abalone_test$Rings, round(model_tree_pred)))

## Accuracy: 0.3059603

cbind(abalone_test$Rings, model_tree_pred) %>% round(2) %>% head(10)

##          model_tree_pred
##  [1,]  7            6.18
##  [2,]  4            3.04
##  [3,]  4            4.77
##  [4,]  7            5.54
##  [5,]  7            7.00
##  [6,]  8            7.11
##  [7,]  7            7.77
##  [8,]  6            5.19
##  [9,] 11            7.14
## [10,]  5            5.70

3.6 Principal components regression

set.seed(1)
pcr_fit <- pcr(Rings~., data=abalone, subset=train_index, scale=T, validation="CV")
summary(pcr_fit)

## Data:    X dimension: 3026 7 
##  Y dimension: 3026 1
## Fit method: svdpc
## Number of components considered: 7
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           2.346    1.897    1.896     1.88    1.717    1.693    1.693
## adjCV        2.346    1.897    1.896     1.88    1.717    1.693    1.693
##        7 comps
## CV       1.681
## adjCV    1.681
## 
## TRAINING: % variance explained
##        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X        74.36    90.62    96.25    97.84    99.01    99.91   100.00
## Rings    34.65    34.77    35.88    46.54    48.06    48.13    48.89

validationplot(pcr_fit, val.type="MSEP")

pcr_pred <- predict(pcr_fit, abalone_test[-7], ncomp=5)
cat("RMSE:", rmse(abalone_test$Rings, c(pcr_pred)), "\n")

## RMSE: 1.533576

cat("Accuracy:", accuracy(abalone_test$Rings, round(c(pcr_pred))))

## Accuracy: 0.2503311

3.7 Partial least squares

set.seed(1)
pls_fit <- plsr(Rings~., data=abalone, subset=train_index, scale=T, validation="CV")
summary(pls_fit)

## Data:    X dimension: 3026 7 
##  Y dimension: 3026 1
## Fit method: kernelpls
## Number of components considered: 7
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           2.346    1.886    1.771     1.72    1.691    1.689    1.682
## adjCV        2.346    1.886    1.771     1.72    1.691    1.688    1.681
##        7 comps
## CV       1.681
## adjCV    1.681
## 
## TRAINING: % variance explained
##        1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
## X        74.32    81.12    92.77    97.79    98.92    99.15   100.00
## Rings    35.36    43.24    46.44    48.20    48.39    48.86    48.89

validationplot(pls_fit, val.type="MSEP")

pls_pred <- predict(pls_fit, abalone_test[-7], ncomp=4)
cat("RMSE:", rmse(abalone_test$Rings, c(pls_pred)), "\n")

## RMSE: 1.528913

cat("Accuracy:", accuracy(abalone_test$Rings, round(c(pls_pred))))

## Accuracy: 0.2450331

3.8 Random forest

3.8.1 Regression approach

rf_train <- abalone_train # train
colnames(rf_train)[3:6] <- c("Whole_weight", "Shucked_weight", 
                               "Viscera_weight", "Shell_weight")

rf_test <- abalone_test[-7] # test
colnames(rf_test)[3:6] <- c("Whole_weight", "Shucked_weight", 
                       "Viscera_weight", "Shell_weight")

set.seed(1)
rf_mod <- randomForest(Rings ~., data=rf_train, 
                           importance=TRUE, proximity=TRUE)
print(rf_mod)

## 
## Call:
##  randomForest(formula = Rings ~ ., data = rf_train, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 2.605371
##                     % Var explained: 52.63

rf_pred <- predict(rf_mod, newdata=rf_test, type="response")
cat("RMSE:", rmse(abalone_test$Rings, c(rf_pred)), "\n")

## RMSE: 1.499617

cat("Accuracy:", accuracy(abalone_test$Rings, round(c(rf_pred))))

## Accuracy: 0.2649007

# rf preds
cbind(abalone_test$Rings, rf_pred) %>% round(2) %>% head(10)

##       rf_pred
## 4   7    6.45
## 5   4    3.25
## 17  4    5.51
## 18  7    6.23
## 25  7    6.86
## 27  8    8.78
## 31  7    8.92
## 41  6    5.30
## 42 11    7.11
## 51  5    5.93

Tune the model using tuneRF from randomForest

set.seed(1)
bestmtry <- tuneRF(rf_train[, -7], rf_train$Rings, stepFactor=1.5, improve=1e-5, ntree=500)

## mtry = 2  OOB error = 2.607128 
## Searching left ...
## Searching right ...
## mtry = 3     OOB error = 2.62075 
## -0.00522479 1e-05

print(bestmtry)

##   mtry OOBError
## 2    2 2.607128
## 3    3 2.620750

Use best mtry of 2 for improved model

set.seed(1)
rf_mod <- randomForest(Rings ~., data=rf_train, 
                           importance=TRUE, proximity=TRUE, mtry=2)
print(rf_mod)

## 
## Call:
##  randomForest(formula = Rings ~ ., data = rf_train, importance = TRUE,      proximity = TRUE, mtry = 2) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 2.605371
##                     % Var explained: 52.63

rf_pred <- predict(rf_mod, newdata=rf_test, type="response")
cat("RMSE:", rmse(abalone_test$Rings, c(rf_pred)), "\n")

## RMSE: 1.499617

cat("Accuracy:", accuracy(abalone_test$Rings, round(c(rf_pred))))

## Accuracy: 0.2649007

# rf preds
cbind(abalone_test$Rings, rf_pred) %>% round(2) %>% head(10)

##       rf_pred
## 4   7    6.45
## 5   4    3.25
## 17  4    5.51
## 18  7    6.23
## 25  7    6.86
## 27  8    8.78
## 31  7    8.92
## 41  6    5.30
## 42 11    7.11
## 51  5    5.93

3.8.2 Classification approach

The dataset ‘as is’ makes it challenging to predict with high accuracy the exact number of Rings the dataset info sheet mentions similar scores as benchmarks to those predicted below. These new predictions were obtained after encoding the Rings as range bins. This allowed a higher prediction accuracy and improved RMSE. The Shiny Web App should allow for selection of predictions based on either regression or classification.

Binning Rings:

<5 (young age)
5 <= x < 10 (middle age)
x >= 10 (old age)

abalone_binned <- abalone
abalone_binned$Rings <- cut(abalone_binned$Rings, breaks=c(-Inf, 5, 10, +Inf))
head(abalone_binned)

##   Sex Diameter Whole weight Shucked weight Viscera weight Shell weight
## 1   M    0.365       0.5140         0.2245         0.1010        0.150
## 2   M    0.265       0.2255         0.0995         0.0485        0.070
## 3   F    0.420       0.6770         0.2565         0.1415        0.210
## 4   M    0.365       0.5160         0.2155         0.1140        0.155
## 5   I    0.255       0.2050         0.0895         0.0395        0.055
## 6   I    0.300       0.3515         0.1410         0.0775        0.120
##       Rings
## 1 (10, Inf]
## 2  (-Inf,5]
## 3    (5,10]
## 4    (5,10]
## 5  (-Inf,5]
## 6  (-Inf,5]

rf_mod2_dat <- abalone_binned[train_index, ]
colnames(rf_mod2_dat)[3:6] <- c("Whole_weight", "Shucked_weight",
                               "Viscera_weight", "Shell_weight")
rf_mod2_dat$Rings <- factor(rf_mod2_dat$Rings)
set.seed(1)
rf_mod2 <- randomForest::randomForest(Rings ~., data=rf_mod2_dat,
                           importance=TRUE, proximity=TRUE)
print(rf_mod2)

## 
## Call:
##  randomForest(formula = Rings ~ ., data = rf_mod2_dat, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 23.27%
## Confusion matrix:
##           (-Inf,5] (5,10] (10, Inf] class.error
## (-Inf,5]       769    304         0   0.2833178
## (5,10]         211   1541        15   0.1279004
## (10, Inf]        7    167        12   0.9354839

rf_mod2_dat_test <- abalone_binned[-train_index, ]
colnames(rf_mod2_dat_test)[3:6] <- c("Whole_weight", "Shucked_weight",
                               "Viscera_weight", "Shell_weight")
rf_pred2 <- predict(rf_mod2, newdata=rf_mod2_dat_test, type="response")
cat("Accuracy:", accuracy(as.numeric(rf_mod2_dat_test$Rings), as.numeric(rf_pred2)))

## Accuracy: 0.7774834

cat("RMSE:", rmse(as.numeric(rf_mod2_dat_test$Rings), as.numeric(rf_pred2)))

## RMSE: 0.4800662

# rf_mod2 preds
cbind(rf_mod2_dat_test$Rings, rf_pred2) %>% head(10)

##      rf_pred2
## 4  2        2
## 5  1        1
## 17 1        1
## 18 2        2
## 25 2        2
## 27 2        2
## 31 2        2
## 41 2        1
## 42 3        2
## 51 1        2

3.9 Model summary

The Random Forest algorithm provided the best fit in terms of RMSE. The Random Forest model can be used in production. Additional feature engineering is required, as mentioned by the researchers. See ‘abalone.names’ supplied in /data folder (GitHub). Here is a summary of the results:

Multiple Linear Regression - Accuracy= 0.258; RMSE=1.526
Model Tree - Accuracy=0.305; RMSE=1.596
Principal Components Regression - Accuracy=0.250; RMSE=1.533
Partial Least Squares - Accuracy=0.245; RMSE=1.528
Random Forest Regression (untuned) - Accuracy=0.264; RMSE=1.499
Random Forest Regression (tuned) - Accuracy=0.264; RMSE=1.499 (no improvement)
Random Forest Classification - Accuracy=0.777; RMSE=0.480

4 Deployment

View Shiny app at:
davidmh.shinyapps.io/abalone_age_predictor
Download shinytestdata.csv in data/ folder (GitHub) to upload to app and get age predictions.

saveRDS(rf_mod, "Abalone_Age_Predictor/rf_mod.rds") # save rf regression model
saveRDS(rf_mod2, "Abalone_Age_Predictor/rf_mod2.rds") # save rf classification model

Abalone Age (CRISP-DM)

David

May 2021