Dataset:
https://archive.ics.uci.edu/ml/datasets/Abalone
Create a tool which can be used by researchers in the field to estimate the age of an abalone species based on measurements. This problem may be treated as regression or classification. Both approaches are considered for deployment.
library(dplyr)
library(data.table)
library(caret)
library(mlbench)
library(Metrics)
library(ggfortify)
library(Cubist)
library(pls)
library(randomForest)
abalone <- read.csv("data/abalone.data", header=FALSE, stringsAsFactors=TRUE)
colnames(abalone) <- c("Sex", "Length", "Diameter",
"Height", "Whole weight", "Shucked weight",
"Viscera weight", "Shell weight", "Rings")
dim(abalone); head(abalone)
## [1] 4177 9
## Sex Length Diameter Height Whole weight Shucked weight Viscera weight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## Shell weight Rings
## 1 0.150 15
## 2 0.070 7
## 3 0.210 9
## 4 0.155 10
## 5 0.055 7
## 6 0.120 8
cat_EDA <- function(dataframe, i){
sum_na <- sum(is.na(dataframe[[i]]))
cat("===========",
"\nEDA RESULTS FOR '", colnames(dataframe[i]), "' VECTOR (CATEGORICAL):",
"\n===========",
"\nCount:\t", length(dataframe[[i]]),
"\nMissing values (%):\t", ((sum(is.na(dataframe[[i]])))/length(dataframe[[i]]))*100,
"\nCardinality:\t", length(names(table(dataframe[[i]]))),
"\n1st Mode:\t", names(sort(table(dataframe[[i]]),
decreasing=T))[1],
"\n1st Mode Freq.:\t", sort(table(dataframe[[i]]),
decreasing=T)[[1]],
"\n1st Mode (%):\t", sort(table(dataframe[[i]]),
decreasing=T)[[1]]/length(dataframe[[i]])*100,
"\n2nd Mode:\t", names(sort(table(dataframe[[i]]),
decreasing=T))[2],
"\n2nd Mode Freq.:\t", sort(table(dataframe[[i]]),
decreasing=T)[[2]],
"\n2nd Mode (%):\t", sort(table(dataframe[[i]]),
decreasing=T)[[2]]/length(dataframe[[i]])*100,
"\n\n")
cat("Summary:\n")
print(summary(dataframe[[i]]))
cat("\nDistinct values:")
print(table(dataframe[[i]]))
barplot(table(dataframe[[i]]),
xlab=colnames(dataframe[i]),
main=i)
}
cont_EDA <- function(dataframe, i){
check_null <- any(is.null(dataframe))
check_na <- any(is.na(dataframe))
cat("===========",
"\nEDA RESULTS FOR '", colnames(dataframe[i]), "' VECTOR (CONTINUOUS):",
"\n===========",
"\nCount:\t", length(dataframe[[i]]) ,
"\nMissing values (%):\t", ((sum(is.na(dataframe[[i]]))) / length(dataframe[[i]])) * 100,
"\nCardinality:\t", length(names(table(dataframe[[i]]))),
"\nSD:\t", sd(dataframe[[i]]),
"\n\n")
cat("Summary:\n")
print(summary(dataframe[[i]]))
cat("\n")
hist(dataframe[[i]],
xlab=colnames(dataframe[i]),
main=i)
}
# pre-cleaning EDA - categorical
cat_EDA(abalone, 1)
## ===========
## EDA RESULTS FOR ' Sex ' VECTOR (CATEGORICAL):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 3
## 1st Mode: M
## 1st Mode Freq.: 1528
## 1st Mode (%): 36.58128
## 2nd Mode: I
## 2nd Mode Freq.: 1342
## 2nd Mode (%): 32.12832
##
## Summary:
## F I M
## 1307 1342 1528
##
## Distinct values:
## F I M
## 1307 1342 1528
# pre-cleaning EDA - continuous
par(mfrow=c(3, 3)); for(i in 2:9){cont_EDA(abalone, i)}
## ===========
## EDA RESULTS FOR ' Length ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 134
## SD: 0.1200929
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.075 0.450 0.545 0.524 0.615 0.815
## ===========
## EDA RESULTS FOR ' Diameter ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 111
## SD: 0.09923987
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0550 0.3500 0.4250 0.4079 0.4800 0.6500
## ===========
## EDA RESULTS FOR ' Height ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 51
## SD: 0.04182706
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1150 0.1400 0.1395 0.1650 1.1300
## ===========
## EDA RESULTS FOR ' Whole weight ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 2429
## SD: 0.490389
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0020 0.4415 0.7995 0.8287 1.1530 2.8255
## ===========
## EDA RESULTS FOR ' Shucked weight ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 1515
## SD: 0.2219629
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0010 0.1860 0.3360 0.3594 0.5020 1.4880
## ===========
## EDA RESULTS FOR ' Viscera weight ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 880
## SD: 0.1096143
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0005 0.0935 0.1710 0.1806 0.2530 0.7600
## ===========
## EDA RESULTS FOR ' Shell weight ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 926
## SD: 0.1392027
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0015 0.1300 0.2340 0.2388 0.3290 1.0050
## ===========
## EDA RESULTS FOR ' Rings ' VECTOR (CONTINUOUS):
## ===========
## Count: 4177
## Missing values (%): 0
## Cardinality: 28
## SD: 3.224169
##
## Summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 8.000 9.000 9.934 11.000 29.000
# check for outliers - continuous features
par(mfrow=c(2, 2))
for(i in c(2:9)){
if(i == 6){par(mfrow=c(2, 2))}
boxplot(abalone[i], xlab=colnames(abalone[i]), main=i)
cat("Outliers for column", i, colnames(abalone[i]), ":", length(boxplot.stats(abalone[[i]])$out), "\n")
}
## Outliers for column 2 Length : 49
## Outliers for column 3 Diameter : 59
## Outliers for column 4 Height : 29
## Outliers for column 5 Whole weight : 30
## Outliers for column 6 Shucked weight : 48
## Outliers for column 7 Viscera weight : 26
## Outliers for column 8 Shell weight : 35
## Outliers for column 9 Rings : 278
# height may have outliers worth examining more as seen with the boxplot
out_ind <- c()
for(i in c(2:9)){
vals_out <- boxplot.stats(abalone[[i]])$out
out_ind <- unique(c(out_ind, which(abalone[[i]] %in% c(vals_out))))
}
dim(abalone)
## [1] 4177 9
abalone <- abalone[-c(out_ind), ]
dim(abalone)
## [1] 3781 9
# cat feature(s) vs target
# abalone Sex is only cat feature
# plot paired plots for each categorical variable, useful as reference
# get combinations, only using 2 cols here but can take many to combine at scale
combs <- expand.grid(c(1, 9),
c(1, 9))
combs <- data.frame(with(combs, cbind(Var2, Var1)))
colnames(combs) <- c("Vec1", "Vec2")
# remove duplicates (avoid plotting a column against itself)
to_remove <- c()
for(row in 1:nrow(combs)){
if(combs[row, "Vec1"]==combs[row, "Vec2"]){
to_remove <- c(to_remove, row)
}
}
combs <- combs[-c(to_remove), ]
for(row in 1:nrow(combs)){
cont_table <- table(abalone[[combs[row, ][[2]]]], abalone[[combs[row, ][[1]]]])
barplot(prop.table(cont_table, 2), names.arg=names(table(abalone[combs[row, ][[1]]])),
legend.text=names(table(abalone[combs[row, ][[2]]])),
main=colnames(abalone[combs[row, ][[2]]]),
xlab=colnames(abalone[combs[row, ][[1]]]))
}
# continuous + categorical pairwise visualisations
combs_2 <- expand.grid(c(2, 3, 4, 5, 6, 7, 8, 9),
c(1))
combs_2 <- data.frame(with(combs_2, cbind(Var2, Var1)))
colnames(combs_2) <- c("Vec1",
"Vec2")
for(row in 1:nrow(combs_2)){
boxplot(data=abalone,
abalone[[combs_2[row, 2]]] ~ abalone[[combs_2[row, 1]]],
xlab=colnames(abalone[combs_2[row, 1]]),
ylab=colnames(abalone[combs_2[row, 2]]))
}
my_pca <- prcomp(purrr::keep(abalone, is.numeric), center=TRUE, scale=TRUE)
autoplot(my_pca, loadings=TRUE, loadings.label=TRUE, loadings.label.size=5, loadings.colour="blue", alpha=.2)
my_pca; summary(my_pca)
## Standard deviations (1, .., p=8):
## [1] 2.62963153 0.80463242 0.39212562 0.35768425 0.26837925 0.25269855 0.11552974
## [8] 0.08168883
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Length 0.3692998 0.068229541 -0.22876433 0.54468523 -0.03947722
## Diameter 0.3697852 0.037921905 -0.26242072 0.52687270 0.04782258
## Height 0.3563192 -0.044043974 -0.71150748 -0.54731524 -0.24043484
## Whole weight 0.3737130 0.146432242 0.28144434 -0.12355157 0.04415563
## Shucked weight 0.3600032 0.274032415 0.36066129 0.01878924 -0.43572884
## Viscera weight 0.3646312 0.145844646 0.33928165 -0.27190051 -0.15132171
## Shell weight 0.3689327 0.005239726 0.06609293 -0.18906614 0.84078456
## Rings 0.2481577 -0.934950537 0.20926430 0.02958511 -0.12930340
## PC6 PC7 PC8
## Length 0.14111046 0.6988854528 0.005978851
## Diameter 0.08826142 -0.7108011231 0.006788153
## Height -0.08536094 0.0132861083 0.005288835
## Whole weight -0.14096349 0.0003293175 0.850058654
## Shucked weight -0.56559810 -0.0206214836 -0.393304291
## Viscera weight 0.76873007 -0.0459814425 -0.201923364
## Shell weight -0.17801317 0.0594036227 -0.285676054
## Rings -0.05103672 0.0087137312 -0.014777277
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6296 0.80463 0.39213 0.35768 0.2684 0.25270 0.11553
## Proportion of Variance 0.8644 0.08093 0.01922 0.01599 0.0090 0.00798 0.00167
## Cumulative Proportion 0.8644 0.94530 0.96452 0.98051 0.9895 0.99750 0.99917
## PC8
## Standard deviation 0.08169
## Proportion of Variance 0.00083
## Cumulative Proportion 1.00000
corm <- round(cor(abalone[, -1]), 2); corm
## Length Diameter Height Whole weight Shucked weight
## Length 1.00 0.99 0.89 0.94 0.92
## Diameter 0.99 1.00 0.90 0.94 0.91
## Height 0.89 0.90 1.00 0.89 0.85
## Whole weight 0.94 0.94 0.89 1.00 0.97
## Shucked weight 0.92 0.91 0.85 0.97 1.00
## Viscera weight 0.91 0.91 0.87 0.97 0.93
## Shell weight 0.92 0.93 0.90 0.96 0.90
## Rings 0.59 0.60 0.62 0.56 0.47
## Viscera weight Shell weight Rings
## Length 0.91 0.92 0.59
## Diameter 0.91 0.93 0.60
## Height 0.87 0.90 0.62
## Whole weight 0.97 0.96 0.56
## Shucked weight 0.93 0.90 0.47
## Viscera weight 1.00 0.92 0.55
## Shell weight 0.92 1.00 0.62
## Rings 0.55 0.62 1.00
set.seed(1)
highcorr <- findCorrelation(corm, cutoff=.5)
abalone$Rings <- as.factor(abalone$Rings)
# feature importance w/ Learning Vector Quantization (LVQ) model
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Rings ~ Length + Diameter + Height + `Whole weight` + `Shucked weight` + `Viscera weight` + `Shell weight`,
data=abalone, method="lvq", preProcess="scale", trControl=control)
# model <- caret::train(Rings ~ Length + Diameter + Height + `Whole weight` + `Shucked weight` + `Viscera weight` + `Shell weight`,
# data=abalone, method="lvq", preProcess="scale", trControl=control)
importance <- varImp(model, scale=F)
print(importance)
## ROC curve variable importance
##
## variables are sorted by maximum importance across the classes
## X4 X5 X6 X7 X8 X9 X10 X11 X12 X13
## Viscera weight 0.9327 0.9772 0.9940 0.9995 0.9994 0.9993 1.0000 0.9996 1 1
## Length 0.9315 0.9780 0.9953 0.9979 0.9990 0.9992 0.9999 0.9995 1 1
## Shucked weight 0.9287 0.9771 0.9927 0.9978 0.9989 0.9989 0.9994 0.9998 1 1
## Shell weight 0.9297 0.9786 0.9956 0.9989 0.9995 0.9995 1.0000 0.9996 1 1
## Height 0.8995 0.9618 0.9921 0.9974 0.9976 0.9979 0.9994 0.9990 1 1
## Whole weight 0.9385 0.9793 0.9952 0.9989 0.9995 0.9995 0.9999 0.9998 1 1
## Diameter 0.9282 0.9764 0.9951 0.9980 0.9991 0.9996 0.9999 0.9996 1 1
## X14 X15
## Viscera weight 0.7921 0.9772
## Length 0.7962 0.9780
## Shucked weight 0.7741 0.9771
## Shell weight 0.8053 0.9786
## Height 0.7769 0.9618
## Whole weight 0.8026 0.9793
## Diameter 0.8004 0.9764
plot(importance)
# average importance for each feature (across the 12 distinct target values)
# mostly all equal, height least important yet still significant
imp_df <- importance$importance
for(r in 1:nrow(imp_df)){
cat(rownames(imp_df[r, ]),
mean(as.numeric(imp_df[r, ])),
"\n")
}
## Length 0.9728498
## Diameter 0.9727228
## Height 0.965291
## Whole weight 0.9743772
## Shucked weight 0.9703844
## Viscera weight 0.9725956
## Shell weight 0.9737819
# using random forest as selection function
set.seed(1)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(abalone[, 2:8], abalone[, 9], sizes=c(2:8), rfeControl=control)
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.2438 0.1280 0.02345 0.02662
## 3 0.2406 0.1255 0.02482 0.02838
## 4 0.2491 0.1355 0.02085 0.02331
## 5 0.2594 0.1467 0.02016 0.02238
## 6 0.2644 0.1515 0.02303 0.02634 *
## 7 0.2631 0.1495 0.02045 0.02421
##
## The top 5 variables (out of 6):
## Shell weight, Diameter, Whole weight, Viscera weight, Shucked weight
predictors(results)
## [1] "Shell weight" "Diameter" "Whole weight" "Viscera weight"
## [5] "Shucked weight" "Length"
plot(results, type=c("g", "o"))
Initially we consider treating the problem of Rings prediction as a regression (predicting a quantitative value).
abalone$Rings <- as.integer(abalone$Rings)
corrplot::corrplot(purrr::keep(abalone, is.numeric) %>% cor(), addCoef.col="black"); dim(abalone)
## [1] 3781 9
# reduce collinearity by identifying extremes - 95% cutoff
to_remove <- caret::findCorrelation(corm, cutoff=.95, exact=FALSE)
abalone <- abalone[, -c(to_remove)]
head(abalone)
## Sex Diameter Whole weight Shucked weight Viscera weight Shell weight Rings
## 1 M 0.365 0.5140 0.2245 0.1010 0.150 12
## 2 M 0.265 0.2255 0.0995 0.0485 0.070 4
## 3 F 0.420 0.6770 0.2565 0.1415 0.210 6
## 4 M 0.365 0.5160 0.2155 0.1140 0.155 7
## 5 I 0.255 0.2050 0.0895 0.0395 0.055 4
## 6 I 0.300 0.3515 0.1410 0.0775 0.120 5
# perform stratified random split
set.seed(1)
train_index <- createDataPartition(abalone$Rings, p=.8, list=FALSE)
abalone_train <- abalone[ train_index, ]
abalone_test <- abalone[-train_index, ]
dim(abalone_train); dim(abalone_test)
## [1] 3026 7
## [1] 755 7
lm_mod <- lm(Rings ~ ., data=abalone_train)
summary(lm_mod)
##
## Call:
## lm(formula = Rings ~ ., data = abalone_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3997 -1.1149 -0.2354 0.8444 6.7469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.58355 0.27786 5.699 1.32e-08 ***
## SexI -0.79257 0.09070 -8.738 < 2e-16 ***
## SexM 0.09432 0.07531 1.252 0.210
## Diameter 11.81163 1.02522 11.521 < 2e-16 ***
## `Whole weight` 4.61279 0.73357 6.288 3.68e-10 ***
## `Shucked weight` -13.06539 0.82924 -15.756 < 2e-16 ***
## `Viscera weight` -3.20802 1.26159 -2.543 0.011 *
## `Shell weight` 7.72029 1.18064 6.539 7.24e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.679 on 3018 degrees of freedom
## Multiple R-squared: 0.4889, Adjusted R-squared: 0.4877
## F-statistic: 412.5 on 7 and 3018 DF, p-value: < 2.2e-16
plot(cbind(abalone_train$Rings, lm_mod$fitted.values))
# evaluate on test set
lm_mod_pred <- predict(lm_mod, abalone_test)
cat("RMSE:", rmse(abalone_test$Rings, lm_mod_pred), "\n")
## RMSE: 1.526494
cat("Accuracy:", accuracy(abalone_test$Rings, round(lm_mod_pred)))
## Accuracy: 0.2582781
# lr preds
cbind(abalone_test$Rings, lm_mod_pred) %>% round(2) %>% head(10)
## lm_mod_pred
## 4 7 6.38
## 5 4 3.88
## 17 4 4.96
## 18 7 5.95
## 25 7 7.30
## 27 8 7.66
## 31 7 8.46
## 41 6 5.66
## 42 11 7.26
## 51 5 6.37
head(abalone)
## Sex Diameter Whole weight Shucked weight Viscera weight Shell weight Rings
## 1 M 0.365 0.5140 0.2245 0.1010 0.150 12
## 2 M 0.265 0.2255 0.0995 0.0485 0.070 4
## 3 F 0.420 0.6770 0.2565 0.1415 0.210 6
## 4 M 0.365 0.5160 0.2155 0.1140 0.155 7
## 5 I 0.255 0.2050 0.0895 0.0395 0.055 4
## 6 I 0.300 0.3515 0.1410 0.0775 0.120 5
preds <- c("Sex", "Whole weight", "Viscera weight", "Shell weight")
train_pred <- abalone[train_index, preds]
test_pred <- abalone[-train_index, preds]
train_resp <- abalone$Rings[train_index]
test_resp <- abalone$Rings[-train_index]
model_tree <- cubist(y=train_resp, x=train_pred)
model_tree; summary(model_tree)
##
## Call:
## cubist.default(x = train_pred, y = train_resp)
##
## Number of samples: 3026
## Number of predictors: 4
##
## Number of committees: 1
## Number of rules: 5
##
## Call:
## cubist.default(x = train_pred, y = train_resp)
##
##
## Cubist [Release 2.07 GPL Edition] Wed May 26 23:39:10 2021
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 3026 cases (5 attributes) from undefined.data
##
## Model:
##
## Rule 1: [699 cases, mean 4.1, range 1 to 12, est err 1.0]
##
## if
## Sex = I
## Shell weight <= 0.165
## then
## outcome = 1.9 + 38.8 Shell weight - 7.01 Whole weight
## + 11.2 Viscera weight
##
## Rule 2: [378 cases, mean 5.7, range 1 to 12, est err 1.5]
##
## if
## Sex in {F, M}
## Shell weight <= 0.165
## then
## outcome = 3.4 + 38.1 Shell weight - 7.52 Whole weight
## + 6.6 Viscera weight
##
## Rule 3: [296 cases, mean 6.7, range 3 to 12, est err 1.3]
##
## if
## Sex = I
## Shell weight > 0.165
## then
## outcome = 1.2 + 26.5 Shell weight - 1.18 Whole weight
## + 1.5 Viscera weight
##
## Rule 4: [1721 cases, mean 7.4, range 3 to 12, est err 1.3]
##
## if
## Whole weight > 0.6605
## then
## outcome = 4.9 + 16.3 Shell weight - 3.07 Whole weight
## + 2.3 Viscera weight
##
## Rule 5: [143 cases, mean 7.8, range 4 to 12, est err 1.8]
##
## if
## Sex in {F, M}
## Whole weight <= 0.6605
## Shell weight > 0.165
## then
## outcome = 4.2 - 14.05 Whole weight + 45.8 Shell weight
## + 22.9 Viscera weight
##
##
## Evaluation on training data (3026 cases):
##
## Average |error| 1.5
## Relative |error| 0.77
## Correlation coefficient 0.61
##
##
## Attribute usage:
## Conds Model
##
## 58% 100% Whole weight
## 47% 100% Shell weight
## 47% Sex
## 100% Viscera weight
##
##
## Time: 0.1 secs
model_tree_pred <- predict(model_tree, test_pred)
cat("RMSE:", rmse(abalone_test$Rings, round(model_tree_pred)), "\n")
## RMSE: 1.596768
cat("Accuracy:", accuracy(abalone_test$Rings, round(model_tree_pred)))
## Accuracy: 0.3059603
cbind(abalone_test$Rings, model_tree_pred) %>% round(2) %>% head(10)
## model_tree_pred
## [1,] 7 6.18
## [2,] 4 3.04
## [3,] 4 4.77
## [4,] 7 5.54
## [5,] 7 7.00
## [6,] 8 7.11
## [7,] 7 7.77
## [8,] 6 5.19
## [9,] 11 7.14
## [10,] 5 5.70
set.seed(1)
pcr_fit <- pcr(Rings~., data=abalone, subset=train_index, scale=T, validation="CV")
summary(pcr_fit)
## Data: X dimension: 3026 7
## Y dimension: 3026 1
## Fit method: svdpc
## Number of components considered: 7
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 2.346 1.897 1.896 1.88 1.717 1.693 1.693
## adjCV 2.346 1.897 1.896 1.88 1.717 1.693 1.693
## 7 comps
## CV 1.681
## adjCV 1.681
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 74.36 90.62 96.25 97.84 99.01 99.91 100.00
## Rings 34.65 34.77 35.88 46.54 48.06 48.13 48.89
validationplot(pcr_fit, val.type="MSEP")
pcr_pred <- predict(pcr_fit, abalone_test[-7], ncomp=5)
cat("RMSE:", rmse(abalone_test$Rings, c(pcr_pred)), "\n")
## RMSE: 1.533576
cat("Accuracy:", accuracy(abalone_test$Rings, round(c(pcr_pred))))
## Accuracy: 0.2503311
set.seed(1)
pls_fit <- plsr(Rings~., data=abalone, subset=train_index, scale=T, validation="CV")
summary(pls_fit)
## Data: X dimension: 3026 7
## Y dimension: 3026 1
## Fit method: kernelpls
## Number of components considered: 7
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 2.346 1.886 1.771 1.72 1.691 1.689 1.682
## adjCV 2.346 1.886 1.771 1.72 1.691 1.688 1.681
## 7 comps
## CV 1.681
## adjCV 1.681
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 74.32 81.12 92.77 97.79 98.92 99.15 100.00
## Rings 35.36 43.24 46.44 48.20 48.39 48.86 48.89
validationplot(pls_fit, val.type="MSEP")
pls_pred <- predict(pls_fit, abalone_test[-7], ncomp=4)
cat("RMSE:", rmse(abalone_test$Rings, c(pls_pred)), "\n")
## RMSE: 1.528913
cat("Accuracy:", accuracy(abalone_test$Rings, round(c(pls_pred))))
## Accuracy: 0.2450331
rf_train <- abalone_train # train
colnames(rf_train)[3:6] <- c("Whole_weight", "Shucked_weight",
"Viscera_weight", "Shell_weight")
rf_test <- abalone_test[-7] # test
colnames(rf_test)[3:6] <- c("Whole_weight", "Shucked_weight",
"Viscera_weight", "Shell_weight")
set.seed(1)
rf_mod <- randomForest(Rings ~., data=rf_train,
importance=TRUE, proximity=TRUE)
print(rf_mod)
##
## Call:
## randomForest(formula = Rings ~ ., data = rf_train, importance = TRUE, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 2.605371
## % Var explained: 52.63
rf_pred <- predict(rf_mod, newdata=rf_test, type="response")
cat("RMSE:", rmse(abalone_test$Rings, c(rf_pred)), "\n")
## RMSE: 1.499617
cat("Accuracy:", accuracy(abalone_test$Rings, round(c(rf_pred))))
## Accuracy: 0.2649007
# rf preds
cbind(abalone_test$Rings, rf_pred) %>% round(2) %>% head(10)
## rf_pred
## 4 7 6.45
## 5 4 3.25
## 17 4 5.51
## 18 7 6.23
## 25 7 6.86
## 27 8 8.78
## 31 7 8.92
## 41 6 5.30
## 42 11 7.11
## 51 5 5.93
Tune the model using tuneRF from randomForest
set.seed(1)
bestmtry <- tuneRF(rf_train[, -7], rf_train$Rings, stepFactor=1.5, improve=1e-5, ntree=500)
## mtry = 2 OOB error = 2.607128
## Searching left ...
## Searching right ...
## mtry = 3 OOB error = 2.62075
## -0.00522479 1e-05
print(bestmtry)
## mtry OOBError
## 2 2 2.607128
## 3 3 2.620750
Use best mtry of 2 for improved model
set.seed(1)
rf_mod <- randomForest(Rings ~., data=rf_train,
importance=TRUE, proximity=TRUE, mtry=2)
print(rf_mod)
##
## Call:
## randomForest(formula = Rings ~ ., data = rf_train, importance = TRUE, proximity = TRUE, mtry = 2)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 2.605371
## % Var explained: 52.63
rf_pred <- predict(rf_mod, newdata=rf_test, type="response")
cat("RMSE:", rmse(abalone_test$Rings, c(rf_pred)), "\n")
## RMSE: 1.499617
cat("Accuracy:", accuracy(abalone_test$Rings, round(c(rf_pred))))
## Accuracy: 0.2649007
# rf preds
cbind(abalone_test$Rings, rf_pred) %>% round(2) %>% head(10)
## rf_pred
## 4 7 6.45
## 5 4 3.25
## 17 4 5.51
## 18 7 6.23
## 25 7 6.86
## 27 8 8.78
## 31 7 8.92
## 41 6 5.30
## 42 11 7.11
## 51 5 5.93
The dataset ‘as is’ makes it challenging to predict with high accuracy the exact number of Rings the dataset info sheet mentions similar scores as benchmarks to those predicted below. These new predictions were obtained after encoding the Rings as range bins. This allowed a higher prediction accuracy and improved RMSE. The Shiny Web App should allow for selection of predictions based on either regression or classification.
Binning Rings:
abalone_binned <- abalone
abalone_binned$Rings <- cut(abalone_binned$Rings, breaks=c(-Inf, 5, 10, +Inf))
head(abalone_binned)
## Sex Diameter Whole weight Shucked weight Viscera weight Shell weight
## 1 M 0.365 0.5140 0.2245 0.1010 0.150
## 2 M 0.265 0.2255 0.0995 0.0485 0.070
## 3 F 0.420 0.6770 0.2565 0.1415 0.210
## 4 M 0.365 0.5160 0.2155 0.1140 0.155
## 5 I 0.255 0.2050 0.0895 0.0395 0.055
## 6 I 0.300 0.3515 0.1410 0.0775 0.120
## Rings
## 1 (10, Inf]
## 2 (-Inf,5]
## 3 (5,10]
## 4 (5,10]
## 5 (-Inf,5]
## 6 (-Inf,5]
rf_mod2_dat <- abalone_binned[train_index, ]
colnames(rf_mod2_dat)[3:6] <- c("Whole_weight", "Shucked_weight",
"Viscera_weight", "Shell_weight")
rf_mod2_dat$Rings <- factor(rf_mod2_dat$Rings)
set.seed(1)
rf_mod2 <- randomForest::randomForest(Rings ~., data=rf_mod2_dat,
importance=TRUE, proximity=TRUE)
print(rf_mod2)
##
## Call:
## randomForest(formula = Rings ~ ., data = rf_mod2_dat, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.27%
## Confusion matrix:
## (-Inf,5] (5,10] (10, Inf] class.error
## (-Inf,5] 769 304 0 0.2833178
## (5,10] 211 1541 15 0.1279004
## (10, Inf] 7 167 12 0.9354839
rf_mod2_dat_test <- abalone_binned[-train_index, ]
colnames(rf_mod2_dat_test)[3:6] <- c("Whole_weight", "Shucked_weight",
"Viscera_weight", "Shell_weight")
rf_pred2 <- predict(rf_mod2, newdata=rf_mod2_dat_test, type="response")
cat("Accuracy:", accuracy(as.numeric(rf_mod2_dat_test$Rings), as.numeric(rf_pred2)))
## Accuracy: 0.7774834
cat("RMSE:", rmse(as.numeric(rf_mod2_dat_test$Rings), as.numeric(rf_pred2)))
## RMSE: 0.4800662
# rf_mod2 preds
cbind(rf_mod2_dat_test$Rings, rf_pred2) %>% head(10)
## rf_pred2
## 4 2 2
## 5 1 1
## 17 1 1
## 18 2 2
## 25 2 2
## 27 2 2
## 31 2 2
## 41 2 1
## 42 3 2
## 51 1 2
The Random Forest algorithm provided the best fit in terms of RMSE. The Random Forest model can be used in production. Additional feature engineering is required, as mentioned by the researchers. See ‘abalone.names’ supplied in /data folder (GitHub). Here is a summary of the results:
View Shiny app at:
davidmh.shinyapps.io/abalone_age_predictor
Download shinytestdata.csv in data/ folder (GitHub) to upload to app and get age predictions.
saveRDS(rf_mod, "Abalone_Age_Predictor/rf_mod.rds") # save rf regression model
saveRDS(rf_mod2, "Abalone_Age_Predictor/rf_mod2.rds") # save rf classification model