1.1 Exploratory Analysis
library(ggplot2)
library(dplyr)
library(GGally)
library(tidyr)
library(MASS)
library(caret)
library(plotly)
library(caTools)
library(heplots)
library(MVN)
library(e1071)
library(MLmetrics)
# Import data
abalone <- read.csv("C:/Users/Tahmina/Downloads/abalone/abalone.csv")
head(abalone)
# Explore data
abalone$Sex <- factor(abalone$Sex)
dim(abalone)
[1] 4177 9
str(abalone)
'data.frame': 4177 obs. of 9 variables:
$ Sex : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ...
$ Length : int 91 70 106 88 66 85 106 109 95 110 ...
$ Diameter : int 73 53 84 73 51 60 83 85 74 88 ...
$ Height : int 19 18 27 25 16 19 30 25 25 30 ...
$ Whole.weight : num 102.8 45.1 135.4 103.2 41 ...
$ Shucked.weight: num 44.9 19.9 51.3 43.1 17.9 28.2 47.4 58.8 43.3 62.9 ...
$ Viscera.weight: num 20.2 9.7 28.3 22.8 7.9 15.5 28.3 29.9 22.5 30.2 ...
$ Shell.weight : num 30 14 42 31 11 24 66 52 33 64 ...
$ Rings : int 15 7 9 10 7 8 20 16 9 19 ...
any(is.na(abalone))
[1] FALSE
summary(abalone[,-1])
Length Diameter Height Whole.weight Shucked.weight Viscera.weight
Min. : 15.0 Min. : 11.00 Min. : 0.0 Min. : 0.4 Min. : 0.20 Min. : 0.10
1st Qu.: 90.0 1st Qu.: 70.00 1st Qu.: 23.0 1st Qu.: 88.3 1st Qu.: 37.20 1st Qu.: 18.70
Median :109.0 Median : 85.00 Median : 28.0 Median :159.9 Median : 67.20 Median : 34.20
Mean :104.8 Mean : 81.58 Mean : 27.9 Mean :165.7 Mean : 71.87 Mean : 36.12
3rd Qu.:123.0 3rd Qu.: 96.00 3rd Qu.: 33.0 3rd Qu.:230.6 3rd Qu.:100.40 3rd Qu.: 50.60
Max. :163.0 Max. :130.00 Max. :226.0 Max. :565.1 Max. :297.60 Max. :152.00
Shell.weight Rings
Min. : 0.30 Min. : 1.000
1st Qu.: 26.00 1st Qu.: 8.000
Median : 46.80 Median : 9.000
Mean : 47.77 Mean : 9.934
3rd Qu.: 65.80 3rd Qu.:11.000
Max. :201.00 Max. :29.000
# Column means:
colMeans(abalone[,-1])
Length Diameter Height Whole.weight Shucked.weight Viscera.weight Shell.weight
104.798420 81.576251 27.903280 165.748432 71.873498 36.118722 47.766172
Rings
9.933684
# Column Standard deviations:
apply(abalone[-1],2,sd)
Length Diameter Height Whole.weight Shucked.weight Viscera.weight Shell.weight
24.018583 19.847973 8.365411 98.077804 44.392590 21.922850 27.840534
Rings
3.224169
# Column range:
apply(abalone[-1],2,range)[2,]-apply(abalone[-1],2,range)[1,]
Length Diameter Height Whole.weight Shucked.weight Viscera.weight Shell.weight
148.0 119.0 226.0 564.7 297.4 151.9 200.7
Rings
28.0
## Multivariate summaries:
# Variance-covariance matrix
cov(abalone[,-1])
Length Diameter Height Whole.weight Shucked.weight Viscera.weight Shell.weight Rings
Length 576.89231 470.43300 166.27648 2179.6283 957.39782 475.48919 600.28688 43.11235
Diameter 470.43300 393.94204 138.42189 1801.5273 786.96808 391.49182 500.26548 36.77433
Height 166.27648 138.42189 69.98011 672.1388 287.79547 146.40670 190.35599 15.03573
Whole.weight 2179.62832 1801.52728 672.13883 9619.2556 4220.72128 2077.84653 2608.63474 170.88171
Shucked.weight 957.39782 786.96808 287.79547 4220.7213 1970.70203 906.99603 1090.83825 60.24075
Viscera.weight 475.48919 391.49182 146.40670 2077.8465 906.99603 480.61135 553.98245 35.61144
Shell.weight 600.28688 500.26548 190.35599 2608.6347 1090.83825 553.98245 775.09533 56.33267
Rings 43.11235 36.77433 15.03573 170.8817 60.24075 35.61144 56.33267 10.39527
# Correlation matrix
cor(abalone[,-1])
Length Diameter Height Whole.weight Shucked.weight Viscera.weight Shell.weight Rings
Length 1.0000000 0.9868116 0.8275536 0.9252612 0.8979137 0.9030177 0.8977056 0.5567196
Diameter 0.9868116 1.0000000 0.8336837 0.9254521 0.8931625 0.8997244 0.9053298 0.5746599
Height 0.8275536 0.8336837 1.0000000 0.8192208 0.7749723 0.7983193 0.8173380 0.5574673
Whole.weight 0.9252612 0.9254521 0.8192208 1.0000000 0.9694055 0.9663751 0.9553554 0.5403897
Shucked.weight 0.8979137 0.8931625 0.7749723 0.9694055 1.0000000 0.9319613 0.8826171 0.4208837
Viscera.weight 0.9030177 0.8997244 0.7983193 0.9663751 0.9319613 1.0000000 0.9076563 0.5038192
Shell.weight 0.8977056 0.9053298 0.8173380 0.9553554 0.8826171 0.9076563 1.0000000 0.6275740
Rings 0.5567196 0.5746599 0.5574673 0.5403897 0.4208837 0.5038192 0.6275740 1.0000000
# Visualization:
ggpairs(abalone, aes(colour = Sex, alpha = 0.7), legend = 1, upper = list(continuous = wrap("cor",size=2)), title="Pair plot for abalone dataset") +
theme(plot.title = element_text(hjust = 0.5))

abalone %>% ggplot(aes(Sex, fill = 'Sex')) +
geom_bar(fill='blue') +
ggtitle('Countplot of Sex')+
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(stat = 'count', aes(label=..count..), vjust = -0.4)

1.4 Multiclass Classification
1.4.1 Linear Discriminant analysis (LDA)
# Fit LDA model with original data
raw.lda <- lda(Sex~Height+Length+Diameter, data = abalone, CV = TRUE)
# Confusion matrix
conf_matrix <- confusionMatrix(raw.lda$class, abalone$Sex)
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction F I M
F 337 19 323
I 195 903 283
M 731 365 857
Overall Statistics
Accuracy : 0.5226
95% CI : (0.507, 0.5381)
No Information Rate : 0.3646
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.2755
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: F Class: I Class: M
Sensitivity 0.26683 0.7016 0.5858
Specificity 0.87564 0.8247 0.5702
Pos Pred Value 0.49632 0.6539 0.4388
Neg Pred Value 0.72226 0.8541 0.7058
Prevalence 0.31473 0.3207 0.3646
Detection Rate 0.08398 0.2250 0.2136
Detection Prevalence 0.16920 0.3441 0.4867
Balanced Accuracy 0.57123 0.7631 0.5780
# Extract accuracy from confusion matrix
accuracy <- conf_matrix$overall["Accuracy"]
paste('Accuracy:', accuracy)
[1] "Accuracy: 0.522551706952405"
# Fit LDA model with transformed data
scaled.lda <- lda(Sex~Height+length2+diameter2, data = scaled, CV = TRUE)
# Confusion matrix
conf_matrix <- confusionMatrix(scaled.lda$class, scaled$Sex)
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction F I M
F 345 15 329
I 212 933 322
M 706 339 812
Overall Statistics
Accuracy : 0.5208
95% CI : (0.5052, 0.5364)
No Information Rate : 0.3646
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.274
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: F Class: I Class: M
Sensitivity 0.27316 0.7249 0.5550
Specificity 0.87491 0.8041 0.5902
Pos Pred Value 0.50073 0.6360 0.4373
Neg Pred Value 0.72383 0.8610 0.6981
Prevalence 0.31473 0.3207 0.3646
Detection Rate 0.08597 0.2325 0.2023
Detection Prevalence 0.17169 0.3656 0.4627
Balanced Accuracy 0.57403 0.7645 0.5726
# Extract accuracy from confusion matrix
accuracy <- conf_matrix$overall["Accuracy"]
paste('Accuracy:', accuracy)
[1] "Accuracy: 0.520807376027909"
# Covarience Equality testing
scaled.lm <- lm(cbind(Height, length2, diameter2)~Sex, data = scaled)
boxM(scaled.lm)
Box's M-test for Homogeneity of Covariance Matrices
data: Y
Chi-Sq (approx.) = 631.25, df = 12, p-value < 2.2e-16
# Normality testing
mvn(scaled[2:4])
$multivariateNormality
$univariateNormality
$Descriptives
NA
1.4.2 Quadratic Discriminant Analysis (QDA)
# Fit QDA model with original data
raw.qda <- qda(Sex~Height+Length+Diameter, data = abalone, CV = TRUE)
# Confusion matrix
conf_matrix <- confusionMatrix(raw.qda$class, abalone$Sex)
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction F I M
F 418 74 452
I 205 915 325
M 640 298 686
Overall Statistics
Accuracy : 0.5031
95% CI : (0.4875, 0.5187)
No Information Rate : 0.3646
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.2505
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: F Class: I Class: M
Sensitivity 0.3310 0.7110 0.4689
Specificity 0.8087 0.8056 0.6322
Pos Pred Value 0.4428 0.6332 0.4224
Neg Pred Value 0.7247 0.8551 0.6748
Prevalence 0.3147 0.3207 0.3646
Detection Rate 0.1042 0.2280 0.1709
Detection Prevalence 0.2352 0.3601 0.4047
Balanced Accuracy 0.5698 0.7583 0.5505
# Extract accuracy from confusion matrix
accuracy <- conf_matrix$overall["Accuracy"]
paste('Accuracy:', accuracy)
[1] "Accuracy: 0.503114876650885"
# Fit QDA model with transformed data
scaled.qda <- qda(Sex~Height+length2+diameter2, data = scaled, CV = TRUE)
# Confusion matrix
conf_matrix <- confusionMatrix(scaled.qda$class, scaled$Sex)
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction F I M
F 362 52 352
I 252 977 381
M 649 258 730
Overall Statistics
Accuracy : 0.5156
95% CI : (0.5, 0.5311)
No Information Rate : 0.3646
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.2688
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: F Class: I Class: M
Sensitivity 0.28662 0.7591 0.4990
Specificity 0.85309 0.7678 0.6443
Pos Pred Value 0.47258 0.6068 0.4459
Neg Pred Value 0.72251 0.8710 0.6915
Prevalence 0.31473 0.3207 0.3646
Detection Rate 0.09021 0.2435 0.1819
Detection Prevalence 0.19088 0.4012 0.4079
Balanced Accuracy 0.56986 0.7635 0.5716
# Extract accuracy from confusion matrix
accuracy <- conf_matrix$overall["Accuracy"]
paste('Accuracy:', accuracy)
[1] "Accuracy: 0.515574383254423"
1.4.3 Support Vector Machine (SVM)
# Support Vector Mechine (SVM) with transformed dataset
F1_Score <- function(confusion) {
# Calculate F1 score
}
svm.cv <- function(k, scaled, acc = TRUE) {
kfolds <- createFolds(scaled$Sex, k)
cv <- function(x) {
tr <- scaled[-x, ]
tst <- scaled[x, ]
fit <- svm(factor(Sex) ~ ., data = tr)
pred <- predict(fit, tst[-1])
confusion <- table(tst$Sex, pred)
if (acc) {
correct <- sum(diag(confusion))
total <- sum(confusion)
return(correct / total)
} else {
return(F1_Score(confusion))
}
}
cv.eval <- lapply(kfolds, cv)
return(cv.eval)
}
set.seed(123)
# Call svm.cv with the correct arguments
results <- svm.cv(10, scaled, TRUE)
mean_accuracy <- mean(unlist(results)) # Unlist the results and calculate the mean
paste('Accuracy with 10 folds:', mean_accuracy)
[1] "Accuracy with 10 folds: 0.520279532487375"
# tune of SVM
#tuned.svm <- tune.svm(Sex~Height+length2+diameter2,data=scaled, kernel="radial", gamma = 10^(-1:1), cost = 10^(-1:1))
#tuned.svm$best.model
1.5 Binary Classification
1.5.1 Classification for Infant
# Binary classification for Infant
I.df <- scaled
levels(I.df$Sex) = c('Non I', 'I', 'Non I')
##QDA
inf.qda <- qda(Sex~., data = I.df, CV = TRUE)
table(I.df$Sex, inf.qda$class)
Non I I
Non I 2272 454
I 421 866
# Confusion matrix to show how well the QDA classifies the data
confusion_matrix <- table(inf.qda$class, I.df$Sex)
# Calculate accuracy and F1-score
cm <- confusionMatrix(as.factor(inf.qda$class), I.df$Sex)
accuracy_value <- cm$overall['Accuracy']
f1_score_value <- cm$byClass['F1']
paste('Accuracy of Infants vs Non-Infants QDA:', accuracy_value)
[1] "Accuracy of Infants vs Non-Infants QDA: 0.781958634438076"
paste('F1 Score of Infants vs Non-Infants QDA:', f1_score_value)
[1] "F1 Score of Infants vs Non-Infants QDA: 0.838531094297841"
#SVM
set.seed(123)
paste('SVM Model accuracy', mean(as.numeric(svm.cv(10,I.df))))
[1] "SVM Model accuracy 0.787953003064478"
paste('SVM F1 Score accuracy', mean(as.numeric(svm.cv(10,I.df, TRUE))))
[1] "SVM F1 Score accuracy 0.788681632361881"
#logistic regression
set.seed (123)
tr.idx <- createDataPartition (I.df$Sex, p = 0.8, list=FALSE)
ds.train <- I.df[tr.idx, ]
ds.tst <- I.df[-tr.idx, ]
log.reg <- glm(Sex~., data = ds.train, family= 'binomial')
summary (log.reg)
Call:
glm(formula = Sex ~ ., family = "binomial", data = ds.train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.12228 0.05188 -21.633 < 2e-16 ***
Height -0.87499 0.11134 -7.859 3.88e-15 ***
length2 0.94160 0.27395 3.437 0.000588 ***
diameter2 -1.75102 0.28715 -6.098 1.08e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4029.5 on 3210 degrees of freedom
Residual deviance: 2861.8 on 3207 degrees of freedom
AIC: 2869.8
Number of Fisher Scoring iterations: 5
pred <- predict(log.reg, ds.tst)
pred.class <- ifelse (pred>0.5, 'I', 'Non I')
table(ds.tst$Sex, pred.class)
pred.class
I Non I
Non I 34 511
I 118 139
# Confusion matrix to show how well the QDA classifies the data
confusion_matrix <- table(pred.class, ds.tst$Sex)
# Calculate accuracy and F1-score
cm <- confusionMatrix(as.factor(pred.class), ds.tst$Sex)
Warning: Levels are not in the same order for reference and data. Refactoring data to match.
accuracy_value <- cm$overall['Accuracy']
f1_score_value <- cm$byClass['F1']
paste('Accuracy of Infants vs Non-Infants Logistic regression:', accuracy_value)
[1] "Accuracy of Infants vs Non-Infants Logistic regression: 0.78428927680798"
paste('F1 Score of Infants vs Non-Infants Logistic regression:', f1_score_value)
[1] "F1 Score of Infants vs Non-Infants Logistic regression: 0.855230125523013"
1.5.2 Classification for Female
# Binary classification for Female
F.df <- scaled
levels(F.df$Sex) = c('F', 'Non F', 'Non F')
##QDA
fm.qda <- qda(Sex~., data = F.df, CV = TRUE)
table(F.df$Sex, fm.qda$class)
F Non F
F 343 920
Non F 365 2385
# Confusion matrix to show how well the QDA classifies the data
confusion_matrix <- table(fm.qda$class, F.df$Sex)
# Calculate accuracy and F1-score
cm <- confusionMatrix(as.factor(fm.qda$class), F.df$Sex)
accuracy_value <- cm$overall['Accuracy']
f1_score_value <- cm$byClass['F1']
paste('Accuracy of Female vs Non-Female QDA:', accuracy_value)
[1] "Accuracy of Female vs Non-Female QDA: 0.679790680289061"
paste('F1 Score of Female vs Non-Female QDA:', f1_score_value)
[1] "F1 Score of Female vs Non-Female QDA: 0.3480466768138"
#SVM
set.seed(123)
paste('SVM Model accuracy', mean(as.numeric(svm.cv(10,F.df))))
[1] "SVM Model accuracy 0.689005719532016"
paste('SVM F1 Score accuracy', mean(as.numeric(svm.cv(10,F.df, TRUE))))
[1] "SVM F1 Score accuracy 0.687522487314053"
#logistic regression
set.seed (123)
tr.idx <- createDataPartition (F.df$Sex, p = 0.8, list=FALSE)
ds.train <- F.df[tr.idx, ]
ds.tst <- F.df[-tr.idx, ]
log.reg <- glm(Sex~., data = ds.train, family= 'binomial')
summary (log.reg)
Call:
glm(formula = Sex ~ ., family = "binomial", data = ds.train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.88862 0.04200 21.157 < 2e-16 ***
Height -0.46637 0.08441 -5.525 3.3e-08 ***
length2 0.18196 0.19364 0.940 0.34737
diameter2 -0.51679 0.19764 -2.615 0.00893 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4000.5 on 3210 degrees of freedom
Residual deviance: 3647.6 on 3207 degrees of freedom
AIC: 3655.6
Number of Fisher Scoring iterations: 4
pred <- predict(log.reg, ds.tst)
pred.class <- ifelse (pred>0.5, 'F', 'Non F')
table(ds.tst$Sex, pred.class)
pred.class
F Non F
F 122 130
Non F 411 139
# Confusion matrix to show how well the QDA classifies the data
confusion_matrix <- table(pred.class, ds.tst$Sex)
# Calculate accuracy and F1-score
cm <- confusionMatrix(as.factor(pred.class), ds.tst$Sex)
accuracy_value <- cm$overall['Accuracy']
f1_score_value <- cm$byClass['F1']
paste('Accuracy of Female vs Non-Female Logistic regression:', accuracy_value)
[1] "Accuracy of Female vs Non-Female Logistic regression: 0.325436408977556"
paste('F1 Score of Female vs Non-Female Logistic regression:', f1_score_value)
[1] "F1 Score of Female vs Non-Female Logistic regression: 0.310828025477707"
1.5.3 Classification for Male
# Binary classification for Male
M.df <- scaled
levels(M.df$Sex) = c('Non M', 'Non M', 'M')
##QDA
m.qda <- qda(Sex~., data = M.df, CV = TRUE)
table(M.df$Sex, m.qda$class)
Non M M
Non M 2185 365
M 1182 281
# Confusion matrix to show how well the QDA classifies the data
confusion_matrix <- table(m.qda$class, M.df$Sex)
# Calculate accuracy and F1-score
cm <- confusionMatrix(as.factor(m.qda$class), M.df$Sex)
accuracy_value <- cm$overall['Accuracy']
f1_score_value <- cm$byClass['F1']
paste('Accuracy of Male vs Non-Male QDA:', accuracy_value)
[1] "Accuracy of Male vs Non-Male QDA: 0.614502865686519"
paste('F1 Score of Male vs Non-Male QDA:', f1_score_value)
[1] "F1 Score of Male vs Non-Male QDA: 0.738549940848403"
#SVM
set.seed(123)
paste('SVM Model accuracy', mean(as.numeric(svm.cv(10,M.df))))
[1] "SVM Model accuracy 0.635435664569919"
paste('SVM F1 Score accuracy', mean(as.numeric(svm.cv(10,M.df, TRUE))))
[1] "SVM F1 Score accuracy 0.635435664569919"
#logistic regression
set.seed (123)
tr.idx <- createDataPartition (M.df$Sex, p = 0.8, list=FALSE)
ds.train <- M.df[tr.idx, ]
ds.tst <- M.df[-tr.idx, ]
log.reg <- glm(Sex~., data = ds.train, family= 'binomial')
summary (log.reg)
Call:
glm(formula = Sex ~ ., family = "binomial", data = ds.train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.59378 0.03819 -15.549 <2e-16 ***
Height 0.24859 0.08083 3.075 0.0021 **
length2 0.10510 0.18804 0.559 0.5762
diameter2 0.18335 0.19317 0.949 0.3425
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4213.3 on 3210 degrees of freedom
Residual deviance: 4027.5 on 3207 degrees of freedom
AIC: 4035.5
Number of Fisher Scoring iterations: 4
pred <- predict(log.reg, ds.tst)
pred.class <- ifelse (pred>0.5, 'M', 'Non M')
table(ds.tst$Sex, pred.class)
pred.class
M Non M
Non M 0 510
M 2 290
# Confusion matrix to show how well the QDA classifies the data
confusion_matrix <- table(pred.class, ds.tst$Sex)
# Calculate accuracy and F1-score
cm <- confusionMatrix(as.factor(pred.class), ds.tst$Sex)
Warning: Levels are not in the same order for reference and data. Refactoring data to match.
accuracy_value <- cm$overall['Accuracy']
f1_score_value <- cm$byClass['F1']
paste('Accuracy of Male vs Non-Male Logistic regression:', accuracy_value)
[1] "Accuracy of Male vs Non-Male Logistic regression: 0.638403990024938"
paste('F1 Score of Male vs Non-Male Logistic regression:', f1_score_value)
[1] "F1 Score of Male vs Non-Male Logistic regression: 0.778625954198473"