#Problem
# Predicting the age of abalone from physical measurements.
#
#Contents
#
# * Import the Dataset
# * Exploratory Data Analysis
# * Corelation Between The Data
# * Spliting the Dataset
# * Building the Model
# * Accuracy of Model
# * Conclusion
#Import the Dataset
#
# Import the dataset and assign the column names for each fields and view the dataset
dataset=read.csv("abalone(1).data",header = F)
colnames(dataset)=c("Sex","Length","Diameter","Height","Whole weight","Shucked weight","Viscera weight","Shell weight","Rings")
head(dataset)
## Sex Length Diameter Height Whole weight Shucked weight Viscera weight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## Shell weight Rings
## 1 0.150 15
## 2 0.070 7
## 3 0.210 9
## 4 0.155 10
## 5 0.055 7
## 6 0.120 8
# Exploratory Data Analysis
#
# Target Attributes
#
# Target attribute is Rings
#
# FREQUENCY BASED ON RINGS
#
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(dataset)
## 'data.frame': 4177 obs. of 9 variables:
## $ Sex : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ...
## $ Length : num 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
## $ Diameter : num 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
## $ Height : num 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
## $ Whole weight : num 0.514 0.226 0.677 0.516 0.205 ...
## $ Shucked weight: num 0.2245 0.0995 0.2565 0.2155 0.0895 ...
## $ Viscera weight: num 0.101 0.0485 0.1415 0.114 0.0395 ...
## $ Shell weight : num 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
## $ Rings : int 15 7 9 10 7 8 20 16 9 19 ...
summary(dataset)
## Sex Length Diameter Height
## F:1307 Min. :0.075 Min. :0.0550 Min. :0.0000
## I:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## M:1528 Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## Whole weight Shucked weight Viscera weight Shell weight
## Min. :0.0020 Min. :0.0010 Min. :0.0005 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300
## Median :0.7995 Median :0.3360 Median :0.1710 Median :0.2340
## Mean :0.8287 Mean :0.3594 Mean :0.1806 Mean :0.2388
## 3rd Qu.:1.1530 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
## Rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.934
## 3rd Qu.:11.000
## Max. :29.000
fivenum(dataset$Rings)
## [1] 1 8 9 11 29
quantile(dataset$Rings)
## 0% 25% 50% 75% 100%
## 1 8 9 11 29
library(ggplot2)
dataset %>% ggplot(aes(x=Rings))+geom_histogram(fill="lightblue",color="Darkblue",binwidth = 2,alpha=0.5)

dataset %>% ggplot(aes(x=Rings))+geom_bar(fill="steelblue",alpha=0.6)

boxplot(dataset$Rings,col = "steelblue",horizontal = TRUE,border="black",xlab=("Rings"))

# Thus the most frequent values of the attribute Rings are highly concentrated around the median of the distribution value of 9.
#
# Sex Attribute
table(dataset$Sex)
##
## F I M
## 1307 1342 1528
ggplot(dataset) + aes(Rings, fill = Sex) + geom_density(alpha=0.6)

ggplot(dataset,aes(x=Rings,y=Sex,fill=Sex))+geom_col()+facet_grid(~Sex)

# From the above plots we know that the female abalone has longer lifespan than others but the count of male abalone is maximum than others.
# Weight Attribute
# Whole weight
#
summary(dataset$`Whole weight`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0020 0.4415 0.7995 0.8287 1.1530 2.8255
vect1<-round(dataset$`Whole weight`,0)
ggplot(dataset,aes(y=`Whole weight`,x=2,fill="Orange"))+geom_boxplot()

ggplot(dataset,aes(y=factor(vect1),x=Rings,fill=factor(vect1)))+geom_col()+facet_grid(~factor(vect1))+ylab("Whole weight")

#
#Shell weight
#
boxplot(dataset$`Shell weight`,col = "darkgreen",border="black",ylab="Shell weight")

ggplot(dataset,aes(x=`Shell weight`,fill="Orange"))+geom_bar(binwidth = 0.02)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

# From the above plots we know that the weight of abalone (1.5 to 2.4)gm has longer lifespan than others but the weight of abalone (0.5 to 1.4)gm has maximum count than others.
#
#Corelation Between The Data
library(corrplot)
## corrplot 0.84 loaded
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
corrgram(dataset[dataset$Rings,],order=T,lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Abalone data")

# Spliting the Dataset
#
clean_dataset<-dataset %>% mutate(age=case_when(Rings %in% 1:5~"young",Rings %in% 6:13~"adult",Rings %in% 14:30~"old"))
clean_dataset<-clean_dataset %>% select(c(2,3,5,8,10)) %>% na.omit()
head(clean_dataset)
## Length Diameter Whole weight Shell weight age
## 1 0.455 0.365 0.5140 0.150 old
## 2 0.350 0.265 0.2255 0.070 adult
## 3 0.530 0.420 0.6770 0.210 adult
## 4 0.440 0.365 0.5160 0.155 adult
## 5 0.330 0.255 0.2050 0.055 adult
## 6 0.425 0.300 0.3515 0.120 adult
library(caTools)
set.seed(123)
sample=sample.split(clean_dataset,SplitRatio = 0.8)
traindata<-subset(clean_dataset,sample==T)
head(traindata)
## Length Diameter Whole weight Shell weight age
## 1 0.455 0.365 0.5140 0.150 old
## 2 0.350 0.265 0.2255 0.070 adult
## 3 0.530 0.420 0.6770 0.210 adult
## 4 0.440 0.365 0.5160 0.155 adult
## 6 0.425 0.300 0.3515 0.120 adult
## 7 0.530 0.415 0.7775 0.330 old
testdata<-subset(clean_dataset,sample==F)
# Building the Model
library(rpart)
model <-rpart(formula=age~.,data=traindata,method="class")
summary(model)
## Call:
## rpart(formula = age ~ ., data = traindata, method = "class")
## n= 3342
##
## CP nsplit rel error xerror xstd
## 1 0.11992620 0 1.0000000 1.0000000 0.03931662
## 2 0.01537515 1 0.8800738 0.8985240 0.03763267
## 3 0.01291513 5 0.8136531 0.9059041 0.03776042
## 4 0.01000000 6 0.8007380 0.8874539 0.03743941
##
## Variable importance
## Shell weight Diameter Whole weight Length
## 27 25 25 23
##
## Node number 1: 3342 observations, complexity param=0.1199262
## predicted class=adult expected loss=0.1621783 P(node) =1
## class counts: 2800 390 152
## probabilities: 0.838 0.117 0.045
## left son=2 (3169 obs) right son=3 (173 obs)
## Primary splits:
## Diameter < 0.2225 to the right, improve=128.1876, (0 missing)
## Shell weight < 0.03875 to the right, improve=128.1826, (0 missing)
## Whole weight < 0.12375 to the right, improve=127.3570, (0 missing)
## Length < 0.2925 to the right, improve=124.2091, (0 missing)
## Surrogate splits:
## Shell weight < 0.03825 to the right, agree=0.994, adj=0.879, (0 split)
## Length < 0.2975 to the right, agree=0.993, adj=0.861, (0 split)
## Whole weight < 0.12375 to the right, agree=0.992, adj=0.844, (0 split)
##
## Node number 2: 3169 observations, complexity param=0.01537515
## predicted class=adult expected loss=0.1334806 P(node) =0.9482346
## class counts: 2746 390 33
## probabilities: 0.867 0.123 0.010
## left son=4 (2541 obs) right son=5 (628 obs)
## Primary splits:
## Shell weight < 0.35975 to the left, improve=36.30877, (0 missing)
## Whole weight < 0.71925 to the left, improve=24.35723, (0 missing)
## Diameter < 0.3775 to the left, improve=20.48195, (0 missing)
## Length < 0.4775 to the left, improve=17.86371, (0 missing)
## Surrogate splits:
## Whole weight < 1.29475 to the left, agree=0.932, adj=0.656, (0 split)
## Diameter < 0.5025 to the left, agree=0.916, adj=0.578, (0 split)
## Length < 0.6325 to the left, agree=0.914, adj=0.567, (0 split)
##
## Node number 3: 173 observations
## predicted class=young expected loss=0.3121387 P(node) =0.05176541
## class counts: 54 0 119
## probabilities: 0.312 0.000 0.688
##
## Node number 4: 2541 observations
## predicted class=adult expected loss=0.09720582 P(node) =0.7603232
## class counts: 2294 214 33
## probabilities: 0.903 0.084 0.013
##
## Node number 5: 628 observations, complexity param=0.01537515
## predicted class=adult expected loss=0.2802548 P(node) =0.1879114
## class counts: 452 176 0
## probabilities: 0.720 0.280 0.000
## left son=10 (549 obs) right son=11 (79 obs)
## Primary splits:
## Shell weight < 0.54875 to the left, improve=15.133450, (0 missing)
## Whole weight < 1.141 to the right, improve=14.238410, (0 missing)
## Length < 0.6475 to the right, improve=11.376540, (0 missing)
## Diameter < 0.4875 to the right, improve= 8.246593, (0 missing)
## Surrogate splits:
## Whole weight < 1.93575 to the left, agree=0.904, adj=0.241, (0 split)
## Length < 0.7275 to the left, agree=0.903, adj=0.228, (0 split)
## Diameter < 0.5825 to the left, agree=0.898, adj=0.190, (0 split)
##
## Node number 10: 549 observations, complexity param=0.01537515
## predicted class=adult expected loss=0.2386157 P(node) =0.1642729
## class counts: 418 131 0
## probabilities: 0.761 0.239 0.000
## left son=20 (497 obs) right son=21 (52 obs)
## Primary splits:
## Whole weight < 1.141 to the right, improve=18.015200, (0 missing)
## Length < 0.6475 to the right, improve=15.687500, (0 missing)
## Diameter < 0.4925 to the right, improve=11.447150, (0 missing)
## Shell weight < 0.36075 to the right, improve= 1.591267, (0 missing)
## Surrogate splits:
## Diameter < 0.4675 to the right, agree=0.927, adj=0.231, (0 split)
## Length < 0.5675 to the right, agree=0.923, adj=0.192, (0 split)
## Shell weight < 0.36025 to the right, agree=0.907, adj=0.019, (0 split)
##
## Node number 11: 79 observations, complexity param=0.01537515
## predicted class=old expected loss=0.4303797 P(node) =0.02363854
## class counts: 34 45 0
## probabilities: 0.430 0.570 0.000
## left son=22 (51 obs) right son=23 (28 obs)
## Primary splits:
## Length < 0.6975 to the right, improve=9.063309, (0 missing)
## Diameter < 0.5525 to the right, improve=6.534747, (0 missing)
## Whole weight < 1.992 to the right, improve=6.138023, (0 missing)
## Shell weight < 0.68775 to the left, improve=3.888023, (0 missing)
## Surrogate splits:
## Diameter < 0.5525 to the right, agree=0.911, adj=0.750, (0 split)
## Whole weight < 1.85225 to the right, agree=0.886, adj=0.679, (0 split)
## Shell weight < 0.556 to the right, agree=0.696, adj=0.143, (0 split)
##
## Node number 20: 497 observations
## predicted class=adult expected loss=0.1971831 P(node) =0.1487133
## class counts: 399 98 0
## probabilities: 0.803 0.197 0.000
##
## Node number 21: 52 observations
## predicted class=old expected loss=0.3653846 P(node) =0.01555955
## class counts: 19 33 0
## probabilities: 0.365 0.635 0.000
##
## Node number 22: 51 observations, complexity param=0.01291513
## predicted class=adult expected loss=0.3921569 P(node) =0.01526032
## class counts: 31 20 0
## probabilities: 0.608 0.392 0.000
## left son=44 (42 obs) right son=45 (9 obs)
## Primary splits:
## Shell weight < 0.69275 to the left, improve=5.3930910, (0 missing)
## Diameter < 0.5975 to the right, improve=2.3986680, (0 missing)
## Whole weight < 1.992 to the right, improve=1.2403660, (0 missing)
## Length < 0.7325 to the right, improve=0.9155773, (0 missing)
##
## Node number 23: 28 observations
## predicted class=old expected loss=0.1071429 P(node) =0.008378217
## class counts: 3 25 0
## probabilities: 0.107 0.893 0.000
##
## Node number 44: 42 observations
## predicted class=adult expected loss=0.2857143 P(node) =0.01256732
## class counts: 30 12 0
## probabilities: 0.714 0.286 0.000
##
## Node number 45: 9 observations
## predicted class=old expected loss=0.1111111 P(node) =0.002692998
## class counts: 1 8 0
## probabilities: 0.111 0.889 0.000
library(rpart.plot)
rpart.plot(model)

predicted_model<-predict(object=model,newdata=testdata,type="class")
confusion.matrix<-table(testdata$age,predicted_model)
confusion.matrix
## predicted_model
## adult old young
## adult 675 6 17
## old 89 11 0
## young 8 0 29
# Accuracy of Model
#
accuracy<-sum(diag(confusion.matrix))/sum(confusion.matrix)
accuracy
## [1] 0.8562874
#
# Conclusion
#
# *By observing the correlation between the target attribute Rings and the indepent variables, we conclude that it is possible to build a model to predict the target value in function of the independent attributes.
# *The weight of the Abalones varies proportional to their sizes
# *There's no significant differences in size, weight and numbers of rigns between male/female abalones
# *The Infant Abalones groups presents lower mean values of size, weight and number of rings