Problem
Predicting the age of abalone from physical measurements.
Contents
Import the Dataset
Import the dataset and assign the column names for each fields and view the dataset
dataset=read.csv(file="abalone(1).data")
colnames(dataset)=c("Sex","Length","Diameter","Height","Whole weight","Shucked weight"
,"Viscera weight","Shell weight","Rings")
head(dataset)
## Sex Length Diameter Height Whole weight Shucked weight Viscera weight
## 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 5 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## 6 F 0.530 0.415 0.150 0.7775 0.2370 0.1415
## Shell weight Rings
## 1 0.070 7
## 2 0.210 9
## 3 0.155 10
## 4 0.055 7
## 5 0.120 8
## 6 0.330 20
There is no missing values in the data set Lets see the Structure and Summary of the given Dataset
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(dataset)
## 'data.frame': 4176 obs. of 9 variables:
## $ Sex : Factor w/ 3 levels "F","I","M": 3 1 3 2 2 1 1 3 1 1 ...
## $ Length : num 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 0.525 ...
## $ Diameter : num 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 0.38 ...
## $ Height : num 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 0.14 ...
## $ Whole weight : num 0.226 0.677 0.516 0.205 0.351 ...
## $ Shucked weight: num 0.0995 0.2565 0.2155 0.0895 0.141 ...
## $ Viscera weight: num 0.0485 0.1415 0.114 0.0395 0.0775 ...
## $ Shell weight : num 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 0.21 ...
## $ Rings : int 7 9 10 7 8 20 16 9 19 14 ...
summary(dataset)
## Sex Length Diameter Height
## F:1307 Min. :0.075 Min. :0.0550 Min. :0.0000
## I:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## M:1527 Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## Whole weight Shucked weight Viscera weight Shell weight
## Min. :0.0020 Min. :0.0010 Min. :0.00050 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.09337 1st Qu.:0.1300
## Median :0.7997 Median :0.3360 Median :0.17100 Median :0.2340
## Mean :0.8288 Mean :0.3594 Mean :0.18061 Mean :0.2389
## 3rd Qu.:1.1533 3rd Qu.:0.5020 3rd Qu.:0.25300 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.76000 Max. :1.0050
## Rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.932
## 3rd Qu.:11.000
## Max. :29.000
quantile(dataset$Rings)
## 0% 25% 50% 75% 100%
## 1 8 9 11 29
Exploratory Data Analysis
Target Attributes
Target attribute is Rings
FREQUENCY BASED ON RINGS
library(ggplot2)
dataset %>% ggplot(aes(x=Rings))+geom_histogram(fill="lightblue",color="Darkblue",binwidth = 2,alpha=0.5)
dataset %>% ggplot(aes(x=Rings))+geom_bar(fill="steelblue",alpha=0.6)
boxplot(dataset$Rings,col = "steelblue",horizontal = TRUE,border="black",xlab=("Rings"))
Thus the most frequent values of the attribute Rings are highly concentrated around the median of the distribution value of 9.
Sex Attribute
table(dataset$Sex)
##
## F I M
## 1307 1342 1527
ggplot(dataset) + aes(Rings, fill = Sex) + geom_density(alpha=0.6)
ggplot(dataset,aes(x=Rings,y=Sex,fill=Sex))+geom_col()+facet_grid(~Sex)
From the above plots we know that the female abalone has longer lifespan than others but the count of male abalone is maximum than others.
Weight Attribute Whole weight
summary(dataset$`Whole weight`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0020 0.4415 0.7997 0.8288 1.1533 2.8255
vect1<-round(dataset$`Whole weight`,0)
ggplot(dataset,aes(y=`Whole weight`,x=2,fill="Orange"))+geom_boxplot()
ggplot(dataset,aes(y=factor(vect1),x=Rings,fill=factor(vect1)))+geom_col()+facet_grid(~factor(vect1))+ylab("Whole weight")
Shell weight
boxplot(dataset$`Shell weight`,col = "darkgreen",border="black",ylab="Shell weight")
ggplot(dataset,aes(x=`Shell weight`,fill="Orange"))+geom_bar(binwidth = 0.02)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
From the above plots we know that the weight of abalone (1.5 to 2.4)gm has longer lifespan than others but the weight of abalone (0.5 to 1.4)gm has maximum count than others.
Corelation Between The Data
library(corrplot)
## corrplot 0.84 loaded
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
corrgram(dataset,order=T,lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Abalone data")
Spliting the Dataset
clean_dataset<-dataset %>% mutate(age=case_when(Rings %in% 1:5~"young",Rings %in% 6:13~"adult",Rings %in% 14:30~"old"))
clean_dataset<-clean_dataset %>% select(c(2,3,5,8,10)) %>% na.omit()
head(clean_dataset)
## Length Diameter Whole weight Shell weight age
## 1 0.350 0.265 0.2255 0.070 adult
## 2 0.530 0.420 0.6770 0.210 adult
## 3 0.440 0.365 0.5160 0.155 adult
## 4 0.330 0.255 0.2050 0.055 adult
## 5 0.425 0.300 0.3515 0.120 adult
## 6 0.530 0.415 0.7775 0.330 old
library(caTools)
set.seed(123)
sample=sample.split(clean_dataset,SplitRatio = 0.8)
traindata<-subset(clean_dataset,sample==T)
head(traindata)
## Length Diameter Whole weight Shell weight age
## 1 0.350 0.265 0.2255 0.070 adult
## 2 0.530 0.420 0.6770 0.210 adult
## 3 0.440 0.365 0.5160 0.155 adult
## 4 0.330 0.255 0.2050 0.055 adult
## 6 0.530 0.415 0.7775 0.330 old
## 7 0.545 0.425 0.7680 0.260 old
testdata<-subset(clean_dataset,sample==F)
Building the Model
library(rpart)
model <-rpart(formula=age~.,data=traindata,method="class")
summary(model)
## Call:
## rpart(formula = age ~ ., data = traindata, method = "class")
## n= 3341
##
## CP nsplit rel error xerror xstd
## 1 0.12343470 0 1.0000000 1.0000000 0.03859530
## 2 0.01118068 1 0.8765653 0.8855098 0.03673415
## 3 0.01073345 7 0.8067979 0.8729875 0.03651832
## 4 0.01000000 9 0.7853309 0.8729875 0.03651832
##
## Variable importance
## Shell weight Diameter Whole weight Length
## 27 25 24 24
##
## Node number 1: 3341 observations, complexity param=0.1234347
## predicted class=adult expected loss=0.1673152 P(node) =1
## class counts: 2782 403 156
## probabilities: 0.833 0.121 0.047
## left son=2 (3160 obs) right son=3 (181 obs)
## Primary splits:
## Diameter < 0.2225 to the right, improve=134.5335, (0 missing)
## Length < 0.2875 to the right, improve=131.4100, (0 missing)
## Shell weight < 0.03525 to the right, improve=131.1960, (0 missing)
## Whole weight < 0.12375 to the right, improve=129.8779, (0 missing)
## Surrogate splits:
## Shell weight < 0.03625 to the right, agree=0.994, adj=0.895, (0 split)
## Length < 0.2975 to the right, agree=0.993, adj=0.878, (0 split)
## Whole weight < 0.1265 to the right, agree=0.993, adj=0.878, (0 split)
##
## Node number 2: 3160 observations, complexity param=0.01118068
## predicted class=adult expected loss=0.1373418 P(node) =0.9458246
## class counts: 2726 403 31
## probabilities: 0.863 0.128 0.010
## left son=4 (2538 obs) right son=5 (622 obs)
## Primary splits:
## Shell weight < 0.35775 to the left, improve=45.62611, (0 missing)
## Whole weight < 0.75875 to the left, improve=26.06479, (0 missing)
## Diameter < 0.3775 to the left, improve=21.87421, (0 missing)
## Length < 0.4775 to the left, improve=19.01801, (0 missing)
## Surrogate splits:
## Whole weight < 1.341 to the left, agree=0.930, adj=0.646, (0 split)
## Diameter < 0.5025 to the left, agree=0.914, adj=0.563, (0 split)
## Length < 0.6325 to the left, agree=0.913, adj=0.556, (0 split)
##
## Node number 3: 181 observations
## predicted class=young expected loss=0.3093923 P(node) =0.0541754
## class counts: 56 0 125
## probabilities: 0.309 0.000 0.691
##
## Node number 4: 2538 observations
## predicted class=adult expected loss=0.0965327 P(node) =0.7596528
## class counts: 2293 214 31
## probabilities: 0.903 0.084 0.012
##
## Node number 5: 622 observations, complexity param=0.01118068
## predicted class=adult expected loss=0.3038585 P(node) =0.1861718
## class counts: 433 189 0
## probabilities: 0.696 0.304 0.000
## left son=10 (387 obs) right son=11 (235 obs)
## Primary splits:
## Length < 0.6475 to the right, improve=14.53104, (0 missing)
## Shell weight < 0.68775 to the left, improve=13.18727, (0 missing)
## Diameter < 0.4875 to the right, improve=12.85866, (0 missing)
## Whole weight < 1.10375 to the right, improve=11.71015, (0 missing)
## Surrogate splits:
## Diameter < 0.5125 to the right, agree=0.852, adj=0.609, (0 split)
## Whole weight < 1.38375 to the right, agree=0.797, adj=0.464, (0 split)
## Shell weight < 0.3905 to the right, agree=0.717, adj=0.251, (0 split)
##
## Node number 10: 387 observations, complexity param=0.01118068
## predicted class=adult expected loss=0.2196382 P(node) =0.1158336
## class counts: 302 85 0
## probabilities: 0.780 0.220 0.000
## left son=20 (338 obs) right son=21 (49 obs)
## Primary splits:
## Shell weight < 0.579 to the left, improve=15.5442400, (0 missing)
## Diameter < 0.5275 to the left, improve= 1.9524010, (0 missing)
## Whole weight < 1.67575 to the left, improve= 1.3805140, (0 missing)
## Length < 0.6675 to the right, improve= 0.5388445, (0 missing)
## Surrogate splits:
## Diameter < 0.5875 to the left, agree=0.902, adj=0.224, (0 split)
## Whole weight < 2.23725 to the left, agree=0.899, adj=0.204, (0 split)
## Length < 0.7425 to the left, agree=0.884, adj=0.082, (0 split)
##
## Node number 11: 235 observations, complexity param=0.01118068
## predicted class=adult expected loss=0.4425532 P(node) =0.07033822
## class counts: 131 104 0
## probabilities: 0.557 0.443 0.000
## left son=22 (161 obs) right son=23 (74 obs)
## Primary splits:
## Shell weight < 0.418 to the left, improve=5.920902, (0 missing)
## Whole weight < 1.10375 to the right, improve=5.394776, (0 missing)
## Length < 0.5925 to the right, improve=3.646195, (0 missing)
## Diameter < 0.4675 to the right, improve=3.646195, (0 missing)
## Surrogate splits:
## Whole weight < 1.43125 to the left, agree=0.740, adj=0.176, (0 split)
## Diameter < 0.5225 to the left, agree=0.698, adj=0.041, (0 split)
##
## Node number 20: 338 observations
## predicted class=adult expected loss=0.1656805 P(node) =0.1011673
## class counts: 282 56 0
## probabilities: 0.834 0.166 0.000
##
## Node number 21: 49 observations
## predicted class=old expected loss=0.4081633 P(node) =0.01466627
## class counts: 20 29 0
## probabilities: 0.408 0.592 0.000
##
## Node number 22: 161 observations, complexity param=0.01118068
## predicted class=adult expected loss=0.3664596 P(node) =0.04818916
## class counts: 102 59 0
## probabilities: 0.634 0.366 0.000
## left son=44 (86 obs) right son=45 (75 obs)
## Primary splits:
## Whole weight < 1.24275 to the right, improve=9.1193140, (0 missing)
## Length < 0.5925 to the right, improve=8.2848600, (0 missing)
## Diameter < 0.4675 to the right, improve=7.5577640, (0 missing)
## Shell weight < 0.41025 to the right, improve=0.7317899, (0 missing)
## Surrogate splits:
## Length < 0.6225 to the right, agree=0.770, adj=0.507, (0 split)
## Diameter < 0.4775 to the right, agree=0.714, adj=0.387, (0 split)
## Shell weight < 0.36025 to the right, agree=0.621, adj=0.187, (0 split)
##
## Node number 23: 74 observations, complexity param=0.01073345
## predicted class=old expected loss=0.3918919 P(node) =0.02214906
## class counts: 29 45 0
## probabilities: 0.392 0.608 0.000
## left son=46 (16 obs) right son=47 (58 obs)
## Primary splits:
## Length < 0.6425 to the right, improve=3.5676840, (0 missing)
## Diameter < 0.4925 to the right, improve=2.7804740, (0 missing)
## Shell weight < 0.54 to the left, improve=1.9702700, (0 missing)
## Whole weight < 1.4285 to the right, improve=0.4676095, (0 missing)
##
## Node number 44: 86 observations
## predicted class=adult expected loss=0.2093023 P(node) =0.0257408
## class counts: 68 18 0
## probabilities: 0.791 0.209 0.000
##
## Node number 45: 75 observations, complexity param=0.01118068
## predicted class=old expected loss=0.4533333 P(node) =0.02244837
## class counts: 34 41 0
## probabilities: 0.453 0.547 0.000
## left son=90 (53 obs) right son=91 (22 obs)
## Primary splits:
## Diameter < 0.4625 to the right, improve=4.5901430, (0 missing)
## Length < 0.5925 to the right, improve=4.2482960, (0 missing)
## Whole weight < 1.10375 to the right, improve=2.6625110, (0 missing)
## Shell weight < 0.36425 to the left, improve=0.8030326, (0 missing)
## Surrogate splits:
## Length < 0.5825 to the right, agree=0.867, adj=0.545, (0 split)
## Whole weight < 0.993 to the right, agree=0.827, adj=0.409, (0 split)
## Shell weight < 0.4025 to the left, agree=0.720, adj=0.045, (0 split)
##
## Node number 46: 16 observations
## predicted class=adult expected loss=0.3125 P(node) =0.004788985
## class counts: 11 5 0
## probabilities: 0.688 0.312 0.000
##
## Node number 47: 58 observations
## predicted class=old expected loss=0.3103448 P(node) =0.01736007
## class counts: 18 40 0
## probabilities: 0.310 0.690 0.000
##
## Node number 90: 53 observations, complexity param=0.01073345
## predicted class=adult expected loss=0.4339623 P(node) =0.01586351
## class counts: 30 23 0
## probabilities: 0.566 0.434 0.000
## left son=180 (41 obs) right son=181 (12 obs)
## Primary splits:
## Whole weight < 1.2105 to the left, improve=3.0987110, (0 missing)
## Diameter < 0.5075 to the left, improve=2.8886680, (0 missing)
## Length < 0.6275 to the left, improve=1.1338900, (0 missing)
## Shell weight < 0.36425 to the left, improve=0.3285855, (0 missing)
## Surrogate splits:
## Diameter < 0.5025 to the left, agree=0.811, adj=0.167, (0 split)
## Length < 0.6375 to the left, agree=0.792, adj=0.083, (0 split)
##
## Node number 91: 22 observations
## predicted class=old expected loss=0.1818182 P(node) =0.006584855
## class counts: 4 18 0
## probabilities: 0.182 0.818 0.000
##
## Node number 180: 41 observations
## predicted class=adult expected loss=0.3414634 P(node) =0.01227177
## class counts: 27 14 0
## probabilities: 0.659 0.341 0.000
##
## Node number 181: 12 observations
## predicted class=old expected loss=0.25 P(node) =0.003591739
## class counts: 3 9 0
## probabilities: 0.250 0.750 0.000
library(rpart.plot)
rpart.plot(model)
predicted_model<-predict(object=model,newdata=testdata,type="class")
confusion.matrix<-table(testdata$age,predicted_model)
confusion.matrix
## predicted_model
## adult old young
## adult 681 20 15
## old 72 14 0
## young 10 0 23
Accuracy of Model
accuracy<-sum(diag(confusion.matrix))/sum(confusion.matrix)
accuracy
## [1] 0.8598802
Conclusion
*By observing the correlation between the target attribute Rings and the indepent variables, we conclude that it is possible to build a model to predict the target value in function of the independent attributes.
*The weight of the Abalones varies proportional to their sizes
*There’s no significant differences in size, weight and numbers of rigns between male/female abalones
*The Infant Abalones groups presents lower mean values of size, weight and number of rings