arunpro.utf8.md

#Problem
#   Predicting the age of abalone from physical measurements.
# 
#Contents
#   
# * Import the Dataset
# * Exploratory Data Analysis
# * Corelation Between The Data
# * Spliting the Dataset
# * Building the Model
# * Accuracy of Model
# * Conclusion 

#Import the Dataset
#   
#   Import the dataset and assign the column names for each fields and view the dataset

dataset=read.csv("abalone(1).data",header = F)
colnames(dataset)=c("Sex","Length","Diameter","Height","Whole weight","Shucked weight","Viscera weight","Shell weight","Rings")
head(dataset)

##   Sex Length Diameter Height Whole weight Shucked weight Viscera weight
## 1   M  0.455    0.365  0.095       0.5140         0.2245         0.1010
## 2   M  0.350    0.265  0.090       0.2255         0.0995         0.0485
## 3   F  0.530    0.420  0.135       0.6770         0.2565         0.1415
## 4   M  0.440    0.365  0.125       0.5160         0.2155         0.1140
## 5   I  0.330    0.255  0.080       0.2050         0.0895         0.0395
## 6   I  0.425    0.300  0.095       0.3515         0.1410         0.0775
##   Shell weight Rings
## 1        0.150    15
## 2        0.070     7
## 3        0.210     9
## 4        0.155    10
## 5        0.055     7
## 6        0.120     8

# Exploratory Data Analysis
#   
# Target Attributes
#   
#   Target attribute is Rings
# 
# FREQUENCY BASED ON  RINGS
# 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

str(dataset)

## 'data.frame':    4177 obs. of  9 variables:
##  $ Sex           : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ...
##  $ Length        : num  0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ Diameter      : num  0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ Height        : num  0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ Whole weight  : num  0.514 0.226 0.677 0.516 0.205 ...
##  $ Shucked weight: num  0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ Viscera weight: num  0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ Shell weight  : num  0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ Rings         : int  15 7 9 10 7 8 20 16 9 19 ...

summary(dataset)

##  Sex          Length         Diameter          Height      
##  F:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  I:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  M:1528   Median :0.545   Median :0.4250   Median :0.1400  
##           Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##           3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##           Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   Whole weight    Shucked weight   Viscera weight    Shell weight   
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.0005   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300  
##  Median :0.7995   Median :0.3360   Median :0.1710   Median :0.2340  
##  Mean   :0.8287   Mean   :0.3594   Mean   :0.1806   Mean   :0.2388  
##  3rd Qu.:1.1530   3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      Rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.934  
##  3rd Qu.:11.000  
##  Max.   :29.000

fivenum(dataset$Rings)

## [1]  1  8  9 11 29

quantile(dataset$Rings)

##   0%  25%  50%  75% 100% 
##    1    8    9   11   29

library(ggplot2)
dataset %>% ggplot(aes(x=Rings))+geom_histogram(fill="lightblue",color="Darkblue",binwidth = 2,alpha=0.5)

dataset %>% ggplot(aes(x=Rings))+geom_bar(fill="steelblue",alpha=0.6)

boxplot(dataset$Rings,col = "steelblue",horizontal = TRUE,border="black",xlab=("Rings"))

# Thus the most frequent values of the attribute Rings are highly concentrated around the median of the distribution value  of 9.
# 
# Sex Attribute
table(dataset$Sex)

## 
##    F    I    M 
## 1307 1342 1528

ggplot(dataset) + aes(Rings, fill = Sex) +  geom_density(alpha=0.6)

ggplot(dataset,aes(x=Rings,y=Sex,fill=Sex))+geom_col()+facet_grid(~Sex)

# From the above plots we know that the female abalone has longer lifespan than others but the count of male abalone is maximum than others.

# Weight Attribute
#   Whole weight
# 
  
summary(dataset$`Whole weight`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0020  0.4415  0.7995  0.8287  1.1530  2.8255

vect1<-round(dataset$`Whole weight`,0)   
ggplot(dataset,aes(y=`Whole weight`,x=2,fill="Orange"))+geom_boxplot()

ggplot(dataset,aes(y=factor(vect1),x=Rings,fill=factor(vect1)))+geom_col()+facet_grid(~factor(vect1))+ylab("Whole weight")

# 
#Shell  weight
# 
boxplot(dataset$`Shell weight`,col = "darkgreen",border="black",ylab="Shell weight")

ggplot(dataset,aes(x=`Shell weight`,fill="Orange"))+geom_bar(binwidth = 0.02)

## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

# From the above plots we know that the weight of abalone (1.5 to 2.4)gm  has longer lifespan than others but the weight of abalone (0.5 to 1.4)gm has maximum  count than others.
# 
#Corelation Between The Data

library(corrplot)

## corrplot 0.84 loaded

library(corrgram)

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus

corrgram(dataset[dataset$Rings,],order=T,lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Abalone data")

# Spliting the Dataset
#  

clean_dataset<-dataset %>% mutate(age=case_when(Rings %in% 1:5~"young",Rings %in% 6:13~"adult",Rings %in% 14:30~"old"))
clean_dataset<-clean_dataset %>% select(c(2,3,5,8,10)) %>% na.omit()
head(clean_dataset)

##   Length Diameter Whole weight Shell weight   age
## 1  0.455    0.365       0.5140        0.150   old
## 2  0.350    0.265       0.2255        0.070 adult
## 3  0.530    0.420       0.6770        0.210 adult
## 4  0.440    0.365       0.5160        0.155 adult
## 5  0.330    0.255       0.2050        0.055 adult
## 6  0.425    0.300       0.3515        0.120 adult

library(caTools)
set.seed(123)
sample=sample.split(clean_dataset,SplitRatio = 0.8)
traindata<-subset(clean_dataset,sample==T)
head(traindata)

##   Length Diameter Whole weight Shell weight   age
## 1  0.455    0.365       0.5140        0.150   old
## 2  0.350    0.265       0.2255        0.070 adult
## 3  0.530    0.420       0.6770        0.210 adult
## 4  0.440    0.365       0.5160        0.155 adult
## 6  0.425    0.300       0.3515        0.120 adult
## 7  0.530    0.415       0.7775        0.330   old

testdata<-subset(clean_dataset,sample==F) 


# Building the Model

library(rpart)
model <-rpart(formula=age~.,data=traindata,method="class")
summary(model)

## Call:
## rpart(formula = age ~ ., data = traindata, method = "class")
##   n= 3342 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.11992620      0 1.0000000 1.0000000 0.03931662
## 2 0.01537515      1 0.8800738 0.8985240 0.03763267
## 3 0.01291513      5 0.8136531 0.9059041 0.03776042
## 4 0.01000000      6 0.8007380 0.8874539 0.03743941
## 
## Variable importance
## Shell weight     Diameter Whole weight       Length 
##           27           25           25           23 
## 
## Node number 1: 3342 observations,    complexity param=0.1199262
##   predicted class=adult  expected loss=0.1621783  P(node) =1
##     class counts:  2800   390   152
##    probabilities: 0.838 0.117 0.045 
##   left son=2 (3169 obs) right son=3 (173 obs)
##   Primary splits:
##       Diameter     < 0.2225  to the right, improve=128.1876, (0 missing)
##       Shell weight < 0.03875 to the right, improve=128.1826, (0 missing)
##       Whole weight < 0.12375 to the right, improve=127.3570, (0 missing)
##       Length       < 0.2925  to the right, improve=124.2091, (0 missing)
##   Surrogate splits:
##       Shell weight < 0.03825 to the right, agree=0.994, adj=0.879, (0 split)
##       Length       < 0.2975  to the right, agree=0.993, adj=0.861, (0 split)
##       Whole weight < 0.12375 to the right, agree=0.992, adj=0.844, (0 split)
## 
## Node number 2: 3169 observations,    complexity param=0.01537515
##   predicted class=adult  expected loss=0.1334806  P(node) =0.9482346
##     class counts:  2746   390    33
##    probabilities: 0.867 0.123 0.010 
##   left son=4 (2541 obs) right son=5 (628 obs)
##   Primary splits:
##       Shell weight < 0.35975 to the left,  improve=36.30877, (0 missing)
##       Whole weight < 0.71925 to the left,  improve=24.35723, (0 missing)
##       Diameter     < 0.3775  to the left,  improve=20.48195, (0 missing)
##       Length       < 0.4775  to the left,  improve=17.86371, (0 missing)
##   Surrogate splits:
##       Whole weight < 1.29475 to the left,  agree=0.932, adj=0.656, (0 split)
##       Diameter     < 0.5025  to the left,  agree=0.916, adj=0.578, (0 split)
##       Length       < 0.6325  to the left,  agree=0.914, adj=0.567, (0 split)
## 
## Node number 3: 173 observations
##   predicted class=young  expected loss=0.3121387  P(node) =0.05176541
##     class counts:    54     0   119
##    probabilities: 0.312 0.000 0.688 
## 
## Node number 4: 2541 observations
##   predicted class=adult  expected loss=0.09720582  P(node) =0.7603232
##     class counts:  2294   214    33
##    probabilities: 0.903 0.084 0.013 
## 
## Node number 5: 628 observations,    complexity param=0.01537515
##   predicted class=adult  expected loss=0.2802548  P(node) =0.1879114
##     class counts:   452   176     0
##    probabilities: 0.720 0.280 0.000 
##   left son=10 (549 obs) right son=11 (79 obs)
##   Primary splits:
##       Shell weight < 0.54875 to the left,  improve=15.133450, (0 missing)
##       Whole weight < 1.141   to the right, improve=14.238410, (0 missing)
##       Length       < 0.6475  to the right, improve=11.376540, (0 missing)
##       Diameter     < 0.4875  to the right, improve= 8.246593, (0 missing)
##   Surrogate splits:
##       Whole weight < 1.93575 to the left,  agree=0.904, adj=0.241, (0 split)
##       Length       < 0.7275  to the left,  agree=0.903, adj=0.228, (0 split)
##       Diameter     < 0.5825  to the left,  agree=0.898, adj=0.190, (0 split)
## 
## Node number 10: 549 observations,    complexity param=0.01537515
##   predicted class=adult  expected loss=0.2386157  P(node) =0.1642729
##     class counts:   418   131     0
##    probabilities: 0.761 0.239 0.000 
##   left son=20 (497 obs) right son=21 (52 obs)
##   Primary splits:
##       Whole weight < 1.141   to the right, improve=18.015200, (0 missing)
##       Length       < 0.6475  to the right, improve=15.687500, (0 missing)
##       Diameter     < 0.4925  to the right, improve=11.447150, (0 missing)
##       Shell weight < 0.36075 to the right, improve= 1.591267, (0 missing)
##   Surrogate splits:
##       Diameter     < 0.4675  to the right, agree=0.927, adj=0.231, (0 split)
##       Length       < 0.5675  to the right, agree=0.923, adj=0.192, (0 split)
##       Shell weight < 0.36025 to the right, agree=0.907, adj=0.019, (0 split)
## 
## Node number 11: 79 observations,    complexity param=0.01537515
##   predicted class=old    expected loss=0.4303797  P(node) =0.02363854
##     class counts:    34    45     0
##    probabilities: 0.430 0.570 0.000 
##   left son=22 (51 obs) right son=23 (28 obs)
##   Primary splits:
##       Length       < 0.6975  to the right, improve=9.063309, (0 missing)
##       Diameter     < 0.5525  to the right, improve=6.534747, (0 missing)
##       Whole weight < 1.992   to the right, improve=6.138023, (0 missing)
##       Shell weight < 0.68775 to the left,  improve=3.888023, (0 missing)
##   Surrogate splits:
##       Diameter     < 0.5525  to the right, agree=0.911, adj=0.750, (0 split)
##       Whole weight < 1.85225 to the right, agree=0.886, adj=0.679, (0 split)
##       Shell weight < 0.556   to the right, agree=0.696, adj=0.143, (0 split)
## 
## Node number 20: 497 observations
##   predicted class=adult  expected loss=0.1971831  P(node) =0.1487133
##     class counts:   399    98     0
##    probabilities: 0.803 0.197 0.000 
## 
## Node number 21: 52 observations
##   predicted class=old    expected loss=0.3653846  P(node) =0.01555955
##     class counts:    19    33     0
##    probabilities: 0.365 0.635 0.000 
## 
## Node number 22: 51 observations,    complexity param=0.01291513
##   predicted class=adult  expected loss=0.3921569  P(node) =0.01526032
##     class counts:    31    20     0
##    probabilities: 0.608 0.392 0.000 
##   left son=44 (42 obs) right son=45 (9 obs)
##   Primary splits:
##       Shell weight < 0.69275 to the left,  improve=5.3930910, (0 missing)
##       Diameter     < 0.5975  to the right, improve=2.3986680, (0 missing)
##       Whole weight < 1.992   to the right, improve=1.2403660, (0 missing)
##       Length       < 0.7325  to the right, improve=0.9155773, (0 missing)
## 
## Node number 23: 28 observations
##   predicted class=old    expected loss=0.1071429  P(node) =0.008378217
##     class counts:     3    25     0
##    probabilities: 0.107 0.893 0.000 
## 
## Node number 44: 42 observations
##   predicted class=adult  expected loss=0.2857143  P(node) =0.01256732
##     class counts:    30    12     0
##    probabilities: 0.714 0.286 0.000 
## 
## Node number 45: 9 observations
##   predicted class=old    expected loss=0.1111111  P(node) =0.002692998
##     class counts:     1     8     0
##    probabilities: 0.111 0.889 0.000

library(rpart.plot)
rpart.plot(model)

predicted_model<-predict(object=model,newdata=testdata,type="class")
confusion.matrix<-table(testdata$age,predicted_model)
confusion.matrix

##        predicted_model
##         adult old young
##   adult   675   6    17
##   old      89  11     0
##   young     8   0    29

# Accuracy of Model
# 

accuracy<-sum(diag(confusion.matrix))/sum(confusion.matrix)
accuracy

## [1] 0.8562874

# 
# Conclusion
#   
# *By observing the correlation between the target attribute Rings and the indepent variables, we conclude that it is possible to build a model to predict the target value in function of the independent attributes.
# *The weight of the Abalones varies proportional to their sizes
# *There's no significant differences in size, weight and numbers of rigns between male/female abalones
# *The Infant Abalones groups presents lower mean values of size, weight and number of rings

arunpro.R

ELCOT

2019-10-09