Description of Title:
Predicting the age of abalone from physical measurements. The age ofabalone is determined by cutting the shell through the cone, staining it,and counting the number of rings through a microscope – a boring and time-consuming task. Other measurements, which are easier to obtain, are used to predict the age. Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.
From the Original data set missing values are removed.Various plots are plotted for the analysing the dataset and a calculation is finally made to predict the age of abalone
Abalone_data=read.csv(file="abalone.csv")
names(Abalone_data)
## [1] "Sex" "Length" "Diameter" "Height"
## [5] "Whole.weight" "Shucked.weight" "Viscera.weight" "Shell.weight"
## [9] "Rings"
#dimension of dataset
dim(Abalone_data)
## [1] 4177 9
#Column Names
names(Abalone_data)
## [1] "Sex" "Length" "Diameter" "Height"
## [5] "Whole.weight" "Shucked.weight" "Viscera.weight" "Shell.weight"
## [9] "Rings"
#Structure of the dataset
str(Abalone_data)
## 'data.frame': 4177 obs. of 9 variables:
## $ Sex : chr "M" "M" "F" "M" ...
## $ Length : num 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
## $ Diameter : num 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
## $ Height : num 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
## $ Whole.weight : num 0.514 0.226 0.677 0.516 0.205 ...
## $ Shucked.weight: num 0.2245 0.0995 0.2565 0.2155 0.0895 ...
## $ Viscera.weight: num 0.101 0.0485 0.1415 0.114 0.0395 ...
## $ Shell.weight : num 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
## $ Rings : int 15 7 9 10 7 8 20 16 9 19 ...
summary(Abalone_data)
## Sex Length Diameter Height
## Length:4177 Min. :0.075 Min. :0.0550 Min. :0.0000
## Class :character 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## Mode :character Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## Whole.weight Shucked.weight Viscera.weight Shell.weight
## Min. :0.0020 Min. :0.0010 Min. :0.0005 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300
## Median :0.7995 Median :0.3360 Median :0.1710 Median :0.2340
## Mean :0.8287 Mean :0.3594 Mean :0.1806 Mean :0.2388
## 3rd Qu.:1.1530 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
## Rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.934
## 3rd Qu.:11.000
## Max. :29.000
head(Abalone_data)
## Sex Length Diameter Height Whole.weight Shucked.weight Viscera.weight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## Shell.weight Rings
## 1 0.150 15
## 2 0.070 7
## 3 0.210 9
## 4 0.155 10
## 5 0.055 7
## 6 0.120 8
tail(Abalone_data)
## Sex Length Diameter Height Whole.weight Shucked.weight Viscera.weight
## 4172 M 0.560 0.430 0.155 0.8675 0.4000 0.1720
## 4173 F 0.565 0.450 0.165 0.8870 0.3700 0.2390
## 4174 M 0.590 0.440 0.135 0.9660 0.4390 0.2145
## 4175 M 0.600 0.475 0.205 1.1760 0.5255 0.2875
## 4176 F 0.625 0.485 0.150 1.0945 0.5310 0.2610
## 4177 M 0.710 0.555 0.195 1.9485 0.9455 0.3765
## Shell.weight Rings
## 4172 0.2290 8
## 4173 0.2490 11
## 4174 0.2605 10
## 4175 0.3080 9
## 4176 0.2960 10
## 4177 0.4950 12
#convert int column to factor
class(Abalone_data$Sex)
## [1] "character"
Abalone_data$Sex<-as.factor(Abalone_data$Sex)
class(Abalone_data$Sex)
## [1] "factor"
levels(Abalone_data$Sex)
## [1] "F" "I" "M"
#create two way contigency table
table(Abalone_data$Sex,Abalone_data$Rings)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## F 0 0 0 0 4 16 44 122 238 248 200 128 88 56 41 30 26 19 15
## I 1 1 12 51 100 216 267 274 173 92 62 21 24 14 10 7 7 5 2
## M 0 0 3 6 11 27 80 172 278 294 225 118 91 56 52 30 25 18 15
##
## 20 21 22 23 24 25 26 27 29
## F 12 7 3 6 1 1 0 1 1
## I 2 1 0 0 0 0 0 0 0
## M 12 6 3 3 1 0 1 1 0
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
Abalone_data%>% group_by(Sex) %>% summarise(No_of_Observation = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
## Sex No_of_Observation
## <fct> <int>
## 1 F 1307
## 2 I 1342
## 3 M 1528
ggplot(data=Abalone_data,aes(x=Sex,fill=Sex))+geom_bar()
Abalone_data$weight.diff <- Abalone_data$Whole.weight - (Abalone_data$Viscera.weight +
Abalone_data$Shucked.weight + Abalone_data$Shell.weight)
ggplot(Abalone_data, aes(x=weight.diff)) +
geom_histogram(colour="dodgerblue",fill=rgb(1,.54,0,.7), bins = 30) +
scale_y_continuous(name="count") +
labs(title="Histogram of abalone with weight difference less than zero")
#false values of dataset
nrow(Abalone_data[Abalone_data$weight.diff <0,])
## [1] 155
nrow(Abalone_data[Abalone_data$Height == 0,])
## [1] 2
cor(Abalone_data$Length, Abalone_data$Rings)
## [1] 0.5567196
cor(Abalone_data$Diameter, Abalone_data$Rings)
## [1] 0.5746599
cor(Abalone_data$Height, Abalone_data$Rings)
## [1] 0.5574673
cor(Abalone_data$Whole.weight, Abalone_data$Rings)
## [1] 0.5403897
cor(Abalone_data$Shucked.weight, Abalone_data$Rings)
## [1] 0.4208837
cor(Abalone_data$Viscera.weight, Abalone_data$Rings)
## [1] 0.5038192
cor(Abalone_data$Shell.weight, Abalone_data$Rings)
## [1] 0.627574
ggplot(data=Abalone_data,aes(x=Shell.weight,y=Rings,color=Sex))+geom_point()+geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(data=Abalone_data,aes(x=Length,y=Rings,color=Sex))+geom_point()+geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
# ************************Simple Linear Regression**************************************************
#Splitting the dataset into the
#Training Set and Testing set
library(caTools)
## Warning: package 'caTools' was built under R version 4.0.2
set.seed(123)
split =sample.split(Abalone_data $Diameter, SplitRatio = 0.8)
training_set=subset(Abalone_data,split==TRUE)
test_set = subset(Abalone_data, split==FALSE)
#Fitting Simple Linear Regression to
#the Training Set
regressor = lm(formula = Diameter ~ Length, data = training_set)
summary(regressor)
##
## Call:
## lm(formula = Diameter ~ Length, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.089312 -0.008774 -0.000443 0.008653 0.243101
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.018853 0.001240 -15.2 <2e-16 ***
## Length 0.814873 0.002308 353.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01602 on 3336 degrees of freedom
## Multiple R-squared: 0.9739, Adjusted R-squared: 0.9739
## F-statistic: 1.247e+05 on 1 and 3336 DF, p-value: < 2.2e-16
#Predicting the Test Set result
y_pred= predict(regressor, newdata=test_set)
#Visualising the Training set
library(ggplot2)
ggplot()+geom_point(aes(x=training_set$Length,y=training_set$Diameter),colour="red")+
geom_line(aes(x=training_set$Length, y=predict(regressor,
newdata=training_set)))+ggtitle("diameter Vs Exp (Training)")
#Visualizing Test set
ggplot()+geom_point(aes(x=test_set$Length,y=test_set$Diameter),color="red")+
geom_line(aes(x=test_set$Length,y=predict(regressor,newdata=test_set)),color="blue")
# *********************************************Predicting the age**************************************
Abalone_data$age=round(Abalone_data$Rings+1.5)
head(data.frame(Sex=Abalone_data$Sex,predicted_rings=Abalone_data$Rings,predicted_age=Abalone_data$age))
## Sex predicted_rings predicted_age
## 1 M 15 16
## 2 M 7 8
## 3 F 9 10
## 4 M 10 12
## 5 I 7 8
## 6 I 8 10
The task is to predict the age of abalone was systematically approached in this report. First dataset was explored, unusual observations were discovered , the variables were appropriately standardized linear regression performed and the age is predicted