Analysing the Abalone dataset to plot various plots and Predict the Age of Abalone

Description of Title:

Predicting the age of abalone from physical measurements. The age ofabalone is determined by cutting the shell through the cone, staining it,and counting the number of rings through a microscope – a boring and time-consuming task. Other measurements, which are easier to obtain, are used to predict the age. Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.

From the Original data set missing values are removed.Various plots are plotted for the analysing the dataset and a calculation is finally made to predict the age of abalone

Abalone_data=read.csv(file="abalone.csv")
names(Abalone_data)
## [1] "Sex"            "Length"         "Diameter"       "Height"        
## [5] "Whole.weight"   "Shucked.weight" "Viscera.weight" "Shell.weight"  
## [9] "Rings"
#dimension of dataset
dim(Abalone_data)
## [1] 4177    9
#Column Names
names(Abalone_data)
## [1] "Sex"            "Length"         "Diameter"       "Height"        
## [5] "Whole.weight"   "Shucked.weight" "Viscera.weight" "Shell.weight"  
## [9] "Rings"
#Structure of the dataset
str(Abalone_data)
## 'data.frame':    4177 obs. of  9 variables:
##  $ Sex           : chr  "M" "M" "F" "M" ...
##  $ Length        : num  0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ Diameter      : num  0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ Height        : num  0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ Whole.weight  : num  0.514 0.226 0.677 0.516 0.205 ...
##  $ Shucked.weight: num  0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ Viscera.weight: num  0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ Shell.weight  : num  0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ Rings         : int  15 7 9 10 7 8 20 16 9 19 ...
summary(Abalone_data)
##      Sex                Length         Diameter          Height      
##  Length:4177        Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  Class :character   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  Mode  :character   Median :0.545   Median :0.4250   Median :0.1400  
##                     Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##                     3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##                     Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   Whole.weight    Shucked.weight   Viscera.weight    Shell.weight   
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.0005   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300  
##  Median :0.7995   Median :0.3360   Median :0.1710   Median :0.2340  
##  Mean   :0.8287   Mean   :0.3594   Mean   :0.1806   Mean   :0.2388  
##  3rd Qu.:1.1530   3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      Rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.934  
##  3rd Qu.:11.000  
##  Max.   :29.000
head(Abalone_data)
##   Sex Length Diameter Height Whole.weight Shucked.weight Viscera.weight
## 1   M  0.455    0.365  0.095       0.5140         0.2245         0.1010
## 2   M  0.350    0.265  0.090       0.2255         0.0995         0.0485
## 3   F  0.530    0.420  0.135       0.6770         0.2565         0.1415
## 4   M  0.440    0.365  0.125       0.5160         0.2155         0.1140
## 5   I  0.330    0.255  0.080       0.2050         0.0895         0.0395
## 6   I  0.425    0.300  0.095       0.3515         0.1410         0.0775
##   Shell.weight Rings
## 1        0.150    15
## 2        0.070     7
## 3        0.210     9
## 4        0.155    10
## 5        0.055     7
## 6        0.120     8
tail(Abalone_data)
##      Sex Length Diameter Height Whole.weight Shucked.weight Viscera.weight
## 4172   M  0.560    0.430  0.155       0.8675         0.4000         0.1720
## 4173   F  0.565    0.450  0.165       0.8870         0.3700         0.2390
## 4174   M  0.590    0.440  0.135       0.9660         0.4390         0.2145
## 4175   M  0.600    0.475  0.205       1.1760         0.5255         0.2875
## 4176   F  0.625    0.485  0.150       1.0945         0.5310         0.2610
## 4177   M  0.710    0.555  0.195       1.9485         0.9455         0.3765
##      Shell.weight Rings
## 4172       0.2290     8
## 4173       0.2490    11
## 4174       0.2605    10
## 4175       0.3080     9
## 4176       0.2960    10
## 4177       0.4950    12
#convert int column to factor
class(Abalone_data$Sex)
## [1] "character"
Abalone_data$Sex<-as.factor(Abalone_data$Sex)
class(Abalone_data$Sex)
## [1] "factor"
levels(Abalone_data$Sex)
## [1] "F" "I" "M"
#create two way contigency table
table(Abalone_data$Sex,Abalone_data$Rings)
##    
##       1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
##   F   0   0   0   0   4  16  44 122 238 248 200 128  88  56  41  30  26  19  15
##   I   1   1  12  51 100 216 267 274 173  92  62  21  24  14  10   7   7   5   2
##   M   0   0   3   6  11  27  80 172 278 294 225 118  91  56  52  30  25  18  15
##    
##      20  21  22  23  24  25  26  27  29
##   F  12   7   3   6   1   1   0   1   1
##   I   2   1   0   0   0   0   0   0   0
##   M  12   6   3   3   1   0   1   1   0
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
Abalone_data%>% group_by(Sex) %>% summarise(No_of_Observation = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 2
##   Sex   No_of_Observation
##   <fct>             <int>
## 1 F                  1307
## 2 I                  1342
## 3 M                  1528
ggplot(data=Abalone_data,aes(x=Sex,fill=Sex))+geom_bar()

Abalone_data$weight.diff <- Abalone_data$Whole.weight - (Abalone_data$Viscera.weight + 
                                                           Abalone_data$Shucked.weight + Abalone_data$Shell.weight)
ggplot(Abalone_data, aes(x=weight.diff)) +
  geom_histogram(colour="dodgerblue",fill=rgb(1,.54,0,.7), bins = 30) +
  scale_y_continuous(name="count") + 
  labs(title="Histogram of abalone with weight difference less than zero")

#false values of dataset
nrow(Abalone_data[Abalone_data$weight.diff <0,])
## [1] 155
nrow(Abalone_data[Abalone_data$Height == 0,])
## [1] 2
cor(Abalone_data$Length, Abalone_data$Rings)
## [1] 0.5567196
cor(Abalone_data$Diameter, Abalone_data$Rings)
## [1] 0.5746599
cor(Abalone_data$Height, Abalone_data$Rings)
## [1] 0.5574673
cor(Abalone_data$Whole.weight, Abalone_data$Rings)
## [1] 0.5403897
cor(Abalone_data$Shucked.weight, Abalone_data$Rings)
## [1] 0.4208837
cor(Abalone_data$Viscera.weight, Abalone_data$Rings)
## [1] 0.5038192
cor(Abalone_data$Shell.weight, Abalone_data$Rings)
## [1] 0.627574
ggplot(data=Abalone_data,aes(x=Shell.weight,y=Rings,color=Sex))+geom_point()+geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(data=Abalone_data,aes(x=Length,y=Rings,color=Sex))+geom_point()+geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

# ************************Simple Linear Regression**************************************************


#Splitting the dataset into the 
#Training Set and Testing set
library(caTools)
## Warning: package 'caTools' was built under R version 4.0.2
set.seed(123)
split =sample.split(Abalone_data $Diameter,  SplitRatio = 0.8)
training_set=subset(Abalone_data,split==TRUE)
test_set = subset(Abalone_data, split==FALSE)

#Fitting Simple Linear Regression to 
#the Training Set
regressor = lm(formula = Diameter ~ Length, data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = Diameter ~ Length, data = training_set)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.089312 -0.008774 -0.000443  0.008653  0.243101 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.018853   0.001240   -15.2   <2e-16 ***
## Length       0.814873   0.002308   353.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01602 on 3336 degrees of freedom
## Multiple R-squared:  0.9739, Adjusted R-squared:  0.9739 
## F-statistic: 1.247e+05 on 1 and 3336 DF,  p-value: < 2.2e-16
#Predicting the Test Set result
y_pred= predict(regressor, newdata=test_set)

#Visualising the Training set

library(ggplot2)
ggplot()+geom_point(aes(x=training_set$Length,y=training_set$Diameter),colour="red")+
  geom_line(aes(x=training_set$Length, y=predict(regressor,
                                                 newdata=training_set)))+ggtitle("diameter Vs Exp (Training)")

#Visualizing Test set 
ggplot()+geom_point(aes(x=test_set$Length,y=test_set$Diameter),color="red")+
  geom_line(aes(x=test_set$Length,y=predict(regressor,newdata=test_set)),color="blue")

# *********************************************Predicting the age**************************************


Abalone_data$age=round(Abalone_data$Rings+1.5)

head(data.frame(Sex=Abalone_data$Sex,predicted_rings=Abalone_data$Rings,predicted_age=Abalone_data$age))
##   Sex predicted_rings predicted_age
## 1   M              15            16
## 2   M               7             8
## 3   F               9            10
## 4   M              10            12
## 5   I               7             8
## 6   I               8            10

The task is to predict the age of abalone was systematically approached in this report. First dataset was explored, unusual observations were discovered , the variables were appropriately standardized linear regression performed and the age is predicted