Due Date: August 15, 2022
setwd("C:/Users/MButr/OneDrive/AI4OPT/Data Processing")
library(dplyr)
library(ggplot2)
library(tidyverse)
library(corrplot)
library(mlbench)
library(caret)
library(lattice)
abalone <- read.csv("C:/Users/MButr/OneDrive/AI4OPT/Datasets-master/abalone.csv", header=FALSE)
head(abalone)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 8
We notice that the names of the columns(attributes) are not very descriptive and we are in need to rename them as to what they should be form the .nmaes file associated with the dataset.
names(abalone)[1] <- "Sex"
names(abalone)[2] <- "Length"
names(abalone)[3] <- "Diameter"
names(abalone)[4] <- "Height"
names(abalone)[5] <- "Whole_Weight"
names(abalone)[6] <- "Shucked_Weight"
names(abalone)[7] <- "Viscera_Weight"
names(abalone)[8] <- "Shell_Weight"
names(abalone)[9] <- "Rings"
head(abalone)
## Sex Length Diameter Height Whole_Weight Shucked_Weight Viscera_Weight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## Shell_Weight Rings
## 1 0.150 15
## 2 0.070 7
## 3 0.210 9
## 4 0.155 10
## 5 0.055 7
## 6 0.120 8
Using the fact that the age of the abalone is given by the number of rings plus 1.5 years, we can create a new column in our dataset called Age
Age <-abalone[,9] +1.5
abalone$Age <- Age
head(abalone)
## Sex Length Diameter Height Whole_Weight Shucked_Weight Viscera_Weight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## Shell_Weight Rings Age
## 1 0.150 15 16.5
## 2 0.070 7 8.5
## 3 0.210 9 10.5
## 4 0.155 10 11.5
## 5 0.055 7 8.5
## 6 0.120 8 9.5
Let us find out the dimensions of the dataset and whether there are any missing values (NA)
dim(abalone)
## [1] 4177 10
str(abalone)
## 'data.frame': 4177 obs. of 10 variables:
## $ Sex : chr "M" "M" "F" "M" ...
## $ Length : num 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
## $ Diameter : num 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
## $ Height : num 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
## $ Whole_Weight : num 0.514 0.226 0.677 0.516 0.205 ...
## $ Shucked_Weight: num 0.2245 0.0995 0.2565 0.2155 0.0895 ...
## $ Viscera_Weight: num 0.101 0.0485 0.1415 0.114 0.0395 ...
## $ Shell_Weight : num 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
## $ Rings : int 15 7 9 10 7 8 20 16 9 19 ...
## $ Age : num 16.5 8.5 10.5 11.5 8.5 9.5 21.5 17.5 10.5 20.5 ...
colnames(abalone)[colSums(is.na(abalone))>0]
## character(0)
We notice that we have one nominal variable “Sex”, one integer variable “Rings”, and the remaining variables are numbers (continuous). We also note that there are no missing values in our dataset.
Now, we obtain the summary of the numerical-valued variables in the dataset
summary(abalone[2:4])
## Length Diameter Height
## Min. :0.075 Min. :0.0550 Min. :0.0000
## 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
summary(abalone[5:8])
## Whole_Weight Shucked_Weight Viscera_Weight Shell_Weight
## Min. :0.0020 Min. :0.0010 Min. :0.0005 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300
## Median :0.7995 Median :0.3360 Median :0.1710 Median :0.2340
## Mean :0.8287 Mean :0.3594 Mean :0.1806 Mean :0.2388
## 3rd Qu.:1.1530 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
summary(abalone[9:10])
## Rings Age
## Min. : 1.000 Min. : 2.50
## 1st Qu.: 8.000 1st Qu.: 9.50
## Median : 9.000 Median :10.50
## Mean : 9.934 Mean :11.43
## 3rd Qu.:11.000 3rd Qu.:12.50
## Max. :29.000 Max. :30.50
Now, we change the Sex variable to a factor and create a two way contingency table between Sex and Rings
abalone$Sex <- as.factor(abalone$Sex)
levels(abalone$Sex)
## [1] "F" "I" "M"
table(abalone$Sex,abalone$Rings)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## F 0 0 0 0 4 16 44 122 238 248 200 128 88 56 41 30 26 19 15
## I 1 1 12 51 100 216 267 274 173 92 62 21 24 14 10 7 7 5 2
## M 0 0 3 6 11 27 80 172 278 294 225 118 91 56 52 30 25 18 15
##
## 20 21 22 23 24 25 26 27 29
## F 12 7 3 6 1 1 0 1 1
## I 2 1 0 0 0 0 0 0 0
## M 12 6 3 3 1 0 1 1 0
cor(abalone[,2:8])
## Length Diameter Height Whole_Weight Shucked_Weight
## Length 1.0000000 0.9868116 0.8275536 0.9252612 0.8979137
## Diameter 0.9868116 1.0000000 0.8336837 0.9254521 0.8931625
## Height 0.8275536 0.8336837 1.0000000 0.8192208 0.7749723
## Whole_Weight 0.9252612 0.9254521 0.8192208 1.0000000 0.9694055
## Shucked_Weight 0.8979137 0.8931625 0.7749723 0.9694055 1.0000000
## Viscera_Weight 0.9030177 0.8997244 0.7983193 0.9663751 0.9319613
## Shell_Weight 0.8977056 0.9053298 0.8173380 0.9553554 0.8826171
## Viscera_Weight Shell_Weight
## Length 0.9030177 0.8977056
## Diameter 0.8997244 0.9053298
## Height 0.7983193 0.8173380
## Whole_Weight 0.9663751 0.9553554
## Shucked_Weight 0.9319613 0.8826171
## Viscera_Weight 1.0000000 0.9076563
## Shell_Weight 0.9076563 1.0000000
We notice that there are some very strong correlations between the attributes of abalone. Let us look deeper into the dataset. Let us show these strong correlations via a correlation plot
correlations <- cor(abalone[,2:8])
corrplot(correlations, method="number", type="upper")
As we can see, the data is heavily correlated. This is a problem if we attempt analysis on these numbers. We will have to use various methods to try and reduce the apparent correlation.
Let us now look at some plots associated with different variables in the dataset:
abalone %>% group_by(Sex) %>% summarise(No_of_Observation = n())
## # A tibble: 3 × 2
## Sex No_of_Observation
## <fct> <int>
## 1 F 1307
## 2 I 1342
## 3 M 1528
ggplot(data=abalone,aes(x=Sex,fill=Sex))+geom_bar()
ggplot(abalone, aes(x = Sex, y = Whole_Weight, fill = Sex)) +
geom_boxplot()
ggplot(data=abalone,aes(x=Length,fill=Length))+geom_bar(fill="red")
ggplot(data=abalone,aes(x=Diameter,fill=Diameter))+geom_bar(fill="purple")
ggplot(data=abalone,aes(x=Height,fill=Height))+geom_bar(fill="green")
It seemed from the distribution above, that there might be some abalone with a height of 0, which is not possible. Let us find out if that is the case and how many incorrect observations we might have
nrow(abalone[abalone$Height == 0,])
## [1] 2
abalone <- subset(abalone, Height!=0)
nrow(abalone[abalone$Height == 0,])
## [1] 0
We see that rows 1258 and 3997 contain heights equal to 0 and were removed removed.
ggplot(abalone, aes(x = Whole_Weight, fill=Whole_Weight)) +
geom_histogram(fill="Dark Blue")
The graph above gives the distribution of the whole weight without taking into account the sex of the abalone.
ggplot(abalone, aes(x = Whole_Weight, fill=Sex)) +
geom_histogram() +
facet_grid(Sex~.)
ggplot(abalone, aes(x = Sex, y = Whole_Weight, fill = Sex)) +
geom_boxplot()
Let us now compute and plot the distribution of the difference between the Whole Weight and the sum of the Viscera Weight, Shucked Weight, Shell Weight.
Weight_Difference <- abalone$Whole_Weight -
(abalone$Viscera_Weight + abalone$Shucked_Weight +
abalone$Shell_Weight)
abalone$Weight_Difference <- Weight_Difference
head(abalone)
## Sex Length Diameter Height Whole_Weight Shucked_Weight Viscera_Weight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## Shell_Weight Rings Age Weight_Difference
## 1 0.150 15 16.5 0.0385
## 2 0.070 7 8.5 0.0075
## 3 0.210 9 10.5 0.0690
## 4 0.155 10 11.5 0.0315
## 5 0.055 7 8.5 0.0210
## 6 0.120 8 9.5 0.0130
ggplot(abalone, aes(x=Weight_Difference)) +
geom_histogram(color="red", fill="skyblue", bins=30) + scale_y_continuous(name="count") +
labs(title="Histogram of Abalone Weight Differences")
We note that some of the differences above are negative, which should not make sense. Let us find out how many of those negative weight differences we have in our dataset:
nrow(abalone[abalone$Weight_Difference <0,])
## [1] 154
abalone <- subset(abalone, Weight_Difference>0)
nrow(abalone[abalone$Weight_Difference <0,])
## [1] 0
ggplot(data=abalone,aes(x=Rings,fill=Rings))+geom_bar(fill="darkblue",alpha=0.6)
Let us look at the density plot between Sex and Rings
library(ggplot2)
ggplot(abalone) + aes(Rings, color = Sex) + geom_density()
We note from the above that the density plot for females and males are almost identical.
Let us plot a histogram for each attribute
par(mfrow=c(3,3))
for(i in 2:9) {
hist(abalone[,i], main=names(abalone)[i])
}
Let us plot a box plot for each attribute to check for outliers
par(mfrow=c(3,3))
for(i in 2:9) {
boxplot(abalone[,i], main=names(abalone)[i])
}
Based on the box plots above, we note that all attributes have at least one outlier.
Now, let us plot the density plot for each attribute
par(mfrow=c(3,3))
for(i in 2:9) {
plot(density(abalone[,i]), main=names(abalone)[i])
}
Let’s move on, and see the relationship between Length and Rings, and how that is affected by Sex.
ggplot(data=abalone,aes(x=Length,y=Rings,color=Sex))+
geom_point()+geom_smooth(method="lm")
We can also visualize the relationship between Shell Weight, Rings and Sex using
ggplot(data=abalone,aes(x=Shell_Weight,y=Rings,color=Sex))+
geom_point()+geom_smooth(method="lm")
We can also visualize the relationship between Whole Weight, Rings and Sex using
ggplot(data=abalone,aes(x=Whole_Weight,y=Rings,color=Sex))+
geom_point()+geom_smooth(method="lm")
Finally we will look at the correlation matrix between the attributes of the dataset
pairs(abalone[2:9])