Reading the data and taking care of missing values

#Read the data and store it in ExDataSet
ExDataSet <- read.csv("exercise.csv")

#Converting Petal.Length outliers to median value
ExDataSet$Petal.Length[is.na(ExDataSet$Petal.Length)]<-median(ExDataSet$Petal.Length,na.rm = TRUE)

#Converting Petal.Width outliers to median value
ExDataSet$Petal.Width[is.na(ExDataSet$Petal.Width)]<-median(ExDataSet$Petal.Width,na.rm = TRUE)

#Converting Sepal.Length outliers to median value
ExDataSet$Sepal.Length[is.na(ExDataSet$Sepal.Length)]<-median(ExDataSet$Sepal.Length,na.rm = TRUE)

#Converting Sepal.Width outliers to median value
ExDataSet$Sepal.Width[is.na(ExDataSet$Sepal.Width)]<-median(ExDataSet$Sepal.Width,na.rm = TRUE)

#Removing rows with missing data (targeting species to remove the 3 outliers found)
ExDataSet<-ExDataSet[complete.cases(ExDataSet), ]

#display the structure of the dataset
str(ExDataSet)
## 'data.frame':    147 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5.8 5.4 5.8 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#boxplot showing the outliers in Sepal.Width
boxplot(ExDataSet$Sepal.Width)

#Display the summary of the dataset
summary(ExDataSet)
##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.150   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.539   Mean   :  4.415   Mean   :  5.829   Mean   :  3.947  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##        Species  
##  setosa    :48  
##  versicolor:50  
##  virginica :49  
##                 
##                 
## 

Find and remove outliers

#tell the system not to plot the outliers
outliers <- boxplot(ExDataSet$Sepal.Width, plot=FALSE)$out
ExDataSet <- ExDataSet[-which(ExDataSet$Sepal.Width %in% outliers),]

#boxplot after removing outliers
boxplot(ExDataSet$Sepal.Width)

#Remove outliersfrom the rest of the columns
outliers <- boxplot(ExDataSet$Petal.Length, plot=FALSE)$out
ExDataSet <- ExDataSet[-which(ExDataSet$Petal.Length %in% outliers),]

outliers <- boxplot(ExDataSet$Sepal.Length, plot=FALSE)$out
ExDataSet <- ExDataSet[-which(ExDataSet$Sepal.Length %in% outliers),]

outliers <- boxplot(ExDataSet$Petal.Width, plot=FALSE)$out
ExDataSet <- ExDataSet[-which(ExDataSet$Petal.Width %in% outliers),]
summary(ExDataSet)
##   Sepal.Length    Sepal.Width    Petal.Length    Petal.Width  
##  Min.   :4.300   Min.   :2.20   Min.   :1.000   Min.   :0.10  
##  1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.600   1st Qu.:0.30  
##  Median :5.800   Median :3.00   Median :4.300   Median :1.30  
##  Mean   :5.871   Mean   :3.04   Mean   :3.835   Mean   :1.24  
##  3rd Qu.:6.400   3rd Qu.:3.25   3rd Qu.:5.100   3rd Qu.:1.80  
##  Max.   :7.900   Max.   :4.00   Max.   :6.900   Max.   :2.50  
##        Species  
##  setosa    :44  
##  versicolor:47  
##  virginica :48  
##                 
##                 
## 
#Rename Species to Plants
colnames(ExDataSet)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
names(ExDataSet)[names(ExDataSet) == "Species"] <- "Plants"

#Add column Petal.Area
ExDataSet <- transform(ExDataSet, Petal.Area= Petal.Width*Petal.Length)

Creation of a correlation matrix (Excluding non numeric features)

ExDataSet.cor = cor(ExDataSet[sapply(ExDataSet, is.numeric)])
ExDataSet.cor
##              Sepal.Length Sepal.Width Petal.Length Petal.Width Petal.Area
## Sepal.Length   1.00000000 -0.09106535    0.8536314   0.7988864  0.8438345
## Sepal.Width   -0.09106535  1.00000000   -0.3769157  -0.3061186 -0.2417391
## Petal.Length   0.85363142 -0.37691575    1.0000000   0.9365146  0.9483009
## Petal.Width    0.79888643 -0.30611859    0.9365146   1.0000000  0.9726569
## Petal.Area     0.84383446 -0.24173915    0.9483009   0.9726569  1.0000000

Creating of a heatmap with ggplot2

## Warning: package 'reshape2' was built under R version 3.5.3
## Warning: package 'ggplot2' was built under R version 3.5.3

Creating a Histogram

ggplot(ExDataSet, aes(x=ExDataSet$Petal.Width)) + geom_histogram(binwidth=0.2, color="black", fill="white")

hist(ExDataSet$Petal.Width)

Plot with colours based on variables

plot(ExDataSet$Petal.Width, ExDataSet$Petal.Length, col=c("red","blue","green")[ExDataSet$Plants])