Load libraries

library(dplyr)
library(tidyverse)
library(tidyr)
library(ggplot2)
library(corrplot)

Import dataset exercise.csv

exercise <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Exercise/exercise.csv")

Inspect dataset

glimpse(exercise)
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, NA, 5.4, NA, 5.0, 4.4, 4.9, 5.4, 4...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Species      <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "set...
summary(exercise) #there are NA's
##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.100   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.529   Mean   :  4.418   Mean   :  5.796   Mean   :  3.939  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Transform datatype of Species to factor

exercise$Species <- as.factor(exercise$Species)

Inspect missing values (NA’s) and find position of NA’s

any(is.na(exercise)) 
## [1] TRUE
which(is.na(exercise)) 
##  [1]   5   7 126 175 231 295 329 379 409 478 491 495 644 650 702

Replace NA’s by median for columns with numerical values

exercise$Sepal.Length[which(is.na(exercise$Sepal.Length))] <- median(exercise$Sepal.Length, na.rm = TRUE)
exercise$Sepal.Width[which(is.na(exercise$Sepal.Width))] <- median(exercise$Sepal.Width, na.rm = TRUE)
exercise$Petal.Length[which(is.na(exercise$Petal.Length))] <- median(exercise$Petal.Length, na.rm = TRUE)
exercise$Petal.Width[which(is.na(exercise$Petal.Width))] <- median(exercise$Petal.Width, na.rm = TRUE)

Remove row if there is NA in Species

exercise_complete <- na.omit(exercise)
summary(exercise_complete)
##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.150   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.539   Mean   :  4.415   Mean   :  5.829   Mean   :  3.947  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##        Species  
##  setosa    :48  
##  versicolor:50  
##  virginica :49  
##                 
##                 
## 

Making boxplot

Remove outliers

exercise_no_outliers <- exercise_complete %>% 
  select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species) %>% 
  filter(Sepal.Length <100 & Sepal.Width <100 & Petal.Length <100 & Petal.Width <100)
boxplot(exercise_no_outliers[,1:4])

Rename columns Species for Plants

exercise_renamed <- exercise_no_outliers %>% 
  rename(Plants = Species)

Create extra column: Petal.Area

exercise_extra <- exercise_renamed %>% 
  mutate(Petal.Area = Petal.Length * Petal.Width)

Glimpse dataset

glimpse(exercise_extra)
## Observations: 143
## Variables: 6
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.8, 5.4, 5.8, 5.0, 4.4, 4.9, 5.4,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Plants       <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa...
## $ Petal.Area   <dbl> 0.28, 0.28, 0.26, 0.30, 0.28, 0.68, 0.42, 0.30, 0.28, ...

Correlation matrix

corrData <- cor(exercise_extra[,1:4])
corrplot(corrData, method="color", order="original", addCoef.col = "gray")

my_cols <- c("#00AFBB", "#E7B800", "#FC4E07")  
pairs(exercise_extra[,1:4], pch = 19,  col = my_cols[exercise_extra$Plants])

There is a high correlation between Petal.Length vs. Petal.Width and Sepal.Length vs. Petal.Length

Exercise with ggplot