Exercise with iris dataset

Load libraries

library(dplyr)
library(tidyverse)
library(tidyr)
library(ggplot2)
library(corrplot)

Import dataset exercise.csv

exercise <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Exercise/exercise.csv")

Inspect dataset

glimpse(exercise)

## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, NA, 5.4, NA, 5.0, 4.4, 4.9, 5.4, 4...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Species      <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "set...

summary(exercise) #there are NA's

##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.100   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.529   Mean   :  4.418   Mean   :  5.796   Mean   :  3.939  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Transform datatype of Species to factor

exercise$Species <- as.factor(exercise$Species)

Inspect missing values (NA’s) and find position of NA’s

any(is.na(exercise))

## [1] TRUE

which(is.na(exercise))

##  [1]   5   7 126 175 231 295 329 379 409 478 491 495 644 650 702

Replace NA’s by median for columns with numerical values

exercise$Sepal.Length[which(is.na(exercise$Sepal.Length))] <- median(exercise$Sepal.Length, na.rm = TRUE)
exercise$Sepal.Width[which(is.na(exercise$Sepal.Width))] <- median(exercise$Sepal.Width, na.rm = TRUE)
exercise$Petal.Length[which(is.na(exercise$Petal.Length))] <- median(exercise$Petal.Length, na.rm = TRUE)
exercise$Petal.Width[which(is.na(exercise$Petal.Width))] <- median(exercise$Petal.Width, na.rm = TRUE)

Remove row if there is NA in Species

exercise_complete <- na.omit(exercise)
summary(exercise_complete)

##   Sepal.Length      Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :  4.300   Min.   :  2.000   Min.   :  1.000   Min.   :  0.100  
##  1st Qu.:  5.150   1st Qu.:  2.800   1st Qu.:  1.600   1st Qu.:  0.300  
##  Median :  5.800   Median :  3.000   Median :  4.300   Median :  1.300  
##  Mean   :  6.539   Mean   :  4.415   Mean   :  5.829   Mean   :  3.947  
##  3rd Qu.:  6.400   3rd Qu.:  3.300   3rd Qu.:  5.100   3rd Qu.:  1.800  
##  Max.   :106.800   Max.   :203.500   Max.   :304.400   Max.   :401.500  
##        Species  
##  setosa    :48  
##  versicolor:50  
##  virginica :49  
##                 
##                 
##

Making boxplot

Remove outliers

exercise_no_outliers <- exercise_complete %>% 
  select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species) %>% 
  filter(Sepal.Length <100 & Sepal.Width <100 & Petal.Length <100 & Petal.Width <100)
boxplot(exercise_no_outliers[,1:4])

Rename columns Species for Plants

exercise_renamed <- exercise_no_outliers %>% 
  rename(Plants = Species)

Create extra column: Petal.Area

exercise_extra <- exercise_renamed %>% 
  mutate(Petal.Area = Petal.Length * Petal.Width)

Glimpse dataset

glimpse(exercise_extra)

## Observations: 143
## Variables: 6
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.8, 5.4, 5.8, 5.0, 4.4, 4.9, 5.4,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Plants       <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa...
## $ Petal.Area   <dbl> 0.28, 0.28, 0.26, 0.30, 0.28, 0.68, 0.42, 0.30, 0.28, ...

Correlation matrix

corrData <- cor(exercise_extra[,1:4])
corrplot(corrData, method="color", order="original", addCoef.col = "gray")

my_cols <- c("#00AFBB", "#E7B800", "#FC4E07")  
pairs(exercise_extra[,1:4], pch = 19,  col = my_cols[exercise_extra$Plants])

There is a high correlation between Petal.Length vs. Petal.Width and Sepal.Length vs. Petal.Length

Exercise with iris dataset

Y.S. Kim

4/28/2020

Load libraries

Import dataset exercise.csv

Inspect dataset

Transform datatype of Species to factor

Inspect missing values (NA’s) and find position of NA’s

Replace NA’s by median for columns with numerical values

Remove row if there is NA in Species

Making boxplot

Remove outliers

Rename columns Species for Plants

Create extra column: Petal.Area

Glimpse dataset

Correlation matrix

Exercise with ggplot