library(dplyr)
library(tidyverse)
library(tidyr)
library(ggplot2)
library(corrplot)
exercise <- read_csv("C:/Users/Y.S. Kim/Desktop/Ubiqum/Exercise/exercise.csv")
glimpse(exercise)
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, NA, 5.4, NA, 5.0, 4.4, 4.9, 5.4, 4...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Species <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "set...
summary(exercise) #there are NA's
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 4.300 Min. : 2.000 Min. : 1.000 Min. : 0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.600 1st Qu.: 0.300
## Median : 5.800 Median : 3.000 Median : 4.300 Median : 1.300
## Mean : 6.529 Mean : 4.418 Mean : 5.796 Mean : 3.939
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.100 3rd Qu.: 1.800
## Max. :106.800 Max. :203.500 Max. :304.400 Max. :401.500
## NA's :3 NA's :3 NA's :3 NA's :3
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
exercise$Species <- as.factor(exercise$Species)
any(is.na(exercise))
## [1] TRUE
which(is.na(exercise))
## [1] 5 7 126 175 231 295 329 379 409 478 491 495 644 650 702
exercise$Sepal.Length[which(is.na(exercise$Sepal.Length))] <- median(exercise$Sepal.Length, na.rm = TRUE)
exercise$Sepal.Width[which(is.na(exercise$Sepal.Width))] <- median(exercise$Sepal.Width, na.rm = TRUE)
exercise$Petal.Length[which(is.na(exercise$Petal.Length))] <- median(exercise$Petal.Length, na.rm = TRUE)
exercise$Petal.Width[which(is.na(exercise$Petal.Width))] <- median(exercise$Petal.Width, na.rm = TRUE)
exercise_complete <- na.omit(exercise)
summary(exercise_complete)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 4.300 Min. : 2.000 Min. : 1.000 Min. : 0.100
## 1st Qu.: 5.150 1st Qu.: 2.800 1st Qu.: 1.600 1st Qu.: 0.300
## Median : 5.800 Median : 3.000 Median : 4.300 Median : 1.300
## Mean : 6.539 Mean : 4.415 Mean : 5.829 Mean : 3.947
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.100 3rd Qu.: 1.800
## Max. :106.800 Max. :203.500 Max. :304.400 Max. :401.500
## Species
## setosa :48
## versicolor:50
## virginica :49
##
##
##
exercise_no_outliers <- exercise_complete %>%
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species) %>%
filter(Sepal.Length <100 & Sepal.Width <100 & Petal.Length <100 & Petal.Width <100)
boxplot(exercise_no_outliers[,1:4])
exercise_renamed <- exercise_no_outliers %>%
rename(Plants = Species)
exercise_extra <- exercise_renamed %>%
mutate(Petal.Area = Petal.Length * Petal.Width)
glimpse(exercise_extra)
## Observations: 143
## Variables: 6
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.8, 5.4, 5.8, 5.0, 4.4, 4.9, 5.4,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Plants <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa...
## $ Petal.Area <dbl> 0.28, 0.28, 0.26, 0.30, 0.28, 0.68, 0.42, 0.30, 0.28, ...
corrData <- cor(exercise_extra[,1:4])
corrplot(corrData, method="color", order="original", addCoef.col = "gray")
my_cols <- c("#00AFBB", "#E7B800", "#FC4E07")
pairs(exercise_extra[,1:4], pch = 19, col = my_cols[exercise_extra$Plants])
There is a high correlation between Petal.Length vs. Petal.Width and Sepal.Length vs. Petal.Length