pacman::p_load(lubridate, stringr, VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
#Q4
sum(complete.cases(dirty_iris))
## [1] 96
mean(complete.cases(dirty_iris)) * 100
## [1] 64
#Q5
which(is.infinite(dirty_iris$Petal.Width))
## [1] 86
#Q6
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
summary(dirty_iris$Petal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.100 0.300 1.300 1.207 1.800 2.500 13
#Q7
violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
violations
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
nrow(violations)
## [1] 4
#Q8
which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
## [1] 16 130
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <-
abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <-
NA
#Q9
# A) Sepal.Width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
# B) Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
# D) Petal.Width: kNN
dirty_iris <- VIM::kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
dirty_iris$Petal.Width_imp <- NULL
# C) Sepal.Length: linear regression
m_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
data = dirty_iris)
miss_sl <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[miss_sl] <- predict(m_sl, newdata = dirty_iris[miss_sl, ])