pacman::p_load(lubridate, stringr, VIM)

Q3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

#Q4

sum(complete.cases(dirty_iris))        
## [1] 96
mean(complete.cases(dirty_iris)) * 100
## [1] 64

#Q5

which(is.infinite(dirty_iris$Petal.Width))
## [1] 86

#Q6

dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA

summary(dirty_iris$Petal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.100   0.300   1.300   1.207   1.800   2.500      13

#Q7

violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)

violations
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
nrow(violations)
## [1] 4

#Q8

which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
## [1]  16 130
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0]  <-
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])

dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <-
  NA

#Q9

# A) Sepal.Width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

# B) Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

# D) Petal.Width: kNN
dirty_iris <- VIM::kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
dirty_iris$Petal.Width_imp <- NULL

# C) Sepal.Length: linear regression
m_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
           data = dirty_iris)
miss_sl <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[miss_sl] <- predict(m_sl, newdata = dirty_iris[miss_sl, ])