Question 3 Missing Values

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4 Number and percentange of observations

complete_rows <- complete.cases(dirty_iris)
c(Number = sum(complete_rows),
  Percentage = mean(complete_rows) * 100)
##     Number Percentage 
##         96         64

Question 5

colSums(sapply(dirty_iris, is.nan))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Question 6

dirty_iris[is.nan(as.matrix(dirty_iris))] <- NA
colSums(sapply(dirty_iris, is.nan))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Question 7

dirty_iris$Sepal.Width  <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
violations <- dirty_iris[
  (!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
  (!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30),
]

nrow(violations)
## [1] 4

Question 8

Make sure column is numeric
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
Locate observations violating Sepal.Width > 0
bad_sw <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
bad_sw
## [1]  16 130
dirty_iris[bad_sw, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
Correct the errors:
Negative values -> absolute value
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <-
  abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])
Zero values -> NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
Show corrected rows
dirty_iris[bad_sw, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa

Question 9

Sepal Width Mean Imputation
dirty_iris$Sepal.Width[
  is.na(dirty_iris$Sepal.Width)
] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
Median imputation for Petal.Length
dirty_iris$Petal.Length[
  is.na(dirty_iris$Petal.Length)
] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris,
                  variable = "Petal.Width",
                  k = 5)
Remove helper column created by kNN
dirty_iris$Petal.Width_imp <- NULL