Question 3 Missing Values
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 4 Number and percentange of observations
complete_rows <- complete.cases(dirty_iris)
c(Number = sum(complete_rows),
Percentage = mean(complete_rows) * 100)
## Number Percentage
## 96 64
Question 5
colSums(sapply(dirty_iris, is.nan))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
Question 6
dirty_iris[is.nan(as.matrix(dirty_iris))] <- NA
colSums(sapply(dirty_iris, is.nan))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
Question 7
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
violations <- dirty_iris[
(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
(!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30),
]
nrow(violations)
## [1] 4
Question 8
Make sure column is numeric
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
Locate observations violating Sepal.Width > 0
bad_sw <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
bad_sw
## [1] 16 130
dirty_iris[bad_sw, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
Correct the errors:
Negative values -> absolute value
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <-
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])
Zero values -> NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
Show corrected rows
dirty_iris[bad_sw, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
Question 9
Sepal Width Mean Imputation
dirty_iris$Sepal.Width[
is.na(dirty_iris$Sepal.Width)
] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
Remove helper column created by kNN
dirty_iris$Petal.Width_imp <- NULL