Question 3:

Counting Missing Values in Petal.Length

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4:

Identifying Complete Observations in the Dataset

sum(complete.cases(dirty_iris))
## [1] 96
sum(complete.cases(dirty_iris)) / nrow(dirty_iris) * 100
## [1] 64

Question 5:

Identification of Special Numeric Values

numeric_df <- dirty_iris[, sapply(dirty_iris, is.numeric)]

sum(is.na(numeric_df))
## [1] 58
sum(sapply(numeric_df, function(x) sum(is.nan(x))))
## [1] 0
sum(numeric_df == Inf, na.rm = TRUE)
## [1] 1
sum(numeric_df == -Inf, na.rm = TRUE)
## [1] 0

Question 6:

Handling Special Numeric Values Through NA Replacement

# Locate
num_cols <- sapply(dirty_iris, is.numeric)
inf_pos <- which(dirty_iris[, num_cols] == Inf, arr.ind = TRUE)
inf_pos
##      row col
## [1,]  86   4
# Replace
dirty_iris[, num_cols][dirty_iris[, num_cols] == Inf] <- NA

# Check
sum(dirty_iris[, num_cols] == Inf, na.rm = TRUE)
## [1] 0

Question 7:

Identification of Rule-Based Data Errors

violations <- dirty_iris[
  (!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
    (!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30),
]

violations # 0 is non-positive
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
nrow(violations)
## [1] 4

Question 8:

Correction of Sepal.Width Rule Violations

# Locate
violations_width <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
violations_width
## [1]  16 130
# Correct
neg_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])

zero_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_idx] <- NA

# Check
sum(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
## [1] 0

Question 9:

Imputation of Missing Values Using Multiple Methods

# 1) Sepal.Width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

# 2) Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

# 3) Sepal.Length: linear regression
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
          data = dirty_iris)

sl_miss <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[sl_miss] <- predict(fit, newdata = dirty_iris[sl_miss, ])

# 4) Petal.Width: kNN
if (!requireNamespace("VIM", quietly = TRUE)) install.packages("VIM")
library(VIM)

dirty_iris <- VIM::kNN(dirty_iris, variable = "Petal.Width", k = 5, imp_var = FALSE)