dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(complete.cases(dirty_iris))
## [1] 96
sum(complete.cases(dirty_iris)) / nrow(dirty_iris) * 100
## [1] 64
numeric_df <- dirty_iris[, sapply(dirty_iris, is.numeric)]
sum(is.na(numeric_df))
## [1] 58
sum(sapply(numeric_df, function(x) sum(is.nan(x))))
## [1] 0
sum(numeric_df == Inf, na.rm = TRUE)
## [1] 1
sum(numeric_df == -Inf, na.rm = TRUE)
## [1] 0
# Locate
num_cols <- sapply(dirty_iris, is.numeric)
inf_pos <- which(dirty_iris[, num_cols] == Inf, arr.ind = TRUE)
inf_pos
## row col
## [1,] 86 4
# Replace
dirty_iris[, num_cols][dirty_iris[, num_cols] == Inf] <- NA
# Check
sum(dirty_iris[, num_cols] == Inf, na.rm = TRUE)
## [1] 0
violations <- dirty_iris[
(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
(!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30),
]
violations # 0 is non-positive
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
nrow(violations)
## [1] 4
# Locate
violations_width <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
violations_width
## [1] 16 130
# Correct
neg_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])
zero_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_idx] <- NA
# Check
sum(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
## [1] 0
# 1) Sepal.Width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
# 2) Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
# 3) Sepal.Length: linear regression
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris)
sl_miss <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[sl_miss] <- predict(fit, newdata = dirty_iris[sl_miss, ])
# 4) Petal.Width: kNN
if (!requireNamespace("VIM", quietly = TRUE)) install.packages("VIM")
library(VIM)
dirty_iris <- VIM::kNN(dirty_iris, variable = "Petal.Width", k = 5, imp_var = FALSE)