dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(complete.cases(dirty_iris))
## [1] 96
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(is.na(dirty_iris$Sepal.Length))
## [1] 10
sum(is.na(dirty_iris$Sepal.Width))
## [1] 17
sum(is.na(dirty_iris$Petal.Width))
## [1] 12
sum(is.nan(dirty_iris$Petal.Length))
## [1] 0
sum(is.nan(dirty_iris$Sepal.Length))
## [1] 0
sum(is.nan(dirty_iris$Sepal.Width))
## [1] 0
sum(is.nan(dirty_iris$Petal.Width))
## [1] 0
# Locate the special (invalid) value
sum(dirty_iris$Sepal.Width < 0)
## [1] NA
# Replace the negative value with NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0] <- NA
# Verify replacement
sum(dirty_iris$Sepal.Width < 0)
## [1] NA
sum(is.na(dirty_iris$Sepal.Width))
## [1] 18
sum(dirty_iris$Sepal.Width <= 0 |
dirty_iris$Sepal.Length > 30,
na.rm = TRUE)
## [1] 3
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 1
# Replace negative values with absolute value
dirty_iris$Sepal.Width <- ifelse(
dirty_iris$Sepal.Width < 0,
abs(dirty_iris$Sepal.Width),
dirty_iris$Sepal.Width
)
# Replace 0 with NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
# Verify the correction
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 0
# Sepal.Width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
# Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
install.packages(“VIM”)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris, k = 5)
dirty_iris <- dirty_iris[, !grepl("_imp", names(dirty_iris))]
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
colSums(is.na(dirty_iris[, c("Sepal.Length",
"Sepal.Width",
"Petal.Length",
"Petal.Width")]))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
sapply(dirty_iris[, c("Sepal.Length",
"Sepal.Width",
"Petal.Length",
"Petal.Width")],
function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 1
sapply(dirty_iris[, c("Sepal.Length",
"Sepal.Width",
"Petal.Length",
"Petal.Width")],
class)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## "numeric" "numeric" "numeric" "numeric"
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
dirty_iris$Petal.Width[is.na(dirty_iris$Petal.Width)] <-
median(dirty_iris$Petal.Width, na.rm = TRUE)
sapply(dirty_iris[, c("Sepal.Length",
"Sepal.Width",
"Petal.Length",
"Petal.Width")],
function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
complete_rows <- complete.cases(
dirty_iris[, c("Sepal.Length",
"Sepal.Width",
"Petal.Length",
"Petal.Width")]
)
lm_model <- lm(
Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris[complete_rows, ]
)
missing_SL <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_SL] <-
predict(lm_model, newdata = dirty_iris[missing_SL, ])