dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(complete.cases(dirty_iris))
## [1] 96
mean(complete.cases(dirty_iris)) * 100
## [1] 64
sapply(dirty_iris, function(col) {
c(
NA_count = sum(is.na(col)),
NaN_count = sum(is.nan(col)),
Inf_count = sum(is.infinite(col))
)
})
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA_count 10 17 19 12 0
## NaN_count 0 0 0 0 0
## Inf_count 0 0 0 1 0
num_cols <- sapply(dirty_iris, is.numeric)
dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE], function(x) {
x[is.infinite(x) | is.nan(x)] <- NA
x
})
viol_idx <- which(
(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
(!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30)
)
length(viol_idx)
## [1] 4
dirty_iris[viol_idx, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
violations <- subset(dirty_iris, Sepal.Width <= 0)
violations
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <-
abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris)
pred_vals <- predict(lm_model,
newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- pred_vals
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0