dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(complete.cases(dirty_iris))
## [1] 96
96/150
## [1] 0.64
sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 1 0
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 0
violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
print(violations)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
nrow(violations)
## [1] 4
neg_indices <- which(dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_indices] <- abs(dirty_iris$Sepal.Width[neg_indices])
zero_indices <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_indices] <- NA
dirty_iris[c(16, 130), ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] FALSE
Sepal.width: mean Petal.Length: median Sepal.Length: linear regression Petal.Width: kNN
if(!require(VIM)) install.packages("VIM")
library(VIM)
library(VIM)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
na_rows <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[na_rows] <- predict(model, dirty_iris[na_rows, ])
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5, imp_var = FALSE)
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0