dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
complete_obs
## [1] 96
complete_obs / total_obs * 100
## [1] 64
sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 1 0
dirty_iris[sapply(dirty_iris, is.infinite)] <- NA
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 2
sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
## [1] 2
sum(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30,
na.rm = TRUE)
## [1] 4
dirty_iris[dirty_iris$Sepal.Width <= 0, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA NA NA NA NA <NA>
## NA.1 NA NA NA NA <NA>
## 16 5.0 -3 3.5 1.0 versicolor
## NA.2 NA NA NA NA <NA>
## NA.3 NA NA NA NA <NA>
## NA.4 NA NA NA NA <NA>
## NA.5 NA NA NA NA <NA>
## NA.6 NA NA NA NA <NA>
## NA.7 NA NA NA NA <NA>
## NA.8 NA NA NA NA <NA>
## NA.9 NA NA NA NA <NA>
## NA.10 NA NA NA NA <NA>
## NA.11 NA NA NA NA <NA>
## NA.12 NA NA NA NA <NA>
## NA.13 NA NA NA NA <NA>
## 130 5.7 0 1.7 0.3 setosa
## NA.14 NA NA NA NA <NA>
## NA.15 NA NA NA NA <NA>
## NA.16 NA NA NA NA <NA>
neg_index <- which(dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_index] <-
abs(dirty_iris$Sepal.Width[neg_index])
zero_index <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_index] <- NA
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 0
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris)
missing_index <- which(is.na(dirty_iris$Sepal.Length))
dirty_iris$Sepal.Length[missing_index] <-
predict(model, dirty_iris[missing_index, ])
install.packages(“VIM”)
library(VIM)
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris,
variable = "Petal.Width",
k = 5)
dirty_iris$Petal.Width_imp <- NULL