##6
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
numeric_cols <- sapply(dirty_iris, is.numeric)
for (col in names(dirty_iris)[numeric_cols]) {
dirty_iris[[col]][is.nan(dirty_iris[[col]])] <- NA
}
sapply(dirty_iris[, numeric_cols], function(x) sum(is.nan(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
sapply(dirty_iris[, numeric_cols], function(x) sum(is.na(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 10 17 19 12
##7
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]
nrow(violations)
## [1] 31
##8
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
neg_rows <- which(dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_rows] <- abs(dirty_iris$Sepal.Width[neg_rows])
zero_rows <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_rows] <- NA
dirty_iris[c(neg_rows, zero_rows), "Sepal.Width"]
## [1] 3 NA
data(iris)
iris_missing <- iris
set.seed(123)
iris_missing[sample(nrow(iris_missing), 5), "Sepal.Width"] <- NA
iris_missing[sample(nrow(iris_missing), 5), "Petal.Length"] <- NA
iris_missing[sample(nrow(iris_missing), 5), "Sepal.Length"] <- NA
iris_missing[sample(nrow(iris_missing), 5), "Petal.Width"] <- NA
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
iris_missing$Sepal.Width[is.na(iris_missing$Sepal.Width)] <-
mean(iris_missing$Sepal.Width, na.rm = TRUE)
iris_missing$Petal.Length[is.na(iris_missing$Petal.Length)] <-
median(iris_missing$Petal.Length, na.rm = TRUE)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
data = iris_missing, na.action = na.omit)
missing_idx <- which(is.na(iris_missing$Sepal.Length))
iris_missing$Sepal.Length[missing_idx] <-
predict(model, iris_missing[missing_idx, ])
iris_imputed <- kNN(iris_missing, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 4.3 2.0 1.0 7.9 4.4 6.9