##6
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

numeric_cols <- sapply(dirty_iris, is.numeric)
for (col in names(dirty_iris)[numeric_cols]) {
  dirty_iris[[col]][is.nan(dirty_iris[[col]])] <- NA
}

sapply(dirty_iris[, numeric_cols], function(x) sum(is.nan(x)))  
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0
sapply(dirty_iris[, numeric_cols], function(x) sum(is.na(x)))   
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##           10           17           19           12
##7
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]

nrow(violations)
## [1] 31
##8
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)

neg_rows <- which(dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_rows] <- abs(dirty_iris$Sepal.Width[neg_rows])

zero_rows <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_rows] <- NA

dirty_iris[c(neg_rows, zero_rows), "Sepal.Width"]
## [1]  3 NA
data(iris)
iris_missing <- iris

set.seed(123)
iris_missing[sample(nrow(iris_missing), 5), "Sepal.Width"] <- NA
iris_missing[sample(nrow(iris_missing), 5), "Petal.Length"] <- NA
iris_missing[sample(nrow(iris_missing), 5), "Sepal.Length"] <- NA
iris_missing[sample(nrow(iris_missing), 5), "Petal.Width"] <- NA

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
iris_missing$Sepal.Width[is.na(iris_missing$Sepal.Width)] <-
  mean(iris_missing$Sepal.Width, na.rm = TRUE)

iris_missing$Petal.Length[is.na(iris_missing$Petal.Length)] <-
  median(iris_missing$Petal.Length, na.rm = TRUE)

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
            data = iris_missing, na.action = na.omit)
missing_idx <- which(is.na(iris_missing$Sepal.Length))
iris_missing$Sepal.Length[missing_idx] <-
  predict(model, iris_missing[missing_idx, ])

iris_imputed <- kNN(iris_missing, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          4.3          2.0          1.0          7.9          4.4          6.9