pacman:: p_load(lubridate, stringr, VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <-sum(complete.cases(dirty_iris))

96/150
## [1] 0.64
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
viol_sl <- which(dirty_iris$Sepal.Width <= 0, arr.ind = TRUE)
viol_sw <- which(dirty_iris$Sepal.Length > 30, arr.ind = TRUE)
violations <- sort(unique(c(viol_sw, viol_sl)))
length(violations)
## [1] 4
viol_idx <- which(dirty_iris$Sepal.Width <= 0)
dirty_iris[viol_idx, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
neg_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])

zero_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_idx] <- NA

dirty_iris[viol_idx, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- 
  median(dirty_iris$Petal.Length, na.rm = TRUE)

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.000   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.750   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.559   Mean   : 3.462   Mean   : 4.456   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##  NA's   :10                                         NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
dirty_iris <- kNN(dirty_iris, "Petal.Width")
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
str(dirty_iris)
## 'data.frame':    150 obs. of  6 variables:
##  $ Sepal.Length   : num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width    : num  3.2 3.3 3.46 3.4 2.6 ...
##  $ Petal.Length   : num  4.5 6 5.4 1.6 3.5 4.5 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width    : num  1.5 2.5 2.3 0.4 1 0.2 1.9 1.8 1 0.2 ...
##  $ Species        : chr  "versicolor" "virginica" "virginica" "setosa" ...
##  $ Petal.Width_imp: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
          data = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- 
  predict(fit, newdata = dirty_iris)[is.na(dirty_iris$Sepal.Length)]

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.000   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.800   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.528   Mean   : 3.462   Mean   : 4.456   Mean   :1.208  
##  3rd Qu.: 6.400   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species          Petal.Width_imp
##  Length:150         Mode :logical  
##  Class :character   FALSE:137      
##  Mode  :character   TRUE :13       
##                                    
##                                    
##