Load the data

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Petal Length Missing Values

sum(is.na(dirty_iris))
## [1] 58
apply(is.na(dirty_iris), 2 , sum)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           17           19           12            0

Complete Observations Calculation

sum(complete.cases(dirty_iris))
## [1] 96
sum(complete.cases(dirty_iris)) / nrow(dirty_iris)
## [1] 0.64

Special Values

numeric_columns <- dirty_iris[ , sapply(dirty_iris, is.numeric)]

special_values <- sapply(numeric_columns, function(col) {
  c(
    NA_count     = sum(is.na(col)),
    NaN_count    = sum(is.nan(col)),
    Inf_count    = sum(is.infinite(col) & col > 0),
    NegInf_count = sum(is.infinite(col) & col < 0)
  )})

special_values
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA_count               10          17           19          12
## NaN_count               0           0            0           0
## Inf_count               0           0            0           1
## NegInf_count            0           0            0           0

Replace special value

na_iris <- dirty_iris
na_iris$Petal.Width[is.infinite(na_iris$Petal.Width)] <- NA
special_values2 <- sapply(na_iris, function(col) {
  (Inf_count    = sum(is.infinite(col) & col > 0))})
print(special_values2)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Find Errors

# Logical condition for violations
violations <- which(na_iris$Sepal.Width <= 0 | na_iris$Sepal.Length > 30)

na_iris[violations, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Correct Errors

acc_iris <- na_iris

acc_iris$Sepal.Width[acc_iris$Sepal.Width <= 0] <- NA

acc_iris$Sepal.Length[acc_iris$Sepal.Length > 30] <- NA

which(acc_iris$Sepal.Width <= 0 | acc_iris$Sepal.Length > 30)
## integer(0)

Imputations

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
imputed_iris <- acc_iris

Mean Imputation

imputed_iris$Sepal.Width[is.na(imputed_iris$Sepal.Width)] <- 
  mean(imputed_iris$Sepal.Width, na.rm = TRUE)

sum(is.na(imputed_iris$Sepal.Width))
## [1] 0
iris_2 <- imputed_iris

Median Imputation

iris_2$Petal.Length[is.na(iris_2$Petal.Length)] <- 
  median(iris_2$Petal.Length, na.rm = TRUE)

sum(is.na(iris_2$Petal.Length))
## [1] 0
iris_3 <- iris_2

Linear Regression Imputaton

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris_3)

na_rows <- is.na(iris_3$Sepal.Length)

iris_3$Sepal.Length[na_rows] <- predict(model, newdata = iris_3[na_rows, ])

sum(is.na(iris_3$Sepal.Length))
## [1] 0
iris_4 <- iris_3

KNN Imputation

iris_5 <- kNN(iris_4, variable = "Petal.Width")
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##      0.00000      2.20000      0.00000     18.40128     30.00000     63.00000
sum(is.na(iris_5$Petal.Width))
## [1] 0

````