Load the Dataset

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Check the number of missing values in Petal.Length

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Number of complete observations

num_complete <- sum(complete.cases(dirty_iris))
num_complete
## [1] 96

Percent of complete observations

percent_complete <- (num_complete / nrow(dirty_iris)) * 100
percent_complete
## [1] 64

Numeric columns

str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...

Find unique special values

sapply(dirty_iris[, sapply(dirty_iris, is.numeric)], unique)
## $Sepal.Length
##  [1]  6.4  6.3  6.2  5.0  5.7  5.3  5.9  5.8  4.8  6.0  6.8   NA  5.5  4.7  5.6
## [16]  4.9  5.4  6.7  4.6 73.0  6.5  4.4  6.6  0.0  7.7  7.0  5.2  5.1  7.4  4.3
## [31]  7.2  6.1  6.9  7.6  7.9  4.5 49.0
## 
## $Sepal.Width
##  [1]  3.2  3.3   NA  3.4  2.6  2.7  3.0  3.1  3.5  2.8  3.9 -3.0  4.0  3.6 29.0
## [16]  2.9  2.2  2.5  4.2  3.8  2.3  3.7  4.1 30.0  0.0
## 
## $Petal.Length
##  [1]  4.500  6.000  5.400  1.600  3.500     NA  5.300  5.100  4.100  4.800
## [11]  1.700  4.000  1.300  4.200  5.700  5.900  1.400  1.500 63.000  0.820
## [21] 23.000  5.500  5.800  1.200  3.900  6.700  4.700  4.400  5.600  3.300
## [31]  6.100  1.100  4.900  4.600  6.600  0.000  6.400  5.200  1.900  4.300
## [41]  5.000  0.925  6.900 14.000  3.600  3.800
## 
## $Petal.Width
##  [1] 1.5 2.5 2.3 0.4 1.0 0.2  NA 1.8 0.6 1.6 1.4 1.3 0.1 2.1 2.0 1.2 1.9 2.2 0.3
## [20] 1.1 Inf 1.7 2.4 0.5

Locate missing value

num_cols <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")

Replace ? with NA

dirty_iris[num_cols][dirty_iris[num_cols] == "?"] <- NA

View cleaned dataset

head(dirty_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4         3.2          4.5         1.5 versicolor
## 2          6.3         3.3          6.0         2.5  virginica
## 3          6.2          NA          5.4         2.3  virginica
## 4          5.0         3.4          1.6         0.4     setosa
## 5          5.7         2.6          3.5         1.0 versicolor
## 6          5.3          NA           NA         0.2     setosa

Convert the numeric columns from character to numeric

dirty_iris[num_cols] <- lapply(dirty_iris[num_cols], as.numeric)

Rule violations

violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)

Count how many violations

nrow(violations)
## [1] 4

Mean

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

Median

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

Knn

if(!require(VIM)) install.packages("VIM")
## Loading required package: VIM
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(VIM)

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##            0           -3            0           73           30           63