dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 3

sum(is.na(dirty_iris$Petal.Length))
## [1] 19
apply(is.na(dirty_iris), 2 , sum)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           17           19           12            0
library(stringr)
str_detect(dirty_iris,"NA")
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1]  TRUE  TRUE  TRUE  TRUE FALSE

Question 4

num_complete <- sum(complete.cases(dirty_iris))

total_rows <- nrow(dirty_iris)

percent_complete <- (num_complete / total_rows) * 100

num_complete
## [1] 96
percent_complete
## [1] 64

Question 6

dirty_iris_complete <-dirty_iris[-which(is.na(dirty_iris$Sepal.Length)),]
str(dirty_iris_complete)
## 'data.frame':    140 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
dirty_iris_complete1 <- dirty_iris[complete.cases(dirty_iris),]
str(dirty_iris_complete1)
## 'data.frame':    96 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
##  $ Sepal.Width : num  3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
##  $ Petal.Length: num  4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
##  $ Petal.Width : num  1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
##  $ Species     : chr  "versicolor" "virginica" "setosa" "versicolor" ...
dirty_iris_complete2 <- na.omit(dirty_iris)
str(dirty_iris_complete2)
## 'data.frame':    96 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
##  $ Sepal.Width : num  3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
##  $ Petal.Length: num  4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
##  $ Petal.Width : num  1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
##  $ Species     : chr  "versicolor" "virginica" "setosa" "versicolor" ...
##  - attr(*, "na.action")= 'omit' Named int [1:54] 3 6 7 13 15 17 19 20 21 22 ...
##   ..- attr(*, "names")= chr [1:54] "3" "6" "7" "13" ...

Question 7

table(dirty_iris$Sepal.Width)
## 
##  -3   0 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 
##   1   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1 
## 4.1 4.2  29  30 
##   1   1   1   1
table(dirty_iris$Sepal.Length)
## 
##   0 4.3 4.4 4.5 4.6 4.7 4.8 4.9   5 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9   6 6.1 
##   1   1   3   1   4   2   5   5  10   9   4   1   4   7   6   7   6   3   5   4 
## 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9   7 7.2 7.4 7.6 7.7 7.9  49  73 
##   4   9   7   5   2   7   2   3   1   3   1   1   4   1   1   1
bad_width  <- subset(dirty_iris, Sepal.Width <= 0)
bad_length <- subset(dirty_iris, Sepal.Length > 30)
bad_data <- rbind(bad_width, bad_length)

Question 8

dirty_iris[which(dirty_iris$Sepal.Width==0),"Sepal.Width"] <- NA
table(dirty_iris$Sepal.Width)
## 
##  -3 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 4.1 
##   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1   1 
## 4.2  29  30 
##   1   1   1
dirty_iris$Sepal.Width
##   [1]  3.2  3.3   NA  3.4  2.6   NA  2.7  3.0  2.7  3.1  3.5  2.7  3.0  2.8  3.9
##  [16] -3.0   NA  3.2  4.0   NA  3.6   NA  2.8  3.3  3.0  3.2  3.1 29.0  3.2  2.8
##  [31]  3.2  3.2  2.8  2.9  2.9  3.0  3.0  2.2  2.5  3.0   NA  2.7   NA  2.7  4.2
##  [46]  2.8   NA  3.2  3.0  3.4  2.6  3.1  2.7  3.4  3.3  3.8  3.8  2.9  2.8  2.8
##  [61]  2.3  2.8  3.0  3.3  3.0  2.5  2.5  3.2  3.5  3.5  3.0  3.1  3.5   NA  2.8
##  [76]  2.5  3.5  3.0  3.8  3.8  2.6  3.4  2.9  3.7  3.0  3.8  2.9  2.9  2.9  2.5
##  [91]  3.2   NA  3.4  2.7  2.2  3.1  2.3   NA  3.0  2.8  3.4  3.6  2.7  3.0  3.7
## [106]   NA  3.0  3.0  2.8  3.4  3.4  3.4  3.4  3.3  3.1  2.6   NA  3.1  3.0  2.8
## [121]  3.0  2.3  3.2  4.1 30.0  2.9  3.2   NA  3.6   NA  2.5  3.1   NA  3.3  3.0
## [136]  3.0  3.2  3.0  3.1  2.2   NA   NA  3.0  2.9  2.5  3.1  3.0  3.5  3.1  2.6

Question 9

# install.packages("Hmisc")
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
# install.packages("VIM")
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep

Mean and Median Imputation

dirty_iris$Sepal.Width <- impute(dirty_iris$Sepal.Width, fun = mean)

dirty_iris$Petal.Length <- impute(dirty_iris$Petal.Length, fun = median)

Linear Regression Imputation

data(dirty_iris)
## Warning in data(dirty_iris): data set 'dirty_iris' not found
str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : 'impute' num  3.2 3.3 3.42 3.4 2.6 ...
##   ..- attr(*, "imputed")= int [1:18] 3 6 17 20 22 41 43 47 74 92 ...
##  $ Petal.Length: 'impute' num  4.5 6 5.4 1.6 3.5 4.5 5.3 5.1 4.1 1.6 ...
##   ..- attr(*, "imputed")= int [1:19] 6 19 21 23 31 34 47 52 67 69 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
head(dirty_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4    3.200000          4.5         1.5 versicolor
## 2          6.3    3.300000          6.0         2.5  virginica
## 3          6.2    3.416667          5.4         2.3  virginica
## 4          5.0    3.400000          1.6         0.4     setosa
## 5          5.7    2.600000          3.5         1.0 versicolor
## 6          5.3    3.416667          4.5         0.2     setosa
dirty_iris$Sepal.Length[1:10] <- NA
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
            data = dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[I] <- predict(model, newdata = dirty_iris[I,])
head(dirty_iris,10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1      6.261069    3.200000          4.5         1.5 versicolor
## 2      7.125084    3.300000          6.0         2.5  virginica
## 3      7.000148    3.416667          5.4         2.3  virginica
## 4      5.144661    3.400000          1.6         0.4     setosa
## 5      4.933841    2.600000          3.5         1.0 versicolor
## 6      6.566251    3.416667          4.5         0.2     setosa
## 7      5.942489    2.700000          5.3          NA  virginica
## 8      6.268627    3.000000          5.1         1.8  virginica
## 9      5.363959    2.700000          4.1         1.0 versicolor
## 10     4.722101    3.100000          1.6         0.2     setosa

kNN Imputation

library(VIM)
summary(dirty_iris)
## 
##  18 values imputed to 3.416667 
## 
## 
##  19 values imputed to 4.5
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.000   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.700   1st Qu.:0.3  
##  Median : 5.800   Median : 3.100   Median : 4.500   Median :1.3  
##  Mean   : 6.545   Mean   : 3.417   Mean   : 4.456   Mean   :Inf  
##  3rd Qu.: 6.500   3rd Qu.: 3.417   3rd Qu.: 5.100   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :Inf  
##                                                     NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
dirty_iris2 <- kNN(iris, "Petal.Width")
## Warning in kNN(iris, "Petal.Width"): Nothing to impute, because no NA are
## present (also after using makeNA)
dirty_iris2 <- subset(dirty_iris2, select=Sepal.Length:Species)
head(dirty_iris2)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa