iris
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica

question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19

question 4: to determine number of observations & percentage that are complete

# sum(complete.cases(dirty_iris))
# (num_complete_obs/nrow(dirty_iris)) * 100 

question 5: Still based on the dirty_iris data, besides missing values, is there an another type of special values containing in the numeric columns? Choose the one you found.

sapply(dirty_iris[,1:4], function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1

question 6: Write R code to locate the above identified special value and replace them with a missing value placeholder.

dirty_iris[,1:4][is.infinite(as.matrix(dirty_iris[,1:4]))] <- NA
sapply(dirty_iris[,1:4], function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0

question 7: Write R code to find out the observations that violate these rules. How many observations violate the above rules?

dirty_iris$Sepal.Width <= 0
##   [1] FALSE FALSE    NA FALSE FALSE    NA FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE  TRUE    NA FALSE FALSE    NA FALSE    NA FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE    NA FALSE    NA FALSE FALSE FALSE    NA FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE    NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE FALSE FALSE
##  [97] FALSE    NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE  TRUE FALSE FALSE
## [133]    NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA    NA FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE
dirty_iris$Sepal.Length > 30 
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE    NA FALSE FALSE FALSE    NA FALSE FALSE FALSE FALSE FALSE
##  [25]    NA FALSE FALSE  TRUE FALSE    NA FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE FALSE FALSE    NA    NA
## [121] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE
# violations <- dirty_iris[rule1_violation | rule2_violation, ]
violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]
violations
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## NA.2            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## 28            73.0          29         63.0          NA  virginica
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
## NA.17           NA          NA           NA          NA       <NA>
## NA.18           NA          NA           NA          NA       <NA>
## NA.19           NA          NA           NA          NA       <NA>
## NA.20           NA          NA           NA          NA       <NA>
## NA.21           NA          NA           NA          NA       <NA>
## 125           49.0          30         14.0         2.0     setosa
## NA.22           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.23           NA          NA           NA          NA       <NA>
## NA.24           NA          NA           NA          NA       <NA>
## NA.25           NA          NA           NA          NA       <NA>
## NA.26           NA          NA           NA          NA       <NA>
which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
## [1]  16  28 125 130

question 8: Write R code to achieve the error correction task

which(dirty_iris$Sepal.Width <= 0) 
## [1]  16 130
dirty_iris_fixed <- dirty_iris 
neg_idx <- which(dirty_iris_fixed$Sepal.Width < 0)
dirty_iris_fixed$Sepal.Width[neg_idx] <- abs(dirty_iris_fixed$Sepal.Width[neg_idx])
zero_idx <- which(dirty_iris_fixed$Sepal.Width == 0)
dirty_iris_fixed$Sepal.Width[zero_idx] <- NA
dirty_iris_fixed$Sepal.Width[c(neg_idx, zero_idx)]
## [1]  3 NA
which(dirty_iris_fixed$Sepal.Width <= 0)
## integer(0)

question 9: Write the R code to do the imputation as specified above. Mark the ones if your attached R code could achieve the task.

# install.packages("VIM")
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris_imputed <- kNN(dirty_iris_fixed, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
sum(is.na(dirty_iris_imputed$Petal.Width))
## [1] 0
# Sepal.Width → mean
mean_val <- mean(dirty_iris_fixed$Sepal.Width, na.rm = TRUE)
dirty_iris_fixed$Sepal.Width[is.na(dirty_iris_fixed$Sepal.Width)] <- mean_val

# Petal.Length → median
median_val <- median(dirty_iris_fixed$Petal.Length, na.rm = TRUE)
dirty_iris_fixed$Petal.Length[is.na(dirty_iris_fixed$Petal.Length)] <- median_val

# Sepal.Length → regression
reg_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
                data = dirty_iris_fixed, na.action = na.exclude)
missing_idx <- which(is.na(dirty_iris_fixed$Sepal.Length))
dirty_iris_fixed$Sepal.Length[missing_idx] <- predict(reg_model, dirty_iris_fixed[missing_idx, ])

# Petal.Width → kNN (using VIM)
library(VIM)
dirty_iris_imputed <- kNN(dirty_iris_fixed, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
# Check missing values after all imputations
colSums(is.na(dirty_iris_imputed))
##    Sepal.Length     Sepal.Width    Petal.Length     Petal.Width         Species 
##               0               0               0               0               0 
## Petal.Width_imp 
##               0