Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)

complete_obs
## [1] 96
complete_obs / total_obs * 100
## [1] 64

Question 5

sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1            0

Question 6

dirty_iris[sapply(dirty_iris, is.infinite)] <- NA

Question 7

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 2
sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
## [1] 2
sum(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, 
    na.rm = TRUE)
## [1] 4

Question 8

dirty_iris[dirty_iris$Sepal.Width <= 0, ]
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.2            NA          NA           NA          NA       <NA>
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
neg_index <- which(dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_index] <- 
  abs(dirty_iris$Sepal.Width[neg_index])
zero_index <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_index] <- NA
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 0

Question 9

Mean

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

Median

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

Linear Regression

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
            data = dirty_iris)

missing_index <- which(is.na(dirty_iris$Sepal.Length))

dirty_iris$Sepal.Length[missing_index] <-
  predict(model, dirty_iris[missing_index, ])

KNN

install.packages(“VIM”)

library(VIM)
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris,
                  variable = "Petal.Width",
                  k = 5)

dirty_iris$Petal.Width_imp <- NULL