Question 1

No Code Required

Question 2

No Code Required

Quetsion 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)

complete_obs
## [1] 96
complete_obs / total_obs
## [1] 0.64

Question 5

sapply(dirty_iris, function(x) sum(is.nan(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0
sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1            0

Question 6

# Locate Inf values
is_inf <- sapply(dirty_iris, is.infinite)

# Replace Inf with NA
dirty_iris[is_inf] <- NA

# Verify
sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Question 7

violations <- subset(dirty_iris,
                     Sepal.Width <= 0 | Sepal.Length > 30)

nrow(violations)
## [1] 4

Question 8

# Fix negative values safely
dirty_iris$Sepal.Width[
  !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
] <- abs(
  dirty_iris$Sepal.Width[
    !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
  ]
)

# Replace zeros with NA
dirty_iris$Sepal.Width[
  !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0
] <- NA

Question 9

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
            data = dirty_iris)

missing_SL <- is.na(dirty_iris$Sepal.Length)

dirty_iris$Sepal.Length[missing_SL] <-
  predict(model, dirty_iris[missing_SL, ])


library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris,
                  variable = "Petal.Width",
                  k = 5)

# Remove indicator column
dirty_iris$Petal.Width_imp <- NULL

Question 10

No Code Required