Question 3

Amount of missing values

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

The number and percentage of observations that are complete

sum(complete.cases(dirty_iris))
## [1] 96

Percent complete = 96/150 = 0.64 -> 64%

So the answer is 96 observations and 64%.

Question 5

Another type of special values containing numeric columns

Checking for NA

sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(is.na(dirty_iris$Sepal.Length))
## [1] 10
sum(is.na(dirty_iris$Sepal.Width))
## [1] 17
sum(is.na(dirty_iris$Petal.Width))
## [1] 12

Checking for NaN

sum(is.nan(dirty_iris$Petal.Length))
## [1] 0
sum(is.nan(dirty_iris$Sepal.Length))
## [1] 0
sum(is.nan(dirty_iris$Sepal.Width))
## [1] 0
sum(is.nan(dirty_iris$Petal.Width))
## [1] 0

Question 6

Locate the above identified special value and replace them with a missing value placeholder.

# Locate the special (invalid) value
sum(dirty_iris$Sepal.Width < 0)
## [1] NA
# Replace the negative value with NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0] <- NA

# Verify replacement
sum(dirty_iris$Sepal.Width < 0)
## [1] NA
sum(is.na(dirty_iris$Sepal.Width))
## [1] 18

Question 7

Find out the observations that violate these rules. How many observations violate the rules?

sum(dirty_iris$Sepal.Width <= 0 |
    dirty_iris$Sepal.Length > 30,
    na.rm = TRUE)
## [1] 3

There are 3 total violating observations. 1 with sepal width and 2 with sepal length.

Question 8

Locate the observation that violates the rule of “Sepal.Width>0” and make reasonable corrections

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 1
# Replace negative values with absolute value
dirty_iris$Sepal.Width <- ifelse(
  dirty_iris$Sepal.Width < 0,
  abs(dirty_iris$Sepal.Width),
  dirty_iris$Sepal.Width
)

# Replace 0 with NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA

# Verify the correction 
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 0

Question 9

Dealing with missing values

Replaces missing values with the column mean

# Sepal.Width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

Replaces missing values with the column median

# Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- 
  median(dirty_iris$Petal.Length, na.rm = TRUE)

Replaces missing values with kNN imputation

install.packages(“VIM”)

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris, k = 5)

dirty_iris <- dirty_iris[, !grepl("_imp", names(dirty_iris))]

colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Replaces missing values with the linear regression

colSums(is.na(dirty_iris[, c("Sepal.Length",
                             "Sepal.Width",
                             "Petal.Length",
                             "Petal.Width")]))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0
sapply(dirty_iris[, c("Sepal.Length",
                      "Sepal.Width",
                      "Petal.Length",
                      "Petal.Width")],
       function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1
sapply(dirty_iris[, c("Sepal.Length",
                      "Sepal.Width",
                      "Petal.Length",
                      "Petal.Width")],
       class)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    "numeric"    "numeric"    "numeric"    "numeric"
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
dirty_iris$Petal.Width[is.na(dirty_iris$Petal.Width)] <-
  median(dirty_iris$Petal.Width, na.rm = TRUE)

sapply(dirty_iris[, c("Sepal.Length",
                      "Sepal.Width",
                      "Petal.Length",
                      "Petal.Width")],
       function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0
complete_rows <- complete.cases(
  dirty_iris[, c("Sepal.Length",
                 "Sepal.Width",
                 "Petal.Length",
                 "Petal.Width")]
)

lm_model <- lm(
  Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
  data = dirty_iris[complete_rows, ]
)

missing_SL <- is.na(dirty_iris$Sepal.Length)

dirty_iris$Sepal.Length[missing_SL] <-
  predict(lm_model, newdata = dirty_iris[missing_SL, ])