dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Q3 Find how many missing values are in the Pedal.length variable

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Q4 Calculate the number and the percentage of observations that are complete.

sum(complete.cases(dirty_iris))
## [1] 96
96/150
## [1] 0.64

Q5 Besides the missing values, what is another type of special values containing in the numeric columns?

sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1            0

Q6 Write R code to locate the above identified special value and replace them with a missing value placeholder.

dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 0

Q7 Write R code to find out the observations that violate the following rules. How many observations are there?

The sepal width should be a positive value.

The sepal length of an iris cannot exceed 30 cm.

violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)

print(violations)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
nrow(violations)
## [1] 4

Q8 locate the observation that violates the rule of “Sepal.Width > 0” and make reasonable corrections

neg_indices <- which(dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_indices] <- abs(dirty_iris$Sepal.Width[neg_indices])

zero_indices <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_indices] <- NA

dirty_iris[c(16, 130), ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa
any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] FALSE

Q9 Write the R code to do the imputation as specified below.

Sepal.width: mean Petal.Length: median Sepal.Length: linear regression Petal.Width: kNN

if(!require(VIM)) install.packages("VIM")
library(VIM)

library(VIM)

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
na_rows <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[na_rows] <- predict(model, dirty_iris[na_rows, ])

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5, imp_var = FALSE)

colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0