Question 1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 2

num_complete <- sum(complete.cases(dirty_iris))

percent_complete <- mean(complete.cases(dirty_iris)) * 100

num_complete
## [1] 96
percent_complete
## [1] 64

Question 3

numeric_columns <- dirty_iris[sapply(dirty_iris, is.numeric)]
inf_rows <- sapply(numeric_columns, function(x) which(is.infinite(x)))
inf_rows 
## $Sepal.Length
## integer(0)
## 
## $Sepal.Width
## integer(0)
## 
## $Petal.Length
## integer(0)
## 
## $Petal.Width
## [1] 86

Question 4

dirty_iris[sapply(dirty_iris, is.numeric)] <- apply(dirty_iris[sapply(dirty_iris, is.numeric)], 2, function(x) {
  x[is.infinite(x)] <- NA  
  return(x)
})

dirty_iris[86, ]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 86          5.1         3.8          1.9          NA  setosa

Question 5

violating_sepal_width <- which(dirty_iris$Sepal.Width <= 0)  # Sepal.Width <= 0
violating_sepal_length <- which(dirty_iris$Sepal.Length > 30)  # Sepal.Length > 30

violating_indices <- unique(c(violating_sepal_width, violating_sepal_length))

violating_rows <- dirty_iris[violating_indices, ]

n_violations <- length(violating_indices)

n_violations
## [1] 4

Question 6

violating_indices <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)

print("Violating Observations Before Correction:")
## [1] "Violating Observations Before Correction:"
print(dirty_iris[violating_indices, ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- 
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA

print("Corrected Observations:")
## [1] "Corrected Observations:"
print(dirty_iris[violating_indices, ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa

Question 7

pacman::p_load(dplyr, Hmisc)

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])

dirty_iris$Petal.Width <- impute(dirty_iris$Petal.Width, method = "knn")

head(dirty_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4    3.200000          4.5         1.5 versicolor
## 2          6.3    3.300000          6.0         2.5  virginica
## 3          6.2    3.462121          5.4         2.3  virginica
## 4          5.0    3.400000          1.6         0.4     setosa
## 5          5.7    2.600000          3.5         1.0 versicolor
## 6          5.3    3.462121          4.5         0.2     setosa