Q1:

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

missing_values <- sum(is.na(dirty_iris$Petal.Length))

missing_values
[1] 19

Q2:

num_complete <- sum(complete.cases(dirty_iris))

total_obs <- nrow(dirty_iris)

percentage_complete <- (num_complete / total_obs) * 100

num_complete
[1] 96
percentage_complete
[1] 64

Q3:

is_na <- sum(is.na(dirty_iris))
is_nan <- sum(is.nan(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
is_inf <- sum(is.infinite(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
is_neg_inf <- sum(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)]) == -Inf)

is_na
[1] 58
is_nan
[1] 0
is_inf
[1] 1
is_neg_inf
[1] NA

Q4:

dirty_iris[sapply(dirty_iris, is.numeric)] <- lapply(dirty_iris[sapply(dirty_iris, is.numeric)], function(x) {
  x[is.nan(x)] <- NA
  return(x)
})

dirty_iris[sapply(dirty_iris, is.numeric)] <- lapply(dirty_iris[sapply(dirty_iris, is.numeric)], function(x) {
  x[is.infinite(x)] <- NA
  return(x)
})

summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length     Petal.Width      Species         
 Min.   : 0.000   Min.   : 2.200   Min.   : 0.000   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300   Class :character  
 Median : 5.800   Median : 3.100   Median : 4.500   Median :1.300   Mode  :character  
 Mean   : 6.545   Mean   : 3.462   Mean   : 4.456   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500                     
                                                    NA's   :13                        

Q5:

violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]

violations

num_violations <- nrow(violations)

num_violations
[1] 2

Q6:

dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA


summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width      Species         
 Min.   : 0.000   Min.   : 2.200   Min.   : 0.00   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300   Class :character  
 Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300   Mode  :character  
 Mean   : 6.559   Mean   : 3.462   Mean   : 4.45   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500                     
 NA's   :10       NA's   :18       NA's   :19      NA's   :13                        

Q7:

# Install and load the required packages
install.packages("mice")
install.packages("VIM") # for kNN function
library(mice)
library(VIM)
# Imputation steps
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

# Use mice for imputation
imputed_data <- mice(dirty_iris, method = 'norm.predict', m = 1, maxit = 5)

 iter imp variable
  1   1  Sepal.Length  Petal.Width
  2   1  Sepal.Length  Petal.Width
  3   1  Sepal.Length  Petal.Width
  4   1  Sepal.Length  Petal.Width
  5   1  Sepal.Length  Petal.Width
Warning: Number of logged events: 1
dirty_iris$Sepal.Length <- complete(imputed_data)$Sepal.Length

# Use kNN for imputing Petal.Width
dirty_iris2 <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

# Summary of the resulting dataset
summary(dirty_iris2)
  Sepal.Length     Sepal.Width      Petal.Length     Petal.Width      Species          Petal.Width_imp
 Min.   : 0.000   Min.   : 2.200   Min.   : 0.000   Min.   :0.100   Length:150         Mode :logical  
 1st Qu.: 5.100   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300   Class :character   FALSE:137      
 Median : 5.800   Median : 3.100   Median : 4.500   Median :1.300   Mode  :character   TRUE :13       
 Mean   : 6.545   Mean   : 3.462   Mean   : 4.456   Mean   :1.209                                     
 3rd Qu.: 6.400   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800                                     
 Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500                                     
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpRMToKYGBge3J9CmRpcnR5X2lyaXMgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9lZHdpbmRqL2RhdGFjbGVhbmluZy9tYXN0ZXIvZGF0YS9kaXJ0eV9pcmlzLmNzdiIpCgptaXNzaW5nX3ZhbHVlcyA8LSBzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQoKbWlzc2luZ192YWx1ZXMKYGBgCgpRMjoKYGBge3J9Cm51bV9jb21wbGV0ZSA8LSBzdW0oY29tcGxldGUuY2FzZXMoZGlydHlfaXJpcykpCgp0b3RhbF9vYnMgPC0gbnJvdyhkaXJ0eV9pcmlzKQoKcGVyY2VudGFnZV9jb21wbGV0ZSA8LSAobnVtX2NvbXBsZXRlIC8gdG90YWxfb2JzKSAqIDEwMAoKbnVtX2NvbXBsZXRlCmBgYAoKYGBge3J9CnBlcmNlbnRhZ2VfY29tcGxldGUKYGBgCgoKClEzOgpgYGB7cn0KaXNfbmEgPC0gc3VtKGlzLm5hKGRpcnR5X2lyaXMpKQppc19uYW4gPC0gc3VtKGlzLm5hbihhcy5tYXRyaXgoZGlydHlfaXJpc1ssIHNhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0pKSkKaXNfaW5mIDwtIHN1bShpcy5pbmZpbml0ZShhcy5tYXRyaXgoZGlydHlfaXJpc1ssIHNhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0pKSkKaXNfbmVnX2luZiA8LSBzdW0oYXMubWF0cml4KGRpcnR5X2lyaXNbLCBzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldKSA9PSAtSW5mKQoKaXNfbmEKYGBgCgpgYGB7cn0KaXNfbmFuCmBgYAoKYGBge3J9CmlzX2luZgpgYGAKCmBgYHtyfQppc19uZWdfaW5mCmBgYAoKClE0OgpgYGB7cn0KZGlydHlfaXJpc1tzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldIDwtIGxhcHBseShkaXJ0eV9pcmlzW3NhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0sIGZ1bmN0aW9uKHgpIHsKICB4W2lzLm5hbih4KV0gPC0gTkEKICByZXR1cm4oeCkKfSkKCmRpcnR5X2lyaXNbc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSA8LSBsYXBwbHkoZGlydHlfaXJpc1tzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldLCBmdW5jdGlvbih4KSB7CiAgeFtpcy5pbmZpbml0ZSh4KV0gPC0gTkEKICByZXR1cm4oeCkKfSkKCnN1bW1hcnkoZGlydHlfaXJpcykKYGBgCgoKUTU6CmBgYHtyfQp2aW9sYXRpb25zIDwtIGRpcnR5X2lyaXNbZGlydHlfaXJpcyRTZXBhbC5XaWR0aCA8PSAwIHwgZGlydHlfaXJpcyRTZXBhbC5MZW5ndGggPiAzMCwgXQoKdmlvbGF0aW9ucwoKbnVtX3Zpb2xhdGlvbnMgPC0gbnJvdyh2aW9sYXRpb25zKQoKbnVtX3Zpb2xhdGlvbnMKYGBgCgoKUTY6CmBgYHtyfQpkaXJ0eV9pcmlzJFNlcGFsLldpZHRoWyFpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLldpZHRoKSAmIGRpcnR5X2lyaXMkU2VwYWwuV2lkdGggPCAwXSA8LSBhYnMoZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDwgMF0pCgoKZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoID09IDBdIDwtIE5BCgoKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKCgpRNzoKYGBge3J9CiMgSW5zdGFsbCBhbmQgbG9hZCB0aGUgcmVxdWlyZWQgcGFja2FnZXMKaW5zdGFsbC5wYWNrYWdlcygibWljZSIpCmluc3RhbGwucGFja2FnZXMoIlZJTSIpICMgZm9yIGtOTiBmdW5jdGlvbgpsaWJyYXJ5KG1pY2UpCmxpYnJhcnkoVklNKQpgYGAKCgpgYGB7cn0KIyBJbXB1dGF0aW9uIHN0ZXBzCmRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpcyRTZXBhbC5XaWR0aCwgbmEucm0gPSBUUlVFKQpkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aFtpcy5uYShkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCwgbmEucm0gPSBUUlVFKQoKIyBVc2UgbWljZSBmb3IgaW1wdXRhdGlvbgppbXB1dGVkX2RhdGEgPC0gbWljZShkaXJ0eV9pcmlzLCBtZXRob2QgPSAnbm9ybS5wcmVkaWN0JywgbSA9IDEsIG1heGl0ID0gNSkKYGBgCgoKYGBge3J9CmRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoIDwtIGNvbXBsZXRlKGltcHV0ZWRfZGF0YSkkU2VwYWwuTGVuZ3RoCgojIFVzZSBrTk4gZm9yIGltcHV0aW5nIFBldGFsLldpZHRoCmRpcnR5X2lyaXMyIDwtIGtOTihkaXJ0eV9pcmlzLCB2YXJpYWJsZSA9ICJQZXRhbC5XaWR0aCIsIGsgPSA1KQoKIyBTdW1tYXJ5IG9mIHRoZSByZXN1bHRpbmcgZGF0YXNldApzdW1tYXJ5KGRpcnR5X2lyaXMyKQpgYGAKCg==