Question 1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

missing_values <- sum(is.na(dirty_iris$Petal.Length))

missing_values
[1] 19

Question 2

num_complete <- sum(complete.cases(dirty_iris))

total_obs <- nrow(dirty_iris)

percentage_complete <- (num_complete / total_obs) * 100

num_complete
[1] 96
percentage_complete
[1] 64

Question 3

is_na <- sum(is.na(dirty_iris))
is_nan <- sum(is.nan(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
is_inf <- sum(is.infinite(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
is_neg_inf <- sum(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)]) == -Inf)

is_na
[1] 58
is_nan
[1] 0
is_inf
[1] 1
is_neg_inf
[1] NA

Question 4


dirty_iris[sapply(dirty_iris, is.numeric)] <- lapply(dirty_iris[sapply(dirty_iris, is.numeric)], function(x) {
  x[is.nan(x)] <- NA
  return(x)
})

dirty_iris[sapply(dirty_iris, is.numeric)] <- lapply(dirty_iris[sapply(dirty_iris, is.numeric)], function(x) {
  x[is.infinite(x)] <- NA
  return(x)
})

summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width      Species         
 Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300   Class :character  
 Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300   Mode  :character  
 Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500                     
 NA's   :10       NA's   :17       NA's   :19      NA's   :13                        

Question 5


violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]

violations

num_violations <- nrow(violations)

num_violations

Question 6


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA


summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width      Species         
 Min.   : 0.000   Min.   : 2.200   Min.   : 0.00   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300   Class :character  
 Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300   Mode  :character  
 Mean   : 6.559   Mean   : 3.462   Mean   : 4.45   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500                     
 NA's   :10       NA's   :18       NA's   :19      NA's   :13                        

Question 7


# Install and load the required packages
install.packages("mice")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.3/mice_3.16.0.tgz'
Content type 'application/x-gzip' length 1860396 bytes (1.8 MB)
==================================================
downloaded 1.8 MB

The downloaded binary packages are in
    /var/folders/86/4k09mpj104zgtpw3g56cwtvr0000gn/T//Rtmp5Z1nw7/downloaded_packages
install.packages("VIM") # for kNN function
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.3/VIM_6.2.2.tgz'
Content type 'application/x-gzip' length 2949644 bytes (2.8 MB)
==================================================
downloaded 2.8 MB

The downloaded binary packages are in
    /var/folders/86/4k09mpj104zgtpw3g56cwtvr0000gn/T//Rtmp5Z1nw7/downloaded_packages
library(mice)
Warning in check_dep_version() :
  ABI version mismatch: 
lme4 was built with Matrix ABI version 1
Current Matrix ABI version is 0
Please re-install lme4 from source or restore original ‘Matrix’ package

Attaching package: ‘mice’

The following object is masked from ‘package:stats’:

    filter

The following objects are masked from ‘package:base’:

    cbind, rbind
library(VIM)
Loading required package: colorspace
Warning: package ‘colorspace’ was built under R version 4.3.3
Loading required package: grid
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
VIM is ready to use.

Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

Attaching package: ‘VIM’

The following object is masked from ‘package:datasets’:

    sleep
# Imputation steps
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

# Use mice for imputation
imputed_data <- mice(dirty_iris, method = 'norm.predict', m = 1, maxit = 5)

 iter imp variable
  1   1  Sepal.Length  Petal.Width
  2   1  Sepal.Length  Petal.Width
  3   1  Sepal.Length  Petal.Width
  4   1  Sepal.Length  Petal.Width
  5   1  Sepal.Length  Petal.Width
Warning: Number of logged events: 1
dirty_iris$Sepal.Length <- complete(imputed_data)$Sepal.Length

# Use kNN for imputing Petal.Width
dirty_iris2 <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

# Summary of the resulting dataset
summary(dirty_iris2)
  Sepal.Length     Sepal.Width      Petal.Length     Petal.Width      Species          Petal.Width_imp
 Min.   : 0.000   Min.   : 2.200   Min.   : 0.000   Min.   :0.100   Length:150         Mode :logical  
 1st Qu.: 5.100   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300   Class :character   FALSE:137      
 Median : 5.800   Median : 3.100   Median : 4.500   Median :1.300   Mode  :character   TRUE :13       
 Mean   : 6.545   Mean   : 3.462   Mean   : 4.456   Mean   :1.209                                     
 3rd Qu.: 6.400   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800                                     
 Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500                                     
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA1IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIFF1ZXN0aW9uIDEKYGBge3J9CmRpcnR5X2lyaXMgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9lZHdpbmRqL2RhdGFjbGVhbmluZy9tYXN0ZXIvZGF0YS9kaXJ0eV9pcmlzLmNzdiIpCgptaXNzaW5nX3ZhbHVlcyA8LSBzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQoKbWlzc2luZ192YWx1ZXMKYGBgCgojIFF1ZXN0aW9uIDIKYGBge3J9Cm51bV9jb21wbGV0ZSA8LSBzdW0oY29tcGxldGUuY2FzZXMoZGlydHlfaXJpcykpCgp0b3RhbF9vYnMgPC0gbnJvdyhkaXJ0eV9pcmlzKQoKcGVyY2VudGFnZV9jb21wbGV0ZSA8LSAobnVtX2NvbXBsZXRlIC8gdG90YWxfb2JzKSAqIDEwMAoKbnVtX2NvbXBsZXRlCnBlcmNlbnRhZ2VfY29tcGxldGUKYGBgCgojIFF1ZXN0aW9uIDMKYGBge3J9CmlzX25hIDwtIHN1bShpcy5uYShkaXJ0eV9pcmlzKSkKaXNfbmFuIDwtIHN1bShpcy5uYW4oYXMubWF0cml4KGRpcnR5X2lyaXNbLCBzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldKSkpCmlzX2luZiA8LSBzdW0oaXMuaW5maW5pdGUoYXMubWF0cml4KGRpcnR5X2lyaXNbLCBzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldKSkpCmlzX25lZ19pbmYgPC0gc3VtKGFzLm1hdHJpeChkaXJ0eV9pcmlzWywgc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSkgPT0gLUluZikKCmlzX25hCmlzX25hbgppc19pbmYKaXNfbmVnX2luZgpgYGAKCiMgUXVlc3Rpb24gNApgYGB7cn0KCmRpcnR5X2lyaXNbc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSA8LSBsYXBwbHkoZGlydHlfaXJpc1tzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldLCBmdW5jdGlvbih4KSB7CiAgeFtpcy5uYW4oeCldIDwtIE5BCiAgcmV0dXJuKHgpCn0pCgpkaXJ0eV9pcmlzW3NhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0gPC0gbGFwcGx5KGRpcnR5X2lyaXNbc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSwgZnVuY3Rpb24oeCkgewogIHhbaXMuaW5maW5pdGUoeCldIDwtIE5BCiAgcmV0dXJuKHgpCn0pCgpzdW1tYXJ5KGRpcnR5X2lyaXMpCgpgYGAKCiMgUXVlc3Rpb24gNQpgYGB7cn0KCnZpb2xhdGlvbnMgPC0gZGlydHlfaXJpc1tkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDw9IDAgfCBkaXJ0eV9pcmlzJFNlcGFsLkxlbmd0aCA+IDMwLCBdCgp2aW9sYXRpb25zCgpudW1fdmlvbGF0aW9ucyA8LSBucm93KHZpb2xhdGlvbnMpCgpudW1fdmlvbGF0aW9ucwoKYGBgCgojIFF1ZXN0aW9uIDYKYGBge3J9CgpkaXJ0eV9pcmlzJFNlcGFsLldpZHRoWyFpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLldpZHRoKSAmIGRpcnR5X2lyaXMkU2VwYWwuV2lkdGggPCAwXSA8LSBhYnMoZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDwgMF0pCgoKZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoID09IDBdIDwtIE5BCgoKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKCiMgUXVlc3Rpb24gNwpgYGB7cn0KCiNpbnN0YWxsLnBhY2thZ2VzKCJtaWNlIikKI2luc3RhbGwucGFja2FnZXMoIlZJTSIpIAoKbGlicmFyeShtaWNlKQpsaWJyYXJ5KFZJTSkKCmRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpcyRTZXBhbC5XaWR0aCwgbmEucm0gPSBUUlVFKQpkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aFtpcy5uYShkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCwgbmEucm0gPSBUUlVFKQoKaW1wdXRlZF9kYXRhIDwtIG1pY2UoZGlydHlfaXJpcywgbWV0aG9kID0gJ25vcm0ucHJlZGljdCcsIG0gPSAxLCBtYXhpdCA9IDUpCmRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoIDwtIGNvbXBsZXRlKGltcHV0ZWRfZGF0YSkkU2VwYWwuTGVuZ3RoCgpkaXJ0eV9pcmlzMiA8LSBrTk4oZGlydHlfaXJpcywgdmFyaWFibGUgPSAiUGV0YWwuV2lkdGgiLCBrID0gNSkKCnN1bW1hcnkoZGlydHlfaXJpczIpCgpgYGAKCg==