Question 1
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
missing_values <- sum(is.na(dirty_iris$Petal.Length))
missing_values
[1] 19
Question 2
num_complete <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage_complete <- (num_complete / total_obs) * 100
num_complete
[1] 96
percentage_complete
[1] 64
Question 3
is_na <- sum(is.na(dirty_iris))
is_nan <- sum(is.nan(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
is_inf <- sum(is.infinite(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
is_neg_inf <- sum(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)]) == -Inf)
is_na
[1] 58
is_nan
[1] 0
is_inf
[1] 1
is_neg_inf
[1] NA
Question 4
dirty_iris[sapply(dirty_iris, is.numeric)] <- lapply(dirty_iris[sapply(dirty_iris, is.numeric)], function(x) {
x[is.nan(x)] <- NA
return(x)
})
dirty_iris[sapply(dirty_iris, is.numeric)] <- lapply(dirty_iris[sapply(dirty_iris, is.numeric)], function(x) {
x[is.infinite(x)] <- NA
return(x)
})
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300 Mode :character
Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
NA's :10 NA's :17 NA's :19 NA's :13
Question 5
violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]
violations
num_violations <- nrow(violations)
num_violations
Question 6
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. : 2.200 Min. : 0.00 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300 Mode :character
Mean : 6.559 Mean : 3.462 Mean : 4.45 Mean :1.207
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
NA's :10 NA's :18 NA's :19 NA's :13
Question 7
# Install and load the required packages
install.packages("mice")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.3/mice_3.16.0.tgz'
Content type 'application/x-gzip' length 1860396 bytes (1.8 MB)
==================================================
downloaded 1.8 MB
The downloaded binary packages are in
/var/folders/86/4k09mpj104zgtpw3g56cwtvr0000gn/T//Rtmp5Z1nw7/downloaded_packages
install.packages("VIM") # for kNN function
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.3/VIM_6.2.2.tgz'
Content type 'application/x-gzip' length 2949644 bytes (2.8 MB)
==================================================
downloaded 2.8 MB
The downloaded binary packages are in
/var/folders/86/4k09mpj104zgtpw3g56cwtvr0000gn/T//Rtmp5Z1nw7/downloaded_packages
library(mice)
Warning in check_dep_version() :
ABI version mismatch:
lme4 was built with Matrix ABI version 1
Current Matrix ABI version is 0
Please re-install lme4 from source or restore original ‘Matrix’ package
Attaching package: ‘mice’
The following object is masked from ‘package:stats’:
filter
The following objects are masked from ‘package:base’:
cbind, rbind
library(VIM)
Loading required package: colorspace
Warning: package ‘colorspace’ was built under R version 4.3.3
Loading required package: grid
Registered S3 method overwritten by 'data.table':
method from
print.data.table
VIM is ready to use.
Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
Attaching package: ‘VIM’
The following object is masked from ‘package:datasets’:
sleep
# Imputation steps
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
# Use mice for imputation
imputed_data <- mice(dirty_iris, method = 'norm.predict', m = 1, maxit = 5)
iter imp variable
1 1 Sepal.Length Petal.Width
2 1 Sepal.Length Petal.Width
3 1 Sepal.Length Petal.Width
4 1 Sepal.Length Petal.Width
5 1 Sepal.Length Petal.Width
Warning: Number of logged events: 1
dirty_iris$Sepal.Length <- complete(imputed_data)$Sepal.Length
# Use kNN for imputing Petal.Width
dirty_iris2 <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
# Summary of the resulting dataset
summary(dirty_iris2)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Petal.Width_imp
Min. : 0.000 Min. : 2.200 Min. : 0.000 Min. :0.100 Length:150 Mode :logical
1st Qu.: 5.100 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.:0.300 Class :character FALSE:137
Median : 5.800 Median : 3.100 Median : 4.500 Median :1.300 Mode :character TRUE :13
Mean : 6.545 Mean : 3.462 Mean : 4.456 Mean :1.209
3rd Qu.: 6.400 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.000 Max. :2.500
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA1IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIFF1ZXN0aW9uIDEKYGBge3J9CmRpcnR5X2lyaXMgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9lZHdpbmRqL2RhdGFjbGVhbmluZy9tYXN0ZXIvZGF0YS9kaXJ0eV9pcmlzLmNzdiIpCgptaXNzaW5nX3ZhbHVlcyA8LSBzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQoKbWlzc2luZ192YWx1ZXMKYGBgCgojIFF1ZXN0aW9uIDIKYGBge3J9Cm51bV9jb21wbGV0ZSA8LSBzdW0oY29tcGxldGUuY2FzZXMoZGlydHlfaXJpcykpCgp0b3RhbF9vYnMgPC0gbnJvdyhkaXJ0eV9pcmlzKQoKcGVyY2VudGFnZV9jb21wbGV0ZSA8LSAobnVtX2NvbXBsZXRlIC8gdG90YWxfb2JzKSAqIDEwMAoKbnVtX2NvbXBsZXRlCnBlcmNlbnRhZ2VfY29tcGxldGUKYGBgCgojIFF1ZXN0aW9uIDMKYGBge3J9CmlzX25hIDwtIHN1bShpcy5uYShkaXJ0eV9pcmlzKSkKaXNfbmFuIDwtIHN1bShpcy5uYW4oYXMubWF0cml4KGRpcnR5X2lyaXNbLCBzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldKSkpCmlzX2luZiA8LSBzdW0oaXMuaW5maW5pdGUoYXMubWF0cml4KGRpcnR5X2lyaXNbLCBzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldKSkpCmlzX25lZ19pbmYgPC0gc3VtKGFzLm1hdHJpeChkaXJ0eV9pcmlzWywgc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSkgPT0gLUluZikKCmlzX25hCmlzX25hbgppc19pbmYKaXNfbmVnX2luZgpgYGAKCiMgUXVlc3Rpb24gNApgYGB7cn0KCmRpcnR5X2lyaXNbc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSA8LSBsYXBwbHkoZGlydHlfaXJpc1tzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldLCBmdW5jdGlvbih4KSB7CiAgeFtpcy5uYW4oeCldIDwtIE5BCiAgcmV0dXJuKHgpCn0pCgpkaXJ0eV9pcmlzW3NhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0gPC0gbGFwcGx5KGRpcnR5X2lyaXNbc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSwgZnVuY3Rpb24oeCkgewogIHhbaXMuaW5maW5pdGUoeCldIDwtIE5BCiAgcmV0dXJuKHgpCn0pCgpzdW1tYXJ5KGRpcnR5X2lyaXMpCgpgYGAKCiMgUXVlc3Rpb24gNQpgYGB7cn0KCnZpb2xhdGlvbnMgPC0gZGlydHlfaXJpc1tkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDw9IDAgfCBkaXJ0eV9pcmlzJFNlcGFsLkxlbmd0aCA+IDMwLCBdCgp2aW9sYXRpb25zCgpudW1fdmlvbGF0aW9ucyA8LSBucm93KHZpb2xhdGlvbnMpCgpudW1fdmlvbGF0aW9ucwoKYGBgCgojIFF1ZXN0aW9uIDYKYGBge3J9CgpkaXJ0eV9pcmlzJFNlcGFsLldpZHRoWyFpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLldpZHRoKSAmIGRpcnR5X2lyaXMkU2VwYWwuV2lkdGggPCAwXSA8LSBhYnMoZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDwgMF0pCgoKZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoID09IDBdIDwtIE5BCgoKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKCiMgUXVlc3Rpb24gNwpgYGB7cn0KCiNpbnN0YWxsLnBhY2thZ2VzKCJtaWNlIikKI2luc3RhbGwucGFja2FnZXMoIlZJTSIpIAoKbGlicmFyeShtaWNlKQpsaWJyYXJ5KFZJTSkKCmRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpcyRTZXBhbC5XaWR0aCwgbmEucm0gPSBUUlVFKQpkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aFtpcy5uYShkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCwgbmEucm0gPSBUUlVFKQoKaW1wdXRlZF9kYXRhIDwtIG1pY2UoZGlydHlfaXJpcywgbWV0aG9kID0gJ25vcm0ucHJlZGljdCcsIG0gPSAxLCBtYXhpdCA9IDUpCmRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoIDwtIGNvbXBsZXRlKGltcHV0ZWRfZGF0YSkkU2VwYWwuTGVuZ3RoCgpkaXJ0eV9pcmlzMiA8LSBrTk4oZGlydHlfaXJpcywgdmFyaWFibGUgPSAiUGV0YWwuV2lkdGgiLCBrID0gNSkKCnN1bW1hcnkoZGlydHlfaXJpczIpCgpgYGAKCg==