library(Hmisc)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘Hmisc’
The following objects are masked from ‘package:base’:
format.pval, units
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:Hmisc’:
src, summarize
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
pacman::p_load(VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
Question 1.
for (i in 1:ncol(dirty_iris)){
iris[sample(1:nrow(dirty_iris), 10, replace = FALSE), i] <- NA
}
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3 Mode :character
Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
NA's :10 NA's :17 NA's :19 NA's :12
sum(is.na(dirty_iris$Petal.Length))
[1] 19
Question 2.
# Count total rows
total_rows <- nrow(dirty_iris)
# Count complete cases (rows with no missing values)
complete_rows <- sum(complete.cases(dirty_iris))
# Calculate percentage of complete observations
percentage_complete <- (complete_rows / total_rows) * 100
# Print results
cat("Number of complete cases:", complete_rows, "\n")
Number of complete cases: 96
cat("Percentage of complete cases:", percentage_complete, "%\n")
Percentage of complete cases: 64 %
Question 3.
sapply(dirty_iris, function(x) {
if (is.numeric(x)) {
c(NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(x == Inf, na.rm = TRUE),
Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
}
})
$Sepal.Length
NA_values NaN_values Inf_values Neg_Inf_values
10 0 0 0
$Sepal.Width
NA_values NaN_values Inf_values Neg_Inf_values
17 0 0 0
$Petal.Length
NA_values NaN_values Inf_values Neg_Inf_values
19 0 0 0
$Petal.Width
NA_values NaN_values Inf_values Neg_Inf_values
12 0 1 0
$Species
NULL
Question 4.
dirty_iris[dirty_iris == Inf] <- NA
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300 Mode :character
Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
NA's :10 NA's :17 NA's :19 NA's :13
sapply(dirty_iris, function(x) {
if (is.numeric(x)) {
c(NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(x == Inf, na.rm = TRUE),
Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
}
})
$Sepal.Length
NA_values NaN_values Inf_values Neg_Inf_values
10 0 0 0
$Sepal.Width
NA_values NaN_values Inf_values Neg_Inf_values
17 0 0 0
$Petal.Length
NA_values NaN_values Inf_values Neg_Inf_values
19 0 0 0
$Petal.Width
NA_values NaN_values Inf_values Neg_Inf_values
13 0 0 0
$Species
NULL
Question 5.
num_negative_sepal_width <- sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
cat("Number of observations with negative Sepal.Width:", num_negative_sepal_width, "\n")
Number of observations with negative Sepal.Width: 2
num_large_sepal_length <- sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
cat("Number of observations with Sepal.Length > 30:", num_large_sepal_length, "\n")
Number of observations with Sepal.Length > 30: 2
Question 6.
# Ensure no Inf or NaN values before imputation
dirty_iris[!is.finite(as.matrix(dirty_iris))] <- NA
### 1. Mean Imputation for Sepal.Width
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
### 2. Median Imputation for Petal.Length
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
### 3. Linear Regression Imputation for Sepal.Length
# Check if there are enough complete cases for regression
complete_data <- dirty_iris %>% filter(!is.na(Sepal.Width) & !is.na(Petal.Length) & !is.na(Petal.Width))
if (nrow(complete_data) > 1) { # Ensure enough data points for regression
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = complete_data)
missing_sepal_length_rows <- which(is.na(dirty_iris$Sepal.Length))
if (length(missing_sepal_length_rows) > 0) {
dirty_iris$Sepal.Length[missing_sepal_length_rows] <- predict(lm_model, newdata = dirty_iris[missing_sepal_length_rows, ])
}
} else {
cat("Not enough complete cases for regression. Sepal.Length not imputed.\n")
}
Not enough complete cases for regression. Sepal.Length not imputed.
### 4. kNN Imputation for Petal.Width
# Select only numeric columns for kNN
numeric_cols <- dirty_iris %>% select(where(is.numeric))
# Perform kNN imputation on numeric columns
imputed_numeric <- kNN(numeric_cols, variable = "Petal.Width", k = 5)
Warning: All observations of Petal.Width are missing, therefore the variable will not be imputed!
Warning: Nothing is imputed, because all variables to be imputed only contains missings.
# Replace the original column with the imputed version
dirty_iris$Petal.Width <- imputed_numeric$Petal.Width
# Remove extra "_imp" columns added by kNN
dirty_iris <- dirty_iris %>% select(-contains("_imp"))
# Final Check: Count remaining missing values
cat("Remaining missing values after imputation:\n")
Remaining missing values after imputation:
print(colSums(is.na(dirty_iris)))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
150 150 150 150 150
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0])
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[38] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[75] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[112] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[149] NA NA
Question 7.
dirty_iris[16,2] <- abs(dirty_iris[16, 2])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
median(dirty_iris$Petal.Length, na.rm = TRUE)
median(dirty_iris$Petal.Length, na.rm = TRUE)
kNN(dirty_iris, variable = "Petal.Width")
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA1IH4gS2F0aWUgUXVpbm4iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KEhtaXNjKQpsaWJyYXJ5KGRwbHlyKQpwYWNtYW46OnBfbG9hZChWSU0pCmBgYAoKCmBgYHtyfQpkaXJ0eV9pcmlzIDwtIHJlYWQuY3N2KCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vZWR3aW5kai9kYXRhY2xlYW5pbmcvbWFzdGVyL2RhdGEvZGlydHlfaXJpcy5jc3YiKQpgYGAKCiMjIyBRdWVzdGlvbiAxLiAKYGBge3J9CmZvciAoaSBpbiAxOm5jb2woZGlydHlfaXJpcykpewogIGlyaXNbc2FtcGxlKDE6bnJvdyhkaXJ0eV9pcmlzKSwgMTAsIHJlcGxhY2UgPSBGQUxTRSksIGldIDwtIE5BCn0KCnN1bW1hcnkoZGlydHlfaXJpcykKYGBgCmBgYHtyfQpzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQpgYGAKCgojIyMgUXVlc3Rpb24gMi4KYGBge3J9CgojIENvdW50IHRvdGFsIHJvd3MKdG90YWxfcm93cyA8LSBucm93KGRpcnR5X2lyaXMpCgojIENvdW50IGNvbXBsZXRlIGNhc2VzIChyb3dzIHdpdGggbm8gbWlzc2luZyB2YWx1ZXMpCmNvbXBsZXRlX3Jvd3MgPC0gc3VtKGNvbXBsZXRlLmNhc2VzKGRpcnR5X2lyaXMpKQoKIyBDYWxjdWxhdGUgcGVyY2VudGFnZSBvZiBjb21wbGV0ZSBvYnNlcnZhdGlvbnMKcGVyY2VudGFnZV9jb21wbGV0ZSA8LSAoY29tcGxldGVfcm93cyAvIHRvdGFsX3Jvd3MpICogMTAwCgojIFByaW50IHJlc3VsdHMKY2F0KCJOdW1iZXIgb2YgY29tcGxldGUgY2FzZXM6IiwgY29tcGxldGVfcm93cywgIlxuIikKY2F0KCJQZXJjZW50YWdlIG9mIGNvbXBsZXRlIGNhc2VzOiIsIHBlcmNlbnRhZ2VfY29tcGxldGUsICIlXG4iKQpgYGAKCgoKIyMjIFF1ZXN0aW9uIDMuCmBgYHtyfQpzYXBwbHkoZGlydHlfaXJpcywgZnVuY3Rpb24oeCkgewogIGlmIChpcy5udW1lcmljKHgpKSB7CiAgICBjKE5BX3ZhbHVlcyA9IHN1bShpcy5uYSh4KSksCiAgICAgIE5hTl92YWx1ZXMgPSBzdW0oaXMubmFuKHgpKSwKICAgICAgSW5mX3ZhbHVlcyA9IHN1bSh4ID09IEluZiwgbmEucm0gPSBUUlVFKSwKICAgICAgTmVnX0luZl92YWx1ZXMgPSBzdW0oeCA9PSAtSW5mLCBuYS5ybSA9IFRSVUUpKQogIH0KfSkKYGBgCgoKIyMjIFF1ZXN0aW9uIDQuIApgYGB7cn0KZGlydHlfaXJpc1tkaXJ0eV9pcmlzID09IEluZl0gPC0gTkEKCnN1bW1hcnkoZGlydHlfaXJpcykKCmBgYAoKCgpgYGB7cn0KCnNhcHBseShkaXJ0eV9pcmlzLCBmdW5jdGlvbih4KSB7CiAgaWYgKGlzLm51bWVyaWMoeCkpIHsKICAgIGMoTkFfdmFsdWVzID0gc3VtKGlzLm5hKHgpKSwKICAgICAgTmFOX3ZhbHVlcyA9IHN1bShpcy5uYW4oeCkpLAogICAgICBJbmZfdmFsdWVzID0gc3VtKHggPT0gSW5mLCBuYS5ybSA9IFRSVUUpLAogICAgICBOZWdfSW5mX3ZhbHVlcyA9IHN1bSh4ID09IC1JbmYsIG5hLnJtID0gVFJVRSkpCiAgfQp9KQpgYGAKCiMjIyBRdWVzdGlvbiA1LiAKCmBgYHtyfQpudW1fbmVnYXRpdmVfc2VwYWxfd2lkdGggPC0gc3VtKGRpcnR5X2lyaXMkU2VwYWwuV2lkdGggPD0gMCwgbmEucm0gPSBUUlVFKQpjYXQoIk51bWJlciBvZiBvYnNlcnZhdGlvbnMgd2l0aCBuZWdhdGl2ZSBTZXBhbC5XaWR0aDoiLCBudW1fbmVnYXRpdmVfc2VwYWxfd2lkdGgsICJcbiIpCgpudW1fbGFyZ2Vfc2VwYWxfbGVuZ3RoIDwtIHN1bShkaXJ0eV9pcmlzJFNlcGFsLkxlbmd0aCA+IDMwLCBuYS5ybSA9IFRSVUUpCmNhdCgiTnVtYmVyIG9mIG9ic2VydmF0aW9ucyB3aXRoIFNlcGFsLkxlbmd0aCA+IDMwOiIsIG51bV9sYXJnZV9zZXBhbF9sZW5ndGgsICJcbiIpCmBgYAoKIyMjIFF1ZXN0aW9uIDYuIAoKCmBgYHtyfQojIEVuc3VyZSBubyBJbmYgb3IgTmFOIHZhbHVlcyBiZWZvcmUgaW1wdXRhdGlvbgpkaXJ0eV9pcmlzWyFpcy5maW5pdGUoYXMubWF0cml4KGRpcnR5X2lyaXMpKV0gPC0gTkEgIAoKIyMjIDEuIE1lYW4gSW1wdXRhdGlvbiBmb3IgU2VwYWwuV2lkdGgKZGlydHlfaXJpcyRTZXBhbC5XaWR0aFtpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLldpZHRoKV0gPC0gbWVhbihkaXJ0eV9pcmlzJFNlcGFsLldpZHRoLCBuYS5ybSA9IFRSVUUpCgojIyMgMi4gTWVkaWFuIEltcHV0YXRpb24gZm9yIFBldGFsLkxlbmd0aApkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aFtpcy5uYShkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCwgbmEucm0gPSBUUlVFKQoKIyMjIDMuIExpbmVhciBSZWdyZXNzaW9uIEltcHV0YXRpb24gZm9yIFNlcGFsLkxlbmd0aAojIENoZWNrIGlmIHRoZXJlIGFyZSBlbm91Z2ggY29tcGxldGUgY2FzZXMgZm9yIHJlZ3Jlc3Npb24KY29tcGxldGVfZGF0YSA8LSBkaXJ0eV9pcmlzICU+JSBmaWx0ZXIoIWlzLm5hKFNlcGFsLldpZHRoKSAmICFpcy5uYShQZXRhbC5MZW5ndGgpICYgIWlzLm5hKFBldGFsLldpZHRoKSkKCmlmIChucm93KGNvbXBsZXRlX2RhdGEpID4gMSkgeyAgIyBFbnN1cmUgZW5vdWdoIGRhdGEgcG9pbnRzIGZvciByZWdyZXNzaW9uCiAgICBsbV9tb2RlbCA8LSBsbShTZXBhbC5MZW5ndGggfiBTZXBhbC5XaWR0aCArIFBldGFsLkxlbmd0aCArIFBldGFsLldpZHRoLCBkYXRhID0gY29tcGxldGVfZGF0YSkKICAgIG1pc3Npbmdfc2VwYWxfbGVuZ3RoX3Jvd3MgPC0gd2hpY2goaXMubmEoZGlydHlfaXJpcyRTZXBhbC5MZW5ndGgpKQogICAgCiAgICBpZiAobGVuZ3RoKG1pc3Npbmdfc2VwYWxfbGVuZ3RoX3Jvd3MpID4gMCkgewogICAgICAgIGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoW21pc3Npbmdfc2VwYWxfbGVuZ3RoX3Jvd3NdIDwtIHByZWRpY3QobG1fbW9kZWwsIG5ld2RhdGEgPSBkaXJ0eV9pcmlzW21pc3Npbmdfc2VwYWxfbGVuZ3RoX3Jvd3MsIF0pCiAgICB9Cn0gZWxzZSB7CiAgICBjYXQoIk5vdCBlbm91Z2ggY29tcGxldGUgY2FzZXMgZm9yIHJlZ3Jlc3Npb24uIFNlcGFsLkxlbmd0aCBub3QgaW1wdXRlZC5cbiIpCn0KYGBgCgpgYGB7cn0KIyMjIDQuIGtOTiBJbXB1dGF0aW9uIGZvciBQZXRhbC5XaWR0aAojIFNlbGVjdCBvbmx5IG51bWVyaWMgY29sdW1ucyBmb3Iga05OCm51bWVyaWNfY29scyA8LSBkaXJ0eV9pcmlzICU+JSBzZWxlY3Qod2hlcmUoaXMubnVtZXJpYykpCgojIFBlcmZvcm0ga05OIGltcHV0YXRpb24gb24gbnVtZXJpYyBjb2x1bW5zCmltcHV0ZWRfbnVtZXJpYyA8LSBrTk4obnVtZXJpY19jb2xzLCB2YXJpYWJsZSA9ICJQZXRhbC5XaWR0aCIsIGsgPSA1KQpgYGAKCmBgYHtyfQojIFJlcGxhY2UgdGhlIG9yaWdpbmFsIGNvbHVtbiB3aXRoIHRoZSBpbXB1dGVkIHZlcnNpb24KZGlydHlfaXJpcyRQZXRhbC5XaWR0aCA8LSBpbXB1dGVkX251bWVyaWMkUGV0YWwuV2lkdGgKCiMgUmVtb3ZlIGV4dHJhICJfaW1wIiBjb2x1bW5zIGFkZGVkIGJ5IGtOTgpkaXJ0eV9pcmlzIDwtIGRpcnR5X2lyaXMgJT4lIHNlbGVjdCgtY29udGFpbnMoIl9pbXAiKSkKCiMgRmluYWwgQ2hlY2s6IENvdW50IHJlbWFpbmluZyBtaXNzaW5nIHZhbHVlcwpjYXQoIlJlbWFpbmluZyBtaXNzaW5nIHZhbHVlcyBhZnRlciBpbXB1dGF0aW9uOlxuIikKYGBgCgpgYGB7cn0KcHJpbnQoY29sU3Vtcyhpcy5uYShkaXJ0eV9pcmlzKSkpCmBgYAoKCmBgYHtyfQpkaXJ0eV9pcmlzX2NvbXBsZXRlIDwtIG5hLm9taXQoZGlydHlfaXJpcykKCnNlcGFsX3dpZHRoX2Vycm9ycyA8LSBkaXJ0eV9pcmlzX2NvbXBsZXRlW2RpcnR5X2lyaXNfY29tcGxldGUkU2VwYWwuV2lkdGggPD0gMCxdCnNlcGFsX3dpZHRoX2Vycm9ycwpgYGAKCmBgYHtyfQphYnMoZGlydHlfaXJpcyRTZXBhbC5XaWR0aFtkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDwgMF0pCmBgYAojIyMgUXVlc3Rpb24gNy4gCmBgYHtyfQpkaXJ0eV9pcmlzWzE2LDJdIDwtIGFicyhkaXJ0eV9pcmlzWzE2LCAyXSkKZGlydHlfaXJpcyRTZXBhbC5XaWR0aFtkaXJ0eV9pcmlzJFNlcGFsLldpZHRoID09IDBdIDwtIE5BCgoKbWVhbihkaXJ0eV9pcmlzJFNlcGFsLldpZHRoLCBuYS5ybSA9IFRSVUUpIAoKbWVkaWFuKGRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoLCBuYS5ybSA9IFRSVUUpCm1lZGlhbihkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCwgbmEucm0gPSBUUlVFKQoKa05OKGRpcnR5X2lyaXMsIHZhcmlhYmxlID0gIlBldGFsLldpZHRoIikKYGBgCgoK