# Load the dataset correctly
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
head(dirty_iris)
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(complete.cases(dirty_iris))
## [1] 96
sum(nrow(dirty_iris))
## [1] 150
percent_of_complete_cases <- (sum(complete.cases(dirty_iris)) / sum(nrow(dirty_iris))) * 100
percent_of_complete_cases
## [1] 64
# Function to count special values per column
count_special_values <- function(df) {
sapply(df, function(col) {
c(
NA_count = sum(is.na(col)),
NaN_count = sum(is.nan(col)),
Inf_count = sum(is.infinite(col) & col > 0),
NegInf_count = sum(is.infinite(col) & col < 0)
)
})
}
special_values_summary <- count_special_values(dirty_iris)
special_values_summary
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA_count 10 17 19 12 0
## NaN_count 0 0 0 0 0
## Inf_count 0 0 0 1 0
## NegInf_count 0 0 0 0 0
dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA
sum(is.na(dirty_iris))
## [1] 58
invalid_sepal_width <- dirty_iris$Sepal.Width <= 0
invalid_sepal_length <- dirty_iris$Sepal.Length > 30
violating_rows <- which(invalid_sepal_length | invalid_sepal_width)
violating_observation <- dirty_iris[violating_rows, ]
violating_observation
nrow(violating_observation)
## [1] 4
invalid_sepal_width_rows <- which(dirty_iris$Sepal.Width <= 0)
dirty_iris[invalid_sepal_width_rows, ]
median_sepal_width <- median(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width > 0], na.rm = TRUE)
median_sepal_width
## [1] 3
dirty_iris$Sepal.Width[invalid_sepal_width_rows] <- median_sepal_width
dirty_iris[invalid_sepal_width_rows, ]
library(VIM)
## Warning: package 'VIM' was built under R version 4.4.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Ensure no Inf or NaN values before imputation
dirty_iris[!is.finite(as.matrix(dirty_iris))] <- NA
### 1. Mean Imputation for Sepal.Width
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
### 2. Median Imputation for Petal.Length
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
### 3. Linear Regression Imputation for Sepal.Length
# Check if there are enough complete cases for regression
complete_data <- dirty_iris %>% filter(!is.na(Sepal.Width) & !is.na(Petal.Length) & !is.na(Petal.Width))
if (nrow(complete_data) > 1) { # Ensure enough data points for regression
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = complete_data)
missing_sepal_length_rows <- which(is.na(dirty_iris$Sepal.Length))
if (length(missing_sepal_length_rows) > 0) {
dirty_iris$Sepal.Length[missing_sepal_length_rows] <- predict(lm_model, newdata = dirty_iris[missing_sepal_length_rows, ])
}
} else {
cat("Not enough complete cases for regression. Sepal.Length not imputed.\n")
}
## Not enough complete cases for regression. Sepal.Length not imputed.
### 4. kNN Imputation for Petal.Width
# Select only numeric columns for kNN
numeric_cols <- dirty_iris %>% select(where(is.numeric))
# Perform kNN imputation on numeric columns
imputed_numeric <- kNN(numeric_cols, variable = "Petal.Width", k = 5)
## Warning in kNN(numeric_cols, variable = "Petal.Width", k = 5): All observations of Petal.Width are missing, therefore the variable will not be imputed!
## Warning in kNN(numeric_cols, variable = "Petal.Width", k = 5): Nothing is
## imputed, because all variables to be imputed only contains missings.
# Replace the original column with the imputed version
dirty_iris$Petal.Width <- imputed_numeric$Petal.Width
# Remove extra "_imp" columns added by kNN
dirty_iris <- dirty_iris %>% select(-contains("_imp"))
# Final Check: Count remaining missing values
cat("Remaining missing values after imputation:\n")
## Remaining missing values after imputation:
print(colSums(is.na(dirty_iris)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 150 150 150 150 150
# Save cleaned dataset
write.csv(dirty_iris, "cleaned_iris.csv", row.names = FALSE)