# ----------------------------------------------------------------------
# 1. SETUP, TYPE CONVERSION, AND Q6 CLEANUP (Inf -> NA)
# ----------------------------------------------------------------------
# Load the data
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
# Ensure columns 1 through 4 are numeric first.
for (i in 1:4) {
dirty_iris[, i] <- as.numeric(dirty_iris[, i])
}
# Q6 FIX: Convert the special value 'Inf' to NA.
numeric_data <- dirty_iris[, 1:4]
numeric_data[is.infinite(as.matrix(numeric_data))] <- NA
dirty_iris[, 1:4] <- numeric_data
# Q8 FIX: Correct Sepal.Width violations (Rule: Sepal.Width > 0)
# 1. Replace negative values with their absolute value
neg_idx <- which(dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width))
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])
# 2. Replace zero values with NA
zero_idx <- which(dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width))
dirty_iris$Sepal.Width[zero_idx] <- NA
# Q7: Convert extreme Sepal.Length outliers (> 30cm) to NA
large_sl_idx <- which(dirty_iris$Sepal.Length > 30 & !is.na(dirty_iris$Sepal.Length))
dirty_iris$Sepal.Length[large_sl_idx] <- NA
# D. Petal.Width: kNN imputation (using a species-mean approximation)
for (species in unique(dirty_iris$Species)) {
na_pw_idx <- which(is.na(dirty_iris$Petal.Width) & dirty_iris$Species == species)
if (length(na_pw_idx) > 0) {
species_mean_pw <- mean(dirty_iris$Petal.Width[dirty_iris$Species == species], na.rm = TRUE)
dirty_iris$Petal.Width[na_pw_idx] <- species_mean_pw
}
}
# A. Sepal.Width: Mean imputation
mean_sw <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sw
# B. Petal.Length: Median imputation
median_pl <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_pl
# C. Sepal.Length: Linear Regression imputation
sl_model <- lm(Sepal.Length ~ Petal.Width, data = dirty_iris, na.action = na.omit)
na_sl_idx <- which(is.na(dirty_iris$Sepal.Length))
predicted_sl <- predict(sl_model, newdata = dirty_iris[na_sl_idx, ])
dirty_iris$Sepal.Length[na_sl_idx] <- predicted_sl