dirty_iris <- read.csv(“https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv”) sum(is.na(dirty_iris\(Petal.Length)) str(dirty_iris) summary(dirty_iris\)Petal.Length) num_complete <- sum(complete.cases(dirty_iris))
total_rows <- nrow(dirty_iris)
percent_complete <- (num_complete / total_rows) * 100
numeric_cols <- names(dirty_iris)[1:4] # First 4 columns are numeric
for (col in numeric_cols) { dirty_iris[[col]][dirty_iris[[col]] == “?”] <- NA dirty_iris[[col]] <- as.numeric(dirty_iris[[col]]) }
sapply(dirty_iris[numeric_cols], function(x) sum(is.na(x)))
cat(“Number of complete observations:”, num_complete, “”) cat(“Percentage of complete observations:”, round(percent_complete, 2), “%”)
table(dirty_iris$Sepal.Length) sapply(dirty_iris[, 1:4], function(x) sum(x == “?”, na.rm = TRUE))
numeric_cols <- names(dirty_iris)[1:4] for (col in numeric_cols) { dirty_iris[[col]][dirty_iris[[col]] == “?”] <- NA dirty_iris[[col]] <- as.numeric(dirty_iris[[col]]) } violations <- dirty_iris[dirty_iris\(Sepal.Width <= 0 | dirty_iris\)Sepal.Length > 30, ]
num_violations <- nrow(violations)
cat(“Number of observations violating the rules:”, num_violations, “”)
numeric_cols <- names(dirty_iris)[1:4] for (col in numeric_cols) { dirty_iris[[col]][dirty_iris[[col]] == “?”] <- NA dirty_iris[[col]] <- as.numeric(dirty_iris[[col]]) }
violators <- which(dirty_iris$Sepal.Width <= 0)
dirty_iris\(Sepal.Width[dirty_iris\)Sepal.Width == 0] <- NA dirty_iris\(Sepal.Width[dirty_iris\)Sepal.Width < 0] <- abs(dirty_iris\(Sepal.Width[dirty_iris\)Sepal.Width < 0])
dirty_iris[violators, ]
mean_sepal_width <- mean(dirty_iris\(Sepal.Width, na.rm = TRUE) dirty_iris\)Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width