dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
Question 1
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 2
num_complete <- sum(complete.cases(dirty_iris))
percent_complete <- (num_complete / nrow(dirty_iris)) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
Question 3/4
sum(is.na(dirty_iris))
## [1] 58
sum(is.nan(as.matrix(dirty_iris)))
## [1] 0
sum(is.infinite(as.matrix(dirty_iris)))
## [1] 0
sum(as.matrix(dirty_iris) == -Inf, na.rm = TRUE)
## [1] 0
Question 5
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
violations_width <- which(dirty_iris$Sepal.Width <= 0)
violations_length <- which(dirty_iris$Sepal.Length > 30)
violations <- unique(c(violations_width, violations_length))
violating_observations <- dirty_iris[violations, ]
print(violating_observations)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
length(violations)
## [1] 4
Question 6
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris$Sepal.Width <- ifelse(is.na(dirty_iris$Sepal.Width), NA, dirty_iris$Sepal.Width)
violations <- which(dirty_iris$Sepal.Width <= 0)
print(dirty_iris[violations, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <-
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width)] <- NA
print(dirty_iris[violations, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
write.csv(dirty_iris, "cleaned_iris.csv", row.names = FALSE)
Question 7
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: ggplot2
## Loading required package: lattice
library(RANN)
## Warning: package 'RANN' was built under R version 4.4.2
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris[sapply(dirty_iris, is.infinite)] <- NA
dirty_iris[sapply(dirty_iris, is.nan)] <- NA
print(colSums(is.na(dirty_iris)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 17 19 13 0
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
clean_data <- dirty_iris[complete.cases(dirty_iris[, c("Petal.Length", "Petal.Width", "Sepal.Width")]), ]
if(nrow(clean_data) > 0) {
lm_model <- lm(Sepal.Length ~ Petal.Length + Petal.Width + Sepal.Width, data = clean_data)
missing_indices <- which(is.na(dirty_iris$Sepal.Length))
dirty_iris$Sepal.Length[missing_indices] <- predict(lm_model, newdata = dirty_iris[missing_indices, ])
} else {
stop("No complete cases available for linear regression.")
}
preProc <- preProcess(dirty_iris, method = "knnImpute")
dirty_iris <- predict(preProc, newdata = dirty_iris)
print(colSums(is.na(dirty_iris)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0