dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percent_complete <- num_complete/total_obs *100
num_complete
## [1] 96
percent_complete
## [1] 64
sum(is.infinite(dirty_iris$Sepal.Length))
## [1] 0
sum(is.infinite(dirty_iris$Sepal.Width))
## [1] 0
sum(is.infinite(dirty_iris$Petal.Length))
## [1] 0
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 1
num_cols <- sapply(dirty_iris, is.numeric)
sum(is.infinite(unlist(dirty_iris)))
## [1] 0
rules_violate <- subset(dirty_iris,c(Sepal.Width<=0)|(Sepal.Length>30))
nrow(rules_violate)
## [1] 4
violation_indices <- which(dirty_iris$Sepal.Width <= 0)
cat("Observations violating the rule (Sepal.Width > 0):\n")
## Observations violating the rule (Sepal.Width > 0):
print(dirty_iris[violation_indices, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
dirty_iris$Sepal.Width <- ifelse(dirty_iris$Sepal.Width < 0,
abs(dirty_iris$Sepal.Width),
dirty_iris$Sepal.Width)
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
cat("\nObservations after correction (0s are now NA, negatives are now positive):\n")
##
## Observations after correction (0s are now NA, negatives are now positive):
print(dirty_iris[violation_indices, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
cat("\nTotal NAs in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width)))
##
## Total NAs in Sepal.Width: 18
##Question 9
w_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median
sl_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Species,
data = dirty_iris[!is.na(dirty_iris$Sepal.Length), ])
sl_na_indices <- which(is.na(dirty_iris$Sepal.Length))
if(length(sl_na_indices) > 0) {
dirty_iris$Sepal.Length[sl_na_indices] <- predict(sl_model, newdata = dirty_iris[sl_na_indices, ])
}