R Markdown

Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

num_complete <- sum(complete.cases(dirty_iris))

total_obs <- nrow(dirty_iris)

percent_complete <- num_complete/total_obs *100

num_complete
## [1] 96
percent_complete
## [1] 64

Question 5

sum(is.infinite(dirty_iris$Sepal.Length))
## [1] 0
sum(is.infinite(dirty_iris$Sepal.Width))
## [1] 0
sum(is.infinite(dirty_iris$Petal.Length))
## [1] 0
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 1

Question 6

num_cols <- sapply(dirty_iris, is.numeric)



sum(is.infinite(unlist(dirty_iris)))
## [1] 0

Question 7

rules_violate <- subset(dirty_iris,c(Sepal.Width<=0)|(Sepal.Length>30))
nrow(rules_violate)
## [1] 4

Question 8

violation_indices <- which(dirty_iris$Sepal.Width <= 0)
cat("Observations violating the rule (Sepal.Width > 0):\n")
## Observations violating the rule (Sepal.Width > 0):
print(dirty_iris[violation_indices, ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
dirty_iris$Sepal.Width <- ifelse(dirty_iris$Sepal.Width < 0, 
                                 abs(dirty_iris$Sepal.Width), 
                                 dirty_iris$Sepal.Width)
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
cat("\nObservations after correction (0s are now NA, negatives are now positive):\n")
## 
## Observations after correction (0s are now NA, negatives are now positive):
print(dirty_iris[violation_indices, ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa
cat("\nTotal NAs in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width)))
## 
## Total NAs in Sepal.Width: 18

##Question 9

w_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] 
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median
sl_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Species, 
               data = dirty_iris[!is.na(dirty_iris$Sepal.Length), ])

sl_na_indices <- which(is.na(dirty_iris$Sepal.Length))
if(length(sl_na_indices) > 0) {
  dirty_iris$Sepal.Length[sl_na_indices] <- predict(sl_model, newdata = dirty_iris[sl_na_indices, ])
}