Charlie Stevens
##
## The downloaded binary packages are in
## /var/folders/9_/4c9yx6hj0tdfpf6r823cszk40000gn/T//RtmpBFq0AQ/downloaded_packages
##
## The downloaded binary packages are in
## /var/folders/9_/4c9yx6hj0tdfpf6r823cszk40000gn/T//RtmpBFq0AQ/downloaded_packages
Question 1:
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 2:
num_complete <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage_complete <- (num_complete / total_obs) * 100
num_complete
## [1] 96
percentage_complete
## [1] 64
Question 3:
# Check for special values in numeric columns
sapply(dirty_iris[, 1:4], function(x) {
c(
NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(is.infinite(x) & x > 0),
NegInf_values = sum(is.infinite(x) & x < 0)
)
})
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA_values 10 17 19 12
## NaN_values 0 0 0 0
## Inf_values 0 0 0 1
## NegInf_values 0 0 0 0
Question 4:
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
table(is.infinite(dirty_iris$Petal.Width))
##
## FALSE
## 150
table(is.na(dirty_iris$Petal.Width))
##
## FALSE TRUE
## 137 13
Question 5:
# Find rows where Sepal.Width is not positive
violating_sepal_width <- dirty_iris$Sepal.Width <= 0
# Find rows where Sepal.Length exceeds 30 cm
violating_sepal_length <- dirty_iris$Sepal.Length > 30
# Combine both conditions to identify violating observations
violating_rows <- which(violating_sepal_width | violating_sepal_length)
# Print violating rows
dirty_iris[violating_rows, ]
# Count number of violating observations
length(violating_rows)
## [1] 4
Question 6:
# Ensure we handle only non-NA values before applying absolute function
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <-
abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
# Replace zero values with NA
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA
# Verify correction
table(dirty_iris$Sepal.Width <= 0, useNA = "ifany")
##
## FALSE <NA>
## 132 18
dirty_iris[16, ]
Question 7:
# 1. Impute Sepal.Width using mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
# 2. Impute Petal.Length using median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
# 3. Impute Sepal.Length using linear regression
mice_model <- mice(dirty_iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")],
method = "norm.predict", m = 1)
##
## iter imp variable
## 1 1 Sepal.Length Petal.Width
## 2 1 Sepal.Length Petal.Width
## 3 1 Sepal.Length Petal.Width
## 4 1 Sepal.Length Petal.Width
## 5 1 Sepal.Length Petal.Width
dirty_iris$Sepal.Length <- complete(mice_model)$Sepal.Length
# 4. Impute Petal.Width using kNN (DMwR package)
numeric_cols <- sapply(dirty_iris, is.numeric)
dirty_iris[numeric_cols] <- knnImputation(dirty_iris[numeric_cols], k = 3)
# Verify missing values are imputed
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0