Charlie Stevens

## 
## The downloaded binary packages are in
##  /var/folders/9_/4c9yx6hj0tdfpf6r823cszk40000gn/T//RtmpBFq0AQ/downloaded_packages
## 
## The downloaded binary packages are in
##  /var/folders/9_/4c9yx6hj0tdfpf6r823cszk40000gn/T//RtmpBFq0AQ/downloaded_packages

Question 1:

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 2:

num_complete <- sum(complete.cases(dirty_iris))

total_obs <- nrow(dirty_iris)
percentage_complete <- (num_complete / total_obs) * 100

num_complete
## [1] 96
percentage_complete
## [1] 64

Question 3:

# Check for special values in numeric columns
sapply(dirty_iris[, 1:4], function(x) {
  c(
    NA_values = sum(is.na(x)),
    NaN_values = sum(is.nan(x)),
    Inf_values = sum(is.infinite(x) & x > 0),
    NegInf_values = sum(is.infinite(x) & x < 0)
  )
})
##               Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA_values               10          17           19          12
## NaN_values               0           0            0           0
## Inf_values               0           0            0           1
## NegInf_values            0           0            0           0

Question 4:

dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA

table(is.infinite(dirty_iris$Petal.Width))  
## 
## FALSE 
##   150
table(is.na(dirty_iris$Petal.Width)) 
## 
## FALSE  TRUE 
##   137    13

Question 5:

# Find rows where Sepal.Width is not positive
violating_sepal_width <- dirty_iris$Sepal.Width <= 0  

# Find rows where Sepal.Length exceeds 30 cm
violating_sepal_length <- dirty_iris$Sepal.Length > 30  

# Combine both conditions to identify violating observations
violating_rows <- which(violating_sepal_width | violating_sepal_length)

# Print violating rows
dirty_iris[violating_rows, ]
# Count number of violating observations
length(violating_rows)
## [1] 4

Question 6:

# Ensure we handle only non-NA values before applying absolute function
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- 
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])  

# Replace zero values with NA
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA  

# Verify correction
table(dirty_iris$Sepal.Width <= 0, useNA = "ifany")
## 
## FALSE  <NA> 
##   132    18
dirty_iris[16, ]

Question 7:

# 1. Impute Sepal.Width using mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

# 2. Impute Petal.Length using median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

# 3. Impute Sepal.Length using linear regression
mice_model <- mice(dirty_iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")], 
                   method = "norm.predict", m = 1)
## 
##  iter imp variable
##   1   1  Sepal.Length  Petal.Width
##   2   1  Sepal.Length  Petal.Width
##   3   1  Sepal.Length  Petal.Width
##   4   1  Sepal.Length  Petal.Width
##   5   1  Sepal.Length  Petal.Width
dirty_iris$Sepal.Length <- complete(mice_model)$Sepal.Length

# 4. Impute Petal.Width using kNN (DMwR package)

numeric_cols <- sapply(dirty_iris, is.numeric)

dirty_iris[numeric_cols] <- knnImputation(dirty_iris[numeric_cols], k = 3)


# Verify missing values are imputed
colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0