Question 1

No R code required

Question 2

No R code required

Loading Data

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 3

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

complete_cases <- sum(complete.cases(dirty_iris))
complete_cases
## [1] 96
percentage_complete <- (complete_cases / nrow(dirty_iris)) *100
percentage_complete
## [1] 64

Question 5

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

numeric_cols <- sapply(dirty_iris, is.numeric)
colSums(sapply(dirty_iris[, numeric_cols], is.infinite))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1

Question 6

dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA

colSums(is.infinite(as.matrix(dirty_iris)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Question 7

violations <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
length(violations)
## [1] 4

Question 8

# Find indices
neg_idx  <- which(dirty_iris$Sepal.Width < 0)
zero_idx <- which(dirty_iris$Sepal.Width == 0)

# Show before
if (length(c(neg_idx, zero_idx)) > 0) {
  print("Before correction:")
  print(dirty_iris[c(neg_idx, zero_idx), c("Sepal.Length", "Sepal.Width", "Species")])
}
## [1] "Before correction:"
##     Sepal.Length Sepal.Width    Species
## 16           5.0          -3 versicolor
## 130          5.7           0     setosa
# Correct
dirty_iris$Sepal.Width[neg_idx]  <- abs(dirty_iris$Sepal.Width[neg_idx])
dirty_iris$Sepal.Width[zero_idx] <- NA

# Show after
if (length(c(neg_idx, zero_idx)) > 0) {
  print("After correction:")
  print(dirty_iris[c(neg_idx, zero_idx), c("Sepal.Length", "Sepal.Width", "Species")])
}
## [1] "After correction:"
##     Sepal.Length Sepal.Width    Species
## 16           5.0           3 versicolor
## 130          5.7          NA     setosa
# Confirm no more violations for Sepal.Width <=0
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 0

Question 9

library(mice)
## Warning: package 'mice' was built under R version 4.5.2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# Step 0
# Replace negative values with absolute and zeros with NA
neg_idx  <- which(dirty_iris$Sepal.Width < 0)
zero_idx <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[neg_idx]  <- abs(dirty_iris$Sepal.Width[neg_idx])
dirty_iris$Sepal.Width[zero_idx] <- NA

# Step 1: Replace INF/-INF in numeric columns with NA
numeric_cols <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
for(col in numeric_cols) {
  dirty_iris[[col]][is.infinite(dirty_iris[[col]])] <- NA
}

# Step 2: Impute Sepal.Width -> mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

# Step 3: Impute Petal.Length -> media
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

# Step 4: Impute Sepal.Length -> linear regression
### Only use rows without NA in predictors for lm
complete_rows <- complete.cases(dirty_iris[, c("Sepal.Width", "Petal.Length", "Petal.Width")])

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
               data = dirty_iris[complete_rows, ])

### Predict missing Sepal.Length
missing_sl <- which(is.na(dirty_iris$Sepal.Length))
dirty_iris$Sepal.Length[missing_sl] <- predict(lm_model, newdata = dirty_iris[missing_sl, ])

# Step 5: Impute Petal.Width -> kNN
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
dirty_iris$Petal.Width <- dirty_iris$Petal.Width  # keep only original column

# Step 6: Verify all missing values are imputed
colSums(is.na(dirty_iris))
##    Sepal.Length     Sepal.Width    Petal.Length     Petal.Width         Species 
##               0               0               0               0               0 
## Petal.Width_imp 
##               0
# Additional: View the cleaned dataset
head(dirty_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species Petal.Width_imp
## 1          6.4    3.200000          4.5         1.5 versicolor           FALSE
## 2          6.3    3.300000          6.0         2.5  virginica           FALSE
## 3          6.2    3.462121          5.4         2.3  virginica           FALSE
## 4          5.0    3.400000          1.6         0.4     setosa           FALSE
## 5          5.7    2.600000          3.5         1.0 versicolor           FALSE
## 6          5.3    3.462121          4.5         0.2     setosa           FALSE

Question 10

No R code required