library(tidyverse); library(broom); library(readr)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


n_total     <- nrow(dirty_iris)
n_complete  <- sum(complete.cases(dirty_iris))
pct_complete <- 100 * n_complete / n_total


cat("Total rows:     ", n_total, "\n")
## Total rows:      150
cat("Complete rows:  ", n_complete, "\n")
## Complete rows:   96
cat("Percent complete:", sprintf("%.0f%%", pct_complete), "\n")
## Percent complete: 64%

Question 6

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


num_cols <- sapply(dirty_iris, is.numeric)


count_specials <- function(x) {
  vals <- c(
    sum(is.na(x)),
    sum(is.nan(x)),
    sum(is.infinite(x) & x > 0, na.rm = TRUE),
    sum(is.infinite(x) & x < 0, na.rm = TRUE)
  )
  setNames(vals, c("NA", "NaN", "Inf", "-Inf"))
}


before_tbl <- sapply(dirty_iris[, num_cols, drop = FALSE], count_specials)
cat("Special values BEFORE replacement (rows=types, cols=vars):\n")
## Special values BEFORE replacement (rows=types, cols=vars):
print(before_tbl)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA             10          17           19          12
## NaN             0           0            0           0
## Inf             0           0            0           1
## -Inf            0           0            0           0
dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE], function(col) {
  col[is.infinite(col) & col > 0] <- NA_real_
  col
})


after_tbl <- sapply(dirty_iris[, num_cols, drop = FALSE], count_specials)
cat("\nSpecial values AFTER replacement (Inf should be 0 everywhere):\n")
## 
## Special values AFTER replacement (Inf should be 0 everywhere):
print(after_tbl)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA             10          17           19          13
## NaN             0           0            0           0
## Inf             0           0            0           0
## -Inf            0           0            0           0
summary(dirty_iris[, num_cols, drop = FALSE])
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13

Question 7

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


SW <- suppressWarnings(as.numeric(dirty_iris$Sepal.Width))
SL <- suppressWarnings(as.numeric(dirty_iris$Sepal.Length))


viol_sw <- !is.na(SW) & (SW <= 0)     # sepal width should be positive
viol_sl <- !is.na(SL) & (SL > 30)     # sepal length cannot exceed 30


viol_idx <- which(viol_sw | viol_sl)
viol_df  <- dirty_iris[viol_idx, ]


cat("Number of violating observations:", length(viol_idx), "\n")
## Number of violating observations: 4
cat("Row indices of violations:\n"); print(viol_idx)
## Row indices of violations:
## [1]  16  28 125 130
cat("\nViolating rows (key columns shown):\n")
## 
## Violating rows (key columns shown):
print(viol_df[, c("Sepal.Width","Sepal.Length","Species")])
##     Sepal.Width Sepal.Length    Species
## 16           -3          5.0 versicolor
## 28           29         73.0  virginica
## 125          30         49.0     setosa
## 130           0          5.7     setosa

Question 8

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


sw_raw <- suppressWarnings(as.numeric(dirty_iris$Sepal.Width))


viol_idx <- which(!is.na(sw_raw) & sw_raw <= 0)
neg_idx  <- which(!is.na(sw_raw) & sw_raw <  0)
zero_idx <- which(!is.na(sw_raw) & sw_raw == 0)

cat("Total violations (<= 0):", length(viol_idx), "\n")
## Total violations (<= 0): 2
cat("  Negatives:", length(neg_idx), "\n")
##   Negatives: 1
cat("  Zeros:", length(zero_idx), "\n\n")
##   Zeros: 1
if (length(viol_idx)) {
  cat("Rows with violations and original values:\n")
  print(data.frame(row = viol_idx, Sepal.Width = sw_raw[viol_idx]))
}
## Rows with violations and original values:
##   row Sepal.Width
## 1  16          -3
## 2 130           0
sw_clean <- sw_raw
if (length(neg_idx))  sw_clean[neg_idx]  <- abs(sw_clean[neg_idx])
if (length(zero_idx)) sw_clean[zero_idx] <- NA_real_

dirty_iris$Sepal.Width <- sw_clean


cat("\nAfter correction:\n")
## 
## After correction:
cat("Any Sepal.Width <= 0? ", any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE), "\n")
## Any Sepal.Width <= 0?  FALSE
cat("Count of NAs in Sepal.Width: ", sum(is.na(dirty_iris$Sepal.Width)), "\n\n")
## Count of NAs in Sepal.Width:  18
summary(dirty_iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.200   2.800   3.000   3.462   3.300  30.000      18

Question 9

set.seed(123)


dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

num_cols <- sapply(dirty_iris, is.numeric)
dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE],
                                 function(x) suppressWarnings(as.numeric(x)))


dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE], function(x) {
  x[is.infinite(x) & x > 0] <- NA_real_
  x
})


if ("Sepal.Width" %in% names(dirty_iris)) {
  sw <- dirty_iris$Sepal.Width
  sw[!is.na(sw) & sw < 0]  <- abs(sw[!is.na(sw) & sw < 0])
  sw[!is.na(sw) & sw == 0] <- NA_real_
  dirty_iris$Sepal.Width <- sw
}


na_before <- sapply(dirty_iris[, c("Sepal.Width","Petal.Length","Sepal.Length","Petal.Width")],
                    function(x) sum(is.na(x)))
cat("NA counts BEFORE:\n"); print(na_before)
## NA counts BEFORE:
##  Sepal.Width Petal.Length Sepal.Length  Petal.Width 
##           18           19           10           13
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)


dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)


fit_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
             data = dirty_iris,
             subset = !is.na(Sepal.Length) & !is.na(Sepal.Width) & !is.na(Petal.Length))
miss_sl <- which(is.na(dirty_iris$Sepal.Length))
if (length(miss_sl)) {
  dirty_iris$Sepal.Length[miss_sl] <- predict(fit_sl, newdata = dirty_iris[miss_sl, ])
}


if (!requireNamespace("VIM", quietly = TRUE)) install.packages("VIM", quiet = TRUE)
library(VIM)
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- VIM::kNN(dirty_iris,
                       variable = "Petal.Width",
                       k = 5,
                       imp_var = FALSE)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
na_after <- sapply(dirty_iris[, c("Sepal.Width","Petal.Length","Sepal.Length","Petal.Width")],
                   function(x) sum(is.na(x)))
cat("\nNA counts AFTER:\n"); print(na_after)
## 
## NA counts AFTER:
##  Sepal.Width Petal.Length Sepal.Length  Petal.Width 
##            0            0            0            0
summary(dirty_iris[, c("Sepal.Width","Petal.Length","Sepal.Length","Petal.Width")])
##   Sepal.Width      Petal.Length     Sepal.Length     Petal.Width   
##  Min.   : 2.200   Min.   : 0.000   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.: 5.100   1st Qu.:0.300  
##  Median : 3.100   Median : 4.500   Median : 5.800   Median :1.300  
##  Mean   : 3.462   Mean   : 4.456   Mean   : 6.528   Mean   :1.209  
##  3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.: 6.400   3rd Qu.:1.800  
##  Max.   :30.000   Max.   :63.000   Max.   :73.000   Max.   :2.500