library(tidyverse); library(broom); library(readr)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Question 3
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 4
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
n_total <- nrow(dirty_iris)
n_complete <- sum(complete.cases(dirty_iris))
pct_complete <- 100 * n_complete / n_total
cat("Total rows: ", n_total, "\n")
## Total rows: 150
cat("Complete rows: ", n_complete, "\n")
## Complete rows: 96
cat("Percent complete:", sprintf("%.0f%%", pct_complete), "\n")
## Percent complete: 64%
Question 6
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
num_cols <- sapply(dirty_iris, is.numeric)
count_specials <- function(x) {
vals <- c(
sum(is.na(x)),
sum(is.nan(x)),
sum(is.infinite(x) & x > 0, na.rm = TRUE),
sum(is.infinite(x) & x < 0, na.rm = TRUE)
)
setNames(vals, c("NA", "NaN", "Inf", "-Inf"))
}
before_tbl <- sapply(dirty_iris[, num_cols, drop = FALSE], count_specials)
cat("Special values BEFORE replacement (rows=types, cols=vars):\n")
## Special values BEFORE replacement (rows=types, cols=vars):
print(before_tbl)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA 10 17 19 12
## NaN 0 0 0 0
## Inf 0 0 0 1
## -Inf 0 0 0 0
dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE], function(col) {
col[is.infinite(col) & col > 0] <- NA_real_
col
})
after_tbl <- sapply(dirty_iris[, num_cols, drop = FALSE], count_specials)
cat("\nSpecial values AFTER replacement (Inf should be 0 everywhere):\n")
##
## Special values AFTER replacement (Inf should be 0 everywhere):
print(after_tbl)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA 10 17 19 13
## NaN 0 0 0 0
## Inf 0 0 0 0
## -Inf 0 0 0 0
summary(dirty_iris[, num_cols, drop = FALSE])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
Question 7
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
SW <- suppressWarnings(as.numeric(dirty_iris$Sepal.Width))
SL <- suppressWarnings(as.numeric(dirty_iris$Sepal.Length))
viol_sw <- !is.na(SW) & (SW <= 0) # sepal width should be positive
viol_sl <- !is.na(SL) & (SL > 30) # sepal length cannot exceed 30
viol_idx <- which(viol_sw | viol_sl)
viol_df <- dirty_iris[viol_idx, ]
cat("Number of violating observations:", length(viol_idx), "\n")
## Number of violating observations: 4
cat("Row indices of violations:\n"); print(viol_idx)
## Row indices of violations:
## [1] 16 28 125 130
cat("\nViolating rows (key columns shown):\n")
##
## Violating rows (key columns shown):
print(viol_df[, c("Sepal.Width","Sepal.Length","Species")])
## Sepal.Width Sepal.Length Species
## 16 -3 5.0 versicolor
## 28 29 73.0 virginica
## 125 30 49.0 setosa
## 130 0 5.7 setosa
Question 8
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sw_raw <- suppressWarnings(as.numeric(dirty_iris$Sepal.Width))
viol_idx <- which(!is.na(sw_raw) & sw_raw <= 0)
neg_idx <- which(!is.na(sw_raw) & sw_raw < 0)
zero_idx <- which(!is.na(sw_raw) & sw_raw == 0)
cat("Total violations (<= 0):", length(viol_idx), "\n")
## Total violations (<= 0): 2
cat(" Negatives:", length(neg_idx), "\n")
## Negatives: 1
cat(" Zeros:", length(zero_idx), "\n\n")
## Zeros: 1
if (length(viol_idx)) {
cat("Rows with violations and original values:\n")
print(data.frame(row = viol_idx, Sepal.Width = sw_raw[viol_idx]))
}
## Rows with violations and original values:
## row Sepal.Width
## 1 16 -3
## 2 130 0
sw_clean <- sw_raw
if (length(neg_idx)) sw_clean[neg_idx] <- abs(sw_clean[neg_idx])
if (length(zero_idx)) sw_clean[zero_idx] <- NA_real_
dirty_iris$Sepal.Width <- sw_clean
cat("\nAfter correction:\n")
##
## After correction:
cat("Any Sepal.Width <= 0? ", any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE), "\n")
## Any Sepal.Width <= 0? FALSE
cat("Count of NAs in Sepal.Width: ", sum(is.na(dirty_iris$Sepal.Width)), "\n\n")
## Count of NAs in Sepal.Width: 18
summary(dirty_iris$Sepal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.200 2.800 3.000 3.462 3.300 30.000 18
Question 9
set.seed(123)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
num_cols <- sapply(dirty_iris, is.numeric)
dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE],
function(x) suppressWarnings(as.numeric(x)))
dirty_iris[, num_cols] <- lapply(dirty_iris[, num_cols, drop = FALSE], function(x) {
x[is.infinite(x) & x > 0] <- NA_real_
x
})
if ("Sepal.Width" %in% names(dirty_iris)) {
sw <- dirty_iris$Sepal.Width
sw[!is.na(sw) & sw < 0] <- abs(sw[!is.na(sw) & sw < 0])
sw[!is.na(sw) & sw == 0] <- NA_real_
dirty_iris$Sepal.Width <- sw
}
na_before <- sapply(dirty_iris[, c("Sepal.Width","Petal.Length","Sepal.Length","Petal.Width")],
function(x) sum(is.na(x)))
cat("NA counts BEFORE:\n"); print(na_before)
## NA counts BEFORE:
## Sepal.Width Petal.Length Sepal.Length Petal.Width
## 18 19 10 13
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
fit_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
data = dirty_iris,
subset = !is.na(Sepal.Length) & !is.na(Sepal.Width) & !is.na(Petal.Length))
miss_sl <- which(is.na(dirty_iris$Sepal.Length))
if (length(miss_sl)) {
dirty_iris$Sepal.Length[miss_sl] <- predict(fit_sl, newdata = dirty_iris[miss_sl, ])
}
if (!requireNamespace("VIM", quietly = TRUE)) install.packages("VIM", quiet = TRUE)
library(VIM)
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- VIM::kNN(dirty_iris,
variable = "Petal.Width",
k = 5,
imp_var = FALSE)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
na_after <- sapply(dirty_iris[, c("Sepal.Width","Petal.Length","Sepal.Length","Petal.Width")],
function(x) sum(is.na(x)))
cat("\nNA counts AFTER:\n"); print(na_after)
##
## NA counts AFTER:
## Sepal.Width Petal.Length Sepal.Length Petal.Width
## 0 0 0 0
summary(dirty_iris[, c("Sepal.Width","Petal.Length","Sepal.Length","Petal.Width")])
## Sepal.Width Petal.Length Sepal.Length Petal.Width
## Min. : 2.200 Min. : 0.000 Min. : 0.000 Min. :0.100
## 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.: 5.100 1st Qu.:0.300
## Median : 3.100 Median : 4.500 Median : 5.800 Median :1.300
## Mean : 3.462 Mean : 4.456 Mean : 6.528 Mean :1.209
## 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.: 6.400 3rd Qu.:1.800
## Max. :30.000 Max. :63.000 Max. :73.000 Max. :2.500