Assignment 4

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

dirty_iris[] <- lapply(dirty_iris, function(x) {
  if (is.numeric(x)) {
    x[!is.finite(x)] <- NA
  }
  x
})

Question 3: Missing values in Petal.Length

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

Question 4: Number and % of complete observations

n_total <- nrow(dirty_iris)
n_complete <- sum(complete.cases(dirty_iris))
pct_complete <- n_complete / n_total * 100

n_complete

## [1] 95

pct_complete

## [1] 63.33333

Question 5: Special values besides NA (Inf/-Inf / NaN)

sum(dirty_iris == Inf, na.rm = TRUE)

## [1] 0

sum(dirty_iris == -Inf, na.rm = TRUE)

## [1] 0

sum(is.nan(as.matrix(dirty_iris[sapply(dirty_iris, is.numeric)])), na.rm = TRUE)

## [1] 0

Question 6: Locate that special value and replace with NA

# identify numeric columns
num_cols <- sapply(dirty_iris, is.numeric)

# replace Inf/-Inf in numeric columns only
tmp <- dirty_iris[, num_cols]
tmp[is.infinite(as.matrix(tmp))] <- NA
dirty_iris[, num_cols] <- tmp

# verify
any(is.infinite(as.matrix(dirty_iris[, num_cols])))

## [1] FALSE

which(is.infinite(as.matrix(dirty_iris[, num_cols])), arr.ind = TRUE)

##      row col

summary(dirty_iris$Petal.Width)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.100   0.300   1.300   1.207   1.800   2.500      13

Question 7:

violations <- (dirty_iris$Sepal.Width <= 0) | (dirty_iris$Sepal.Length > 30)

sum(violations, na.rm = TRUE)      # <-- this is the number you need for Q7

## [1] 4

which(violations)                  # row numbers (optional)

## [1]  16  28 125 130

dirty_iris[which(violations), ]    # show violating records

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Question 8:

# --- Q8: locate Sepal.Width <= 0 and correct it ---

# 1) locate bad sepal widths, excluding NA (important!)
idx_neg  <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
idx_zero <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0)

# show the bad rows BEFORE fixing (proof)
dirty_iris[c(idx_neg, idx_zero), ]

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa

# 2) correct:
# negative -> abs(value)
dirty_iris$Sepal.Width[idx_neg] <- abs(dirty_iris$Sepal.Width[idx_neg])

# zero -> NA
dirty_iris$Sepal.Width[idx_zero] <- NA

# show the same rows AFTER fixing (proof)
dirty_iris[c(idx_neg, idx_zero), ]

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa

Question 9:

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


num_cols <- sapply(dirty_iris, is.numeric)
tmp <- dirty_iris[, num_cols]
tmp[!is.finite(as.matrix(tmp))] <- NA
dirty_iris[, num_cols] <- tmp


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <-
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA


dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)


dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)


library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

sum(is.na(dirty_iris$Petal.Width))

## [1] 13

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5, imp_var = FALSE)
sum(is.na(dirty_iris$Petal.Width))

## [1] 0

I <- is.na(dirty_iris$Sepal.Length)
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
dirty_iris$Sepal.Length[I] <- predict(fit, newdata = dirty_iris[I, ])


colSums(is.na(dirty_iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Assignment 4

Hannah Specthrie

2026-02-26

Question 3: Missing values in Petal.Length

Question 4: Number and % of complete observations

Question 5: Special values besides NA (Inf/-Inf / NaN)

Question 6: Locate that special value and replace with NA

Question 7:

Question 8:

Question 9: