Question 3, missing values in petal length
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.length))
## [1] 0
Question 4, Number of observations
num_complete <- sum(complete.cases(dirty_iris))
percent <- num_complete / nrow(dirty_iris)
num_complete
## [1] 96
percent
## [1] 0.64
Question 5 Type of special values
sapply(dirty_iris, function(x) {
if (is.numeric(x)) sum(is.infinite(x)) else NA
})
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 1 NA
Question 6, Locate and Replace
which(is.infinite(as.matrix(dirty_iris)), arr.ind = TRUE)
## row col
# Replace Inf with NA
dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA
# Verify
sum(is.infinite(as.matrix(dirty_iris)))
## [1] 0
Question 7, observations violating the rules
violating_rows <- subset(dirty_iris,
Sepal.Width <= 0 | Sepal.Length > 30)
violating_rows <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]
violating_indices <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
print(violating_rows)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA NA NA NA NA <NA>
## NA.1 NA NA NA NA <NA>
## NA.2 NA NA NA NA <NA>
## 16 5.0 -3 3.5 1.0 versicolor
## NA.3 NA NA NA NA <NA>
## NA.4 NA NA NA NA <NA>
## NA.5 NA NA NA NA <NA>
## NA.6 NA NA NA NA <NA>
## NA.7 NA NA NA NA <NA>
## 28 73.0 29 63.0 NA virginica
## NA.8 NA NA NA NA <NA>
## NA.9 NA NA NA NA <NA>
## NA.10 NA NA NA NA <NA>
## NA.11 NA NA NA NA <NA>
## NA.12 NA NA NA NA <NA>
## NA.13 NA NA NA NA <NA>
## NA.14 NA NA NA NA <NA>
## NA.15 NA NA NA NA <NA>
## NA.16 NA NA NA NA <NA>
## NA.17 NA NA NA NA <NA>
## NA.18 NA NA NA NA <NA>
## NA.19 NA NA NA NA <NA>
## NA.20 NA NA NA NA <NA>
## NA.21 NA NA NA NA <NA>
## 125 49.0 30 14.0 2.0 setosa
## NA.22 NA NA NA NA <NA>
## 130 5.7 0 1.7 0.3 setosa
## NA.23 NA NA NA NA <NA>
## NA.24 NA NA NA NA <NA>
## NA.25 NA NA NA NA <NA>
## NA.26 NA NA NA NA <NA>
Question 8
invalid_width <- dirty_iris[dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width), ]
print("Observations with Sepal.Width <= 0:")
## [1] "Observations with Sepal.Width <= 0:"
print(invalid_width)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
cat("\nRow indices with Sepal.Width <= 0:\n")
##
## Row indices with Sepal.Width <= 0:
print(which(dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width)))
## [1] 16 130
summary(dirty_iris$Sepal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.000 2.800 3.000 3.391 3.300 30.000 17
dirty_iris$Sepal.Width <- ifelse(dirty_iris$Sepal.Width < 0,
abs(dirty_iris$Sepal.Width),
dirty_iris$Sepal.Width)
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
cat("\nAfter correction:\n")
##
## After correction:
summary(dirty_iris$Sepal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.200 2.800 3.000 3.462 3.300 30.000 18
print("Any remaining Sepal.Width <= 0?")
## [1] "Any remaining Sepal.Width <= 0?"
print(any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)) # should be FALSE
## [1] FALSE
Question 9. missing values
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv",
na.strings = c("", "NA"))
dirty_iris <- dirty_iris |>
mutate(across(where(is.numeric), ~ifelse(is.infinite(.) | is.nan(.), NA, .)))
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
any(is.infinite(as.matrix(dirty_iris[,1:4]))) # → FALSE
## [1] FALSE
any(is.nan(as.matrix(dirty_iris[,1:4]))) # → FALSE
## [1] FALSE
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
any_inf <- sapply(dirty_iris[,1:4], function(x) any(is.infinite(x), na.rm = TRUE))
any_nan <- sapply(dirty_iris[,1:4], function(x) any(is.nan(x), na.rm = TRUE))
print("Columns with Inf:")
## [1] "Columns with Inf:"
print(any_inf)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## FALSE FALSE FALSE FALSE
print("Columns with NaN:")
## [1] "Columns with NaN:"
print(any_nan)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## FALSE FALSE FALSE FALSE
model_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris)
missing_sl <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_sl] <- predict(model_sl, newdata = dirty_iris[missing_sl, ])