Question 3, missing values in petal length

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.length))
## [1] 0

Question 4, Number of observations

num_complete <- sum(complete.cases(dirty_iris))


percent <- num_complete / nrow(dirty_iris)

num_complete
## [1] 96
percent
## [1] 0.64

Question 5 Type of special values

sapply(dirty_iris, function(x) {
  if (is.numeric(x)) sum(is.infinite(x)) else NA
})
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1           NA

Question 6, Locate and Replace

which(is.infinite(as.matrix(dirty_iris)), arr.ind = TRUE)
##      row col
# Replace Inf with NA
dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA

# Verify
sum(is.infinite(as.matrix(dirty_iris)))
## [1] 0

Question 7, observations violating the rules

violating_rows <- subset(dirty_iris, 
                         Sepal.Width <= 0 | Sepal.Length > 30)


violating_rows <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]


violating_indices <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)

print(violating_rows)
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## NA.2            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## 28            73.0          29         63.0          NA  virginica
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
## NA.17           NA          NA           NA          NA       <NA>
## NA.18           NA          NA           NA          NA       <NA>
## NA.19           NA          NA           NA          NA       <NA>
## NA.20           NA          NA           NA          NA       <NA>
## NA.21           NA          NA           NA          NA       <NA>
## 125           49.0          30         14.0         2.0     setosa
## NA.22           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.23           NA          NA           NA          NA       <NA>
## NA.24           NA          NA           NA          NA       <NA>
## NA.25           NA          NA           NA          NA       <NA>
## NA.26           NA          NA           NA          NA       <NA>

Question 8

invalid_width <- dirty_iris[dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width), ]
print("Observations with Sepal.Width <= 0:")
## [1] "Observations with Sepal.Width <= 0:"
print(invalid_width)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
cat("\nRow indices with Sepal.Width <= 0:\n")
## 
## Row indices with Sepal.Width <= 0:
print(which(dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width)))
## [1]  16 130
summary(dirty_iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -3.000   2.800   3.000   3.391   3.300  30.000      17
dirty_iris$Sepal.Width <- ifelse(dirty_iris$Sepal.Width < 0, 
                                 abs(dirty_iris$Sepal.Width),
                                 dirty_iris$Sepal.Width)

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA

cat("\nAfter correction:\n")
## 
## After correction:
summary(dirty_iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.200   2.800   3.000   3.462   3.300  30.000      18
print("Any remaining Sepal.Width <= 0?")
## [1] "Any remaining Sepal.Width <= 0?"
print(any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE))   # should be FALSE
## [1] FALSE

Question 9. missing values

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv",
                       na.strings = c("", "NA"))

dirty_iris <- dirty_iris |>
  mutate(across(where(is.numeric), ~ifelse(is.infinite(.) | is.nan(.), NA, .)))

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
any(is.infinite(as.matrix(dirty_iris[,1:4])))   # → FALSE
## [1] FALSE
any(is.nan(as.matrix(dirty_iris[,1:4])))        # → FALSE
## [1] FALSE
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- 
  median(dirty_iris$Petal.Length, na.rm = TRUE)

any_inf <- sapply(dirty_iris[,1:4], function(x) any(is.infinite(x), na.rm = TRUE))
any_nan <- sapply(dirty_iris[,1:4], function(x) any(is.nan(x), na.rm = TRUE))
print("Columns with Inf:")
## [1] "Columns with Inf:"
print(any_inf)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE        FALSE
print("Columns with NaN:")
## [1] "Columns with NaN:"
print(any_nan)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE        FALSE
model_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
               data = dirty_iris)

missing_sl <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_sl] <- predict(model_sl, newdata = dirty_iris[missing_sl, ])