dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Question 3

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

complete <- sum(complete.cases(dirty_iris))
total <- nrow(dirty_iris)
percentage <- (complete/total) * 100

complete
## [1] 96
percentage
## [1] 64

Question 5

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 6

# Locate the Inf value
which(is.na(dirty_iris$Petal.Width) == FALSE & dirty_iris$Petal.Width == Inf)
## [1] 86
# Replace Inf with NA
dirty_iris$Petal.Width[dirty_iris$Petal.Width == Inf] <- NA

Question 7

# Rule 1: Sepal.Width should be positive (> 0)
# Rule 2: Sepal.Length cannot exceed 30 cm

violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
violations
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Question 8

# Fix row 16: assign absolute value to negative
dirty_iris$Sepal.Width[16] <- abs(dirty_iris$Sepal.Width[16])

# Fix row 130: assign NA to 0
dirty_iris$Sepal.Width[130] <- NA

# Check the corrections
dirty_iris[c(16, 130), ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa

Question 9

library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# Sepal.Width: mean imputation
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

# Petal.Length: median imputation
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

# Sepal.Length: linear regression imputation
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
missing_sl <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_sl] <- predict(model, dirty_iris[missing_sl, ])

# Petal.Width: kNN imputation
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

summary(dirty_iris)
##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.00   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.10   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.80   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.54   Mean   : 3.462   Mean   : 4.456   Mean   :1.209  
##  3rd Qu.: 6.40   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.00   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species          Petal.Width_imp
##  Length:150         Mode :logical  
##  Class :character   FALSE:137      
##  Mode  :character   TRUE :13       
##                                    
##                                    
##