dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
# Q3
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
#Q4
# Count complete rows
complete_rows <- sum(complete.cases(dirty_iris))
complete_rows
## [1] 96
# Total rows
total_rows <- nrow(dirty_iris)
total_rows
## [1] 150
# Percentage
percentage_complete <- complete_rows / total_rows * 100
percentage_complete
## [1] 64
#Q5
suppressWarnings(
sapply(dirty_iris, function(x) sum(is.infinite(as.numeric(x)), na.rm = TRUE))
)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 1 0
#Q6
# Convert numeric columns and replace Inf with NA
dirty_iris[] <- lapply(dirty_iris, function(col) {
# Convert to numeric where possible
suppressWarnings(col_num <- as.numeric(col))
# If conversion produced numeric values, check for Inf
if (!all(is.na(col_num))) {
col_num[is.infinite(col_num)] <- NA
return(col_num)
} else {
return(col)
}
})
# Verify replacement
sapply(dirty_iris, function(x) sum(is.infinite(x), na.rm = TRUE))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
#Q7
# Convert columns to numeric (to ensure proper comparison)
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
# Logical conditions for violations
violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30
# Subset rows that violate rules
invalid_rows <- dirty_iris[violations, ]
# Count violations
nrow(invalid_rows)
## [1] 31
# View invalid observations
invalid_rows
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA NA NA NA NA <NA>
## NA.1 NA NA NA NA <NA>
## NA.2 NA NA NA NA <NA>
## 16 5.0 -3 3.5 1.0 versicolor
## NA.3 NA NA NA NA <NA>
## NA.4 NA NA NA NA <NA>
## NA.5 NA NA NA NA <NA>
## NA.6 NA NA NA NA <NA>
## NA.7 NA NA NA NA <NA>
## 28 73.0 29 63.0 NA virginica
## NA.8 NA NA NA NA <NA>
## NA.9 NA NA NA NA <NA>
## NA.10 NA NA NA NA <NA>
## NA.11 NA NA NA NA <NA>
## NA.12 NA NA NA NA <NA>
## NA.13 NA NA NA NA <NA>
## NA.14 NA NA NA NA <NA>
## NA.15 NA NA NA NA <NA>
## NA.16 NA NA NA NA <NA>
## NA.17 NA NA NA NA <NA>
## NA.18 NA NA NA NA <NA>
## NA.19 NA NA NA NA <NA>
## NA.20 NA NA NA NA <NA>
## NA.21 NA NA NA NA <NA>
## 125 49.0 30 14.0 2.0 setosa
## NA.22 NA NA NA NA <NA>
## 130 5.7 0 1.7 0.3 setosa
## NA.23 NA NA NA NA <NA>
## NA.24 NA NA NA NA <NA>
## NA.25 NA NA NA NA <NA>
## NA.26 NA NA NA NA <NA>
#Q8
# Convert to numeric
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
# Locate problematic values
which(dirty_iris$Sepal.Width <= 0)
## [1] 16 130
# Apply corrections
neg_index <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_index] <- abs(dirty_iris$Sepal.Width[neg_index])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
# Verify
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width <= 0]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Q9
# Load required packages
library(VIM) # for kNN imputation
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice) # for regression imputation
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
# Convert columns to numeric
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Petal.Length <- as.numeric(dirty_iris$Petal.Length)
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
dirty_iris$Petal.Width <- as.numeric(dirty_iris$Petal.Width)
### 1️⃣ Sepal.Width → Mean Imputation
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
### 2️⃣ Petal.Length → Median Imputation
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
### 3️⃣ Sepal.Length → Linear Regression Imputation
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris)
missing_SL <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_SL] <-
predict(lm_model, newdata = dirty_iris[missing_SL, ])
### 4️⃣ Petal.Width → kNN Imputation
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
# Remove helper column created by VIM
dirty_iris$Petal.Width_imp <- NULL