dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
# Q3
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
#Q4
# Count complete rows
complete_rows <- sum(complete.cases(dirty_iris))

complete_rows
## [1] 96
# Total rows
total_rows <- nrow(dirty_iris)

total_rows
## [1] 150
# Percentage
percentage_complete <- complete_rows / total_rows * 100

percentage_complete
## [1] 64
#Q5
suppressWarnings(
  sapply(dirty_iris, function(x) sum(is.infinite(as.numeric(x)), na.rm = TRUE))
)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1            0
#Q6
# Convert numeric columns and replace Inf with NA
dirty_iris[] <- lapply(dirty_iris, function(col) {
  # Convert to numeric where possible
  suppressWarnings(col_num <- as.numeric(col))
  
  # If conversion produced numeric values, check for Inf
  if (!all(is.na(col_num))) {
    col_num[is.infinite(col_num)] <- NA
    return(col_num)
  } else {
    return(col)
  }
})

# Verify replacement
sapply(dirty_iris, function(x) sum(is.infinite(x), na.rm = TRUE))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0
#Q7
# Convert columns to numeric (to ensure proper comparison)
dirty_iris$Sepal.Width  <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)

# Logical conditions for violations
violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30

# Subset rows that violate rules
invalid_rows <- dirty_iris[violations, ]

# Count violations
nrow(invalid_rows)
## [1] 31
# View invalid observations
invalid_rows
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## NA.2            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## 28            73.0          29         63.0          NA  virginica
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
## NA.17           NA          NA           NA          NA       <NA>
## NA.18           NA          NA           NA          NA       <NA>
## NA.19           NA          NA           NA          NA       <NA>
## NA.20           NA          NA           NA          NA       <NA>
## NA.21           NA          NA           NA          NA       <NA>
## 125           49.0          30         14.0         2.0     setosa
## NA.22           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.23           NA          NA           NA          NA       <NA>
## NA.24           NA          NA           NA          NA       <NA>
## NA.25           NA          NA           NA          NA       <NA>
## NA.26           NA          NA           NA          NA       <NA>
#Q8
# Convert to numeric
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)

# Locate problematic values
which(dirty_iris$Sepal.Width <= 0)
## [1]  16 130
# Apply corrections
neg_index <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_index] <- abs(dirty_iris$Sepal.Width[neg_index])

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA

# Verify
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width <= 0]
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Q9
# Load required packages
library(VIM)       # for kNN imputation
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(mice)      # for regression imputation
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
# Convert columns to numeric
dirty_iris$Sepal.Width  <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Petal.Length <- as.numeric(dirty_iris$Petal.Length)
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
dirty_iris$Petal.Width  <- as.numeric(dirty_iris$Petal.Width)

### 1️⃣ Sepal.Width → Mean Imputation
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

### 2️⃣ Petal.Length → Median Imputation
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

### 3️⃣ Sepal.Length → Linear Regression Imputation
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
               data = dirty_iris)

missing_SL <- is.na(dirty_iris$Sepal.Length)

dirty_iris$Sepal.Length[missing_SL] <-
  predict(lm_model, newdata = dirty_iris[missing_SL, ])

### 4️⃣ Petal.Width → kNN Imputation
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

# Remove helper column created by VIM
dirty_iris$Petal.Width_imp <- NULL