R Notebook

# Load the dataset correctly
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

head(dirty_iris)

str(dirty_iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

sum(complete.cases(dirty_iris))

## [1] 96

sum(nrow(dirty_iris))

## [1] 150

percent_of_complete_cases <- (sum(complete.cases(dirty_iris)) / sum(nrow(dirty_iris))) * 100
percent_of_complete_cases

## [1] 64

# Function to count special values per column
count_special_values <- function(df) {
  sapply(df, function(col) {
    c(
      NA_count = sum(is.na(col)),
      NaN_count = sum(is.nan(col)),
      Inf_count = sum(is.infinite(col) & col > 0),
      NegInf_count = sum(is.infinite(col) & col < 0)
    )
  })
}

special_values_summary <- count_special_values(dirty_iris)
special_values_summary

##              Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA_count               10          17           19          12       0
## NaN_count               0           0            0           0       0
## Inf_count               0           0            0           1       0
## NegInf_count            0           0            0           0       0

dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA
sum(is.na(dirty_iris))

## [1] 58

invalid_sepal_width <- dirty_iris$Sepal.Width <= 0
invalid_sepal_length <- dirty_iris$Sepal.Length > 30
violating_rows <- which(invalid_sepal_length | invalid_sepal_width)
violating_observation <- dirty_iris[violating_rows, ]
violating_observation

nrow(violating_observation)

## [1] 4

invalid_sepal_width_rows <- which(dirty_iris$Sepal.Width <= 0)
dirty_iris[invalid_sepal_width_rows, ]

median_sepal_width <- median(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width > 0], na.rm = TRUE)
median_sepal_width

## [1] 3

dirty_iris$Sepal.Width[invalid_sepal_width_rows] <- median_sepal_width
dirty_iris[invalid_sepal_width_rows, ]

library(VIM)

## Warning: package 'VIM' was built under R version 4.4.2

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Ensure no Inf or NaN values before imputation
dirty_iris[!is.finite(as.matrix(dirty_iris))] <- NA  

### 1. Mean Imputation for Sepal.Width
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

### 2. Median Imputation for Petal.Length
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

### 3. Linear Regression Imputation for Sepal.Length
# Check if there are enough complete cases for regression
complete_data <- dirty_iris %>% filter(!is.na(Sepal.Width) & !is.na(Petal.Length) & !is.na(Petal.Width))

if (nrow(complete_data) > 1) {  # Ensure enough data points for regression
    lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = complete_data)
    missing_sepal_length_rows <- which(is.na(dirty_iris$Sepal.Length))
    
    if (length(missing_sepal_length_rows) > 0) {
        dirty_iris$Sepal.Length[missing_sepal_length_rows] <- predict(lm_model, newdata = dirty_iris[missing_sepal_length_rows, ])
    }
} else {
    cat("Not enough complete cases for regression. Sepal.Length not imputed.\n")
}

## Not enough complete cases for regression. Sepal.Length not imputed.

### 4. kNN Imputation for Petal.Width
# Select only numeric columns for kNN
numeric_cols <- dirty_iris %>% select(where(is.numeric))

# Perform kNN imputation on numeric columns
imputed_numeric <- kNN(numeric_cols, variable = "Petal.Width", k = 5)

## Warning in kNN(numeric_cols, variable = "Petal.Width", k = 5): All observations of Petal.Width are missing, therefore the variable will not be imputed!

## Warning in kNN(numeric_cols, variable = "Petal.Width", k = 5): Nothing is
## imputed, because all variables to be imputed only contains missings.

# Replace the original column with the imputed version
dirty_iris$Petal.Width <- imputed_numeric$Petal.Width

# Remove extra "_imp" columns added by kNN
dirty_iris <- dirty_iris %>% select(-contains("_imp"))

# Final Check: Count remaining missing values
cat("Remaining missing values after imputation:\n")

## Remaining missing values after imputation:

print(colSums(is.na(dirty_iris)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##          150          150          150          150          150

# Save cleaned dataset
write.csv(dirty_iris, "cleaned_iris.csv", row.names = FALSE)