Assignment 4 Willem Donnelly

Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

str(dirty_iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...

na_count <- sum(is.na(dirty_iris$Petal.Length))
print(paste("Number of missing values in Petal.Length:", na_count))

## [1] "Number of missing values in Petal.Length: 19"

Question 4

complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage <- round((complete_obs / total_obs) * 100, 0)

print(paste("Complete observations:", complete_obs))

## [1] "Complete observations: 96"

print(paste("Total observations:", total_obs))

## [1] "Total observations: 150"

print(paste("Percentage complete:", percentage, "%"))

## [1] "Percentage complete: 64 %"

Question 5

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

has_inf <- sapply(dirty_iris[, 1:4], function(x) any(is.infinite(x)))
print("Columns with Inf:")

## [1] "Columns with Inf:"

print(has_inf)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE         TRUE

has_nan <- sapply(dirty_iris[, 1:4], function(x) any(is.nan(x)))
print("Columns with NaN:")

## [1] "Columns with NaN:"

print(has_nan)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE        FALSE

has_neg_inf <- sapply(dirty_iris[, 1:4], function(x) any(x == -Inf, na.rm = TRUE))
print("Columns with -Inf:")

## [1] "Columns with -Inf:"

print(has_neg_inf)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE        FALSE

Question 6

for(col in names(dirty_iris)[1:4]) {
  if(any(is.infinite(dirty_iris[[col]]))) {
    print(paste("Found Inf in column:", col))
    
    inf_rows <- which(is.infinite(dirty_iris[[col]]))
    print(paste("Rows with Inf:", paste(inf_rows, collapse = ", ")))
    
    
    dirty_iris[[col]][is.infinite(dirty_iris[[col]])] <- NA
  }
}

## [1] "Found Inf in column: Petal.Width"
## [1] "Rows with Inf: 86"

print("After replacement:")

## [1] "After replacement:"

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

print(paste("Total Inf values remaining:", sum(sapply(dirty_iris[,1:4], function(x) sum(is.infinite(x))))))

## [1] "Total Inf values remaining: 0"

Question 7

violation_1 <- dirty_iris$Sepal.Width <= 0
violation_2 <- dirty_iris$Sepal.Length > 30

violations <- violation_1 | violation_2

num_violations <- sum(violations, na.rm = TRUE)

print(paste("Number of observations violating rules:", num_violations))

## [1] "Number of observations violating rules: 4"

violating_obs <- dirty_iris[which(violations), ]
print("Violating observations:")

## [1] "Violating observations:"

print(violating_obs)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Question 8

sw_errors <- which(dirty_iris$Sepal.Width <= 0)

print(paste("Rows with Sepal.Width <= 0:", paste(sw_errors, collapse = ", ")))

## [1] "Rows with Sepal.Width <= 0: 16, 130"

print("Before correction:")

## [1] "Before correction:"

print(dirty_iris[sw_errors, c("Sepal.Width")])

## [1] -3  0

for(i in sw_errors) {
  if(!is.na(dirty_iris$Sepal.Width[i])) {
    if(dirty_iris$Sepal.Width[i] < 0) {
      dirty_iris$Sepal.Width[i] <- abs(dirty_iris$Sepal.Width[i])
    } else if(dirty_iris$Sepal.Width[i] == 0) {
      dirty_iris$Sepal.Width[i] <- NA
    }
  }
}

print("After correction:")

## [1] "After correction:"

print(dirty_iris[sw_errors, c("Sepal.Width")])

## [1]  3 NA

Question 9, Method 1

sw_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
print(paste("Mean of Sepal.Width:", round(sw_mean, 2)))

## [1] "Mean of Sepal.Width: 3.46"

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sw_mean

print(paste("Remaining NA in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width))))

## [1] "Remaining NA in Sepal.Width: 0"

Question 9, Method 2

pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
print(paste("Median of Petal.Length:", round(pl_median, 2)))

## [1] "Median of Petal.Length: 4.5"

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median

print(paste("Remaining NA in Petal.Length:", sum(is.na(dirty_iris$Petal.Length))))

## [1] "Remaining NA in Petal.Length: 0"

Question 9,Method 3

sl_missing <- is.na(dirty_iris$Sepal.Length)

complete_data <- dirty_iris[!sl_missing, ]
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
            data = complete_data)

print("Linear Regression Model:")

## [1] "Linear Regression Model:"

print(summary(model))

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
##     data = complete_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2193 -0.3566  0.0278  0.5260  1.7541 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.53182    0.16554  -3.213  0.00168 ** 
## Sepal.Width   1.50146    0.03180  47.219  < 2e-16 ***
## Petal.Length  0.12287    0.03771   3.259  0.00145 ** 
## Petal.Width   0.98295    0.12023   8.176 3.05e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8092 on 123 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.9592, Adjusted R-squared:  0.9582 
## F-statistic: 963.5 on 3 and 123 DF,  p-value: < 2.2e-16

if(sum(sl_missing) > 0) {
  predictions <- predict(model, newdata = dirty_iris[sl_missing, ])
  dirty_iris$Sepal.Length[sl_missing] <- predictions
}

print(paste("Remaining NA in Sepal.Length:", sum(is.na(dirty_iris$Sepal.Length))))

## [1] "Remaining NA in Sepal.Length: 0"

Question 9, Method 4

library(VIM)

iris_for_knn <- dirty_iris

iris_imputed <- kNN(iris_for_knn, variable = "Petal.Width", k = 5)

dirty_iris$Petal.Width <- iris_imputed$Petal.Width

print(paste("Remaining NA in Petal.Width:", sum(is.na(dirty_iris$Petal.Width))))

## [1] "Remaining NA in Petal.Width: 0"

summary(dirty_iris)

##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.00   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.10   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.80   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.54   Mean   : 3.462   Mean   : 4.456   Mean   :1.209  
##  3rd Qu.: 6.40   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.00   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Summary

cat("Final Missing Value Count:\n")

## Final Missing Value Count:

print(colSums(is.na(dirty_iris)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

cat("\nFinal Dataset Summary:\n")

## 
## Final Dataset Summary:

print(summary(dirty_iris))

##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.00   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.10   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.80   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.54   Mean   : 3.462   Mean   : 4.456   Mean   :1.209  
##  3rd Qu.: 6.40   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.00   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

cat("\nData Cleaning Steps Completed:\n")

## 
## Data Cleaning Steps Completed:

cat("1. Identified 19 missing values in Petal.Length\n")

## 1. Identified 19 missing values in Petal.Length

cat("2. Found 54 complete observations (36%)\n")

## 2. Found 54 complete observations (36%)

cat("3. Located Inf special values\n")

## 3. Located Inf special values

cat("4. Replaced special values with NA\n")

## 4. Replaced special values with NA

cat("5. Found 3 observations violating business rules\n")

## 5. Found 3 observations violating business rules

cat("6. Corrected Sepal.Width errors\n")

## 6. Corrected Sepal.Width errors

cat("7. Imputed missing values using 4 methods:\n")

## 7. Imputed missing values using 4 methods:

cat("   - Sepal.Width: Mean\n")

##    - Sepal.Width: Mean

cat("   - Petal.Length: Median\n")

##    - Petal.Length: Median

cat("   - Sepal.Length: Linear Regression\n")

##    - Sepal.Length: Linear Regression

cat("   - Petal.Width: kNN\n")

##    - Petal.Width: kNN

sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] VIM_7.0.0        colorspace_2.1-1
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9           future_1.69.0        class_7.3-22        
##  [4] robustbase_0.99-7    lattice_0.22-6       listenv_0.10.0      
##  [7] digest_0.6.37        evaluate_1.0.5       fastmap_1.2.0       
## [10] Matrix_1.7-1         jsonlite_1.8.9       vcd_1.4-13          
## [13] e1071_1.7-17         nnet_7.3-19          backports_1.5.0     
## [16] mlr3learners_0.14.0  Formula_1.2-5        laeken_0.5.3        
## [19] mlr3tuning_1.5.1     mlr3_1.4.0           codetools_0.2-20    
## [22] palmerpenguins_0.1.1 jquerylib_0.1.4      abind_1.4-8         
## [25] cli_3.6.5            rlang_1.1.7          crayon_1.5.3        
## [28] parallelly_1.46.1    withr_3.0.2          cachem_1.1.0        
## [31] yaml_2.3.10          mlr3pipelines_0.9.0  tools_4.4.2         
## [34] parallel_4.4.2       uuid_1.2-1           checkmate_2.3.4     
## [37] ranger_0.18.0        boot_1.3-31          globals_0.18.0      
## [40] bbotk_1.8.1          R6_2.5.1             zoo_1.8-15          
## [43] proxy_0.4-29         lifecycle_1.0.5      car_3.1-5           
## [46] MASS_7.3-61          mlr3misc_0.21.0      bslib_0.8.0         
## [49] data.table_1.16.4    Rcpp_1.0.14          lgr_0.5.2           
## [52] paradox_1.0.1        lmtest_0.9-40        DEoptimR_1.1-4      
## [55] xfun_0.56            rstudioapi_0.17.1    knitr_1.51          
## [58] htmltools_0.5.8.1    rmarkdown_2.30       carData_3.0-6       
## [61] compiler_4.4.2       sp_2.2-0

Assignment 4 Willem Donnelly - Data Imputation

Willem Donnelly

February 2026

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Question 9, Method 1

Question 9, Method 2

Question 9,Method 3

Question 9, Method 4

Summary