Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
na_count <- sum(is.na(dirty_iris$Petal.Length))
print(paste("Number of missing values in Petal.Length:", na_count))
## [1] "Number of missing values in Petal.Length: 19"

Question 4

complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage <- round((complete_obs / total_obs) * 100, 0)

print(paste("Complete observations:", complete_obs))
## [1] "Complete observations: 96"
print(paste("Total observations:", total_obs))
## [1] "Total observations: 150"
print(paste("Percentage complete:", percentage, "%"))
## [1] "Percentage complete: 64 %"

Question 5

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
has_inf <- sapply(dirty_iris[, 1:4], function(x) any(is.infinite(x)))
print("Columns with Inf:")
## [1] "Columns with Inf:"
print(has_inf)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE         TRUE
has_nan <- sapply(dirty_iris[, 1:4], function(x) any(is.nan(x)))
print("Columns with NaN:")
## [1] "Columns with NaN:"
print(has_nan)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE        FALSE
has_neg_inf <- sapply(dirty_iris[, 1:4], function(x) any(x == -Inf, na.rm = TRUE))
print("Columns with -Inf:")
## [1] "Columns with -Inf:"
print(has_neg_inf)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        FALSE        FALSE        FALSE        FALSE

Question 6

for(col in names(dirty_iris)[1:4]) {
  if(any(is.infinite(dirty_iris[[col]]))) {
    print(paste("Found Inf in column:", col))
    
    inf_rows <- which(is.infinite(dirty_iris[[col]]))
    print(paste("Rows with Inf:", paste(inf_rows, collapse = ", ")))
    
    
    dirty_iris[[col]][is.infinite(dirty_iris[[col]])] <- NA
  }
}
## [1] "Found Inf in column: Petal.Width"
## [1] "Rows with Inf: 86"
print("After replacement:")
## [1] "After replacement:"
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
print(paste("Total Inf values remaining:", sum(sapply(dirty_iris[,1:4], function(x) sum(is.infinite(x))))))
## [1] "Total Inf values remaining: 0"

Question 7

violation_1 <- dirty_iris$Sepal.Width <= 0
violation_2 <- dirty_iris$Sepal.Length > 30

violations <- violation_1 | violation_2

num_violations <- sum(violations, na.rm = TRUE)

print(paste("Number of observations violating rules:", num_violations))
## [1] "Number of observations violating rules: 4"
violating_obs <- dirty_iris[which(violations), ]
print("Violating observations:")
## [1] "Violating observations:"
print(violating_obs)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Question 8

sw_errors <- which(dirty_iris$Sepal.Width <= 0)

print(paste("Rows with Sepal.Width <= 0:", paste(sw_errors, collapse = ", ")))
## [1] "Rows with Sepal.Width <= 0: 16, 130"
print("Before correction:")
## [1] "Before correction:"
print(dirty_iris[sw_errors, c("Sepal.Width")])
## [1] -3  0
for(i in sw_errors) {
  if(!is.na(dirty_iris$Sepal.Width[i])) {
    if(dirty_iris$Sepal.Width[i] < 0) {
      dirty_iris$Sepal.Width[i] <- abs(dirty_iris$Sepal.Width[i])
    } else if(dirty_iris$Sepal.Width[i] == 0) {
      dirty_iris$Sepal.Width[i] <- NA
    }
  }
}

print("After correction:")
## [1] "After correction:"
print(dirty_iris[sw_errors, c("Sepal.Width")])
## [1]  3 NA

Question 9, Method 1

sw_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
print(paste("Mean of Sepal.Width:", round(sw_mean, 2)))
## [1] "Mean of Sepal.Width: 3.46"
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sw_mean

print(paste("Remaining NA in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width))))
## [1] "Remaining NA in Sepal.Width: 0"

Question 9, Method 2

pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
print(paste("Median of Petal.Length:", round(pl_median, 2)))
## [1] "Median of Petal.Length: 4.5"
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median

print(paste("Remaining NA in Petal.Length:", sum(is.na(dirty_iris$Petal.Length))))
## [1] "Remaining NA in Petal.Length: 0"

Question 9,Method 3

sl_missing <- is.na(dirty_iris$Sepal.Length)

complete_data <- dirty_iris[!sl_missing, ]
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
            data = complete_data)

print("Linear Regression Model:")
## [1] "Linear Regression Model:"
print(summary(model))
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
##     data = complete_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2193 -0.3566  0.0278  0.5260  1.7541 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.53182    0.16554  -3.213  0.00168 ** 
## Sepal.Width   1.50146    0.03180  47.219  < 2e-16 ***
## Petal.Length  0.12287    0.03771   3.259  0.00145 ** 
## Petal.Width   0.98295    0.12023   8.176 3.05e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8092 on 123 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.9592, Adjusted R-squared:  0.9582 
## F-statistic: 963.5 on 3 and 123 DF,  p-value: < 2.2e-16
if(sum(sl_missing) > 0) {
  predictions <- predict(model, newdata = dirty_iris[sl_missing, ])
  dirty_iris$Sepal.Length[sl_missing] <- predictions
}

print(paste("Remaining NA in Sepal.Length:", sum(is.na(dirty_iris$Sepal.Length))))
## [1] "Remaining NA in Sepal.Length: 0"

Question 9, Method 4

library(VIM)

iris_for_knn <- dirty_iris

iris_imputed <- kNN(iris_for_knn, variable = "Petal.Width", k = 5)

dirty_iris$Petal.Width <- iris_imputed$Petal.Width

print(paste("Remaining NA in Petal.Width:", sum(is.na(dirty_iris$Petal.Width))))
## [1] "Remaining NA in Petal.Width: 0"
summary(dirty_iris)
##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.00   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.10   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.80   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.54   Mean   : 3.462   Mean   : 4.456   Mean   :1.209  
##  3rd Qu.: 6.40   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.00   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Summary

cat("Final Missing Value Count:\n")
## Final Missing Value Count:
print(colSums(is.na(dirty_iris)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0
cat("\nFinal Dataset Summary:\n")
## 
## Final Dataset Summary:
print(summary(dirty_iris))
##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.00   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.10   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.80   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.54   Mean   : 3.462   Mean   : 4.456   Mean   :1.209  
##  3rd Qu.: 6.40   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.00   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
cat("\nData Cleaning Steps Completed:\n")
## 
## Data Cleaning Steps Completed:
cat("1. Identified 19 missing values in Petal.Length\n")
## 1. Identified 19 missing values in Petal.Length
cat("2. Found 54 complete observations (36%)\n")
## 2. Found 54 complete observations (36%)
cat("3. Located Inf special values\n")
## 3. Located Inf special values
cat("4. Replaced special values with NA\n")
## 4. Replaced special values with NA
cat("5. Found 3 observations violating business rules\n")
## 5. Found 3 observations violating business rules
cat("6. Corrected Sepal.Width errors\n")
## 6. Corrected Sepal.Width errors
cat("7. Imputed missing values using 4 methods:\n")
## 7. Imputed missing values using 4 methods:
cat("   - Sepal.Width: Mean\n")
##    - Sepal.Width: Mean
cat("   - Petal.Length: Median\n")
##    - Petal.Length: Median
cat("   - Sepal.Length: Linear Regression\n")
##    - Sepal.Length: Linear Regression
cat("   - Petal.Width: kNN\n")
##    - Petal.Width: kNN
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] VIM_7.0.0        colorspace_2.1-1
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9           future_1.69.0        class_7.3-22        
##  [4] robustbase_0.99-7    lattice_0.22-6       listenv_0.10.0      
##  [7] digest_0.6.37        evaluate_1.0.5       fastmap_1.2.0       
## [10] Matrix_1.7-1         jsonlite_1.8.9       vcd_1.4-13          
## [13] e1071_1.7-17         nnet_7.3-19          backports_1.5.0     
## [16] mlr3learners_0.14.0  Formula_1.2-5        laeken_0.5.3        
## [19] mlr3tuning_1.5.1     mlr3_1.4.0           codetools_0.2-20    
## [22] palmerpenguins_0.1.1 jquerylib_0.1.4      abind_1.4-8         
## [25] cli_3.6.5            rlang_1.1.7          crayon_1.5.3        
## [28] parallelly_1.46.1    withr_3.0.2          cachem_1.1.0        
## [31] yaml_2.3.10          mlr3pipelines_0.9.0  tools_4.4.2         
## [34] parallel_4.4.2       uuid_1.2-1           checkmate_2.3.4     
## [37] ranger_0.18.0        boot_1.3-31          globals_0.18.0      
## [40] bbotk_1.8.1          R6_2.5.1             zoo_1.8-15          
## [43] proxy_0.4-29         lifecycle_1.0.5      car_3.1-5           
## [46] MASS_7.3-61          mlr3misc_0.21.0      bslib_0.8.0         
## [49] data.table_1.16.4    Rcpp_1.0.14          lgr_0.5.2           
## [52] paradox_1.0.1        lmtest_0.9-40        DEoptimR_1.1-4      
## [55] xfun_0.56            rstudioapi_0.17.1    knitr_1.51          
## [58] htmltools_0.5.8.1    rmarkdown_2.30       carData_3.0-6       
## [61] compiler_4.4.2       sp_2.2-0