Question 3
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
na_count <- sum(is.na(dirty_iris$Petal.Length))
print(paste("Number of missing values in Petal.Length:", na_count))
## [1] "Number of missing values in Petal.Length: 19"
Question 4
complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage <- round((complete_obs / total_obs) * 100, 0)
print(paste("Complete observations:", complete_obs))
## [1] "Complete observations: 96"
print(paste("Total observations:", total_obs))
## [1] "Total observations: 150"
print(paste("Percentage complete:", percentage, "%"))
## [1] "Percentage complete: 64 %"
Question 5
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
has_inf <- sapply(dirty_iris[, 1:4], function(x) any(is.infinite(x)))
print("Columns with Inf:")
## [1] "Columns with Inf:"
print(has_inf)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## FALSE FALSE FALSE TRUE
has_nan <- sapply(dirty_iris[, 1:4], function(x) any(is.nan(x)))
print("Columns with NaN:")
## [1] "Columns with NaN:"
print(has_nan)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## FALSE FALSE FALSE FALSE
has_neg_inf <- sapply(dirty_iris[, 1:4], function(x) any(x == -Inf, na.rm = TRUE))
print("Columns with -Inf:")
## [1] "Columns with -Inf:"
print(has_neg_inf)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## FALSE FALSE FALSE FALSE
Question 6
for(col in names(dirty_iris)[1:4]) {
if(any(is.infinite(dirty_iris[[col]]))) {
print(paste("Found Inf in column:", col))
inf_rows <- which(is.infinite(dirty_iris[[col]]))
print(paste("Rows with Inf:", paste(inf_rows, collapse = ", ")))
dirty_iris[[col]][is.infinite(dirty_iris[[col]])] <- NA
}
}
## [1] "Found Inf in column: Petal.Width"
## [1] "Rows with Inf: 86"
print("After replacement:")
## [1] "After replacement:"
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
print(paste("Total Inf values remaining:", sum(sapply(dirty_iris[,1:4], function(x) sum(is.infinite(x))))))
## [1] "Total Inf values remaining: 0"
Question 7
violation_1 <- dirty_iris$Sepal.Width <= 0
violation_2 <- dirty_iris$Sepal.Length > 30
violations <- violation_1 | violation_2
num_violations <- sum(violations, na.rm = TRUE)
print(paste("Number of observations violating rules:", num_violations))
## [1] "Number of observations violating rules: 4"
violating_obs <- dirty_iris[which(violations), ]
print("Violating observations:")
## [1] "Violating observations:"
print(violating_obs)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
Question 8
sw_errors <- which(dirty_iris$Sepal.Width <= 0)
print(paste("Rows with Sepal.Width <= 0:", paste(sw_errors, collapse = ", ")))
## [1] "Rows with Sepal.Width <= 0: 16, 130"
print("Before correction:")
## [1] "Before correction:"
print(dirty_iris[sw_errors, c("Sepal.Width")])
## [1] -3 0
for(i in sw_errors) {
if(!is.na(dirty_iris$Sepal.Width[i])) {
if(dirty_iris$Sepal.Width[i] < 0) {
dirty_iris$Sepal.Width[i] <- abs(dirty_iris$Sepal.Width[i])
} else if(dirty_iris$Sepal.Width[i] == 0) {
dirty_iris$Sepal.Width[i] <- NA
}
}
}
print("After correction:")
## [1] "After correction:"
print(dirty_iris[sw_errors, c("Sepal.Width")])
## [1] 3 NA
Question 9, Method 1
sw_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
print(paste("Mean of Sepal.Width:", round(sw_mean, 2)))
## [1] "Mean of Sepal.Width: 3.46"
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sw_mean
print(paste("Remaining NA in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width))))
## [1] "Remaining NA in Sepal.Width: 0"
Question 9, Method 2
pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
print(paste("Median of Petal.Length:", round(pl_median, 2)))
## [1] "Median of Petal.Length: 4.5"
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median
print(paste("Remaining NA in Petal.Length:", sum(is.na(dirty_iris$Petal.Length))))
## [1] "Remaining NA in Petal.Length: 0"
Question 9,Method 3
sl_missing <- is.na(dirty_iris$Sepal.Length)
complete_data <- dirty_iris[!sl_missing, ]
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = complete_data)
print("Linear Regression Model:")
## [1] "Linear Regression Model:"
print(summary(model))
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
## data = complete_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2193 -0.3566 0.0278 0.5260 1.7541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.53182 0.16554 -3.213 0.00168 **
## Sepal.Width 1.50146 0.03180 47.219 < 2e-16 ***
## Petal.Length 0.12287 0.03771 3.259 0.00145 **
## Petal.Width 0.98295 0.12023 8.176 3.05e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8092 on 123 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.9592, Adjusted R-squared: 0.9582
## F-statistic: 963.5 on 3 and 123 DF, p-value: < 2.2e-16
if(sum(sl_missing) > 0) {
predictions <- predict(model, newdata = dirty_iris[sl_missing, ])
dirty_iris$Sepal.Length[sl_missing] <- predictions
}
print(paste("Remaining NA in Sepal.Length:", sum(is.na(dirty_iris$Sepal.Length))))
## [1] "Remaining NA in Sepal.Length: 0"
Question 9, Method 4
library(VIM)
iris_for_knn <- dirty_iris
iris_imputed <- kNN(iris_for_knn, variable = "Petal.Width", k = 5)
dirty_iris$Petal.Width <- iris_imputed$Petal.Width
print(paste("Remaining NA in Petal.Width:", sum(is.na(dirty_iris$Petal.Width))))
## [1] "Remaining NA in Petal.Width: 0"
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.00 Min. : 2.200 Min. : 0.000 Min. :0.100
## 1st Qu.: 5.10 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.:0.300
## Median : 5.80 Median : 3.100 Median : 4.500 Median :1.300
## Mean : 6.54 Mean : 3.462 Mean : 4.456 Mean :1.209
## 3rd Qu.: 6.40 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.:1.800
## Max. :73.00 Max. :30.000 Max. :63.000 Max. :2.500
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
Summary
cat("Final Missing Value Count:\n")
## Final Missing Value Count:
print(colSums(is.na(dirty_iris)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
cat("\nFinal Dataset Summary:\n")
##
## Final Dataset Summary:
print(summary(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.00 Min. : 2.200 Min. : 0.000 Min. :0.100
## 1st Qu.: 5.10 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.:0.300
## Median : 5.80 Median : 3.100 Median : 4.500 Median :1.300
## Mean : 6.54 Mean : 3.462 Mean : 4.456 Mean :1.209
## 3rd Qu.: 6.40 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.:1.800
## Max. :73.00 Max. :30.000 Max. :63.000 Max. :2.500
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
cat("\nData Cleaning Steps Completed:\n")
##
## Data Cleaning Steps Completed:
cat("1. Identified 19 missing values in Petal.Length\n")
## 1. Identified 19 missing values in Petal.Length
cat("2. Found 54 complete observations (36%)\n")
## 2. Found 54 complete observations (36%)
cat("3. Located Inf special values\n")
## 3. Located Inf special values
cat("4. Replaced special values with NA\n")
## 4. Replaced special values with NA
cat("5. Found 3 observations violating business rules\n")
## 5. Found 3 observations violating business rules
cat("6. Corrected Sepal.Width errors\n")
## 6. Corrected Sepal.Width errors
cat("7. Imputed missing values using 4 methods:\n")
## 7. Imputed missing values using 4 methods:
cat(" - Sepal.Width: Mean\n")
## - Sepal.Width: Mean
cat(" - Petal.Length: Median\n")
## - Petal.Length: Median
cat(" - Sepal.Length: Linear Regression\n")
## - Sepal.Length: Linear Regression
cat(" - Petal.Width: kNN\n")
## - Petal.Width: kNN
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] VIM_7.0.0 colorspace_2.1-1
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.9 future_1.69.0 class_7.3-22
## [4] robustbase_0.99-7 lattice_0.22-6 listenv_0.10.0
## [7] digest_0.6.37 evaluate_1.0.5 fastmap_1.2.0
## [10] Matrix_1.7-1 jsonlite_1.8.9 vcd_1.4-13
## [13] e1071_1.7-17 nnet_7.3-19 backports_1.5.0
## [16] mlr3learners_0.14.0 Formula_1.2-5 laeken_0.5.3
## [19] mlr3tuning_1.5.1 mlr3_1.4.0 codetools_0.2-20
## [22] palmerpenguins_0.1.1 jquerylib_0.1.4 abind_1.4-8
## [25] cli_3.6.5 rlang_1.1.7 crayon_1.5.3
## [28] parallelly_1.46.1 withr_3.0.2 cachem_1.1.0
## [31] yaml_2.3.10 mlr3pipelines_0.9.0 tools_4.4.2
## [34] parallel_4.4.2 uuid_1.2-1 checkmate_2.3.4
## [37] ranger_0.18.0 boot_1.3-31 globals_0.18.0
## [40] bbotk_1.8.1 R6_2.5.1 zoo_1.8-15
## [43] proxy_0.4-29 lifecycle_1.0.5 car_3.1-5
## [46] MASS_7.3-61 mlr3misc_0.21.0 bslib_0.8.0
## [49] data.table_1.16.4 Rcpp_1.0.14 lgr_0.5.2
## [52] paradox_1.0.1 lmtest_0.9-40 DEoptimR_1.1-4
## [55] xfun_0.56 rstudioapi_0.17.1 knitr_1.51
## [58] htmltools_0.5.8.1 rmarkdown_2.30 carData_3.0-6
## [61] compiler_4.4.2 sp_2.2-0