marks <- c(89,95,85,NA,76,69,78,NA,73,NA,90,69)
marks1 <- marks
marks2 <- marks
marks1[is.na(marks)] <- mean(marks, na.rm = TRUE)
marks2[is.na(marks)] <- median(marks, na.rm = TRUE)
marks1
## [1] 89.00000 95.00000 85.00000 80.44444 76.00000 69.00000 78.00000 80.44444
## [9] 73.00000 80.44444 90.00000 69.00000
marks2
## [1] 89 95 85 78 76 69 78 78 73 78 90 69
pacman:: p_load(Hmisc, VIM)
marks1 <- impute(marks, fun=mean)
marks2 <- impute(marks, fun=median)
marks1
## 1 2 3 4 5 6 7 8
## 89.00000 95.00000 85.00000 80.44444* 76.00000 69.00000 78.00000 80.44444*
## 9 10 11 12
## 73.00000 80.44444* 90.00000 69.00000
is.imputed(marks1)
## [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
marks <- c(89,95,85,NA,76,69,78,NA,73,NA,90,69)
old_marks <- c(90, 100, 75, 89, 80, 77, 88, 69, 75, 88, 95, 70)
sum(old_marks[!is.na(marks)])
## [1] 750
sum(marks, na.rm = TRUE)
## [1] 724
R <- sum(old_marks[!is.na(marks)])/sum(marks, na.rm = TRUE)
R
## [1] 1.035912
marks[is.na(marks)] <- old_marks[is.na(marks)] / R
marks
## [1] 89.00000 95.00000 85.00000 85.91467 76.00000 69.00000 78.00000 66.60800
## [9] 73.00000 84.94933 90.00000 69.00000
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
iris$Sepal.Length[1:10] <- NA
head(iris)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data=iris)
model
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
##
## Coefficients:
## (Intercept) Sepal.Width Petal.Length
## 2.2716 0.5931 0.4689
I <- is.na(iris$Sepal.Length)
iris$Sepal.Length[I] <- predict(model, newdata = iris[I,])
head(iris,10)
for (i in 1:ncol(iris)) {
iris[sample(1:nrow(iris), 10, replace = FALSE) ,i] <- NA
}
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :4.30 Min. :2.000 Min. :1.000 Min. :0.100 setosa :46
## 1st Qu.:5.10 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:48
## Median :5.70 Median :3.000 Median :4.300 Median :1.300 virginica :46
## Mean :5.82 Mean :3.059 Mean :3.744 Mean :1.181 NA's :10
## 3rd Qu.:6.40 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.90 Max. :4.400 Max. :6.900 Max. :2.500
## NA's :10 NA's :10 NA's :10 NA's :10
iris2 <- kNN(iris)
summary(iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :4.30 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
## 1st Qu.:5.10 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:49
## Median :5.80 Median :3.000 Median :4.300 Median :1.300 virginica :51
## Mean :5.84 Mean :3.057 Mean :3.757 Mean :1.207
## 3rd Qu.:6.40 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.90 Max. :4.400 Max. :6.900 Max. :2.500
## Sepal.Length_imp Sepal.Width_imp Petal.Length_imp Petal.Width_imp
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:140 FALSE:140 FALSE:140 FALSE:140
## TRUE :10 TRUE :10 TRUE :10 TRUE :10
##
##
##
## Species_imp
## Mode :logical
## FALSE:140
## TRUE :10
##
##
##
iris2 <- iris2[,1:5]
iris2 <- subset(iris2, select=Sepal.Length:Petal.Width)
summary(iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.30 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.10 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.80 Median :3.000 Median :4.300 Median :1.300
## Mean :5.84 Mean :3.057 Mean :3.757 Mean :1.207
## 3rd Qu.:6.40 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.90 Max. :4.400 Max. :6.900 Max. :2.500
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
missing_petal_length <- sum(is.na(dirty_iris$Petal.Length))
cat("Number of missing values in Petal.Length:", missing_petal_length, "\n")
## Number of missing values in Petal.Length: 19
complete_obs <- sum(complete.cases(dirty_iris))
cat("Number of complete observations:", complete_obs, "\n")
## Number of complete observations: 96
total_obs <- nrow(dirty_iris)
percent_complete <- (complete_obs / total_obs) * 100
cat("Percentage of complete observations:", percent_complete, "%\n")
## Percentage of complete observations: 64 %
numeric_cols <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
check_special_values <- function(column) {
na_count <- sum(is.na(column))
inf_count <- sum(is.infinite(column), na.rm = TRUE)
nan_count <- sum(is.nan(column), na.rm = TRUE)
cat("Column:", deparse(substitute(column)), "\n")
cat(" NA count:", na_count, "\n")
cat(" Inf count:", inf_count, "\n")
cat(" NaN count:", nan_count, "\n\n")
}
for (col in numeric_cols) {
check_special_values(dirty_iris[[col]])
}
## Column: dirty_iris[[col]]
## NA count: 10
## Inf count: 0
## NaN count: 0
##
## Column: dirty_iris[[col]]
## NA count: 17
## Inf count: 0
## NaN count: 0
##
## Column: dirty_iris[[col]]
## NA count: 19
## Inf count: 0
## NaN count: 0
##
## Column: dirty_iris[[col]]
## NA count: 12
## Inf count: 1
## NaN count: 0
cat("Total rows with any Inf in numeric columns:\n")
## Total rows with any Inf in numeric columns:
inf_rows <- apply(dirty_iris[numeric_cols], 1, function(row) any(is.infinite(row)))
cat(sum(inf_rows, na.rm = TRUE), "rows with Inf\n")
## 1 rows with Inf
cat("Total rows with any NaN in numeric columns:\n")
## Total rows with any NaN in numeric columns:
nan_rows <- apply(dirty_iris[numeric_cols], 1, function(row) any(is.nan(row)))
cat(sum(nan_rows, na.rm = TRUE), "rows with NaN\n")
## 0 rows with NaN
cat("Initial Inf count in Petal.Width:", sum(is.infinite(dirty_iris$Petal.Width)), "\n")
## Initial Inf count in Petal.Width: 1
cat("Initial NA count in Petal.Width:", sum(is.na(dirty_iris$Petal.Width)), "\n")
## Initial NA count in Petal.Width: 12
inf_row <- which(is.infinite(dirty_iris$Petal.Width))
if (length(inf_row) > 0) {
cat("Row with Inf in Petal.Width:", inf_row, "\n")
cat("Original value at row", inf_row, ":", dirty_iris$Petal.Width[inf_row], "\n")
} else {
cat("No Inf found in Petal.Width\n")
}
## Row with Inf in Petal.Width: 86
## Original value at row 86 : Inf
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
cat("\nAfter replacement:\n")
##
## After replacement:
cat("Inf count in Petal.Width:", sum(is.infinite(dirty_iris$Petal.Width)), "\n")
## Inf count in Petal.Width: 0
cat("NA count in Petal.Width:", sum(is.na(dirty_iris$Petal.Width)), "\n")
## NA count in Petal.Width: 13
if (length(inf_row) > 0) {
cat("Value at row", inf_row, "is now:", dirty_iris$Petal.Width[inf_row], "\n")
}
## Value at row 86 is now: NA
violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30
violating_rows <- dirty_iris[violations, ]
num_violations <- sum(violations, na.rm = TRUE)
cat("Number of observations violating the rules:", num_violations, "\n\n")
## Number of observations violating the rules: 4
cat("Observations violating the rules:\n")
## Observations violating the rules:
print(violating_rows)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA NA NA NA NA <NA>
## NA.1 NA NA NA NA <NA>
## NA.2 NA NA NA NA <NA>
## 16 5.0 -3 3.5 1.0 versicolor
## NA.3 NA NA NA NA <NA>
## NA.4 NA NA NA NA <NA>
## NA.5 NA NA NA NA <NA>
## NA.6 NA NA NA NA <NA>
## NA.7 NA NA NA NA <NA>
## 28 73.0 29 63.0 NA virginica
## NA.8 NA NA NA NA <NA>
## NA.9 NA NA NA NA <NA>
## NA.10 NA NA NA NA <NA>
## NA.11 NA NA NA NA <NA>
## NA.12 NA NA NA NA <NA>
## NA.13 NA NA NA NA <NA>
## NA.14 NA NA NA NA <NA>
## NA.15 NA NA NA NA <NA>
## NA.16 NA NA NA NA <NA>
## NA.17 NA NA NA NA <NA>
## NA.18 NA NA NA NA <NA>
## NA.19 NA NA NA NA <NA>
## NA.20 NA NA NA NA <NA>
## NA.21 NA NA NA NA <NA>
## 125 49.0 30 14.0 2.0 setosa
## NA.22 NA NA NA NA <NA>
## 130 5.7 0 1.7 0.3 setosa
## NA.23 NA NA NA NA <NA>
## NA.24 NA NA NA NA <NA>
## NA.25 NA NA NA NA <NA>
## NA.26 NA NA NA NA <NA>
# Locate observations where Sepal.Width <= 0
violations_width <- dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width)
violating_rows <- dirty_iris[violations_width, ]
num_violations <- sum(violations_width, na.rm = TRUE)
cat("Number of observations with Sepal.Width <= 0 (excluding NA):", num_violations, "\n")
## Number of observations with Sepal.Width <= 0 (excluding NA): 2
cat("Observations violating Sepal.Width > 0 rule:\n")
## Observations violating Sepal.Width > 0 rule:
print(violating_rows)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <- abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width)] <- NA
cat("\nAfter corrections:\n")
##
## After corrections:
cat("Number of Sepal.Width <= 0 (excluding NA):", sum(dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width)), "\n")
## Number of Sepal.Width <= 0 (excluding NA): 0
cat("Updated rows that originally violated the rule:\n")
## Updated rows that originally violated the rule:
print(dirty_iris[which(violations_width), ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
cat("Updated NA count in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width)), "\n")
## Updated NA count in Sepal.Width: 18
cat("Before imputation - NA counts:\n")
## Before imputation - NA counts:
print(sapply(dirty_iris, function(x) sum(is.na(x))))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 18 19 13 0
sepal_width_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sepal_width_mean
cat("\nSepal.Width imputed with mean:", sepal_width_mean, "\n")
##
## Sepal.Width imputed with mean: 3.462121
petal_length_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- petal_length_median
cat("Petal.Length imputed with median:", petal_length_median, "\n")
## Petal.Length imputed with median: 4.5
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris, subset = !is.na(Sepal.Length))
predicted_sepal_length <- predict(lm_model, newdata = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_sepal_length[is.na(dirty_iris$Sepal.Length)]
cat("Sepal.Length imputed with linear regression\n")
## Sepal.Length imputed with linear regression
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5, numFun = median, catFun = maxCat)
dirty_iris <- dirty_iris[, 1:5] # Keep only original columns
cat("Petal.Width imputed with kNN (k=5)\n")
## Petal.Width imputed with kNN (k=5)
cat("\nAfter imputation - NA counts:\n")
##
## After imputation - NA counts:
print(sapply(dirty_iris, function(x) sum(is.na(x))))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
cat("\nSample of imputed dataset:\n")
##
## Sample of imputed dataset:
print(head(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.4 3.200000 4.5 1.5 versicolor
## 2 6.3 3.300000 6.0 2.5 virginica
## 3 6.2 3.462121 5.4 2.3 virginica
## 4 5.0 3.400000 1.6 0.4 setosa
## 5 5.7 2.600000 3.5 1.0 versicolor
## 6 5.3 3.462121 4.5 0.2 setosa