dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage_complete <- (complete_obs / total_obs) * 100
cat("Number of complete observations:", complete_obs, "\n")
## Number of complete observations: 96
cat("Percentage of complete observations:", round(percentage_complete, 2), "%\n")
## Percentage of complete observations: 64 %
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
numeric_columns <- sapply(dirty_iris, is.numeric)
na_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.na(x)))
nan_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.nan(x)))
inf_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.infinite(x) & x > 0))
neg_inf_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.infinite(x) & x < 0))
cat("NA values:\n")
## NA values:
print(na_count)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 10 17 19 12
cat("NaN values:\n")
## NaN values:
print(nan_count)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
cat("\nInf values:\n")
##
## Inf values:
print(inf_count)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 1
cat("\n-Inf values:\n")
##
## -Inf values:
print(neg_inf_count)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
numeric_columns <- sapply(dirty_iris, is.numeric)
dirty_iris[, numeric_columns] <- lapply(dirty_iris[, numeric_columns], function(x) {
x[is.nan(x)] <- NA
return(x)
})
dirty_iris[, numeric_columns] <- lapply(dirty_iris[, numeric_columns], function(x) {
x[is.infinite(x)] <- NA
return(x)
})
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
violations <- dirty_iris[
(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
(!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30),
]
print(violations)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
cat("Number of observations violating the rules:", nrow(violations))
## Number of observations violating the rules: 4
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)
dirty_iris$Sepal.Width <- ifelse(
dirty_iris$Sepal.Width < 0, abs(dirty_iris$Sepal.Width),
ifelse(dirty_iris$Sepal.Width == 0, NA, dirty_iris$Sepal.Width)
)
print(dirty_iris$Sepal.Width)
## [1] 3.2 3.3 NA 3.4 2.6 NA 2.7 3.0 2.7 3.1 3.5 2.7 3.0 2.8 3.9
## [16] 3.0 NA 3.2 4.0 NA 3.6 NA 2.8 3.3 3.0 3.2 3.1 29.0 3.2 2.8
## [31] 3.2 3.2 2.8 2.9 2.9 3.0 3.0 2.2 2.5 3.0 NA 2.7 NA 2.7 4.2
## [46] 2.8 NA 3.2 3.0 3.4 2.6 3.1 2.7 3.4 3.3 3.8 3.8 2.9 2.8 2.8
## [61] 2.3 2.8 3.0 3.3 3.0 2.5 2.5 3.2 3.5 3.5 3.0 3.1 3.5 NA 2.8
## [76] 2.5 3.5 3.0 3.8 3.8 2.6 3.4 2.9 3.7 3.0 3.8 2.9 2.9 2.9 2.5
## [91] 3.2 NA 3.4 2.7 2.2 3.1 2.3 NA 3.0 2.8 3.4 3.6 2.7 3.0 3.7
## [106] NA 3.0 3.0 2.8 3.4 3.4 3.4 3.4 3.3 3.1 2.6 NA 3.1 3.0 2.8
## [121] 3.0 2.3 3.2 4.1 30.0 2.9 3.2 NA 3.6 NA 2.5 3.1 NA 3.3 3.0
## [136] 3.0 3.2 3.0 3.1 2.2 NA NA 3.0 2.9 2.5 3.1 3.0 3.5 3.1 2.6
cat("Number of NA values in Sepal.Width after correction:", sum(is.na(dirty_iris$Sepal.Width)))
## Number of NA values in Sepal.Width after correction: 18
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(mice)
## Warning: package 'mice' was built under R version 4.3.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv", stringsAsFactors = FALSE)
numeric_cols <- c("Sepal.Width", "Sepal.Length", "Petal.Length", "Petal.Width")
dirty_iris[numeric_cols] <- lapply(dirty_iris[numeric_cols], function(x) as.numeric(as.character(x)))
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
mice_data <- mice(dirty_iris, method = c("", "", "norm.predict", "", ""), m = 1, maxit = 5, seed = 123)
##
## iter imp variable
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## Warning: Number of logged events: 2
dirty_iris <- complete(mice_data)
tryCatch({dirty_iris <- knnImputation(dirty_iris, k = 5)
}, error = function(e) {
cat("kNN failed: ", e$message, "\nApplying mean imputation for Petal.Width.\n")
dirty_iris$Petal.Width[is.na(dirty_iris$Petal.Width)] <- mean(dirty_iris$Petal.Width, na.rm = TRUE)
})
## Warning in knnImputation(dirty_iris, k = 5): NAs introduced by coercion
## kNN failed: Not sufficient complete cases for computing neighbors.
## Applying mean imputation for Petal.Width.
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.000 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.700 1st Qu.:0.3
## Median : 5.750 Median : 3.100 Median : 4.500 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.456 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.391 3rd Qu.: 5.100 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.000 Max. :Inf
## NA's :10 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##