dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
sum(is.na(dirty_iris))
## [1] 58
apply(is.na(dirty_iris), 2 , sum)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 17 19 12 0
sum(complete.cases(dirty_iris))
## [1] 96
sum(complete.cases(dirty_iris)) / nrow(dirty_iris)
## [1] 0.64
numeric_columns <- dirty_iris[ , sapply(dirty_iris, is.numeric)]
special_values <- sapply(numeric_columns, function(col) {
c(
NA_count = sum(is.na(col)),
NaN_count = sum(is.nan(col)),
Inf_count = sum(is.infinite(col) & col > 0),
NegInf_count = sum(is.infinite(col) & col < 0)
)})
special_values
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## NA_count 10 17 19 12
## NaN_count 0 0 0 0
## Inf_count 0 0 0 1
## NegInf_count 0 0 0 0
na_iris <- dirty_iris
na_iris$Petal.Width[is.infinite(na_iris$Petal.Width)] <- NA
special_values2 <- sapply(na_iris, function(col) {
(Inf_count = sum(is.infinite(col) & col > 0))})
print(special_values2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
# Logical condition for violations
violations <- which(na_iris$Sepal.Width <= 0 | na_iris$Sepal.Length > 30)
na_iris[violations, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
acc_iris <- na_iris
acc_iris$Sepal.Width[acc_iris$Sepal.Width <= 0] <- NA
acc_iris$Sepal.Length[acc_iris$Sepal.Length > 30] <- NA
which(acc_iris$Sepal.Width <= 0 | acc_iris$Sepal.Length > 30)
## integer(0)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
imputed_iris <- acc_iris
imputed_iris$Sepal.Width[is.na(imputed_iris$Sepal.Width)] <-
mean(imputed_iris$Sepal.Width, na.rm = TRUE)
sum(is.na(imputed_iris$Sepal.Width))
## [1] 0
iris_2 <- imputed_iris
iris_2$Petal.Length[is.na(iris_2$Petal.Length)] <-
median(iris_2$Petal.Length, na.rm = TRUE)
sum(is.na(iris_2$Petal.Length))
## [1] 0
iris_3 <- iris_2
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris_3)
na_rows <- is.na(iris_3$Sepal.Length)
iris_3$Sepal.Length[na_rows] <- predict(model, newdata = iris_3[na_rows, ])
sum(is.na(iris_3$Sepal.Length))
## [1] 0
iris_4 <- iris_3
iris_5 <- kNN(iris_4, variable = "Petal.Width")
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.00000 2.20000 0.00000 18.40128 30.00000 63.00000
sum(is.na(iris_5$Petal.Width))
## [1] 0
````