pacman:: p_load(lubridate, stringr, VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <-sum(complete.cases(dirty_iris))
96/150
## [1] 0.64
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
viol_sl <- which(dirty_iris$Sepal.Width <= 0, arr.ind = TRUE)
viol_sw <- which(dirty_iris$Sepal.Length > 30, arr.ind = TRUE)
violations <- sort(unique(c(viol_sw, viol_sl)))
length(violations)
## [1] 4
viol_idx <- which(dirty_iris$Sepal.Width <= 0)
dirty_iris[viol_idx, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
neg_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])
zero_idx <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_idx] <- NA
dirty_iris[viol_idx, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. : 2.200 Min. : 0.000 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.:0.300
## Median : 5.750 Median : 3.100 Median : 4.500 Median :1.300
## Mean : 6.559 Mean : 3.462 Mean : 4.456 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.000 Max. :2.500
## NA's :10 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
dirty_iris <- kNN(dirty_iris, "Petal.Width")
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
str(dirty_iris)
## 'data.frame': 150 obs. of 6 variables:
## $ Sepal.Length : num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 3.46 3.4 2.6 ...
## $ Petal.Length : num 4.5 6 5.4 1.6 3.5 4.5 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 1.9 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
## $ Petal.Width_imp: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
data = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <-
predict(fit, newdata = dirty_iris)[is.na(dirty_iris$Sepal.Length)]
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. : 2.200 Min. : 0.000 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.:0.300
## Median : 5.800 Median : 3.100 Median : 4.500 Median :1.300
## Mean : 6.528 Mean : 3.462 Mean : 4.456 Mean :1.208
## 3rd Qu.: 6.400 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.000 Max. :2.500
## Species Petal.Width_imp
## Length:150 Mode :logical
## Class :character FALSE:137
## Mode :character TRUE :13
##
##
##