install.packages(“VIM”)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
complete <- sum(complete.cases(dirty_iris))
total_rows <- nrow(dirty_iris)
percent_complete <- complete / total_rows*100
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
which(is.infinite(dirty_iris$Petal.Width))
## [1] 86
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 2
sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
## [1] 2
table(dirty_iris$Sepal.Width)
##
## -3 0 2.2 2.3 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4
## 1 1 3 3 7 5 8 12 9 23 11 12 6 10 6 3 2 5 1 1
## 4.1 4.2 29 30
## 1 1 1 1
which(dirty_iris$Sepal.Width <= 0)
## [1] 16 130
dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[130] <- NA
table(dirty_iris$Sepal.Width)
##
## 2.2 2.3 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1 4.2
## 3 3 7 5 8 12 9 24 11 12 6 10 6 3 2 5 1 1 1 1
## 29 30
## 1 1
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
summary(lm_model)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5549 -0.1682 0.0996 0.4901 2.0549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.61716 0.13268 -4.652 7.67e-06 ***
## Sepal.Width 1.47695 0.04000 36.923 < 2e-16 ***
## Petal.Length 0.45604 0.02279 20.012 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.054 on 137 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.9763, Adjusted R-squared: 0.976
## F-statistic: 2823 on 2 and 137 DF, p-value: < 2.2e-16
missing_idx <- which(is.na(dirty_iris$Sepal.Length))
predicted_vals <- predict(lm_model, newdata = dirty_iris[missing_idx, ])
dirty_iris$Sepal.Length[missing_idx] <- predicted_vals
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
dirty_iris <- dirty_iris[ , !grepl("_imp$", names(dirty_iris))]