install.packages(“VIM”)

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Find missing values in Petal.length variable

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Calculate the number and the percentage of observations that are complete

complete <- sum(complete.cases(dirty_iris))
total_rows <- nrow(dirty_iris)
percent_complete <- complete / total_rows*100

Check for other types of special values

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
which(is.infinite(dirty_iris$Petal.Width))
## [1] 86

Locate the above identified special value and replace them with a missing value placeholder

dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA

Write R code to find out the observations that violate the rules The sepal width should be a positive value & the sepal length of an iris cannot exceed 30 cm

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 2
sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
## [1] 2

locate the observation that violates the rule of “Sepal.Width >0” and make reasonable corrections

table(dirty_iris$Sepal.Width)
## 
##  -3   0 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 
##   1   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1 
## 4.1 4.2  29  30 
##   1   1   1   1
which(dirty_iris$Sepal.Width <= 0)
## [1]  16 130
dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[130] <- NA
table(dirty_iris$Sepal.Width)
## 
## 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 4.1 4.2 
##   3   3   7   5   8  12   9  24  11  12   6  10   6   3   2   5   1   1   1   1 
##  29  30 
##   1   1

Use four methods we learned to impute the missing values for each column, respectively

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
summary(lm_model)
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5549 -0.1682  0.0996  0.4901  2.0549 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.61716    0.13268  -4.652 7.67e-06 ***
## Sepal.Width   1.47695    0.04000  36.923  < 2e-16 ***
## Petal.Length  0.45604    0.02279  20.012  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.054 on 137 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.9763, Adjusted R-squared:  0.976 
## F-statistic:  2823 on 2 and 137 DF,  p-value: < 2.2e-16
missing_idx <- which(is.na(dirty_iris$Sepal.Length))
predicted_vals <- predict(lm_model, newdata = dirty_iris[missing_idx, ])
dirty_iris$Sepal.Length[missing_idx] <- predicted_vals

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
dirty_iris <- dirty_iris[ , !grepl("_imp$", names(dirty_iris))]