Install Packages

# install.packages("VIM")

Question 3: How many missing values do you find in Petal.Length variable?

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4: Calculate the number and the percentage of observations that are complete.

num_complete <- sum(complete.cases(dirty_iris))

percent_complete <- (num_complete / nrow(dirty_iris)) * 100

print(num_complete)
## [1] 96
print(percent_complete)
## [1] 64

Question 5: Is there an another type of special values containing in the numeric columns?

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 6: Write R code to locate the above identified special value and replace them with a missing value placeholder.

inf_indicator <- is.infinite(dirty_iris$Petal.Width)
dirty_iris$Petal.Width[inf_indicator] <- NA
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 7: Write R code to find out the observations that violate these rules. How many observations violate the above rules?

violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)

print(violations)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
nrow(violations)
## [1] 4

Question 8: Would you locate the observation that violates the rule of “Sepal.Width >0” and make reasonable corrections?

neg_indicator <- which(dirty_iris$Sepal.Width<0)
dirty_iris$Sepal.Width[neg_indicator]< abs(dirty_iris$Sepal.Width[neg_indicator])
## [1] TRUE
dirty_iris[neg_indicator,]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16            5          -3          3.5           1 versicolor
zero_indicator <- which(dirty_iris$Sepal.Width ==0)
dirty_iris$Sepal.Width[zero_indicator] <- NA

Question 9: Write the R code to do the imputation as specified above. Mark the ones if your attached R code could achieve the task.

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# 1. Sepal.Width: Mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

# 2. Petal.Length: Median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

# 3. Sepal.Length: Linear Regression
model <- lm(Sepal.Length~Sepal.Width+Petal.Width,data=dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
to_be_imported <- dirty_iris[I,]
dirty_iris$Sepal.Length[I] <- predict(model,newdata = to_be_imported)

# 4. Petal.Width: kNN
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

dirty_iris$Petal.Width_imp <- NULL
colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0