Question 3

Reading data and using sum is.na to find missing values in Petal length

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

Using Complete case mean and sum to calculate number and percentage of complete observations

mean(complete.cases(dirty_iris))
## [1] 0.64
sum(complete.cases(dirty_iris))
## [1] 96

Question 5

Use Summary to see what special values are in numeric columsn

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 6

Use kNN to impute NAs

library(VIM)
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
Perfect_iris <- kNN(dirty_iris)

Question 7

using subset to find violations

Perfect_iris$Sepal.Width <= 0 | Perfect_iris$Sepal.Length > 30
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE
violations <- subset (Perfect_iris, Sepal.Width <= 0 | Sepal.Length > 30)
violations
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0         2.2  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
##     Sepal.Length_imp Sepal.Width_imp Petal.Length_imp Petal.Width_imp
## 16             FALSE           FALSE            FALSE           FALSE
## 28             FALSE           FALSE            FALSE            TRUE
## 125            FALSE           FALSE            FALSE           FALSE
## 130            FALSE           FALSE            FALSE           FALSE
##     Species_imp
## 16        FALSE
## 28        FALSE
## 125       FALSE
## 130       FALSE
nrow(violations)
## [1] 4

Question 8

Using absolute value and NA to fix violations

Perfect_iris [Perfect_iris$Sepal.Width <= 0, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
##     Sepal.Length_imp Sepal.Width_imp Petal.Length_imp Petal.Width_imp
## 16             FALSE           FALSE            FALSE           FALSE
## 130            FALSE           FALSE            FALSE           FALSE
##     Species_imp
## 16        FALSE
## 130       FALSE
Perfect_iris$Sepal.Width[Perfect_iris$Sepal.Width < 0] <- abs(Perfect_iris$Sepal.Width[Perfect_iris$Sepal.Width < 0])
Perfect_iris$Sepal.Width[Perfect_iris$Sepal.Width == 0] <- NA
Perfect_iris[Perfect_iris$Sepal.Width <= 0, ]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length_imp
## NA           NA          NA           NA          NA    <NA>               NA
##    Sepal.Width_imp Petal.Length_imp Petal.Width_imp Species_imp
## NA              NA               NA              NA          NA

Question 9

Using kNN again to impute all NAs

Mean Sepal.Width

mean_sw <- mean(Perfect_iris$Sepal.Width, na.rm = TRUE)
Perfect_iris$Sepal.Width[is.na(Perfect_iris$Sepal.Width)] <- mean_sw

Petal.Length

median_petal <- median(Perfect_iris$Petal.Length, na.rm = TRUE)
Perfect_iris$Petal.Length[is.na(Perfect_iris$Petal.Length)] <- median_petal

Sepal.Length

Perfect_iris[c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")] <- lapply(Perfect_iris[c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")], function(x) {
  x[is.nan(x) | is.infinite(x)] <- NA
  x
})
Linear_Sepal_Model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = Perfect_iris)
Imputed_Sepal_Length <- is.na(Perfect_iris$Sepal.Length)
Perfect_iris$Sepal.Length[Imputed_Sepal_Length] <- predict(Linear_Sepal_Model, newdata = Perfect_iris[Imputed_Sepal_Length,])

Petal Width Messed up earlier question so reloaded original

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
library(VIM)

Perfect_iris2 <- kNN(dirty_iris)