dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
Question 3
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
apply(is.na(dirty_iris), 2 , sum)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 17 19 12 0
library(stringr)
str_detect(dirty_iris,"NA")
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] TRUE TRUE TRUE TRUE FALSE
Question 4
num_complete <- sum(complete.cases(dirty_iris))
total_rows <- nrow(dirty_iris)
percent_complete <- (num_complete / total_rows) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
Question 6
dirty_iris_complete <-dirty_iris[-which(is.na(dirty_iris$Sepal.Length)),]
str(dirty_iris_complete)
## 'data.frame': 140 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
dirty_iris_complete1 <- dirty_iris[complete.cases(dirty_iris),]
str(dirty_iris_complete1)
## 'data.frame': 96 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
## $ Sepal.Width : num 3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
## $ Petal.Length: num 4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
## $ Petal.Width : num 1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
## $ Species : chr "versicolor" "virginica" "setosa" "versicolor" ...
dirty_iris_complete2 <- na.omit(dirty_iris)
str(dirty_iris_complete2)
## 'data.frame': 96 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
## $ Sepal.Width : num 3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
## $ Petal.Length: num 4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
## $ Petal.Width : num 1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
## $ Species : chr "versicolor" "virginica" "setosa" "versicolor" ...
## - attr(*, "na.action")= 'omit' Named int [1:54] 3 6 7 13 15 17 19 20 21 22 ...
## ..- attr(*, "names")= chr [1:54] "3" "6" "7" "13" ...
Question 7
table(dirty_iris$Sepal.Width)
##
## -3 0 2.2 2.3 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4
## 1 1 3 3 7 5 8 12 9 23 11 12 6 10 6 3 2 5 1 1
## 4.1 4.2 29 30
## 1 1 1 1
table(dirty_iris$Sepal.Length)
##
## 0 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6 6.1
## 1 1 3 1 4 2 5 5 10 9 4 1 4 7 6 7 6 3 5 4
## 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9 7 7.2 7.4 7.6 7.7 7.9 49 73
## 4 9 7 5 2 7 2 3 1 3 1 1 4 1 1 1
bad_width <- subset(dirty_iris, Sepal.Width <= 0)
bad_length <- subset(dirty_iris, Sepal.Length > 30)
bad_data <- rbind(bad_width, bad_length)
Question 8
dirty_iris[which(dirty_iris$Sepal.Width==0),"Sepal.Width"] <- NA
table(dirty_iris$Sepal.Width)
##
## -3 2.2 2.3 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1
## 1 3 3 7 5 8 12 9 23 11 12 6 10 6 3 2 5 1 1 1
## 4.2 29 30
## 1 1 1
dirty_iris$Sepal.Width
## [1] 3.2 3.3 NA 3.4 2.6 NA 2.7 3.0 2.7 3.1 3.5 2.7 3.0 2.8 3.9
## [16] -3.0 NA 3.2 4.0 NA 3.6 NA 2.8 3.3 3.0 3.2 3.1 29.0 3.2 2.8
## [31] 3.2 3.2 2.8 2.9 2.9 3.0 3.0 2.2 2.5 3.0 NA 2.7 NA 2.7 4.2
## [46] 2.8 NA 3.2 3.0 3.4 2.6 3.1 2.7 3.4 3.3 3.8 3.8 2.9 2.8 2.8
## [61] 2.3 2.8 3.0 3.3 3.0 2.5 2.5 3.2 3.5 3.5 3.0 3.1 3.5 NA 2.8
## [76] 2.5 3.5 3.0 3.8 3.8 2.6 3.4 2.9 3.7 3.0 3.8 2.9 2.9 2.9 2.5
## [91] 3.2 NA 3.4 2.7 2.2 3.1 2.3 NA 3.0 2.8 3.4 3.6 2.7 3.0 3.7
## [106] NA 3.0 3.0 2.8 3.4 3.4 3.4 3.4 3.3 3.1 2.6 NA 3.1 3.0 2.8
## [121] 3.0 2.3 3.2 4.1 30.0 2.9 3.2 NA 3.6 NA 2.5 3.1 NA 3.3 3.0
## [136] 3.0 3.2 3.0 3.1 2.2 NA NA 3.0 2.9 2.5 3.1 3.0 3.5 3.1 2.6
Question 9
# install.packages("Hmisc")
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
# install.packages("VIM")
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
Linear Regression Imputation
data(dirty_iris)
## Warning in data(dirty_iris): data set 'dirty_iris' not found
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : 'impute' num 3.2 3.3 3.42 3.4 2.6 ...
## ..- attr(*, "imputed")= int [1:18] 3 6 17 20 22 41 43 47 74 92 ...
## $ Petal.Length: 'impute' num 4.5 6 5.4 1.6 3.5 4.5 5.3 5.1 4.1 1.6 ...
## ..- attr(*, "imputed")= int [1:19] 6 19 21 23 31 34 47 52 67 69 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
head(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.4 3.200000 4.5 1.5 versicolor
## 2 6.3 3.300000 6.0 2.5 virginica
## 3 6.2 3.416667 5.4 2.3 virginica
## 4 5.0 3.400000 1.6 0.4 setosa
## 5 5.7 2.600000 3.5 1.0 versicolor
## 6 5.3 3.416667 4.5 0.2 setosa
dirty_iris$Sepal.Length[1:10] <- NA
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
data = dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[I] <- predict(model, newdata = dirty_iris[I,])
head(dirty_iris,10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.261069 3.200000 4.5 1.5 versicolor
## 2 7.125084 3.300000 6.0 2.5 virginica
## 3 7.000148 3.416667 5.4 2.3 virginica
## 4 5.144661 3.400000 1.6 0.4 setosa
## 5 4.933841 2.600000 3.5 1.0 versicolor
## 6 6.566251 3.416667 4.5 0.2 setosa
## 7 5.942489 2.700000 5.3 NA virginica
## 8 6.268627 3.000000 5.1 1.8 virginica
## 9 5.363959 2.700000 4.1 1.0 versicolor
## 10 4.722101 3.100000 1.6 0.2 setosa
kNN Imputation
library(VIM)
summary(dirty_iris)
##
## 18 values imputed to 3.416667
##
##
## 19 values imputed to 4.5
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.000 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.700 1st Qu.:0.3
## Median : 5.800 Median : 3.100 Median : 4.500 Median :1.3
## Mean : 6.545 Mean : 3.417 Mean : 4.456 Mean :Inf
## 3rd Qu.: 6.500 3rd Qu.: 3.417 3rd Qu.: 5.100 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.000 Max. :Inf
## NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
dirty_iris2 <- kNN(iris, "Petal.Width")
## Warning in kNN(iris, "Petal.Width"): Nothing to impute, because no NA are
## present (also after using makeNA)
dirty_iris2 <- subset(dirty_iris2, select=Sepal.Length:Species)
head(dirty_iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa