Synopisis

This is my attempt at doing the exercises for section 3 of An Introduction To Data Cleaning With R article by Edwin de Jonge and Mark van der Loo. This is by no means the correct answers as they were not peer reviewed or presented to the authors of the document.

Loading Packages

library(editrules)
## Loading required package: igraph
## Loading required package: lpSolveAPI
## 
## Attaching package: 'editrules'
## 
## The following object is masked from 'package:igraph':
## 
##     blocks
library(deducorrect)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
set.seed(123)

Exercise 3.1 : Reading & Manually Checking

Data

fileurl <- "http://raw.github.com/edwindj/datacleaning/master/data/dirty_iris.csv"
download.file(fileurl, destfile = "dirty_iris.csv",method = "curl")
## Warning: running command 'curl
## "http://raw.github.com/edwindj/datacleaning/master/data/dirty_iris.csv" -o
## "dirty_iris.csv"' had status 127
## Warning in download.file(fileurl, destfile = "dirty_iris.csv", method =
## "curl"): download had nonzero exit status
df <- read.csv("dirty_iris.csv", stringsAsFactors = F)
sapply(df, class)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"  "character"
complete_case_percent <- 100*length(which(complete.cases(df)))/nrow(df)
paste("Percentage of complete observations is ", complete_case_percent, "%", sep = "")
## [1] "Percentage of complete observations is 64%"
#We mustn't worry about NaN
any(sapply(df, is.nan))
## [1] FALSE
is.special <- function(x){
  if (is.numeric(x)) !is.finite(x) else is.na(x)
}
Ind <- data.frame(sapply(df, is.special))
for(i in 1:ncol(Ind)){
    tmp <- rep(NA, length(df[Ind[[i]], i]))
    df[Ind[[i]], i] <- tmp
}

head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4         3.2          4.5         1.5 versicolor
## 2          6.3         3.3          6.0         2.5  virginica
## 3          6.2          NA          5.4         2.3  virginica
## 4          5.0         3.4          1.6         0.4     setosa
## 5          5.7         2.6          3.5         1.0 versicolor
## 6          5.3          NA           NA         0.2     setosa

Exercise 3.2 : Checking With Rules

fileurl <- "http://raw.githubusercontent.com/buckeye76guy/Data-Cleaning/master/exedits.txt"
download.file(fileurl, destfile = "exedits.txt", method = "curl")
## Warning: running command 'curl
## "http://raw.githubusercontent.com/buckeye76guy/Data-Cleaning/master/exedits.txt"
## -o "exedits.txt"' had status 127
## Warning in download.file(fileurl, destfile = "exedits.txt", method =
## "curl"): download had nonzero exit status
(E <- editfile("exedits.txt"))
## 
## Data model:
## dat1 : Species %in% c('setosa', 'versicolor', 'virginica') 
## 
## Edit set:
## num1 : 0 < Sepal.Length
## num2 : 0 < Sepal.Width
## num3 : 0 < Petal.Length
## num4 : 0 < Petal.Width
## num5 : 2*Petal.Width <= Petal.Length
## num6 : Sepal.Length <= 30
## num7 : Petal.Length < Sepal.Length
ve <- violatedEdits(E, df)
summary(ve)
## Edit violations, 150 observations, 0 completely missing (0%):
## 
##  editname freq  rel
##      num5    3   2%
##      num2    2 1.3%
##      num6    2 1.3%
##      num7    2 1.3%
##      num1    1 0.7%
##      num3    1 0.7%
## 
## Edit violations per record:
## 
##  errors freq   rel
##       0   90   60%
##       1   17 11.3%
##       2   13  8.7%
##       3   25 16.7%
##       4    4  2.7%
##       5    1  0.7%
plot(ve)

There were 90 records with no violations out of 150 observations

percent_correct <- (90/150)*100
paste("Percentage of correct records is ", percent_correct, "%", sep = "")
## [1] "Percentage of correct records is 60%"
OBS <- which(df$Petal.Length >= df$Sepal.Length)

The 35th, 43rd observations are The ones that have a too long Petal

boxplot(df$Sepal.Length, main = "Box plot for Sepal.Length")

outliers_SL <- boxplot.stats(df$Sepal.Length)$out

The outliers are 73, 0, 49. From looking at the box plot someone might have forgotten to put a decimal in some of the values. The outlier at 0 is a bit weird however. I have no idea what happened there.

new_df <- df
ind <- new_df$Sepal.Length %in% outliers_SL
new_df$Sepal.Length[ind] <- rep(NA, length(outliers_SL))
boxplot(new_df$Sepal.Length)

Exercise 3.3 : Correcting

fileurl <- "http://github.com/buckeye76guy/Data-Cleaning/blob/master/excorrections.txt"
download.file(fileurl, destfile = "excorrections.txt", method = "curl")
## Warning: running command 'curl
## "http://github.com/buckeye76guy/Data-Cleaning/blob/master/excorrections.txt"
## -o "excorrections.txt"' had status 127
## Warning in download.file(fileurl, destfile = "excorrections.txt", method =
## "curl"): download had nonzero exit status
(R <- correctionRules("excorrections.txt"))
## Object of class 'correctionRules'
## ##  1-------
##   if (!is.na(Petal.Width) & Petal.Width <= 0) {
##       Petal.Width <- NA
##   }
cor <- correctWithRules(R, new_df)
cor$corrections
## [1] row      variable old      new      how     
## <0 rows> (or 0-length row.names)
head(cor$corrected, 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1           6.4         3.2          4.5         1.5 versicolor
## 2           6.3         3.3          6.0         2.5  virginica
## 3           6.2          NA          5.4         2.3  virginica
## 4           5.0         3.4          1.6         0.4     setosa
## 5           5.7         2.6          3.5         1.0 versicolor
## 6           5.3          NA           NA         0.2     setosa
## 7           6.4         2.7          5.3          NA  virginica
## 8           5.9         3.0          5.1         1.8  virginica
## 9           5.8         2.7          4.1         1.0 versicolor
## 10          4.8         3.1          1.6         0.2     setosa
new_df <- cor$corrected
head(le$adapt)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1        FALSE       FALSE        FALSE       FALSE   FALSE
## 2        FALSE       FALSE        FALSE       FALSE   FALSE
## 3        FALSE        TRUE        FALSE       FALSE   FALSE
## 4        FALSE       FALSE        FALSE       FALSE   FALSE
## 5        FALSE       FALSE        FALSE       FALSE   FALSE
## 6        FALSE        TRUE         TRUE       FALSE   FALSE
# First set the too long petal to NA
new_df[OBS, "Petal.Length"] <- NA
# Now attempt to NA most of the errors using le$adapt
for(i in 1:ncol(le$adapt)){
  new_df[le$adapt[[i]], i] <- NA # Indices with TRUE in le turn to NA
}
summary(violatedEdits(E, new_df))
## Edit violations, 150 observations, 0 completely missing (0%):
## 
##  editname freq  rel
##      num5    3   2%
##      num2    2 1.3%
##      num3    1 0.7%
## 
## Edit violations per record:
## 
##  errors freq   rel
##       0   90   60%
##       1   15   10%
##       2   13  8.7%
##       3   25 16.7%
##       4    4  2.7%
##       5    2  1.3%
##       6    1  0.7%
# I want to use editmatrix here but that won't introduce NAs
# num 3 rule
ind <- which(new_df$Petal.Length <= 0)
new_df[ind, "Petal.Length"] <- NA
# num 2 rule
ind <- which(new_df$Sepal.Width <= 0)
new_df[ind, "Sepal.Width"] <- NA
# num 5 rule
ind <- which(2*new_df$Petal.Width > new_df$Petal.Length)
new_df[ind, c("Petal.Width", "Petal.Length")] <- NA

# No violations of any rules we've set
summary(violatedEdits(E, new_df))
## No violations detected, 154 checks evaluated to NA
## NULL
#Data after NA'ing all errorenous datum
head(new_df, 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1           6.4         3.2          4.5         1.5 versicolor
## 2           6.3         3.3          6.0         2.5  virginica
## 3           6.2          NA          5.4         2.3  virginica
## 4           5.0         3.4          1.6         0.4     setosa
## 5           5.7         2.6          3.5         1.0 versicolor
## 6           5.3          NA           NA         0.2     setosa
## 7           6.4         2.7          5.3          NA  virginica
## 8           5.9         3.0          5.1         1.8  virginica
## 9           5.8         2.7          4.1         1.0 versicolor
## 10          4.8         3.1          1.6         0.2     setosa

Exercise 3.4 : Imputing

new_df_imp <- kNN(new_df)
## Time difference of -0.03902698 secs
# Is there any number that is not finite?
ifelse((!any(sapply(new_df_imp, is.finite))), "Some +/- Inf remain", "All numeric values are finite")
## [1] "All numeric values are finite"
# Is there any NaN
if (!any(sapply(new_df_imp, is.nan))) "No NAN"
## [1] "No NAN"
# Is there any na?
if (!any(sapply(new_df_imp, is.na))) "As expected: No Missing values"
## [1] "As expected: No Missing values"
#Data after KNN imputation: Compare to previous data
head(new_df_imp[,names(new_df)], 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1           6.4         3.2          4.5         1.5 versicolor
## 2           6.3         3.3          6.0         2.5  virginica
## 3           6.2         3.0          5.4         2.3  virginica
## 4           5.0         3.4          1.6         0.4     setosa
## 5           5.7         2.6          3.5         1.0 versicolor
## 6           5.3         3.7          1.5         0.2     setosa
## 7           6.4         2.7          5.3         1.8  virginica
## 8           5.9         3.0          5.1         1.8  virginica
## 9           5.8         2.7          4.1         1.0 versicolor
## 10          4.8         3.1          1.6         0.2     setosa
  1. This is a function offered by the article
# x : vector to be imputed
# last : value to use if last value of x is empty
seqImpute <- function(x,last){
n <- length(x)
x <- c(x,last)
i <- is.na(x)
while(any(i)){
x[i] <- x[which(i) + 1]
i <- is.na(x)
}
x[1:n]
}
  1. Ordered Data Frame By Species and Imputation
new_df_spec <- new_df[order(new_df$Species),]
head(new_df_spec)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 4           5.0         3.4          1.6         0.4  setosa
## 6           5.3          NA           NA         0.2  setosa
## 10          4.8         3.1          1.6         0.2  setosa
## 11          5.0         3.5          1.6         0.6  setosa
## 15           NA         3.9          1.7         0.4  setosa
## 18          4.7         3.2          1.3         0.2  setosa
new_df_spec$Petal.Width <- seqImpute(new_df_spec$Petal.Width, median(new_df_spec$Petal.Width, na.rm = TRUE))
if (!any(is.na(new_df_spec$Petal.Width))) "No Missing Values in imputed Petal.Width as expected : Evidence presented below : com_df$order1"
## [1] "No Missing Values in imputed Petal.Width as expected : Evidence presented below : com_df$order1"
  1. Ordered Data Frame By Species & Sepal.Length
new_df_spec_SL <- new_df[order(new_df$Species, new_df$Sepal.Length),]
head(new_df_spec_SL)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 63           4.3         3.0          1.1         0.1  setosa
## 31           4.4         3.2           NA         0.2  setosa
## 89           4.4         2.9          1.4         0.2  setosa
## 135          4.4         3.0          1.3          NA  setosa
## 122          4.5         2.3          1.3         0.3  setosa
## 26           4.6         3.2          1.4         0.2  setosa
new_df_spec_SL$Petal.Width <- seqImpute(new_df_spec_SL$Petal.Width, median(new_df_spec_SL$Petal.Width, na.rm = TRUE))
if (!any(is.na(new_df_spec_SL$Petal.Width))) "No Missing values in imputed Petal.Width as expected : Evidence presented below : comp_df$order2"
## [1] "No Missing values in imputed Petal.Width as expected : Evidence presented below : comp_df$order2"
  1. Comparing The imputed vectors: Order1 is for the ordering by Species alone and Order2 is for the ordering by Species and Sepal.Length
comp_df <- data.frame(order1 = new_df_spec$Petal.Width, order2 = new_df_spec_SL$Petal.Width)
head(comp_df)
##   order1 order2
## 1    0.4    0.1
## 2    0.2    0.2
## 3    0.2    0.2
## 4    0.6    0.3
## 5    0.4    0.3
## 6    0.2    0.2

The ordering made a big difference here. But I suspect that we will have the exact same entries in this vector. They are just permuted in some different order. To confirm:

#Set difference
setdiff(comp_df$order1, comp_df$order2)
## numeric(0)
#Ordered imputed Petal.Width from first ordering
sort(unique(comp_df$order1))
##  [1] 0.1 0.2 0.3 0.4 0.5 0.6 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0
## [18] 2.1 2.2 2.3 2.4 2.5
#Ordered imputed Petal.Width from second ordering
sort(unique(comp_df$order2))
##  [1] 0.1 0.2 0.3 0.4 0.5 0.6 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0
## [18] 2.1 2.2 2.3 2.4 2.5