Load Data

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
head(dirty_iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4         3.2          4.5         1.5 versicolor
## 2          6.3         3.3          6.0         2.5  virginica
## 3          6.2          NA          5.4         2.3  virginica
## 4          5.0         3.4          1.6         0.4     setosa
## 5          5.7         2.6          3.5         1.0 versicolor
## 6          5.3          NA           NA         0.2     setosa

Question 3

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 4

n_complete <- sum(complete.cases(dirty_iris))
pct_complete <- mean(complete.cases(dirty_iris)) * 100

n_complete
## [1] 96
pct_complete
## [1] 64

Question 5

num_cols <- sapply(dirty_iris, is.numeric)

sapply(dirty_iris[, num_cols], function(x) sum(is.nan(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0
sapply(dirty_iris[, num_cols], function(x) sum(is.infinite(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1

Question 6

which(is.infinite(dirty_iris$Petal.Width))
## [1] 86
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA

sum(is.infinite(dirty_iris$Petal.Width))
## [1] 0
sum(is.na(dirty_iris$Petal.Width))
## [1] 13

Question 7

violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30
violating_rows <- which(violations)

length(violating_rows)
## [1] 4
violating_rows
## [1]  16  28 125 130
dirty_iris[violating_rows, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Question 8

bad_sw <- which(!is.na(dirty_iris$Sepal.Width) &
                  dirty_iris$Sepal.Width <= 0)

bad_sw
## [1]  16 130
dirty_iris[bad_sw, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
# Fix negative values
neg_sw <- !is.na(dirty_iris$Sepal.Width) &
          dirty_iris$Sepal.Width < 0
dirty_iris$Sepal.Width[neg_sw] <- abs(dirty_iris$Sepal.Width[neg_sw])

# Replace zeros with NA
zero_sw <- !is.na(dirty_iris$Sepal.Width) &
           dirty_iris$Sepal.Width == 0
dirty_iris$Sepal.Width[zero_sw] <- NA

summary(dirty_iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.200   2.800   3.000   3.462   3.300  30.000      18
which(!is.na(dirty_iris$Sepal.Width) &
        dirty_iris$Sepal.Width <= 0)
## integer(0)

Question 9

# Replace NA with mean (Sepal.Width)
sw_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sw_mean

sum(is.na(dirty_iris$Sepal.Width))
## [1] 0
sw_mean
## [1] 3.462121
# Replace NA with median (Petal.Length)
pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median

sum(is.na(dirty_iris$Petal.Length))
## [1] 0
pl_median
## [1] 4.5