knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
head(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.4 3.2 4.5 1.5 versicolor
## 2 6.3 3.3 6.0 2.5 virginica
## 3 6.2 NA 5.4 2.3 virginica
## 4 5.0 3.4 1.6 0.4 setosa
## 5 5.7 2.6 3.5 1.0 versicolor
## 6 5.3 NA NA 0.2 setosa
Question 3
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 4
n_complete <- sum(complete.cases(dirty_iris))
pct_complete <- mean(complete.cases(dirty_iris)) * 100
n_complete
## [1] 96
pct_complete
## [1] 64
Question 5
num_cols <- sapply(dirty_iris, is.numeric)
sapply(dirty_iris[, num_cols], function(x) sum(is.nan(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
sapply(dirty_iris[, num_cols], function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 1
Question 6
# locate (which row(s) contain Inf in numeric columns)
which(is.infinite(dirty_iris$Petal.Width))
## [1] 86
# replace Inf with NA in Petal.Width
dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA
# verify
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 0
sum(is.na(dirty_iris$Petal.Width))
## [1] 13
Question 7
violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30
violating_rows <- which(violations)
length(violating_rows)
## [1] 4
violating_rows
## [1] 16 28 125 130
dirty_iris[violating_rows, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
Question 8
# find rows with Sepal.Width <= 0 (ignore NAs)
bad_sw <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
bad_sw
## [1] 16 130
dirty_iris[bad_sw, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
# correct: negative -> abs (ignore NAs)
neg_sw <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
dirty_iris$Sepal.Width[neg_sw] <- abs(dirty_iris$Sepal.Width[neg_sw])
# correct: zero -> NA (ignore NAs)
zero_sw <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0
dirty_iris$Sepal.Width[zero_sw] <- NA
# verify
summary(dirty_iris$Sepal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.200 2.800 3.000 3.462 3.300 30.000 18
which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
## integer(0)
Question 9
sw_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sw_mean
sum(is.na(dirty_iris$Sepal.Width))
## [1] 0
sw_mean
## [1] 3.462121
pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median
sum(is.na(dirty_iris$Petal.Length))
## [1] 0
pl_median
## [1] 4.5