dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
Question 3
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 4
sum(complete.cases(dirty_iris))
## [1] 96
# percent
mean(complete.cases(dirty_iris)) * 100
## [1] 64
Question 5
colSums(sapply(dirty_iris, is.nan))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
colSums(sapply(dirty_iris, is.infinite))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 1 0
Question 6
table(dirty_iris$Petal.Width)
##
## 0.1 0.2 0.3 0.4 0.5 0.6 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3
## 5 26 6 6 1 1 6 3 4 13 8 12 4 1 8 4 7 6 2 8
## 2.4 2.5 Inf
## 3 3 1
which(dirty_iris$Petal.Width == "Inf")
## [1] 86
dirty_iris$Petal.Width[86] <- "NA"
table(dirty_iris$Petal.Width)
##
## 0.1 0.2 0.3 0.4 0.5 0.6 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3
## 5 26 6 6 1 1 6 3 4 13 8 12 4 1 8 4 7 6 2 8
## 2.4 2.5 NA
## 3 3 1
Question 7
violations <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
violations
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1 versicolor
## 28 73.0 29 63.0 <NA> virginica
## 125 49.0 30 14.0 2 setosa
## 130 5.7 0 1.7 0.3 setosa
nrow(violations)
## [1] 4
Question 8
#filtering out NAs since they are not allowed in subscripted assignments
#Make negative absolute value
neg_values <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0)
dirty_iris$Sepal.Width[neg_values] <- abs(dirty_iris$Sepal.Width[neg_values])
print(dirty_iris$Sepal.Width)
## [1] 3.2 3.3 NA 3.4 2.6 NA 2.7 3.0 2.7 3.1 3.5 2.7 3.0 2.8 3.9
## [16] 3.0 NA 3.2 4.0 NA 3.6 NA 2.8 3.3 3.0 3.2 3.1 29.0 3.2 2.8
## [31] 3.2 3.2 2.8 2.9 2.9 3.0 3.0 2.2 2.5 3.0 NA 2.7 NA 2.7 4.2
## [46] 2.8 NA 3.2 3.0 3.4 2.6 3.1 2.7 3.4 3.3 3.8 3.8 2.9 2.8 2.8
## [61] 2.3 2.8 3.0 3.3 3.0 2.5 2.5 3.2 3.5 3.5 3.0 3.1 3.5 NA 2.8
## [76] 2.5 3.5 3.0 3.8 3.8 2.6 3.4 2.9 3.7 3.0 3.8 2.9 2.9 2.9 2.5
## [91] 3.2 NA 3.4 2.7 2.2 3.1 2.3 NA 3.0 2.8 3.4 3.6 2.7 3.0 3.7
## [106] NA 3.0 3.0 2.8 3.4 3.4 3.4 3.4 3.3 3.1 2.6 NA 3.1 3.0 2.8
## [121] 3.0 2.3 3.2 4.1 30.0 2.9 3.2 NA 3.6 0.0 2.5 3.1 NA 3.3 3.0
## [136] 3.0 3.2 3.0 3.1 2.2 NA NA 3.0 2.9 2.5 3.1 3.0 3.5 3.1 2.6
# Make 0 become NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
print(dirty_iris$Sepal.Width)
## [1] 3.2 3.3 NA 3.4 2.6 NA 2.7 3.0 2.7 3.1 3.5 2.7 3.0 2.8 3.9
## [16] 3.0 NA 3.2 4.0 NA 3.6 NA 2.8 3.3 3.0 3.2 3.1 29.0 3.2 2.8
## [31] 3.2 3.2 2.8 2.9 2.9 3.0 3.0 2.2 2.5 3.0 NA 2.7 NA 2.7 4.2
## [46] 2.8 NA 3.2 3.0 3.4 2.6 3.1 2.7 3.4 3.3 3.8 3.8 2.9 2.8 2.8
## [61] 2.3 2.8 3.0 3.3 3.0 2.5 2.5 3.2 3.5 3.5 3.0 3.1 3.5 NA 2.8
## [76] 2.5 3.5 3.0 3.8 3.8 2.6 3.4 2.9 3.7 3.0 3.8 2.9 2.9 2.9 2.5
## [91] 3.2 NA 3.4 2.7 2.2 3.1 2.3 NA 3.0 2.8 3.4 3.6 2.7 3.0 3.7
## [106] NA 3.0 3.0 2.8 3.4 3.4 3.4 3.4 3.3 3.1 2.6 NA 3.1 3.0 2.8
## [121] 3.0 2.3 3.2 4.1 30.0 2.9 3.2 NA 3.6 NA 2.5 3.1 NA 3.3 3.0
## [136] 3.0 3.2 3.0 3.1 2.2 NA NA 3.0 2.9 2.5 3.1 3.0 3.5 3.1 2.6
Question 9
# Sepal Width Mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
#Petal Length Median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 0 0 12 0