Question 1
No Code Required
Question 2
No Code Required
Quetsion 3
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 4
complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
complete_obs
## [1] 96
complete_obs / total_obs
## [1] 0.64
Question 5
sapply(dirty_iris, function(x) sum(is.nan(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 1 0
Question 6
# Locate Inf values
is_inf <- sapply(dirty_iris, is.infinite)
# Replace Inf with NA
dirty_iris[is_inf] <- NA
# Verify
sapply(dirty_iris, function(x) sum(is.infinite(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
Question 7
violations <- subset(dirty_iris,
Sepal.Width <= 0 | Sepal.Length > 30)
nrow(violations)
## [1] 4
Question 8
# Fix negative values safely
dirty_iris$Sepal.Width[
!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
] <- abs(
dirty_iris$Sepal.Width[
!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
]
)
# Replace zeros with NA
dirty_iris$Sepal.Width[
!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0
] <- NA
Question 9
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris)
missing_SL <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_SL] <-
predict(model, dirty_iris[missing_SL, ])
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris,
variable = "Petal.Width",
k = 5)
# Remove indicator column
dirty_iris$Petal.Width_imp <- NULL
Question 10
No Code Required