Question 1
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 2
complete_cases <- complete.cases(dirty_iris)
num_complete <- sum(complete_cases)
percent_complete <- (num_complete / nrow(dirty_iris)) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
Question 3
library(stringr)
str_detect(dirty_iris,"NaN")
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] FALSE FALSE FALSE FALSE FALSE
Question 4
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv", stringsAsFactors = FALSE)
numeric_cols <- sapply(dirty_iris, is.numeric)
for (col in names(dirty_iris)[numeric_cols]) {
dirty_iris[[col]][is.infinite(dirty_iris[[col]])] <- NA
dirty_iris[[col]][is.nan(dirty_iris[[col]])] <- NA
}
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
Question 5
violating_observations_subset <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
num_violations_subset <- nrow(violating_observations_subset)
num_violations_subset
## [1] 4
Question 6
violating_rows <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0
dirty_iris$Sepal.Width[violating_rows & dirty_iris$Sepal.Width < 0] <- abs(dirty_iris$Sepal.Width[violating_rows & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[violating_rows & dirty_iris$Sepal.Width == 0] <- NA
head(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.4 3.2 4.5 1.5 versicolor
## 2 6.3 3.3 6.0 2.5 virginica
## 3 6.2 NA 5.4 2.3 virginica
## 4 5.0 3.4 1.6 0.4 setosa
## 5 5.7 2.6 3.5 1.0 versicolor
## 6 5.3 NA NA 0.2 setosa
num_corrections <- sum(violating_rows)
num_corrections
## [1] 2
Question 7: Sepal.Width: mean
mean_sepal_width <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width
Question 7: Sepal.Lenth: linear regression
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris, na.action = na.omit)
missing_sepal_length <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_sepal_length] <- predict(lm_model, newdata = dirty_iris[missing_sepal_length,])
Question 7: Petal.Width: kNN
data(dirty_iris)
## Warning in data(dirty_iris): data set 'dirty_iris' not found
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris$Petal.Width <- kNN(dirty_iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length")])$Petal.Width
## Warning in kNN(dirty_iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length")]):
## Nothing to impute, because no NA are present (also after using makeNA)