pacman::p_load(Hmisc, VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
Question 1
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 2
num_complete <- sum(complete.cases(dirty_iris))
percent_complete <- (num_complete/nrow(dirty_iris))*100
cat("Number of complete observations:", num_complete, "\n")
## Number of complete observations: 96
cat("Percentage of complete observations", percent_complete, "%\n")
## Percentage of complete observations 64 %
Question 3
sapply(dirty_iris, function (x){
if(is.numeric(x)) {
c(NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(x == Inf, na.rm = TRUE),
Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
}
})
## $Sepal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 10 0 0 0
##
## $Sepal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 17 0 0 0
##
## $Petal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 19 0 0 0
##
## $Petal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 12 0 1 0
##
## $Species
## NULL
Question 4
dirty_iris[dirty_iris == Inf] <- NA
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
sapply(dirty_iris, function (x){
if(is.numeric(x)) {
c(NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(x == Inf, na.rm = TRUE),
Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
}
})
## $Sepal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 10 0 0 0
##
## $Sepal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 17 0 0 0
##
## $Petal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 19 0 0 0
##
## $Petal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 13 0 0 0
##
## $Species
## NULL
Question 5
num_negative_sepal_width <- sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
cat("Number of observations with negative Sepal.Width:", num_negative_sepal_width, "\n")
## Number of observations with negative Sepal.Width: 2
num_large_sepal_length <- sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
cat("Number of observations with Sepal.Length > 30:", num_large_sepal_length, "\n")
## Number of observations with Sepal.Length > 30: 2
Question 6
sepal_width_errors <- dirty_iris[dirty_iris$Sepal.Width <= 0,]
sepal_width_errors
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0])
## [1] NA NA 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Question 7
dirty_iris[16,2] <- abs(dirty_iris[16, 2])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
## [1] 3.462121
median(dirty_iris$Petal.Length, na.rm = TRUE)
## [1] 4.5
kNN(dirty_iris, variable = "Petal.Width")