Load necessary libraries and datasets
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
library(dplyr)
library(DMwR2) # For kNN imputation
Question 1
Determine missing values of Petal.length
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 2
Number and the percentage of observations that are complete
num_complete <- sum(complete.cases(dirty_iris))
percentage_complete <- (num_complete / nrow(dirty_iris)) * 100
print(num_complete,) %>% cat(" % \n",percentage_complete,"%")
## [1] 96
## 96 %
## 64 %
Question 3
Check for special values in numeric columns
num_cols <- sapply(dirty_iris, is.numeric)
has_NA <- any(is.na(dirty_iris[, num_cols]))
print(has_NA)
## [1] TRUE
has_NAn <- any(is.nan(as.matrix(dirty_iris[, num_cols])))
print(has_NAn)
## [1] FALSE
has_INF <- any(is.infinite(as.matrix(dirty_iris[, num_cols])))
print(has_INF)
## [1] TRUE
Question 4
Locate positions of infinite values and replace infinite values with
NA
inf_positions <- which(is.infinite(as.matrix(dirty_iris[, num_cols])), arr.ind = TRUE)
dirty_iris[inf_positions] <- NA
Question 5
Identify rows where Sepal.Width is not positive or Sepal.Length
exceeds 30 cm, count the observations.
errors <- which((dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30) &
!is.na(dirty_iris$Sepal.Width) & !is.na(dirty_iris$Sepal.Length))
# Subset the dataset using the found indices
violations <- dirty_iris[errors, ]
print(violations)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
print(length(errors))
## [1] 4
Question 6
Locate observations where Sepal.Width is ≤ 0
invalid_width <- which(dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width))
dirty_iris$Sepal.Width[invalid_width] <- ifelse(dirty_iris$Sepal.Width[invalid_width] < 0,abs(dirty_iris$Sepal.Width[invalid_width]),NA)
print(dirty_iris[invalid_width, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
Question 7
Impute Sepal.Width using mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
Summary of dirty_iris
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. : 2.200 Min. : 0.000 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.825 1st Qu.: 1.700 1st Qu.:0.300
## Median : 5.750 Median : 3.100 Median : 4.500 Median :1.300
## Mean : 6.559 Mean : 3.462 Mean : 4.456 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.462 3rd Qu.: 5.100 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.000 Max. :2.500
## NA's :10 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##