dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <- sum(complete.cases(dirty_iris))
percent_complete <- (num_complete / nrow(dirty_iris)) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
inf_indicator <- is.infinite(dirty_iris$Petal.Width)
dirty_iris$Petal.Width[inf_indicator] <- NA
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
violations_width <- which(dirty_iris$Sepal.Width <= 0)
violations_length <- which(dirty_iris$Sepal.Length > 30)
violations <- unique(c(violations_width, violations_length))
violating_observations <- dirty_iris[violations, ]
print(violating_observations)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
length(violations)
## [1] 4
rules_violate <- subset(dirty_iris,c((Sepal.Width<=0)|(Sepal.Length>30)))
nrow(rules_violate)
## [1] 4
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris$Sepal.Width <- ifelse(is.na(dirty_iris$Sepal.Width), NA, dirty_iris$Sepal.Width)
violations <- which(dirty_iris$Sepal.Width <= 0)
print(dirty_iris[violations, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <-
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width)] <- NA
print(dirty_iris[violations, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
write.csv(dirty_iris, "cleaned_iris.csv", row.names = FALSE)
neg_indicator <- which(dirty_iris$Sepal.Width<0)
dirty_iris$Sepal.Width[neg_indicator]<-abs(dirty_iris$Sepal.Width[neg_indicator])
dirty_iris[neg_indicator,]
zero_indicator <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_indicator] <- NA
#####Sepal.width: mean
#####Petal.Length: median
#####Sepal.Length: linear regression
#####Petal.Width: kNN
###Write the R code to do the imputation as specified above. Mark the ones if your attached R code could achieve the task.
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: ggplot2
## Loading required package: lattice
library(RANN)
## Warning: package 'RANN' was built under R version 4.4.2
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris[sapply(dirty_iris, is.infinite)] <- NA
dirty_iris[sapply(dirty_iris, is.nan)] <- NA
print(colSums(is.na(dirty_iris)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 17 19 13 0
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
clean_data <- dirty_iris[complete.cases(dirty_iris[, c("Petal.Length", "Petal.Width", "Sepal.Width")]), ]
if(nrow(clean_data) > 0) {
lm_model <- lm(Sepal.Length ~ Petal.Length + Petal.Width + Sepal.Width, data = clean_data)
missing_indices <- which(is.na(dirty_iris$Sepal.Length))
dirty_iris$Sepal.Length[missing_indices] <- predict(lm_model, newdata = dirty_iris[missing_indices, ])
} else {
stop("No complete cases available for linear regression.")
}
preProc <- preProcess(dirty_iris, method = "knnImpute")
dirty_iris <- predict(preProc, newdata = dirty_iris)
print(colSums(is.na(dirty_iris)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
# Sepal.width: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm=TRUE)
# Petal.Length: median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm=TRUE)
# Sepal.Length: linear regression
model <- lm(Sepal.Length~Sepal.Width+Petal.Width, data=dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
to_be_imputed <- dirty_iris[I,]
dirty_iris$Sepal.Length[I] <- predict(model, newdata = to_be_imputed)
# Petal.Width: kNN
library(VIM)
## Warning: package 'VIM' was built under R version 4.4.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris1 <- kNN(dirty_iris)
## Warning in kNN(dirty_iris): Nothing to impute, because no NA are present (also
## after using makeNA)