Assignment 5

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Question 1: How many missing values do you find in Petal.Length variable?

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

Question 2: Still use the same dirty_iris data you read in from the above, calculate the number and the percentage of observations that are complete. Which of the following results are correct?

num_complete <- sum(complete.cases(dirty_iris))
percent_complete <- (num_complete / nrow(dirty_iris)) * 100
num_complete

## [1] 96

percent_complete

## [1] 64

Question 3: Still based on the dirty_iris data, besides missing values, are there any other types of special values containing in the numeric columns? Choose the one(s) you found.

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Question 4: Write R code to locate the above special value you found from Question 3 and replace the special value as NA

inf_indicator <- is.infinite(dirty_iris$Petal.Width)
dirty_iris$Petal.Width[inf_indicator] <- NA
summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Question 5: This dirty_iris dataset also contains errors. We have the following background knowledge:

The sepal width should be a positive value.

The sepal length of an iris cannot exceed 30 cm.

Write R code to find out the observations that violate the above rules.

How many observations violate the above rules?

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

violations_width <- which(dirty_iris$Sepal.Width <= 0)

violations_length <- which(dirty_iris$Sepal.Length > 30)

violations <- unique(c(violations_width, violations_length))

violating_observations <- dirty_iris[violations, ]
print(violating_observations)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa

length(violations)

## [1] 4

Question 5 (from professor)

rules_violate <- subset(dirty_iris,c((Sepal.Width<=0)|(Sepal.Length>30)))
nrow(rules_violate)

## [1] 4

Question 6: Would you locate the observation that violates the rule of “Sepal.Width >0” and make reasonable corrections?

Write R code to achieve the error correction task. For example, you may assign an absolute value to an original negative value, and assign NA to an original 0 value (we can impute all NAs together later).

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

dirty_iris$Sepal.Width <- ifelse(is.na(dirty_iris$Sepal.Width), NA, dirty_iris$Sepal.Width)

violations <- which(dirty_iris$Sepal.Width <= 0)
print(dirty_iris[violations, ])

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <- 
  abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width)] <- NA

print(dirty_iris[violations, ])

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa

write.csv(dirty_iris, "cleaned_iris.csv", row.names = FALSE)

Question 6 (from professor)

neg_indicator <- which(dirty_iris$Sepal.Width<0)

dirty_iris$Sepal.Width[neg_indicator]<-abs(dirty_iris$Sepal.Width[neg_indicator])

dirty_iris[neg_indicator,]

zero_indicator <- which(dirty_iris$Sepal.Width == 0)

dirty_iris$Sepal.Width[zero_indicator] <- NA

Question 7: Let us deal with the missing values now. You are going to use four methods we learned to impute the missing values for each column, respectively:

#####Sepal.width: mean

#####Petal.Length: median

#####Sepal.Length: linear regression

#####Petal.Width: kNN

###Write the R code to do the imputation as specified above. Mark the ones if your attached R code could achieve the task.

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: ggplot2

## Loading required package: lattice

library(RANN)

## Warning: package 'RANN' was built under R version 4.4.2

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

dirty_iris[sapply(dirty_iris, is.infinite)] <- NA
dirty_iris[sapply(dirty_iris, is.nan)] <- NA

print(colSums(is.na(dirty_iris)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           17           19           13            0

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

clean_data <- dirty_iris[complete.cases(dirty_iris[, c("Petal.Length", "Petal.Width", "Sepal.Width")]), ]

if(nrow(clean_data) > 0) {
    lm_model <- lm(Sepal.Length ~ Petal.Length + Petal.Width + Sepal.Width, data = clean_data)
    missing_indices <- which(is.na(dirty_iris$Sepal.Length))
    dirty_iris$Sepal.Length[missing_indices] <- predict(lm_model, newdata = dirty_iris[missing_indices, ])
} else {
    stop("No complete cases available for linear regression.")
}

preProc <- preProcess(dirty_iris, method = "knnImpute")
dirty_iris <- predict(preProc, newdata = dirty_iris)

print(colSums(is.na(dirty_iris)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Question 7 (from professor)

# Sepal.width: mean

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm=TRUE) 

# Petal.Length: median

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm=TRUE) 

# Sepal.Length: linear regression

model <- lm(Sepal.Length~Sepal.Width+Petal.Width, data=dirty_iris)

I <- is.na(dirty_iris$Sepal.Length)

to_be_imputed <- dirty_iris[I,]

dirty_iris$Sepal.Length[I] <- predict(model, newdata = to_be_imputed)

# Petal.Width: kNN

library(VIM)

## Warning: package 'VIM' was built under R version 4.4.2

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

dirty_iris1 <- kNN(dirty_iris)

## Warning in kNN(dirty_iris): Nothing to impute, because no NA are present (also
## after using makeNA)