Assignment 5

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))

## [1] 19

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

complete_obs <- sum(complete.cases(dirty_iris))

total_obs <- nrow(dirty_iris)
percentage_complete <- (complete_obs / total_obs) * 100

cat("Number of complete observations:", complete_obs, "\n")

## Number of complete observations: 96

cat("Percentage of complete observations:", round(percentage_complete, 2), "%\n")

## Percentage of complete observations: 64 %

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

numeric_columns <- sapply(dirty_iris, is.numeric)

na_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.na(x)))

nan_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.nan(x)))

inf_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.infinite(x) & x > 0))

neg_inf_count <- sapply(dirty_iris[, numeric_columns], function(x) sum(is.infinite(x) & x < 0))

cat("NA values:\n")

## NA values:

print(na_count)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##           10           17           19           12

cat("NaN values:\n")

## NaN values:

print(nan_count)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0

cat("\nInf values:\n")

## 
## Inf values:

print(inf_count)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1

cat("\n-Inf values:\n")

## 
## -Inf values:

print(neg_inf_count)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

numeric_columns <- sapply(dirty_iris, is.numeric)

dirty_iris[, numeric_columns] <- lapply(dirty_iris[, numeric_columns], function(x) {
  x[is.nan(x)] <- NA
  return(x)
})

dirty_iris[, numeric_columns] <- lapply(dirty_iris[, numeric_columns], function(x) {
  x[is.infinite(x)] <- NA
  return(x)
})

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

dirty_iris$Sepal.Length <- as.numeric(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)

violations <- dirty_iris[
  (!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) | 
  (!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30), 
]

print(violations)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

cat("Number of observations violating the rules:", nrow(violations))

## Number of observations violating the rules: 4

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")


dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)

dirty_iris$Sepal.Width <- ifelse(
  dirty_iris$Sepal.Width < 0, abs(dirty_iris$Sepal.Width), 
  ifelse(dirty_iris$Sepal.Width == 0, NA, dirty_iris$Sepal.Width)
)

print(dirty_iris$Sepal.Width)

##   [1]  3.2  3.3   NA  3.4  2.6   NA  2.7  3.0  2.7  3.1  3.5  2.7  3.0  2.8  3.9
##  [16]  3.0   NA  3.2  4.0   NA  3.6   NA  2.8  3.3  3.0  3.2  3.1 29.0  3.2  2.8
##  [31]  3.2  3.2  2.8  2.9  2.9  3.0  3.0  2.2  2.5  3.0   NA  2.7   NA  2.7  4.2
##  [46]  2.8   NA  3.2  3.0  3.4  2.6  3.1  2.7  3.4  3.3  3.8  3.8  2.9  2.8  2.8
##  [61]  2.3  2.8  3.0  3.3  3.0  2.5  2.5  3.2  3.5  3.5  3.0  3.1  3.5   NA  2.8
##  [76]  2.5  3.5  3.0  3.8  3.8  2.6  3.4  2.9  3.7  3.0  3.8  2.9  2.9  2.9  2.5
##  [91]  3.2   NA  3.4  2.7  2.2  3.1  2.3   NA  3.0  2.8  3.4  3.6  2.7  3.0  3.7
## [106]   NA  3.0  3.0  2.8  3.4  3.4  3.4  3.4  3.3  3.1  2.6   NA  3.1  3.0  2.8
## [121]  3.0  2.3  3.2  4.1 30.0  2.9  3.2   NA  3.6   NA  2.5  3.1   NA  3.3  3.0
## [136]  3.0  3.2  3.0  3.1  2.2   NA   NA  3.0  2.9  2.5  3.1  3.0  3.5  3.1  2.6

cat("Number of NA values in Sepal.Width after correction:", sum(is.na(dirty_iris$Sepal.Width)))

## Number of NA values in Sepal.Width after correction: 18

library(DMwR2)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(mice)

## Warning: package 'mice' was built under R version 4.3.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv", stringsAsFactors = FALSE)

numeric_cols <- c("Sepal.Width", "Sepal.Length", "Petal.Length", "Petal.Width")
dirty_iris[numeric_cols] <- lapply(dirty_iris[numeric_cols], function(x) as.numeric(as.character(x)))

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

mice_data <- mice(dirty_iris, method = c("", "", "norm.predict", "", ""), m = 1, maxit = 5, seed = 123)

## 
##  iter imp variable
##   1   1
##   2   1
##   3   1
##   4   1
##   5   1

## Warning: Number of logged events: 2

dirty_iris <- complete(mice_data)


tryCatch({dirty_iris <- knnImputation(dirty_iris, k = 5)
}, error = function(e) {
  cat("kNN failed: ", e$message, "\nApplying mean imputation for Petal.Width.\n")
  dirty_iris$Petal.Width[is.na(dirty_iris$Petal.Width)] <- mean(dirty_iris$Petal.Width, na.rm = TRUE)
})

## Warning in knnImputation(dirty_iris, k = 5): NAs introduced by coercion

## kNN failed:  Not sufficient complete cases for computing neighbors. 
## Applying mean imputation for Petal.Width.

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.000   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.700   1st Qu.:0.3  
##  Median : 5.750   Median : 3.100   Median : 4.500   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.456   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.391   3rd Qu.: 5.100   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :Inf  
##  NA's   :10                                         NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Assignment 5

Melissa Conti

2025-02-25