Assignment

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

num_complete <- sum(complete.cases(dirty_iris))
perc_complete <- mean(complete.cases(dirty_iris)) * 100
c(Number = num_complete, Percentage = perc_complete)

##     Number Percentage 
##         96         64

num_cols <- sapply(dirty_iris, is.numeric)

inf_counts <- sapply(dirty_iris[, num_cols, drop = FALSE], function(x) sum(is.infinite(x)))
inf_counts

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1

inf_mat <- sapply(dirty_iris[, num_cols, drop = FALSE], is.infinite)
which(inf_mat, arr.ind = TRUE)

##      row col
## [1,]  86   4

cbind(row = which(inf_mat, arr.ind = TRUE)[,1],
      col = colnames(dirty_iris[, num_cols, drop = FALSE])[which(inf_mat, arr.ind = TRUE)[,2]])

##     row  col          
## row "86" "Petal.Width"

dirty_iris[sapply(dirty_iris, is.infinite)] <- NA

sapply(dirty_iris, function(x) sum(is.infinite(x)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

violations_subset <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
violations_subset

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

nrow(violations_subset)

## [1] 4

violations_which <- dirty_iris[which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30), ]
violations_which

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

nrow(violations_which)

## [1] 4

idx_neg  <- which(dirty_iris$Sepal.Width < 0)   # negative values
idx_zero <- which(dirty_iris$Sepal.Width == 0)  # zero values

dirty_iris$Sepal.Width[idx_neg]  <- abs(dirty_iris$Sepal.Width[idx_neg])
dirty_iris$Sepal.Width[idx_zero] <- NA

any(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)

## [1] FALSE

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)


dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)


preds <- c("Sepal.Width","Petal.Length")
fml <- as.formula(paste("Sepal.Length ~", paste(preds, collapse = " + ")))

model <- lm(fml, data=dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
dirty_iris[I,"Sepal.Length"] <- predict(model, dirty_iris[I,])

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

dirty_iris <- VIM::kNN(dirty_iris, variable = "Petal.Width", k = 5, imp_var = FALSE)

## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width   
##  Min.   : 0.000   Min.   : 2.200   Min.   : 0.000   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.825   1st Qu.: 1.700   1st Qu.:0.300  
##  Median : 5.800   Median : 3.100   Median : 4.500   Median :1.300  
##  Mean   : 6.528   Mean   : 3.462   Mean   : 4.456   Mean   :1.209  
##  3rd Qu.: 6.400   3rd Qu.: 3.462   3rd Qu.: 5.100   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500  
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Assignment_4

2025-10-02