install.packages(“VIM”)

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Find missing values in Petal.length variable

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

Calculate the number and the percentage of observations that are complete

complete <- sum(complete.cases(dirty_iris))
total_rows <- nrow(dirty_iris)
percent_complete <- complete / total_rows*100

Check for other types of special values

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

which(is.infinite(dirty_iris$Petal.Width))

## [1] 86

Locate the above identified special value and replace them with a missing value placeholder

dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA

Write R code to find out the observations that violate the rules The sepal width should be a positive value & the sepal length of an iris cannot exceed 30 cm

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)

## [1] 2

sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)

## [1] 2

locate the observation that violates the rule of “Sepal.Width >0” and make reasonable corrections

table(dirty_iris$Sepal.Width)

## 
##  -3   0 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 
##   1   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1 
## 4.1 4.2  29  30 
##   1   1   1   1

which(dirty_iris$Sepal.Width <= 0)

## [1]  16 130

dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[130] <- NA
table(dirty_iris$Sepal.Width)

## 
## 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 4.1 4.2 
##   3   3   7   5   8  12   9  24  11  12   6  10   6   3   2   5   1   1   1   1 
##  29  30 
##   1   1

Use four methods we learned to impute the missing values for each column, respectively

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
summary(lm_model)

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5549 -0.1682  0.0996  0.4901  2.0549 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.61716    0.13268  -4.652 7.67e-06 ***
## Sepal.Width   1.47695    0.04000  36.923  < 2e-16 ***
## Petal.Length  0.45604    0.02279  20.012  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.054 on 137 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.9763, Adjusted R-squared:  0.976 
## F-statistic:  2823 on 2 and 137 DF,  p-value: < 2.2e-16

missing_idx <- which(is.na(dirty_iris$Sepal.Length))
predicted_vals <- predict(lm_model, newdata = dirty_iris[missing_idx, ])
dirty_iris$Sepal.Length[missing_idx] <- predicted_vals

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0

dirty_iris <- dirty_iris[ , !grepl("_imp$", names(dirty_iris))]

Assignment_4

Austin McClintic

2025-10-03

Find missing values in Petal.length variable

Calculate the number and the percentage of observations that are complete

Check for other types of special values

Locate the above identified special value and replace them with a missing value placeholder

Write R code to find out the observations that violate the rules The sepal width should be a positive value & the sepal length of an iris cannot exceed 30 cm

locate the observation that violates the rule of “Sepal.Width >0” and make reasonable corrections

Use four methods we learned to impute the missing values for each column, respectively