assignment4

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

num_complete <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percent_complete <- (num_complete / total_obs) * 100
num_complete

## [1] 96

percent_complete

## [1] 64

library(stringr)

sum(is.na(dirty_iris))

## [1] 58

sum(dirty_iris == NaN, na.rm = TRUE)

## [1] 0

sum(dirty_iris == Inf, na.rm = TRUE)

## [1] 1

sum(dirty_iris == -Inf, na.rm = TRUE)

## [1] 0

dirty_iris[dirty_iris == NaN] <- NA
dirty_iris[dirty_iris == Inf] <- NA
dirty_iris[dirty_iris == -Inf] <- NA

bad_rows <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
dirty_iris[bad_rows, ]

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

length(bad_rows)

## [1] 4

table(dirty_iris$Sepal.Width)

## 
##  -3   0 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 
##   1   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1 
## 4.1 4.2  29  30 
##   1   1   1   1

bad_data <- str_subset(dirty_iris$Sepal.Width, "[a-z A-Z]")

dirty_iris[which(dirty_iris$Sepal.Width=="0"),"Sepal.Width"] <- NA
table(dirty_iris$Sepal.Width)

## 
##  -3 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 4.1 
##   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1   1 
## 4.2  29  30 
##   1   1   1

colSums(is.na(dirty_iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           18           19           13            0

library(robotstxt)
library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

colSums(is.na(dirty_iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10            0           19           13            0

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)]

##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

  median(dirty_iris$Petal.Length, na.rm = TRUE)

## [1] 4.5

colSums(is.na(dirty_iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10            0           19           13            0

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species, 
               data = dirty_iris, na.action = na.omit)

dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- 
  predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##            0           -3            0           73           30           63

colSums(is.na(dirty_iris))

##    Sepal.Length     Sepal.Width    Petal.Length     Petal.Width         Species 
##               1               0              19               0               0 
## Petal.Width_imp 
##               0

assignment4

sam gondelman

2025-09-26