Question 1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

Question 2

complete_cases <- complete.cases(dirty_iris)
num_complete <- sum(complete_cases)
percent_complete <- (num_complete / nrow(dirty_iris)) * 100

num_complete

## [1] 96

percent_complete

## [1] 64

Question 3

library(stringr)
str_detect(dirty_iris,"NaN")

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## [1] FALSE FALSE FALSE FALSE FALSE

Question 4

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv", stringsAsFactors = FALSE)

numeric_cols <- sapply(dirty_iris, is.numeric)

for (col in names(dirty_iris)[numeric_cols]) {
  dirty_iris[[col]][is.infinite(dirty_iris[[col]])] <- NA
  dirty_iris[[col]][is.nan(dirty_iris[[col]])] <- NA
}
summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Question 5

violating_observations_subset <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
num_violations_subset <- nrow(violating_observations_subset)
num_violations_subset

## [1] 4

Question 6

violating_rows <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0

dirty_iris$Sepal.Width[violating_rows & dirty_iris$Sepal.Width < 0] <- abs(dirty_iris$Sepal.Width[violating_rows & dirty_iris$Sepal.Width < 0])

dirty_iris$Sepal.Width[violating_rows & dirty_iris$Sepal.Width == 0] <- NA

head(dirty_iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4         3.2          4.5         1.5 versicolor
## 2          6.3         3.3          6.0         2.5  virginica
## 3          6.2          NA          5.4         2.3  virginica
## 4          5.0         3.4          1.6         0.4     setosa
## 5          5.7         2.6          3.5         1.0 versicolor
## 6          5.3          NA           NA         0.2     setosa

num_corrections <- sum(violating_rows)
num_corrections

## [1] 2

Question 7: Sepal.Width: mean

mean_sepal_width <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width

Question 7: Petal.Length: median

median_petal_length <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_petal_length

Question 7: Sepal.Lenth: linear regression

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris, na.action = na.omit)

missing_sepal_length <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing_sepal_length] <- predict(lm_model, newdata = dirty_iris[missing_sepal_length,])

Question 7: Petal.Width: kNN

data(dirty_iris)

## Warning in data(dirty_iris): data set 'dirty_iris' not found

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

dirty_iris$Petal.Width <- kNN(dirty_iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length")])$Petal.Width

## Warning in kNN(dirty_iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length")]):
## Nothing to impute, because no NA are present (also after using makeNA)

Assignment 5

Laura Schollmeyer

2024-10-17

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7: Sepal.Width: mean

Question 7: Petal.Length: median

Question 7: Sepal.Lenth: linear regression

Question 7: Petal.Width: kNN