Assignment 4

Question 3

How many missing values do you find in Petal.Length variable?

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

Question 4

Calculate the number and the percentage of observations that are complete.

complete_obs <- sum(complete.cases(dirty_iris))
complete_obs

## [1] 96

percent_complete <- complete_obs / nrow(dirty_iris) * 100
percent_complete

## [1] 64

complete_obs

## [1] 96

round(percent_complete, 0)

## [1] 64

Question 5

Is there an another type of special values containing in the numeric columns?

sum(sapply(dirty_iris, function(x) sum(is.nan(x))))

## [1] 0

sum(sapply(dirty_iris, function(x) sum(is.infinite(x) & x > 0)))

## [1] 1

sum(sapply(dirty_iris, function(x) sum(is.infinite(x) & x < 0)))

## [1] 0

Question 6

Write R code to locate the above identified special value and replace them with a missing value placeholder.

sapply(dirty_iris, function(x) sum(is.infinite(x)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1            0

dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA

sapply(dirty_iris, function(x) sum(is.infinite(x)))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            1            0

Question 7

Write R code to find out the observations that violate these rules. How many observations violate the above rules?

violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30

sum(violations, na.rm = TRUE)

## [1] 4

Question 8

Would you locate the observation that violates the rule of “Sepal.Width >0” and make reasonable corrections?

Write R code to achieve the error correction task.

which(dirty_iris$Sepal.Width <= 0)

## [1]  16 130

dirty_iris[dirty_iris$Sepal.Width <= 0, ]

##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.2            NA          NA           NA          NA       <NA>
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv",
stringsAsFactors = FALSE)

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)

## [1] 2

dirty_iris[is.na(dirty_iris$Sepal.Width) | dirty_iris$Sepal.Width <= 0, ]

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 3            6.2          NA        5.400         2.3  virginica
## 6            5.3          NA           NA         0.2     setosa
## 16           5.0          -3        3.500         1.0 versicolor
## 17           5.5          NA        4.000         1.3 versicolor
## 20           5.6          NA        4.200         1.3 versicolor
## 22           5.4          NA        4.500         1.5 versicolor
## 41           5.0          NA        1.200         0.2     setosa
## 43           0.0          NA        1.300         0.4     setosa
## 47           5.7          NA           NA         0.4     setosa
## 74           6.5          NA        4.600         1.5 versicolor
## 92           4.9          NA        3.300         1.0 versicolor
## 98           6.7          NA        5.000         1.7 versicolor
## 106          5.5          NA        0.925         1.0 versicolor
## 117          6.3          NA        4.400         1.3 versicolor
## 128          5.8          NA        5.100         2.4  virginica
## 130          5.7           0        1.700         0.3     setosa
## 133          4.8          NA        1.900         0.2     setosa
## 141          5.0          NA        1.400         0.2     setosa
## 142          5.5          NA        3.800         1.1 versicolor

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)

## [1] 1

Question 9

You are going to use the four methods we learned to impute the missing values for each column, respectively:

Sepal.width: mean
Petal.Length: median
Sepal.Length: linear regression
Petal.Width: kNN

Write the R code to do the imputation as specified above. Mark the ones if your attached R code could achieve the task.

dirty_iris <- read.csv(
  "https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv",
  stringsAsFactors = FALSE
)

num_cols <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
dirty_iris[num_cols] <- lapply(dirty_iris[num_cols], function(x) suppressWarnings(as.numeric(x)))

dirty_iris$Species <- as.factor(dirty_iris$Species)

dirty_iris[num_cols] <- lapply(dirty_iris[num_cols], function(x) {
  x[is.infinite(x)] <- NA
  x
})

Sepal.Width: MEAN impute

sw_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sw_mean

Petal.Length: MEDIAN impute

pl_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- pl_median

Sepal.Length: LINEAR REGRESSION impute

train_sl <- subset(dirty_iris, !is.na(Sepal.Length))
fit_sl <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species, data = train_sl)

miss_sl_idx <- which(is.na(dirty_iris$Sepal.Length))
dirty_iris$Sepal.Length[miss_sl_idx] <- predict(fit_sl, newdata = dirty_iris[miss_sl_idx, ])

Petal.Width: kNN impute

library(VIM)

## Warning: package 'VIM' was built under R version 4.4.3

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

tmp <- VIM::kNN(
  dirty_iris,
  variable = "Petal.Width",
  dist_var = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Species"),
  k = 5
)

dirty_iris <- tmp[, names(dirty_iris)]

colSums(is.na(dirty_iris[, num_cols]))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0