Assignment 5

pacman::p_load(Hmisc, VIM)

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Question 1

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

Question 2

num_complete <- sum(complete.cases(dirty_iris))
percent_complete <- (num_complete/nrow(dirty_iris))*100
cat("Number of complete observations:", num_complete, "\n")

## Number of complete observations: 96

cat("Percentage of complete observations", percent_complete, "%\n")

## Percentage of complete observations 64 %

Question 3

sapply(dirty_iris, function (x){
  if(is.numeric(x)) {
    c(NA_values = sum(is.na(x)), 
      NaN_values = sum(is.nan(x)),
      Inf_values = sum(x == Inf, na.rm = TRUE),
      Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
  }
})

## $Sepal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             10              0              0              0 
## 
## $Sepal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             17              0              0              0 
## 
## $Petal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             19              0              0              0 
## 
## $Petal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             12              0              1              0 
## 
## $Species
## NULL

Question 4

dirty_iris[dirty_iris == Inf] <- NA
summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

sapply(dirty_iris, function (x){
  if(is.numeric(x)) {
    c(NA_values = sum(is.na(x)), 
      NaN_values = sum(is.nan(x)),
      Inf_values = sum(x == Inf, na.rm = TRUE),
      Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
  }
})

## $Sepal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             10              0              0              0 
## 
## $Sepal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             17              0              0              0 
## 
## $Petal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             19              0              0              0 
## 
## $Petal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             13              0              0              0 
## 
## $Species
## NULL

Question 5

num_negative_sepal_width <- sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
cat("Number of observations with negative Sepal.Width:", num_negative_sepal_width, "\n")

## Number of observations with negative Sepal.Width: 2

num_large_sepal_length <- sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
cat("Number of observations with Sepal.Length > 30:", num_large_sepal_length, "\n")

## Number of observations with Sepal.Length > 30: 2

Question 6

sepal_width_errors <- dirty_iris[dirty_iris$Sepal.Width <= 0,] 
sepal_width_errors

abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0])

##  [1] NA NA  3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

Question 7

dirty_iris[16,2] <- abs(dirty_iris[16, 2]) 
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA 
mean(dirty_iris$Sepal.Width, na.rm = TRUE)

## [1] 3.462121

median(dirty_iris$Petal.Length, na.rm = TRUE)

## [1] 4.5

kNN(dirty_iris, variable = "Petal.Width")