library(readr)
library(stringr)
pacman::p_install(VIM)
## 
## The downloaded binary packages are in
##  /var/folders/r1/q7btql916p792lphs8rfpvy40000gn/T//Rtmpr1ZLbs/downloaded_packages
## 
## VIM installed
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep

Question 1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 2

sum(complete.cases(dirty_iris))
## [1] 96
nrow(dirty_iris)
## [1] 150
96/150 * 100
## [1] 64

Question 3

sapply(dirty_iris, function (x){
  if(is.numeric(x)) {
    c(NA_values = sum(is.na(x)),
      NaN_values = sum(is.nan(x)),
      Inf_values = sum(x == Inf, na.rm = TRUE),
      Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
  }
})
## $Sepal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             10              0              0              0 
## 
## $Sepal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             17              0              0              0 
## 
## $Petal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             19              0              0              0 
## 
## $Petal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             12              0              1              0 
## 
## $Species
## NULL

Question 4

dirty_iris[dirty_iris == Inf] <- NA
summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
sapply(dirty_iris, function (x){
  if(is.numeric(x)) {
    c(NA_values = sum(is.na(x)),
      NaN_values = sum(is.nan(x)),
      Inf_values = sum(x == Inf, na.rm = TRUE),
      Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
  }
})
## $Sepal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             10              0              0              0 
## 
## $Sepal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             17              0              0              0 
## 
## $Petal.Length
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             19              0              0              0 
## 
## $Petal.Width
##      NA_values     NaN_values     Inf_values Neg_Inf_values 
##             13              0              0              0 
## 
## $Species
## NULL

Question 5

num_negative_sepal_width <- sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
cat("Number of observations with negative Sepal.Width:", num_negative_sepal_width, "\n")
## Number of observations with negative Sepal.Width: 2
num_large_sepal_length <- sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
cat("Number of observations with Sepal.Length > 30:", num_large_sepal_length, "\n")
## Number of observations with Sepal.Length > 30: 2

Question 6

sepal_width_errors <- dirty_iris[dirty_iris$Sepal.Width <= 0,]
sepal_width_errors
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0])
##  [1] NA NA  3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

Question 7

dirty_iris[16,2] <- abs(dirty_iris[16, 2])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA


mean(dirty_iris$Sepal.Width, na.rm = TRUE)
## [1] 3.462121
median(dirty_iris$Petal.Length, na.rm = TRUE)
## [1] 4.5
lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
##     data = dirty_iris)
## 
## Coefficients:
##  (Intercept)   Sepal.Width  Petal.Length   Petal.Width  
##      -0.3282        1.5127        0.1048        0.9860
kNN(dirty_iris, variable = "Petal.Width")