Question 1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Question 2

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Question 3

sum(complete.cases(dirty_iris))
## [1] 96

Question 4

mean(complete.cases(dirty_iris)) * 100
## [1] 64

Question 5

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 6

dirty_iris[dirty_iris == Inf] <- NA

Question 7

violations <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
dirty_iris[violations, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
length(violations)
## [1] 4

Question 8

dirty_iris$Sepal.Width <- as.numeric(dirty_iris$Sepal.Width)

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <- 
  abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width)] <- NA

summary(dirty_iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.200   2.800   3.000   3.462   3.300  30.000      18

Question 9

mean_sw <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sw
median_pl <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_pl
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
               data = dirty_iris, na.action = na.omit)
library("VIM")
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris)