#Q1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

#Q2

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

#Q3

num_complete <- sum(complete.cases(dirty_iris))

percent_complete <- (num_complete / nrow(dirty_iris)) * 100

num_complete
## [1] 96
percent_complete
## [1] 64

#Q4

dirty_iris[dirty_iris == Inf] <- NA

#Q5

violations <- dirty_iris[!is.na(dirty_iris$Sepal.Width) & (dirty_iris$Sepal.Width < 0 | dirty_iris$Sepal.Width > 30), ]

violations
##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16            5          -3          3.5           1 versicolor

#Q6

# zeros
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA

# neg values
dirty_iris$Sepal.Width <- abs(dirty_iris$Sepal.Width)

#Q7

options(repos = c(CRAN = "https://cran.rstudio.com"))
install.packages("VIM")
## 
## The downloaded binary packages are in
##  /var/folders/w9/14mq59ws1v5gjp0qx287vg1r0000gn/T//RtmpCX6ntB/downloaded_packages
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- iris

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
missing <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing] <- predict(model, newdata = dirty_iris[missing, ])

iris_knn <- kNN(dirty_iris, k = 5)
## Warning in kNN(dirty_iris, k = 5): Nothing to impute, because no NA are present
## (also after using makeNA)
iris_knn <- iris_knn[, 1:ncol(dirty_iris)]

summary(iris_knn)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(iris_knn)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa