#Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...

#Question 4

sum(complete.cases(dirty_iris))
## [1] 96
(96/150)*100
## [1] 64

#Question 6

which(dirty_iris$Petal.Width=="Inf")
## [1] 86
dirty_iris$Petal.Width[86] <- NA
dirty_iris$Petal.Width
##   [1] 1.5 2.5 2.3 0.4 1.0 0.2  NA 1.8 1.0 0.2 0.6 1.6  NA 1.4 0.4 1.0 1.3 0.2
##  [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1  NA 2.0 1.3 0.2  NA 1.3 1.3 1.3 0.1
##  [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
##  [55] 1.6 2.2 0.3 1.5  NA 2.1  NA 1.9 0.1 0.2 1.6 1.5 1.1 2.3  NA 0.3 1.4 2.3
##  [73] 0.3 1.5 2.0  NA 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3  NA 1.3 1.3 0.2 1.9
##  [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5  NA 1.3 0.4 1.0 2.0  NA
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5  NA 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8  NA

#Question 7

sepal_width_vio <- which(dirty_iris$Sepal.Width <= 0)
sepal_length_vio <- which(dirty_iris$Sepal.Length > 30)
all_violations <- union(sepal_width_vio, sepal_length_vio)
all_violations
## [1]  16 130  28 125

#Question 8

dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[16]
## [1] 3
dirty_iris$Sepal.Width[130] <- NA
dirty_iris$Sepal.Width[130]
## [1] NA

#Question 9

mean_sepal_width <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width
dirty_iris$Sepal.Width
##   [1]  3.200000  3.300000  3.462121  3.400000  2.600000  3.462121  2.700000
##   [8]  3.000000  2.700000  3.100000  3.500000  2.700000  3.000000  2.800000
##  [15]  3.900000  3.000000  3.462121  3.200000  4.000000  3.462121  3.600000
##  [22]  3.462121  2.800000  3.300000  3.000000  3.200000  3.100000 29.000000
##  [29]  3.200000  2.800000  3.200000  3.200000  2.800000  2.900000  2.900000
##  [36]  3.000000  3.000000  2.200000  2.500000  3.000000  3.462121  2.700000
##  [43]  3.462121  2.700000  4.200000  2.800000  3.462121  3.200000  3.000000
##  [50]  3.400000  2.600000  3.100000  2.700000  3.400000  3.300000  3.800000
##  [57]  3.800000  2.900000  2.800000  2.800000  2.300000  2.800000  3.000000
##  [64]  3.300000  3.000000  2.500000  2.500000  3.200000  3.500000  3.500000
##  [71]  3.000000  3.100000  3.500000  3.462121  2.800000  2.500000  3.500000
##  [78]  3.000000  3.800000  3.800000  2.600000  3.400000  2.900000  3.700000
##  [85]  3.000000  3.800000  2.900000  2.900000  2.900000  2.500000  3.200000
##  [92]  3.462121  3.400000  2.700000  2.200000  3.100000  2.300000  3.462121
##  [99]  3.000000  2.800000  3.400000  3.600000  2.700000  3.000000  3.700000
## [106]  3.462121  3.000000  3.000000  2.800000  3.400000  3.400000  3.400000
## [113]  3.400000  3.300000  3.100000  2.600000  3.462121  3.100000  3.000000
## [120]  2.800000  3.000000  2.300000  3.200000  4.100000 30.000000  2.900000
## [127]  3.200000  3.462121  3.600000  3.462121  2.500000  3.100000  3.462121
## [134]  3.300000  3.000000  3.000000  3.200000  3.000000  3.100000  2.200000
## [141]  3.462121  3.462121  3.000000  2.900000  2.500000  3.100000  3.000000
## [148]  3.500000  3.100000  2.600000
median_petal_length <- mean(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_petal_length
dirty_iris$Petal.Length
##   [1]  4.500000  6.000000  5.400000  1.600000  3.500000  4.449962  5.300000
##   [8]  5.100000  4.100000  1.600000  1.600000  5.100000  4.800000  4.800000
##  [15]  1.700000  3.500000  4.000000  1.300000  4.449962  4.200000  4.449962
##  [22]  4.500000  4.449962  5.700000  5.900000  1.400000  1.500000 63.000000
##  [29]  5.100000  0.820000  4.449962  4.800000  4.500000  4.449962 23.000000
##  [36]  1.400000  5.500000  4.500000  5.800000  1.600000  1.200000  3.900000
##  [43]  1.300000  5.100000  1.400000  6.700000  4.449962  4.700000  5.800000
##  [50]  4.500000  4.400000  4.449962  3.900000  1.600000  4.700000  6.700000
##  [57]  1.500000  4.500000  5.600000  5.600000  3.300000  6.100000  1.100000
##  [64]  1.400000  5.800000  4.900000  4.449962  5.700000  4.449962  1.300000
##  [71]  4.600000  5.100000  1.400000  4.600000  4.900000  4.500000  1.300000
##  [78]  6.600000  0.000000  6.400000  5.600000  1.700000  4.700000  1.500000
##  [85]  5.200000  1.900000  4.300000  4.200000  1.400000  5.000000  6.000000
##  [92]  3.300000  1.400000  5.100000  5.000000  4.449962  4.000000  5.000000
##  [99]  4.200000  5.100000  1.500000  4.449962  4.900000  4.100000  4.449962
## [106]  0.925000  5.200000  1.400000  4.449962  1.400000  4.449962  1.500000
## [113]  1.500000  5.700000  4.700000  6.900000  4.400000  1.500000  5.500000
## [120]  4.700000  4.449962  1.300000  5.300000  1.500000 14.000000  3.600000
## [127]  5.900000  5.100000  4.449962  1.700000  3.900000  4.400000  1.900000
## [134]  1.700000  1.300000  4.449962  1.600000  4.900000  5.400000  4.000000
## [141]  1.400000  3.800000  4.400000  5.600000  5.000000  5.600000  4.500000
## [148]  1.500000  4.449962  4.000000
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
               data = dirty_iris,
               na.action = na.exclude)
predicted_values <- predict(lm_model, newdata = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_values[is.na(dirty_iris$Sepal.Length)]
dirty_iris$Sepal.Length
##   [1]  6.400000  6.300000  6.200000  5.000000  5.700000  5.300000  6.400000
##   [8]  5.900000  5.800000  4.800000  5.000000  6.000000  6.000000  6.800000
##  [15]  5.926511  5.000000  5.500000  4.700000  6.219012  5.600000  4.900000
##  [22]  5.400000  6.200000  6.700000  6.762536  4.600000  4.900000 73.000000
##  [29]  6.500000  5.050606  4.400000  5.900000  5.700000  6.200000  6.600000
##  [36]  4.800000  6.500000  6.200000  6.700000  5.000000  5.000000  5.800000
##  [43]  0.000000  5.800000  5.500000  7.700000  5.700000  7.000000  6.500000
##  [50]  6.000000  5.500000  4.900000  5.200000  4.800000  6.300000  7.700000
##  [57]  5.100000  5.850581  6.400000  6.400000  5.000000  7.400000  4.300000
##  [64]  5.000000  7.200000  6.300000  5.100000  7.234609  5.100000  5.000000
##  [71]  6.100000  6.900000  5.100000  6.500000  5.600000  4.900000  5.500000
##  [78]  7.600000  5.100000  7.900000  6.100000  5.400000  6.100000  5.400000
##  [85]  6.700000  5.100000  6.400000  5.700000  4.400000  6.300000  7.200000
##  [92]  4.900000  5.200000  5.800000  6.000000  6.900000  5.500000  6.700000
##  [99]  5.700000  6.300000  5.400000  7.200000  6.300000  5.600000  5.100000
## [106]  5.500000  6.500000  4.800000  6.100000  4.600000  6.300000  5.000000
## [113]  5.100000  7.188300  6.700000  7.700000  6.300000  4.600000  6.713254
## [120]  5.430421  5.900000  4.500000  6.400000  5.200000 49.000000  5.600000
## [127]  6.800000  5.800000  4.600000  5.700000  5.600000  6.700000  4.800000
## [134]  5.100000  4.400000  7.700000  4.700000  6.344665  6.900000  6.000000
## [141]  5.000000  5.500000  6.600000  6.300000  5.700000  6.700000  5.600000
## [148]  5.200000  6.400000  5.800000
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
dirty_iris$Petal.Width
##   [1] 1.5 2.5 2.3 0.4 1.0 0.2 1.9 1.8 1.0 0.2 0.6 1.6 1.8 1.4 0.4 1.0 1.3 0.2
##  [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 2.0 2.0 1.3 0.2 1.5 1.3 1.3 1.3 0.1
##  [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
##  [55] 1.6 2.2 0.3 1.5 1.8 2.1 1.1 1.9 0.1 0.2 1.6 1.5 1.1 2.3 0.2 0.3 1.4 2.3
##  [73] 0.3 1.5 2.0 1.9 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 0.3 1.3 1.3 0.2 1.9
##  [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 1.9 1.3 0.4 1.0 2.0 0.2
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 0.2 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 1.1