R Markdown

##Question 3

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19

##Question 4

sum(complete.cases(dirty_iris))
## [1] 96
(96/150)*100
## [1] 64

##Question 5

str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 1

##Question 6

table(dirty_iris$Petal.Width)
## 
## 0.1 0.2 0.3 0.4 0.5 0.6   1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9   2 2.1 2.2 2.3 
##   5  26   6   6   1   1   6   3   4  13   8  12   4   1   8   4   7   6   2   8 
## 2.4 2.5 Inf 
##   3   3   1
which(dirty_iris$Petal.Width == 'Inf')
## [1] 86
dirty_iris$Petal.Width[86] <- NA
dirty_iris$Petal.Width
##   [1] 1.5 2.5 2.3 0.4 1.0 0.2  NA 1.8 1.0 0.2 0.6 1.6  NA 1.4 0.4 1.0 1.3 0.2
##  [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1  NA 2.0 1.3 0.2  NA 1.3 1.3 1.3 0.1
##  [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
##  [55] 1.6 2.2 0.3 1.5  NA 2.1  NA 1.9 0.1 0.2 1.6 1.5 1.1 2.3  NA 0.3 1.4 2.3
##  [73] 0.3 1.5 2.0  NA 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3  NA 1.3 1.3 0.2 1.9
##  [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5  NA 1.3 0.4 1.0 2.0  NA
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5  NA 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8  NA

##Question 7

sepal_width_negative <- which(dirty_iris$Sepal.Width <= 0)
sepal_length_greaterthanthirty <- which(dirty_iris$Sepal.Length > 30)
both_violations <- union(sepal_width_negative, sepal_length_greaterthanthirty)
both_violations
## [1]  16 130  28 125

##Question 8

dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[130] <- NA
dirty_iris$Sepal.Width[16]
## [1] 3
dirty_iris$Sepal.Width[130]
## [1] NA

##Question 9

mean_sepal_width <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width
dirty_iris$Sepal.Width
##   [1]  3.200000  3.300000  3.462121  3.400000  2.600000  3.462121  2.700000
##   [8]  3.000000  2.700000  3.100000  3.500000  2.700000  3.000000  2.800000
##  [15]  3.900000  3.000000  3.462121  3.200000  4.000000  3.462121  3.600000
##  [22]  3.462121  2.800000  3.300000  3.000000  3.200000  3.100000 29.000000
##  [29]  3.200000  2.800000  3.200000  3.200000  2.800000  2.900000  2.900000
##  [36]  3.000000  3.000000  2.200000  2.500000  3.000000  3.462121  2.700000
##  [43]  3.462121  2.700000  4.200000  2.800000  3.462121  3.200000  3.000000
##  [50]  3.400000  2.600000  3.100000  2.700000  3.400000  3.300000  3.800000
##  [57]  3.800000  2.900000  2.800000  2.800000  2.300000  2.800000  3.000000
##  [64]  3.300000  3.000000  2.500000  2.500000  3.200000  3.500000  3.500000
##  [71]  3.000000  3.100000  3.500000  3.462121  2.800000  2.500000  3.500000
##  [78]  3.000000  3.800000  3.800000  2.600000  3.400000  2.900000  3.700000
##  [85]  3.000000  3.800000  2.900000  2.900000  2.900000  2.500000  3.200000
##  [92]  3.462121  3.400000  2.700000  2.200000  3.100000  2.300000  3.462121
##  [99]  3.000000  2.800000  3.400000  3.600000  2.700000  3.000000  3.700000
## [106]  3.462121  3.000000  3.000000  2.800000  3.400000  3.400000  3.400000
## [113]  3.400000  3.300000  3.100000  2.600000  3.462121  3.100000  3.000000
## [120]  2.800000  3.000000  2.300000  3.200000  4.100000 30.000000  2.900000
## [127]  3.200000  3.462121  3.600000  3.462121  2.500000  3.100000  3.462121
## [134]  3.300000  3.000000  3.000000  3.200000  3.000000  3.100000  2.200000
## [141]  3.462121  3.462121  3.000000  2.900000  2.500000  3.100000  3.000000
## [148]  3.500000  3.100000  2.600000
median_petal_length <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_petal_length
dirty_iris$Petal.Length
##   [1]  4.500  6.000  5.400  1.600  3.500  4.500  5.300  5.100  4.100  1.600
##  [11]  1.600  5.100  4.800  4.800  1.700  3.500  4.000  1.300  4.500  4.200
##  [21]  4.500  4.500  4.500  5.700  5.900  1.400  1.500 63.000  5.100  0.820
##  [31]  4.500  4.800  4.500  4.500 23.000  1.400  5.500  4.500  5.800  1.600
##  [41]  1.200  3.900  1.300  5.100  1.400  6.700  4.500  4.700  5.800  4.500
##  [51]  4.400  4.500  3.900  1.600  4.700  6.700  1.500  4.500  5.600  5.600
##  [61]  3.300  6.100  1.100  1.400  5.800  4.900  4.500  5.700  4.500  1.300
##  [71]  4.600  5.100  1.400  4.600  4.900  4.500  1.300  6.600  0.000  6.400
##  [81]  5.600  1.700  4.700  1.500  5.200  1.900  4.300  4.200  1.400  5.000
##  [91]  6.000  3.300  1.400  5.100  5.000  4.500  4.000  5.000  4.200  5.100
## [101]  1.500  4.500  4.900  4.100  4.500  0.925  5.200  1.400  4.500  1.400
## [111]  4.500  1.500  1.500  5.700  4.700  6.900  4.400  1.500  5.500  4.700
## [121]  4.500  1.300  5.300  1.500 14.000  3.600  5.900  5.100  4.500  1.700
## [131]  3.900  4.400  1.900  1.700  1.300  4.500  1.600  4.900  5.400  4.000
## [141]  1.400  3.800  4.400  5.600  5.000  5.600  4.500  1.500  4.500  4.000
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
               data = dirty_iris,
               na.action = na.exclude)
predicted_values <- predict(lm_model, newdata = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_values[is.na(dirty_iris$Sepal.Length)]
dirty_iris$Sepal.Length
##   [1]  6.400000  6.300000  6.200000  5.000000  5.700000  5.300000  6.400000
##   [8]  5.900000  5.800000  4.800000  5.000000  6.000000  6.000000  6.800000
##  [15]  5.925959  5.000000  5.500000  4.700000  6.223562  5.600000  4.900000
##  [22]  5.400000  6.200000  6.700000  6.761731  4.600000  4.900000 73.000000
##  [29]  6.500000  5.050877  4.400000  5.900000  5.700000  6.200000  6.600000
##  [36]  4.800000  6.500000  6.200000  6.700000  5.000000  5.000000  5.800000
##  [43]  0.000000  5.800000  5.500000  7.700000  5.700000  7.000000  6.500000
##  [50]  6.000000  5.500000  4.900000  5.200000  4.800000  6.300000  7.700000
##  [57]  5.100000  5.849790  6.400000  6.400000  5.000000  7.400000  4.300000
##  [64]  5.000000  7.200000  6.300000  5.100000  7.234040  5.100000  5.000000
##  [71]  6.100000  6.900000  5.100000  6.500000  5.600000  4.900000  5.500000
##  [78]  7.600000  5.100000  7.900000  6.100000  5.400000  6.100000  5.400000
##  [85]  6.700000  5.100000  6.400000  5.700000  4.400000  6.300000  7.200000
##  [92]  4.900000  5.200000  5.800000  6.000000  6.900000  5.500000  6.700000
##  [99]  5.700000  6.300000  5.400000  7.200000  6.300000  5.600000  5.100000
## [106]  5.500000  6.500000  4.800000  6.100000  4.600000  6.300000  5.000000
## [113]  5.100000  7.187596  6.700000  7.700000  6.300000  4.600000  6.712582
## [120]  5.429333  5.900000  4.500000  6.400000  5.200000 49.000000  5.600000
## [127]  6.800000  5.800000  4.600000  5.700000  5.600000  6.700000  4.800000
## [134]  5.100000  4.400000  7.700000  4.700000  6.343972  6.900000  6.000000
## [141]  5.000000  5.500000  6.600000  6.300000  5.700000  6.700000  5.600000
## [148]  5.200000  6.400000  5.800000
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0
dirty_iris$Petal.Width
##   [1] 1.5 2.5 2.3 0.4 1.0 0.2 1.9 1.8 1.0 0.2 0.6 1.6 1.8 1.4 0.4 1.0 1.3 0.2
##  [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 2.0 2.0 1.3 0.2 1.5 1.3 1.3 1.3 0.1
##  [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
##  [55] 1.6 2.2 0.3 1.5 1.8 2.1 1.1 1.9 0.1 0.2 1.6 1.5 1.1 2.3 0.2 0.3 1.4 2.3
##  [73] 0.3 1.5 2.0 1.9 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 0.3 1.3 1.3 0.2 1.9
##  [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 1.9 1.3 0.4 1.0 2.0 0.2
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 0.2 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 1.1