##Question 3
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
##Question 4
sum(complete.cases(dirty_iris))
## [1] 96
(96/150)*100
## [1] 64
##Question 5
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
sum(is.infinite(dirty_iris$Petal.Width))
## [1] 1
##Question 6
table(dirty_iris$Petal.Width)
##
## 0.1 0.2 0.3 0.4 0.5 0.6 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3
## 5 26 6 6 1 1 6 3 4 13 8 12 4 1 8 4 7 6 2 8
## 2.4 2.5 Inf
## 3 3 1
which(dirty_iris$Petal.Width == 'Inf')
## [1] 86
dirty_iris$Petal.Width[86] <- NA
dirty_iris$Petal.Width
## [1] 1.5 2.5 2.3 0.4 1.0 0.2 NA 1.8 1.0 0.2 0.6 1.6 NA 1.4 0.4 1.0 1.3 0.2
## [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 NA 2.0 1.3 0.2 NA 1.3 1.3 1.3 0.1
## [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
## [55] 1.6 2.2 0.3 1.5 NA 2.1 NA 1.9 0.1 0.2 1.6 1.5 1.1 2.3 NA 0.3 1.4 2.3
## [73] 0.3 1.5 2.0 NA 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 NA 1.3 1.3 0.2 1.9
## [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 NA 1.3 0.4 1.0 2.0 NA
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 NA 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 NA
##Question 7
sepal_width_negative <- which(dirty_iris$Sepal.Width <= 0)
sepal_length_greaterthanthirty <- which(dirty_iris$Sepal.Length > 30)
both_violations <- union(sepal_width_negative, sepal_length_greaterthanthirty)
both_violations
## [1] 16 130 28 125
##Question 8
dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[130] <- NA
dirty_iris$Sepal.Width[16]
## [1] 3
dirty_iris$Sepal.Width[130]
## [1] NA
##Question 9
mean_sepal_width <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width
dirty_iris$Sepal.Width
## [1] 3.200000 3.300000 3.462121 3.400000 2.600000 3.462121 2.700000
## [8] 3.000000 2.700000 3.100000 3.500000 2.700000 3.000000 2.800000
## [15] 3.900000 3.000000 3.462121 3.200000 4.000000 3.462121 3.600000
## [22] 3.462121 2.800000 3.300000 3.000000 3.200000 3.100000 29.000000
## [29] 3.200000 2.800000 3.200000 3.200000 2.800000 2.900000 2.900000
## [36] 3.000000 3.000000 2.200000 2.500000 3.000000 3.462121 2.700000
## [43] 3.462121 2.700000 4.200000 2.800000 3.462121 3.200000 3.000000
## [50] 3.400000 2.600000 3.100000 2.700000 3.400000 3.300000 3.800000
## [57] 3.800000 2.900000 2.800000 2.800000 2.300000 2.800000 3.000000
## [64] 3.300000 3.000000 2.500000 2.500000 3.200000 3.500000 3.500000
## [71] 3.000000 3.100000 3.500000 3.462121 2.800000 2.500000 3.500000
## [78] 3.000000 3.800000 3.800000 2.600000 3.400000 2.900000 3.700000
## [85] 3.000000 3.800000 2.900000 2.900000 2.900000 2.500000 3.200000
## [92] 3.462121 3.400000 2.700000 2.200000 3.100000 2.300000 3.462121
## [99] 3.000000 2.800000 3.400000 3.600000 2.700000 3.000000 3.700000
## [106] 3.462121 3.000000 3.000000 2.800000 3.400000 3.400000 3.400000
## [113] 3.400000 3.300000 3.100000 2.600000 3.462121 3.100000 3.000000
## [120] 2.800000 3.000000 2.300000 3.200000 4.100000 30.000000 2.900000
## [127] 3.200000 3.462121 3.600000 3.462121 2.500000 3.100000 3.462121
## [134] 3.300000 3.000000 3.000000 3.200000 3.000000 3.100000 2.200000
## [141] 3.462121 3.462121 3.000000 2.900000 2.500000 3.100000 3.000000
## [148] 3.500000 3.100000 2.600000
median_petal_length <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_petal_length
dirty_iris$Petal.Length
## [1] 4.500 6.000 5.400 1.600 3.500 4.500 5.300 5.100 4.100 1.600
## [11] 1.600 5.100 4.800 4.800 1.700 3.500 4.000 1.300 4.500 4.200
## [21] 4.500 4.500 4.500 5.700 5.900 1.400 1.500 63.000 5.100 0.820
## [31] 4.500 4.800 4.500 4.500 23.000 1.400 5.500 4.500 5.800 1.600
## [41] 1.200 3.900 1.300 5.100 1.400 6.700 4.500 4.700 5.800 4.500
## [51] 4.400 4.500 3.900 1.600 4.700 6.700 1.500 4.500 5.600 5.600
## [61] 3.300 6.100 1.100 1.400 5.800 4.900 4.500 5.700 4.500 1.300
## [71] 4.600 5.100 1.400 4.600 4.900 4.500 1.300 6.600 0.000 6.400
## [81] 5.600 1.700 4.700 1.500 5.200 1.900 4.300 4.200 1.400 5.000
## [91] 6.000 3.300 1.400 5.100 5.000 4.500 4.000 5.000 4.200 5.100
## [101] 1.500 4.500 4.900 4.100 4.500 0.925 5.200 1.400 4.500 1.400
## [111] 4.500 1.500 1.500 5.700 4.700 6.900 4.400 1.500 5.500 4.700
## [121] 4.500 1.300 5.300 1.500 14.000 3.600 5.900 5.100 4.500 1.700
## [131] 3.900 4.400 1.900 1.700 1.300 4.500 1.600 4.900 5.400 4.000
## [141] 1.400 3.800 4.400 5.600 5.000 5.600 4.500 1.500 4.500 4.000
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris,
na.action = na.exclude)
predicted_values <- predict(lm_model, newdata = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_values[is.na(dirty_iris$Sepal.Length)]
dirty_iris$Sepal.Length
## [1] 6.400000 6.300000 6.200000 5.000000 5.700000 5.300000 6.400000
## [8] 5.900000 5.800000 4.800000 5.000000 6.000000 6.000000 6.800000
## [15] 5.925959 5.000000 5.500000 4.700000 6.223562 5.600000 4.900000
## [22] 5.400000 6.200000 6.700000 6.761731 4.600000 4.900000 73.000000
## [29] 6.500000 5.050877 4.400000 5.900000 5.700000 6.200000 6.600000
## [36] 4.800000 6.500000 6.200000 6.700000 5.000000 5.000000 5.800000
## [43] 0.000000 5.800000 5.500000 7.700000 5.700000 7.000000 6.500000
## [50] 6.000000 5.500000 4.900000 5.200000 4.800000 6.300000 7.700000
## [57] 5.100000 5.849790 6.400000 6.400000 5.000000 7.400000 4.300000
## [64] 5.000000 7.200000 6.300000 5.100000 7.234040 5.100000 5.000000
## [71] 6.100000 6.900000 5.100000 6.500000 5.600000 4.900000 5.500000
## [78] 7.600000 5.100000 7.900000 6.100000 5.400000 6.100000 5.400000
## [85] 6.700000 5.100000 6.400000 5.700000 4.400000 6.300000 7.200000
## [92] 4.900000 5.200000 5.800000 6.000000 6.900000 5.500000 6.700000
## [99] 5.700000 6.300000 5.400000 7.200000 6.300000 5.600000 5.100000
## [106] 5.500000 6.500000 4.800000 6.100000 4.600000 6.300000 5.000000
## [113] 5.100000 7.187596 6.700000 7.700000 6.300000 4.600000 6.712582
## [120] 5.429333 5.900000 4.500000 6.400000 5.200000 49.000000 5.600000
## [127] 6.800000 5.800000 4.600000 5.700000 5.600000 6.700000 4.800000
## [134] 5.100000 4.400000 7.700000 4.700000 6.343972 6.900000 6.000000
## [141] 5.000000 5.500000 6.600000 6.300000 5.700000 6.700000 5.600000
## [148] 5.200000 6.400000 5.800000
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
dirty_iris$Petal.Width
## [1] 1.5 2.5 2.3 0.4 1.0 0.2 1.9 1.8 1.0 0.2 0.6 1.6 1.8 1.4 0.4 1.0 1.3 0.2
## [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 2.0 2.0 1.3 0.2 1.5 1.3 1.3 1.3 0.1
## [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
## [55] 1.6 2.2 0.3 1.5 1.8 2.1 1.1 1.9 0.1 0.2 1.6 1.5 1.1 2.3 0.2 0.3 1.4 2.3
## [73] 0.3 1.5 2.0 1.9 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 0.3 1.3 1.3 0.2 1.9
## [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 1.9 1.3 0.4 1.0 2.0 0.2
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 0.2 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 1.1