#Question 3
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
#Question 4
sum(complete.cases(dirty_iris))
## [1] 96
(96/150)*100
## [1] 64
#Question 6
which(dirty_iris$Petal.Width=="Inf")
## [1] 86
dirty_iris$Petal.Width[86] <- NA
dirty_iris$Petal.Width
## [1] 1.5 2.5 2.3 0.4 1.0 0.2 NA 1.8 1.0 0.2 0.6 1.6 NA 1.4 0.4 1.0 1.3 0.2
## [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 NA 2.0 1.3 0.2 NA 1.3 1.3 1.3 0.1
## [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
## [55] 1.6 2.2 0.3 1.5 NA 2.1 NA 1.9 0.1 0.2 1.6 1.5 1.1 2.3 NA 0.3 1.4 2.3
## [73] 0.3 1.5 2.0 NA 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 NA 1.3 1.3 0.2 1.9
## [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 NA 1.3 0.4 1.0 2.0 NA
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 NA 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 NA
#Question 7
sepal_width_vio <- which(dirty_iris$Sepal.Width <= 0)
sepal_length_vio <- which(dirty_iris$Sepal.Length > 30)
all_violations <- union(sepal_width_vio, sepal_length_vio)
all_violations
## [1] 16 130 28 125
#Question 8
dirty_iris$Sepal.Width[16] <- 3
dirty_iris$Sepal.Width[16]
## [1] 3
dirty_iris$Sepal.Width[130] <- NA
dirty_iris$Sepal.Width[130]
## [1] NA
#Question 9
mean_sepal_width <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sepal_width
dirty_iris$Sepal.Width
## [1] 3.200000 3.300000 3.462121 3.400000 2.600000 3.462121 2.700000
## [8] 3.000000 2.700000 3.100000 3.500000 2.700000 3.000000 2.800000
## [15] 3.900000 3.000000 3.462121 3.200000 4.000000 3.462121 3.600000
## [22] 3.462121 2.800000 3.300000 3.000000 3.200000 3.100000 29.000000
## [29] 3.200000 2.800000 3.200000 3.200000 2.800000 2.900000 2.900000
## [36] 3.000000 3.000000 2.200000 2.500000 3.000000 3.462121 2.700000
## [43] 3.462121 2.700000 4.200000 2.800000 3.462121 3.200000 3.000000
## [50] 3.400000 2.600000 3.100000 2.700000 3.400000 3.300000 3.800000
## [57] 3.800000 2.900000 2.800000 2.800000 2.300000 2.800000 3.000000
## [64] 3.300000 3.000000 2.500000 2.500000 3.200000 3.500000 3.500000
## [71] 3.000000 3.100000 3.500000 3.462121 2.800000 2.500000 3.500000
## [78] 3.000000 3.800000 3.800000 2.600000 3.400000 2.900000 3.700000
## [85] 3.000000 3.800000 2.900000 2.900000 2.900000 2.500000 3.200000
## [92] 3.462121 3.400000 2.700000 2.200000 3.100000 2.300000 3.462121
## [99] 3.000000 2.800000 3.400000 3.600000 2.700000 3.000000 3.700000
## [106] 3.462121 3.000000 3.000000 2.800000 3.400000 3.400000 3.400000
## [113] 3.400000 3.300000 3.100000 2.600000 3.462121 3.100000 3.000000
## [120] 2.800000 3.000000 2.300000 3.200000 4.100000 30.000000 2.900000
## [127] 3.200000 3.462121 3.600000 3.462121 2.500000 3.100000 3.462121
## [134] 3.300000 3.000000 3.000000 3.200000 3.000000 3.100000 2.200000
## [141] 3.462121 3.462121 3.000000 2.900000 2.500000 3.100000 3.000000
## [148] 3.500000 3.100000 2.600000
median_petal_length <- mean(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_petal_length
dirty_iris$Petal.Length
## [1] 4.500000 6.000000 5.400000 1.600000 3.500000 4.449962 5.300000
## [8] 5.100000 4.100000 1.600000 1.600000 5.100000 4.800000 4.800000
## [15] 1.700000 3.500000 4.000000 1.300000 4.449962 4.200000 4.449962
## [22] 4.500000 4.449962 5.700000 5.900000 1.400000 1.500000 63.000000
## [29] 5.100000 0.820000 4.449962 4.800000 4.500000 4.449962 23.000000
## [36] 1.400000 5.500000 4.500000 5.800000 1.600000 1.200000 3.900000
## [43] 1.300000 5.100000 1.400000 6.700000 4.449962 4.700000 5.800000
## [50] 4.500000 4.400000 4.449962 3.900000 1.600000 4.700000 6.700000
## [57] 1.500000 4.500000 5.600000 5.600000 3.300000 6.100000 1.100000
## [64] 1.400000 5.800000 4.900000 4.449962 5.700000 4.449962 1.300000
## [71] 4.600000 5.100000 1.400000 4.600000 4.900000 4.500000 1.300000
## [78] 6.600000 0.000000 6.400000 5.600000 1.700000 4.700000 1.500000
## [85] 5.200000 1.900000 4.300000 4.200000 1.400000 5.000000 6.000000
## [92] 3.300000 1.400000 5.100000 5.000000 4.449962 4.000000 5.000000
## [99] 4.200000 5.100000 1.500000 4.449962 4.900000 4.100000 4.449962
## [106] 0.925000 5.200000 1.400000 4.449962 1.400000 4.449962 1.500000
## [113] 1.500000 5.700000 4.700000 6.900000 4.400000 1.500000 5.500000
## [120] 4.700000 4.449962 1.300000 5.300000 1.500000 14.000000 3.600000
## [127] 5.900000 5.100000 4.449962 1.700000 3.900000 4.400000 1.900000
## [134] 1.700000 1.300000 4.449962 1.600000 4.900000 5.400000 4.000000
## [141] 1.400000 3.800000 4.400000 5.600000 5.000000 5.600000 4.500000
## [148] 1.500000 4.449962 4.000000
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris,
na.action = na.exclude)
predicted_values <- predict(lm_model, newdata = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_values[is.na(dirty_iris$Sepal.Length)]
dirty_iris$Sepal.Length
## [1] 6.400000 6.300000 6.200000 5.000000 5.700000 5.300000 6.400000
## [8] 5.900000 5.800000 4.800000 5.000000 6.000000 6.000000 6.800000
## [15] 5.926511 5.000000 5.500000 4.700000 6.219012 5.600000 4.900000
## [22] 5.400000 6.200000 6.700000 6.762536 4.600000 4.900000 73.000000
## [29] 6.500000 5.050606 4.400000 5.900000 5.700000 6.200000 6.600000
## [36] 4.800000 6.500000 6.200000 6.700000 5.000000 5.000000 5.800000
## [43] 0.000000 5.800000 5.500000 7.700000 5.700000 7.000000 6.500000
## [50] 6.000000 5.500000 4.900000 5.200000 4.800000 6.300000 7.700000
## [57] 5.100000 5.850581 6.400000 6.400000 5.000000 7.400000 4.300000
## [64] 5.000000 7.200000 6.300000 5.100000 7.234609 5.100000 5.000000
## [71] 6.100000 6.900000 5.100000 6.500000 5.600000 4.900000 5.500000
## [78] 7.600000 5.100000 7.900000 6.100000 5.400000 6.100000 5.400000
## [85] 6.700000 5.100000 6.400000 5.700000 4.400000 6.300000 7.200000
## [92] 4.900000 5.200000 5.800000 6.000000 6.900000 5.500000 6.700000
## [99] 5.700000 6.300000 5.400000 7.200000 6.300000 5.600000 5.100000
## [106] 5.500000 6.500000 4.800000 6.100000 4.600000 6.300000 5.000000
## [113] 5.100000 7.188300 6.700000 7.700000 6.300000 4.600000 6.713254
## [120] 5.430421 5.900000 4.500000 6.400000 5.200000 49.000000 5.600000
## [127] 6.800000 5.800000 4.600000 5.700000 5.600000 6.700000 4.800000
## [134] 5.100000 4.400000 7.700000 4.700000 6.344665 6.900000 6.000000
## [141] 5.000000 5.500000 6.600000 6.300000 5.700000 6.700000 5.600000
## [148] 5.200000 6.400000 5.800000
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
dirty_iris$Petal.Width
## [1] 1.5 2.5 2.3 0.4 1.0 0.2 1.9 1.8 1.0 0.2 0.6 1.6 1.8 1.4 0.4 1.0 1.3 0.2
## [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 2.0 2.0 1.3 0.2 1.5 1.3 1.3 1.3 0.1
## [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
## [55] 1.6 2.2 0.3 1.5 1.8 2.1 1.1 1.9 0.1 0.2 1.6 1.5 1.1 2.3 0.2 0.3 1.4 2.3
## [73] 0.3 1.5 2.0 1.9 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 0.3 1.3 1.3 0.2 1.9
## [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 1.9 1.3 0.4 1.0 2.0 0.2
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 0.2 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 1.1