## Question 1
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
## Question 2
complete_cases_dirty_iris <-
  sum(complete.cases(dirty_iris))
total_obsv_dirty_iris <- nrow(dirty_iris)
complete_cases_dirty_iris
## [1] 96
total_obsv_dirty_iris <- nrow(dirty_iris)
(complete_cases_dirty_iris / total_obsv_dirty_iris) * 100
## [1] 64
## Question 3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sum(is.na(dirty_iris))
## [1] 58
sum(str_detect(dirty_iris, "Inf"))
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] 1
sum(str_detect(dirty_iris, "-Inf"))
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] 0
sum(str_detect(dirty_iris, "NaN"))
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] 0
## Question 4
dirty_iris[sapply(dirty_iris, is.infinite)] <- NA
sum(str_detect(dirty_iris, "Inf"))
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] 0
## Question 5
total_violations <- sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE) +
sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
total_violations
## [1] 4
## Question 6
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
errors <- dirty_iris[(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0) |
                       (!is.na(dirty_iris$Sepal.Length) & dirty_iris$Sepal.Length > 30), ]
errors
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16             5          -3          3.5           1 versicolor
## 28            73          29         63.0          NA  virginica
## 125           49          30         14.0           2     setosa
## Question 7: Part 1
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)]
## numeric(0)
## Question 7: Part 2
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)]
## numeric(0)
## Question 7: Part 3
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
iris2 <- kNN(dirty_iris, variable = "Petal.Width")
subset(iris2, select=Sepal.Length:Species)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            6.4    3.200000        4.500         1.5 versicolor
## 2            6.3    3.300000        6.000         2.5  virginica
## 3            6.2    3.416667        5.400         2.3  virginica
## 4            5.0    3.400000        1.600         0.4     setosa
## 5            5.7    2.600000        3.500         1.0 versicolor
## 6            5.3    3.416667        4.500         0.2     setosa
## 7            6.4    2.700000        5.300         1.9  virginica
## 8            5.9    3.000000        5.100         1.8  virginica
## 9            5.8    2.700000        4.100         1.0 versicolor
## 10           4.8    3.100000        1.600         0.2     setosa
## 11           5.0    3.500000        1.600         0.6     setosa
## 12           6.0    2.700000        5.100         1.6 versicolor
## 13           6.0    3.000000        4.800         1.8  virginica
## 14           6.8    2.800000        4.800         1.4 versicolor
## 15            NA    3.900000        1.700         0.4     setosa
## 16           5.0   -3.000000        3.500         1.0 versicolor
## 17           5.5    3.416667        4.000         1.3 versicolor
## 18           4.7    3.200000        1.300         0.2     setosa
## 19            NA    4.000000        4.500         0.2     setosa
## 20           5.6    3.416667        4.200         1.3 versicolor
## 21           4.9    3.600000        4.500         0.1     setosa
## 22           5.4    3.416667        4.500         1.5 versicolor
## 23           6.2    2.800000        4.500         1.8  virginica
## 24           6.7    3.300000        5.700         2.5  virginica
## 25            NA    3.000000        5.900         2.1  virginica
## 26           4.6    3.200000        1.400         0.2     setosa
## 27           4.9    3.100000        1.500         0.1     setosa
## 28          73.0   29.000000       63.000         2.0  virginica
## 29           6.5    3.200000        5.100         2.0  virginica
## 30            NA    2.800000        0.820         1.3 versicolor
## 31           4.4    3.200000        4.500         0.2     setosa
## 32           5.9    3.200000        4.800         1.5 versicolor
## 33           5.7    2.800000        4.500         1.3 versicolor
## 34           6.2    2.900000        4.500         1.3 versicolor
## 35           6.6    2.900000       23.000         1.3 versicolor
## 36           4.8    3.000000        1.400         0.1     setosa
## 37           6.5    3.000000        5.500         1.8  virginica
## 38           6.2    2.200000        4.500         1.5 versicolor
## 39           6.7    2.500000        5.800         1.8  virginica
## 40           5.0    3.000000        1.600         0.2     setosa
## 41           5.0    3.416667        1.200         0.2     setosa
## 42           5.8    2.700000        3.900         1.2 versicolor
## 43           0.0    3.416667        1.300         0.4     setosa
## 44           5.8    2.700000        5.100         1.9  virginica
## 45           5.5    4.200000        1.400         0.2     setosa
## 46           7.7    2.800000        6.700         2.0  virginica
## 47           5.7    3.416667        4.500         0.4     setosa
## 48           7.0    3.200000        4.700         1.4 versicolor
## 49           6.5    3.000000        5.800         2.2  virginica
## 50           6.0    3.400000        4.500         1.6 versicolor
## 51           5.5    2.600000        4.400         1.2 versicolor
## 52           4.9    3.100000        4.500         0.2     setosa
## 53           5.2    2.700000        3.900         1.4 versicolor
## 54           4.8    3.400000        1.600         0.2     setosa
## 55           6.3    3.300000        4.700         1.6 versicolor
## 56           7.7    3.800000        6.700         2.2  virginica
## 57           5.1    3.800000        1.500         0.3     setosa
## 58            NA    2.900000        4.500         1.5 versicolor
## 59           6.4    2.800000        5.600         1.8  virginica
## 60           6.4    2.800000        5.600         2.1  virginica
## 61           5.0    2.300000        3.300         1.1 versicolor
## 62           7.4    2.800000        6.100         1.9  virginica
## 63           4.3    3.000000        1.100         0.1     setosa
## 64           5.0    3.300000        1.400         0.2     setosa
## 65           7.2    3.000000        5.800         1.6  virginica
## 66           6.3    2.500000        4.900         1.5 versicolor
## 67           5.1    2.500000        4.500         1.1 versicolor
## 68            NA    3.200000        5.700         2.3  virginica
## 69           5.1    3.500000        4.500         0.2     setosa
## 70           5.0    3.500000        1.300         0.3     setosa
## 71           6.1    3.000000        4.600         1.4 versicolor
## 72           6.9    3.100000        5.100         2.3  virginica
## 73           5.1    3.500000        1.400         0.3     setosa
## 74           6.5    3.416667        4.600         1.5 versicolor
## 75           5.6    2.800000        4.900         2.0  virginica
## 76           4.9    2.500000        4.500         1.9  virginica
## 77           5.5    3.500000        1.300         0.2     setosa
## 78           7.6    3.000000        6.600         2.1  virginica
## 79           5.1    3.800000        0.000         0.2     setosa
## 80           7.9    3.800000        6.400         2.0  virginica
## 81           6.1    2.600000        5.600         1.4  virginica
## 82           5.4    3.400000        1.700         0.2     setosa
## 83           6.1    2.900000        4.700         1.4 versicolor
## 84           5.4    3.700000        1.500         0.2     setosa
## 85           6.7    3.000000        5.200         2.3  virginica
## 86           5.1    3.800000        1.900         0.2     setosa
## 87           6.4    2.900000        4.300         1.3 versicolor
## 88           5.7    2.900000        4.200         1.3 versicolor
## 89           4.4    2.900000        1.400         0.2     setosa
## 90           6.3    2.500000        5.000         1.9  virginica
## 91           7.2    3.200000        6.000         1.8  virginica
## 92           4.9    3.416667        3.300         1.0 versicolor
## 93           5.2    3.400000        1.400         0.2     setosa
## 94           5.8    2.700000        5.100         1.9  virginica
## 95           6.0    2.200000        5.000         1.5  virginica
## 96           6.9    3.100000        4.500         1.5 versicolor
## 97           5.5    2.300000        4.000         1.3 versicolor
## 98           6.7    3.416667        5.000         1.7 versicolor
## 99           5.7    3.000000        4.200         1.2 versicolor
## 100          6.3    2.800000        5.100         1.5  virginica
## 101          5.4    3.400000        1.500         0.4     setosa
## 102          7.2    3.600000        4.500         2.5  virginica
## 103          6.3    2.700000        4.900         1.9  virginica
## 104          5.6    3.000000        4.100         1.3 versicolor
## 105          5.1    3.700000        4.500         0.4     setosa
## 106          5.5    3.416667        0.925         1.0 versicolor
## 107          6.5    3.000000        5.200         2.0  virginica
## 108          4.8    3.000000        1.400         0.2     setosa
## 109          6.1    2.800000        4.500         1.3 versicolor
## 110          4.6    3.400000        1.400         0.3     setosa
## 111          6.3    3.400000        4.500         2.4  virginica
## 112          5.0    3.400000        1.500         0.2     setosa
## 113          5.1    3.400000        1.500         0.2     setosa
## 114           NA    3.300000        5.700         2.1  virginica
## 115          6.7    3.100000        4.700         1.5 versicolor
## 116          7.7    2.600000        6.900         2.3  virginica
## 117          6.3    3.416667        4.400         1.3 versicolor
## 118          4.6    3.100000        1.500         0.2     setosa
## 119           NA    3.000000        5.500         2.1  virginica
## 120           NA    2.800000        4.700         1.2 versicolor
## 121          5.9    3.000000        4.500         1.5 versicolor
## 122          4.5    2.300000        1.300         0.3     setosa
## 123          6.4    3.200000        5.300         2.3  virginica
## 124          5.2    4.100000        1.500         0.1     setosa
## 125         49.0   30.000000       14.000         2.0     setosa
## 126          5.6    2.900000        3.600         1.3 versicolor
## 127          6.8    3.200000        5.900         2.3  virginica
## 128          5.8    3.416667        5.100         2.4  virginica
## 129          4.6    3.600000        4.500         0.2     setosa
## 130          5.7    3.416667        1.700         0.3     setosa
## 131          5.6    2.500000        3.900         1.1 versicolor
## 132          6.7    3.100000        4.400         1.4 versicolor
## 133          4.8    3.416667        1.900         0.2     setosa
## 134          5.1    3.300000        1.700         0.5     setosa
## 135          4.4    3.000000        1.300         0.2     setosa
## 136          7.7    3.000000        4.500         2.3  virginica
## 137          4.7    3.200000        1.600         0.2     setosa
## 138           NA    3.000000        4.900         1.8  virginica
## 139          6.9    3.100000        5.400         2.1  virginica
## 140          6.0    2.200000        4.000         1.0 versicolor
## 141          5.0    3.416667        1.400         0.2     setosa
## 142          5.5    3.416667        3.800         1.1 versicolor
## 143          6.6    3.000000        4.400         1.4 versicolor
## 144          6.3    2.900000        5.600         1.8  virginica
## 145          5.7    2.500000        5.000         2.0  virginica
## 146          6.7    3.100000        5.600         2.4  virginica
## 147          5.6    3.000000        4.500         1.5 versicolor
## 148          5.2    3.500000        1.500         0.2     setosa
## 149          6.4    3.100000        4.500         1.8  virginica
## 150          5.8    2.600000        4.000         1.1 versicolor
## Question 7: Part 4
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length,
            data = dirty_iris)

I <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[I] <- predict(model, newdata = dirty_iris[I,])

dirty_iris
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1       6.400000         3.2        4.500         1.5 versicolor
## 2       6.300000         3.3        6.000         2.5  virginica
## 3       6.200000          NA        5.400         2.3  virginica
## 4       5.000000         3.4        1.600         0.4     setosa
## 5       5.700000         2.6        3.500         1.0 versicolor
## 6       5.300000          NA           NA         0.2     setosa
## 7       6.400000         2.7        5.300          NA  virginica
## 8       5.900000         3.0        5.100         1.8  virginica
## 9       5.800000         2.7        4.100         1.0 versicolor
## 10      4.800000         3.1        1.600         0.2     setosa
## 11      5.000000         3.5        1.600         0.6     setosa
## 12      6.000000         2.7        5.100         1.6 versicolor
## 13      6.000000         3.0        4.800          NA  virginica
## 14      6.800000         2.8        4.800         1.4 versicolor
## 15      6.138018         3.9        1.700         0.4     setosa
## 16      5.000000        -3.0        3.500         1.0 versicolor
## 17      5.500000          NA        4.000         1.3 versicolor
## 18      4.700000         3.2        1.300         0.2     setosa
## 19            NA         4.0           NA         0.2     setosa
## 20      5.600000          NA        4.200         1.3 versicolor
## 21      4.900000         3.6           NA         0.1     setosa
## 22      5.400000          NA        4.500         1.5 versicolor
## 23      6.200000         2.8           NA         1.8  virginica
## 24      6.700000         3.3        5.700         2.5  virginica
## 25      6.872562         3.0        5.900         2.1  virginica
## 26      4.600000         3.2        1.400         0.2     setosa
## 27      4.900000         3.1        1.500         0.1     setosa
## 28     73.000000        29.0       63.000          NA  virginica
## 29      6.500000         3.2        5.100         2.0  virginica
## 30      4.174884         2.8        0.820         1.3 versicolor
## 31      4.400000         3.2           NA         0.2     setosa
## 32      5.900000         3.2        4.800          NA versicolor
## 33      5.700000         2.8        4.500         1.3 versicolor
## 34      6.200000         2.9           NA         1.3 versicolor
## 35      6.600000         2.9       23.000         1.3 versicolor
## 36      4.800000         3.0        1.400         0.1     setosa
## 37      6.500000         3.0        5.500         1.8  virginica
## 38      6.200000         2.2        4.500         1.5 versicolor
## 39      6.700000         2.5        5.800         1.8  virginica
## 40      5.000000         3.0        1.600         0.2     setosa
## 41      5.000000          NA        1.200         0.2     setosa
## 42      5.800000         2.7        3.900         1.2 versicolor
## 43      0.000000          NA        1.300         0.4     setosa
## 44      5.800000         2.7        5.100         1.9  virginica
## 45      5.500000         4.2        1.400         0.2     setosa
## 46      7.700000         2.8        6.700         2.0  virginica
## 47      5.700000          NA           NA         0.4     setosa
## 48      7.000000         3.2        4.700         1.4 versicolor
## 49      6.500000         3.0        5.800         2.2  virginica
## 50      6.000000         3.4        4.500         1.6 versicolor
## 51      5.500000         2.6        4.400         1.2 versicolor
## 52      4.900000         3.1           NA         0.2     setosa
## 53      5.200000         2.7        3.900         1.4 versicolor
## 54      4.800000         3.4        1.600         0.2     setosa
## 55      6.300000         3.3        4.700         1.6 versicolor
## 56      7.700000         3.8        6.700         2.2  virginica
## 57      5.100000         3.8        1.500         0.3     setosa
## 58      6.066090         2.9        4.500         1.5 versicolor
## 59      6.400000         2.8        5.600          NA  virginica
## 60      6.400000         2.8        5.600         2.1  virginica
## 61      5.000000         2.3        3.300          NA versicolor
## 62      7.400000         2.8        6.100         1.9  virginica
## 63      4.300000         3.0        1.100         0.1     setosa
## 64      5.000000         3.3        1.400         0.2     setosa
## 65      7.200000         3.0        5.800         1.6  virginica
## 66      6.300000         2.5        4.900         1.5 versicolor
## 67      5.100000         2.5           NA         1.1 versicolor
## 68      7.058221         3.2        5.700         2.3  virginica
## 69      5.100000         3.5           NA          NA     setosa
## 70      5.000000         3.5        1.300         0.3     setosa
## 71      6.100000         3.0        4.600         1.4 versicolor
## 72      6.900000         3.1        5.100         2.3  virginica
## 73      5.100000         3.5        1.400         0.3     setosa
## 74      6.500000          NA        4.600         1.5 versicolor
## 75      5.600000         2.8        4.900         2.0  virginica
## 76      4.900000         2.5        4.500          NA  virginica
## 77      5.500000         3.5        1.300         0.2     setosa
## 78      7.600000         3.0        6.600         2.1  virginica
## 79      5.100000         3.8        0.000         0.2     setosa
## 80      7.900000         3.8        6.400         2.0  virginica
## 81      6.100000         2.6        5.600         1.4  virginica
## 82      5.400000         3.4        1.700         0.2     setosa
## 83      6.100000         2.9        4.700         1.4 versicolor
## 84      5.400000         3.7        1.500         0.2     setosa
## 85      6.700000         3.0        5.200         2.3  virginica
## 86      5.100000         3.8        1.900         Inf     setosa
## 87      6.400000         2.9        4.300         1.3 versicolor
## 88      5.700000         2.9        4.200         1.3 versicolor
## 89      4.400000         2.9        1.400         0.2     setosa
## 90      6.300000         2.5        5.000         1.9  virginica
## 91      7.200000         3.2        6.000         1.8  virginica
## 92      4.900000          NA        3.300         1.0 versicolor
## 93      5.200000         3.4        1.400         0.2     setosa
## 94      5.800000         2.7        5.100         1.9  virginica
## 95      6.000000         2.2        5.000         1.5  virginica
## 96      6.900000         3.1           NA         1.5 versicolor
## 97      5.500000         2.3        4.000         1.3 versicolor
## 98      6.700000          NA        5.000         1.7 versicolor
## 99      5.700000         3.0        4.200         1.2 versicolor
## 100     6.300000         2.8        5.100         1.5  virginica
## 101     5.400000         3.4        1.500         0.4     setosa
## 102     7.200000         3.6           NA         2.5  virginica
## 103     6.300000         2.7        4.900          NA  virginica
## 104     5.600000         3.0        4.100         1.3 versicolor
## 105     5.100000         3.7           NA         0.4     setosa
## 106     5.500000          NA        0.925         1.0 versicolor
## 107     6.500000         3.0        5.200         2.0  virginica
## 108     4.800000         3.0        1.400          NA     setosa
## 109     6.100000         2.8           NA         1.3 versicolor
## 110     4.600000         3.4        1.400         0.3     setosa
## 111     6.300000         3.4           NA         2.4  virginica
## 112     5.000000         3.4        1.500         0.2     setosa
## 113     5.100000         3.4        1.500         0.2     setosa
## 114     7.198627         3.3        5.700         2.1  virginica
## 115     6.700000         3.1        4.700         1.5 versicolor
## 116     7.700000         2.6        6.900         2.3  virginica
## 117     6.300000          NA        4.400         1.3 versicolor
## 118     4.600000         3.1        1.500         0.2     setosa
## 119     6.682257         3.0        5.500         2.1  virginica
## 120     6.020837         2.8        4.700         1.2 versicolor
## 121     5.900000         3.0           NA         1.5 versicolor
## 122     4.500000         2.3        1.300         0.3     setosa
## 123     6.400000         3.2        5.300         2.3  virginica
## 124     5.200000         4.1        1.500         0.1     setosa
## 125    49.000000        30.0       14.000         2.0     setosa
## 126     5.600000         2.9        3.600         1.3 versicolor
## 127     6.800000         3.2        5.900         2.3  virginica
## 128     5.800000          NA        5.100         2.4  virginica
## 129     4.600000         3.6           NA         0.2     setosa
## 130     5.700000         0.0        1.700         0.3     setosa
## 131     5.600000         2.5        3.900         1.1 versicolor
## 132     6.700000         3.1        4.400         1.4 versicolor
## 133     4.800000          NA        1.900         0.2     setosa
## 134     5.100000         3.3        1.700         0.5     setosa
## 135     4.400000         3.0        1.300          NA     setosa
## 136     7.700000         3.0           NA         2.3  virginica
## 137     4.700000         3.2        1.600         0.2     setosa
## 138     6.396801         3.0        4.900         1.8  virginica
## 139     6.900000         3.1        5.400         2.1  virginica
## 140     6.000000         2.2        4.000         1.0 versicolor
## 141     5.000000          NA        1.400         0.2     setosa
## 142     5.500000          NA        3.800         1.1 versicolor
## 143     6.600000         3.0        4.400         1.4 versicolor
## 144     6.300000         2.9        5.600         1.8  virginica
## 145     5.700000         2.5        5.000         2.0  virginica
## 146     6.700000         3.1        5.600         2.4  virginica
## 147     5.600000         3.0        4.500         1.5 versicolor
## 148     5.200000         3.5        1.500         0.2     setosa
## 149     6.400000         3.1           NA         1.8  virginica
## 150     5.800000         2.6        4.000          NA versicolor