dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
dirty_iris
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            6.4         3.2        4.500         1.5 versicolor
## 2            6.3         3.3        6.000         2.5  virginica
## 3            6.2          NA        5.400         2.3  virginica
## 4            5.0         3.4        1.600         0.4     setosa
## 5            5.7         2.6        3.500         1.0 versicolor
## 6            5.3          NA           NA         0.2     setosa
## 7            6.4         2.7        5.300          NA  virginica
## 8            5.9         3.0        5.100         1.8  virginica
## 9            5.8         2.7        4.100         1.0 versicolor
## 10           4.8         3.1        1.600         0.2     setosa
## 11           5.0         3.5        1.600         0.6     setosa
## 12           6.0         2.7        5.100         1.6 versicolor
## 13           6.0         3.0        4.800          NA  virginica
## 14           6.8         2.8        4.800         1.4 versicolor
## 15            NA         3.9        1.700         0.4     setosa
## 16           5.0        -3.0        3.500         1.0 versicolor
## 17           5.5          NA        4.000         1.3 versicolor
## 18           4.7         3.2        1.300         0.2     setosa
## 19            NA         4.0           NA         0.2     setosa
## 20           5.6          NA        4.200         1.3 versicolor
## 21           4.9         3.6           NA         0.1     setosa
## 22           5.4          NA        4.500         1.5 versicolor
## 23           6.2         2.8           NA         1.8  virginica
## 24           6.7         3.3        5.700         2.5  virginica
## 25            NA         3.0        5.900         2.1  virginica
## 26           4.6         3.2        1.400         0.2     setosa
## 27           4.9         3.1        1.500         0.1     setosa
## 28          73.0        29.0       63.000          NA  virginica
## 29           6.5         3.2        5.100         2.0  virginica
## 30            NA         2.8        0.820         1.3 versicolor
## 31           4.4         3.2           NA         0.2     setosa
## 32           5.9         3.2        4.800          NA versicolor
## 33           5.7         2.8        4.500         1.3 versicolor
## 34           6.2         2.9           NA         1.3 versicolor
## 35           6.6         2.9       23.000         1.3 versicolor
## 36           4.8         3.0        1.400         0.1     setosa
## 37           6.5         3.0        5.500         1.8  virginica
## 38           6.2         2.2        4.500         1.5 versicolor
## 39           6.7         2.5        5.800         1.8  virginica
## 40           5.0         3.0        1.600         0.2     setosa
## 41           5.0          NA        1.200         0.2     setosa
## 42           5.8         2.7        3.900         1.2 versicolor
## 43           0.0          NA        1.300         0.4     setosa
## 44           5.8         2.7        5.100         1.9  virginica
## 45           5.5         4.2        1.400         0.2     setosa
## 46           7.7         2.8        6.700         2.0  virginica
## 47           5.7          NA           NA         0.4     setosa
## 48           7.0         3.2        4.700         1.4 versicolor
## 49           6.5         3.0        5.800         2.2  virginica
## 50           6.0         3.4        4.500         1.6 versicolor
## 51           5.5         2.6        4.400         1.2 versicolor
## 52           4.9         3.1           NA         0.2     setosa
## 53           5.2         2.7        3.900         1.4 versicolor
## 54           4.8         3.4        1.600         0.2     setosa
## 55           6.3         3.3        4.700         1.6 versicolor
## 56           7.7         3.8        6.700         2.2  virginica
## 57           5.1         3.8        1.500         0.3     setosa
## 58            NA         2.9        4.500         1.5 versicolor
## 59           6.4         2.8        5.600          NA  virginica
## 60           6.4         2.8        5.600         2.1  virginica
## 61           5.0         2.3        3.300          NA versicolor
## 62           7.4         2.8        6.100         1.9  virginica
## 63           4.3         3.0        1.100         0.1     setosa
## 64           5.0         3.3        1.400         0.2     setosa
## 65           7.2         3.0        5.800         1.6  virginica
## 66           6.3         2.5        4.900         1.5 versicolor
## 67           5.1         2.5           NA         1.1 versicolor
## 68            NA         3.2        5.700         2.3  virginica
## 69           5.1         3.5           NA          NA     setosa
## 70           5.0         3.5        1.300         0.3     setosa
## 71           6.1         3.0        4.600         1.4 versicolor
## 72           6.9         3.1        5.100         2.3  virginica
## 73           5.1         3.5        1.400         0.3     setosa
## 74           6.5          NA        4.600         1.5 versicolor
## 75           5.6         2.8        4.900         2.0  virginica
## 76           4.9         2.5        4.500          NA  virginica
## 77           5.5         3.5        1.300         0.2     setosa
## 78           7.6         3.0        6.600         2.1  virginica
## 79           5.1         3.8        0.000         0.2     setosa
## 80           7.9         3.8        6.400         2.0  virginica
## 81           6.1         2.6        5.600         1.4  virginica
## 82           5.4         3.4        1.700         0.2     setosa
## 83           6.1         2.9        4.700         1.4 versicolor
## 84           5.4         3.7        1.500         0.2     setosa
## 85           6.7         3.0        5.200         2.3  virginica
## 86           5.1         3.8        1.900         Inf     setosa
## 87           6.4         2.9        4.300         1.3 versicolor
## 88           5.7         2.9        4.200         1.3 versicolor
## 89           4.4         2.9        1.400         0.2     setosa
## 90           6.3         2.5        5.000         1.9  virginica
## 91           7.2         3.2        6.000         1.8  virginica
## 92           4.9          NA        3.300         1.0 versicolor
## 93           5.2         3.4        1.400         0.2     setosa
## 94           5.8         2.7        5.100         1.9  virginica
## 95           6.0         2.2        5.000         1.5  virginica
## 96           6.9         3.1           NA         1.5 versicolor
## 97           5.5         2.3        4.000         1.3 versicolor
## 98           6.7          NA        5.000         1.7 versicolor
## 99           5.7         3.0        4.200         1.2 versicolor
## 100          6.3         2.8        5.100         1.5  virginica
## 101          5.4         3.4        1.500         0.4     setosa
## 102          7.2         3.6           NA         2.5  virginica
## 103          6.3         2.7        4.900          NA  virginica
## 104          5.6         3.0        4.100         1.3 versicolor
## 105          5.1         3.7           NA         0.4     setosa
## 106          5.5          NA        0.925         1.0 versicolor
## 107          6.5         3.0        5.200         2.0  virginica
## 108          4.8         3.0        1.400          NA     setosa
## 109          6.1         2.8           NA         1.3 versicolor
## 110          4.6         3.4        1.400         0.3     setosa
## 111          6.3         3.4           NA         2.4  virginica
## 112          5.0         3.4        1.500         0.2     setosa
## 113          5.1         3.4        1.500         0.2     setosa
## 114           NA         3.3        5.700         2.1  virginica
## 115          6.7         3.1        4.700         1.5 versicolor
## 116          7.7         2.6        6.900         2.3  virginica
## 117          6.3          NA        4.400         1.3 versicolor
## 118          4.6         3.1        1.500         0.2     setosa
## 119           NA         3.0        5.500         2.1  virginica
## 120           NA         2.8        4.700         1.2 versicolor
## 121          5.9         3.0           NA         1.5 versicolor
## 122          4.5         2.3        1.300         0.3     setosa
## 123          6.4         3.2        5.300         2.3  virginica
## 124          5.2         4.1        1.500         0.1     setosa
## 125         49.0        30.0       14.000         2.0     setosa
## 126          5.6         2.9        3.600         1.3 versicolor
## 127          6.8         3.2        5.900         2.3  virginica
## 128          5.8          NA        5.100         2.4  virginica
## 129          4.6         3.6           NA         0.2     setosa
## 130          5.7         0.0        1.700         0.3     setosa
## 131          5.6         2.5        3.900         1.1 versicolor
## 132          6.7         3.1        4.400         1.4 versicolor
## 133          4.8          NA        1.900         0.2     setosa
## 134          5.1         3.3        1.700         0.5     setosa
## 135          4.4         3.0        1.300          NA     setosa
## 136          7.7         3.0           NA         2.3  virginica
## 137          4.7         3.2        1.600         0.2     setosa
## 138           NA         3.0        4.900         1.8  virginica
## 139          6.9         3.1        5.400         2.1  virginica
## 140          6.0         2.2        4.000         1.0 versicolor
## 141          5.0          NA        1.400         0.2     setosa
## 142          5.5          NA        3.800         1.1 versicolor
## 143          6.6         3.0        4.400         1.4 versicolor
## 144          6.3         2.9        5.600         1.8  virginica
## 145          5.7         2.5        5.000         2.0  virginica
## 146          6.7         3.1        5.600         2.4  virginica
## 147          5.6         3.0        4.500         1.5 versicolor
## 148          5.2         3.5        1.500         0.2     setosa
## 149          6.4         3.1           NA         1.8  virginica
## 150          5.8         2.6        4.000          NA versicolor
missing_petal_length <- sum(is.na(dirty_iris$Petal.Length))
missing_petal_length
## [1] 19

Question 4

the number and the percentage of observations that are complete

complete_obs <- sum(complete.cases(dirty_iris))
percent_complete <- mean(complete.cases(dirty_iris)) * 100

complete_obs
## [1] 96
percent_complete
## [1] 64

# Question 5

is.na(dirty_iris$Petal.Length)
##   [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [49] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [97] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [109]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE  TRUE FALSE

Question 6

Index to search for missing info and replace with “NA”

dirty_iris[is.na(dirty_iris$Petal.Length), ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 6            5.3          NA           NA         0.2     setosa
## 19            NA         4.0           NA         0.2     setosa
## 21           4.9         3.6           NA         0.1     setosa
## 23           6.2         2.8           NA         1.8  virginica
## 31           4.4         3.2           NA         0.2     setosa
## 34           6.2         2.9           NA         1.3 versicolor
## 47           5.7          NA           NA         0.4     setosa
## 52           4.9         3.1           NA         0.2     setosa
## 67           5.1         2.5           NA         1.1 versicolor
## 69           5.1         3.5           NA          NA     setosa
## 96           6.9         3.1           NA         1.5 versicolor
## 102          7.2         3.6           NA         2.5  virginica
## 105          5.1         3.7           NA         0.4     setosa
## 109          6.1         2.8           NA         1.3 versicolor
## 111          6.3         3.4           NA         2.4  virginica
## 121          5.9         3.0           NA         1.5 versicolor
## 129          4.6         3.6           NA         0.2     setosa
## 136          7.7         3.0           NA         2.3  virginica
## 149          6.4         3.1           NA         1.8  virginica
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- NA

Question 7

violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 |dirty_iris$Sepal.Length > 30, ]

violations
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## NA.2            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## 28            73.0          29         63.0          NA  virginica
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
## NA.17           NA          NA           NA          NA       <NA>
## NA.18           NA          NA           NA          NA       <NA>
## NA.19           NA          NA           NA          NA       <NA>
## NA.20           NA          NA           NA          NA       <NA>
## NA.21           NA          NA           NA          NA       <NA>
## 125           49.0          30         14.0         2.0     setosa
## NA.22           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.23           NA          NA           NA          NA       <NA>
## NA.24           NA          NA           NA          NA       <NA>
## NA.25           NA          NA           NA          NA       <NA>
## NA.26           NA          NA           NA          NA       <NA>
nrow(violations)
## [1] 31

Question 9

mean_sw <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean_sw
median_pl <- median(dirty_iris$Petal.Length, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median_pl