dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <- sum(complete.cases(dirty_iris))

total_rows <- nrow(dirty_iris)

percent_complete <- (num_complete / total_rows) * 100
unique(dirty_iris$Sepal.Length)
##  [1]  6.4  6.3  6.2  5.0  5.7  5.3  5.9  5.8  4.8  6.0  6.8   NA  5.5  4.7  5.6
## [16]  4.9  5.4  6.7  4.6 73.0  6.5  4.4  6.6  0.0  7.7  7.0  5.2  5.1  7.4  4.3
## [31]  7.2  6.1  6.9  7.6  7.9  4.5 49.0
unique(dirty_iris$Sepal.Width)
##  [1]  3.2  3.3   NA  3.4  2.6  2.7  3.0  3.1  3.5  2.8  3.9 -3.0  4.0  3.6 29.0
## [16]  2.9  2.2  2.5  4.2  3.8  2.3  3.7  4.1 30.0  0.0
unique(dirty_iris$Petal.Length)
##  [1]  4.500  6.000  5.400  1.600  3.500     NA  5.300  5.100  4.100  4.800
## [11]  1.700  4.000  1.300  4.200  5.700  5.900  1.400  1.500 63.000  0.820
## [21] 23.000  5.500  5.800  1.200  3.900  6.700  4.700  4.400  5.600  3.300
## [31]  6.100  1.100  4.900  4.600  6.600  0.000  6.400  5.200  1.900  4.300
## [41]  5.000  0.925  6.900 14.000  3.600  3.800
unique(dirty_iris$Petal.Width)
##  [1] 1.5 2.5 2.3 0.4 1.0 0.2  NA 1.8 0.6 1.6 1.4 1.3 0.1 2.1 2.0 1.2 1.9 2.2 0.3
## [20] 1.1 Inf 1.7 2.4 0.5
dirty_iris[] <- lapply(dirty_iris, function(x) {
  if (is.numeric(x)) {
    x[is.nan(x)] <- NA
  }
  return(x)
})
sapply(dirty_iris, function(x) sum(is.nan(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0
violations <- dirty_iris[
  (dirty_iris$Sepal.Width <= 0 | is.na(dirty_iris$Sepal.Width)) |
  (dirty_iris$Sepal.Length > 30), 
]

violations
##      Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 3             6.2          NA        5.400         2.3  virginica
## 6             5.3          NA           NA         0.2     setosa
## NA             NA          NA           NA          NA       <NA>
## 16            5.0          -3        3.500         1.0 versicolor
## 17            5.5          NA        4.000         1.3 versicolor
## NA.1           NA          NA           NA          NA       <NA>
## 20            5.6          NA        4.200         1.3 versicolor
## 22            5.4          NA        4.500         1.5 versicolor
## NA.2           NA          NA           NA          NA       <NA>
## 28           73.0          29       63.000          NA  virginica
## NA.3           NA          NA           NA          NA       <NA>
## 41            5.0          NA        1.200         0.2     setosa
## 43            0.0          NA        1.300         0.4     setosa
## 47            5.7          NA           NA         0.4     setosa
## NA.4           NA          NA           NA          NA       <NA>
## NA.5           NA          NA           NA          NA       <NA>
## 74            6.5          NA        4.600         1.5 versicolor
## 92            4.9          NA        3.300         1.0 versicolor
## 98            6.7          NA        5.000         1.7 versicolor
## 106           5.5          NA        0.925         1.0 versicolor
## NA.6           NA          NA           NA          NA       <NA>
## 117           6.3          NA        4.400         1.3 versicolor
## NA.7           NA          NA           NA          NA       <NA>
## NA.8           NA          NA           NA          NA       <NA>
## 125          49.0          30       14.000         2.0     setosa
## 128           5.8          NA        5.100         2.4  virginica
## 130           5.7           0        1.700         0.3     setosa
## 133           4.8          NA        1.900         0.2     setosa
## NA.9           NA          NA           NA          NA       <NA>
## 141           5.0          NA        1.400         0.2     setosa
## 142           5.5          NA        3.800         1.1 versicolor
nrow(violations)
## [1] 31
invalid_sepal_width <- which(dirty_iris$Sepal.Width <= 0)


dirty_iris$Sepal.Width[invalid_sepal_width] <- sapply(
  dirty_iris$Sepal.Width[invalid_sepal_width],
  function(x) if (x == 0) NA else abs(x)
)

sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
## [1] 0