dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percent_complete <- (num_complete / total_obs) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
library(stringr)
sum(is.na(dirty_iris))
## [1] 58
sum(dirty_iris == NaN, na.rm = TRUE)
## [1] 0
sum(dirty_iris == Inf, na.rm = TRUE)
## [1] 1
sum(dirty_iris == -Inf, na.rm = TRUE)
## [1] 0
dirty_iris[dirty_iris == NaN] <- NA
dirty_iris[dirty_iris == Inf] <- NA
dirty_iris[dirty_iris == -Inf] <- NA
bad_rows <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
dirty_iris[bad_rows, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa
length(bad_rows)
## [1] 4
table(dirty_iris$Sepal.Width)
## 
##  -3   0 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 
##   1   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1 
## 4.1 4.2  29  30 
##   1   1   1   1
bad_data <- str_subset(dirty_iris$Sepal.Width, "[a-z A-Z]")
dirty_iris[which(dirty_iris$Sepal.Width=="0"),"Sepal.Width"] <- NA
table(dirty_iris$Sepal.Width)
## 
##  -3 2.2 2.3 2.5 2.6 2.7 2.8 2.9   3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9   4 4.1 
##   1   3   3   7   5   8  12   9  23  11  12   6  10   6   3   2   5   1   1   1 
## 4.2  29  30 
##   1   1   1
colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           18           19           13            0
library(robotstxt)
library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)
colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10            0           19           13            0
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)]
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
  median(dirty_iris$Petal.Length, na.rm = TRUE)
## [1] 4.5
colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10            0           19           13            0
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species, 
               data = dirty_iris, na.action = na.omit)

dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- 
  predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##            0           -3            0           73           30           63
colSums(is.na(dirty_iris))
##    Sepal.Length     Sepal.Width    Petal.Length     Petal.Width         Species 
##               1               0              19               0               0 
## Petal.Width_imp 
##               0