dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percent_complete <- (num_complete / total_obs) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
library(stringr)
sum(is.na(dirty_iris))
## [1] 58
sum(dirty_iris == NaN, na.rm = TRUE)
## [1] 0
sum(dirty_iris == Inf, na.rm = TRUE)
## [1] 1
sum(dirty_iris == -Inf, na.rm = TRUE)
## [1] 0
dirty_iris[dirty_iris == NaN] <- NA
dirty_iris[dirty_iris == Inf] <- NA
dirty_iris[dirty_iris == -Inf] <- NA
bad_rows <- which(dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30)
dirty_iris[bad_rows, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
length(bad_rows)
## [1] 4
table(dirty_iris$Sepal.Width)
##
## -3 0 2.2 2.3 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4
## 1 1 3 3 7 5 8 12 9 23 11 12 6 10 6 3 2 5 1 1
## 4.1 4.2 29 30
## 1 1 1 1
bad_data <- str_subset(dirty_iris$Sepal.Width, "[a-z A-Z]")
dirty_iris[which(dirty_iris$Sepal.Width=="0"),"Sepal.Width"] <- NA
table(dirty_iris$Sepal.Width)
##
## -3 2.2 2.3 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1
## 1 3 3 7 5 8 12 9 23 11 12 6 10 6 3 2 5 1 1 1
## 4.2 29 30
## 1 1 1
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 18 19 13 0
library(robotstxt)
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 0 19 13 0
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
median(dirty_iris$Petal.Length, na.rm = TRUE)
## [1] 4.5
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 0 19 13 0
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species,
data = dirty_iris, na.action = na.omit)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <-
predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0 -3 0 73 30 63
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 0 19 0 0
## Petal.Width_imp
## 0