1. Trực quan dữ liệu lỗi
library(dlookr) # for exploratory data analysis and imputation
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
dulieu <-airquality
dim(dulieu)
## [1] 153 6
plot_na_pareto(dulieu, only_na = TRUE)

#plot_na_intersect(dulieu)
library(visdat) # for visualizing NAs
library(plotly) # for interactive visualization
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
vis_miss(dulieu) %>% ggplotly()
2. Impute đơn biến
2.1 Don bien lien tuc
library(grid)
# produce more NAs with missRanger package
library(missRanger)
set.seed(111)
dulieu_NA <- generateNA(dulieu) %>% mutate(Month = factor(Month))
head(dulieu)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
head(dulieu_NA)
## Ozone Solar.R Wind Temp Month Day
## 1 41 NA NA 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 <NA> 6
# check out NAs
plot_na_pareto(dulieu_NA)

# iputate X theo Y
vidu1 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "mean")
plot(vidu1)

head(vidu1)
## [1] 41.00000 36.00000 12.00000 18.00000 42.80189 28.00000
vidu2 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "rpart")
plot(vidu2)

head(vidu2)
## [1] 41.00000 36.00000 12.00000 18.00000 17.66667 28.00000
vidu3 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "mice")
##
## iter imp variable
## 1 1 Ozone Solar.R Wind Month Day
## 1 2 Ozone Solar.R Wind Month Day
## 1 3 Ozone Solar.R Wind Month Day
## 1 4 Ozone Solar.R Wind Month Day
## 1 5 Ozone Solar.R Wind Month Day
## 2 1 Ozone Solar.R Wind Month Day
## 2 2 Ozone Solar.R Wind Month Day
## 2 3 Ozone Solar.R Wind Month Day
## 2 4 Ozone Solar.R Wind Month Day
## 2 5 Ozone Solar.R Wind Month Day
## 3 1 Ozone Solar.R Wind Month Day
## 3 2 Ozone Solar.R Wind Month Day
## 3 3 Ozone Solar.R Wind Month Day
## 3 4 Ozone Solar.R Wind Month Day
## 3 5 Ozone Solar.R Wind Month Day
## 4 1 Ozone Solar.R Wind Month Day
## 4 2 Ozone Solar.R Wind Month Day
## 4 3 Ozone Solar.R Wind Month Day
## 4 4 Ozone Solar.R Wind Month Day
## 4 5 Ozone Solar.R Wind Month Day
## 5 1 Ozone Solar.R Wind Month Day
## 5 2 Ozone Solar.R Wind Month Day
## 5 3 Ozone Solar.R Wind Month Day
## 5 4 Ozone Solar.R Wind Month Day
## 5 5 Ozone Solar.R Wind Month Day
plot(vidu3)

head(vidu3)
## [1] 41 36 12 18 24 28
#vidu4 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "knn")
#plot(vidu4)
#head(vidu4)
2.2 Don bien roi rac
vidu4 <- imputate_na(dulieu_NA, Month, Temp, method = "mice", seed = 111)
##
## iter imp variable
## 1 1 Ozone Solar.R Wind Month Day
## 1 2 Ozone Solar.R Wind Month Day
## 1 3 Ozone Solar.R Wind Month Day
## 1 4 Ozone Solar.R Wind Month Day
## 1 5 Ozone Solar.R Wind Month Day
## 2 1 Ozone Solar.R Wind Month Day
## 2 2 Ozone Solar.R Wind Month Day
## 2 3 Ozone Solar.R Wind Month Day
## 2 4 Ozone Solar.R Wind Month Day
## 2 5 Ozone Solar.R Wind Month Day
## 3 1 Ozone Solar.R Wind Month Day
## 3 2 Ozone Solar.R Wind Month Day
## 3 3 Ozone Solar.R Wind Month Day
## 3 4 Ozone Solar.R Wind Month Day
## 3 5 Ozone Solar.R Wind Month Day
## 4 1 Ozone Solar.R Wind Month Day
## 4 2 Ozone Solar.R Wind Month Day
## 4 3 Ozone Solar.R Wind Month Day
## 4 4 Ozone Solar.R Wind Month Day
## 4 5 Ozone Solar.R Wind Month Day
## 5 1 Ozone Solar.R Wind Month Day
## 5 2 Ozone Solar.R Wind Month Day
## 5 3 Ozone Solar.R Wind Month Day
## 5 4 Ozone Solar.R Wind Month Day
## 5 5 Ozone Solar.R Wind Month Day
plot(vidu4)

table(dulieu[5])
## Month
## 5 6 7 8 9
## 31 30 31 31 30
nrow(dulieu)
## [1] 153
table(dulieu_NA[5])
## Month
## 5 6 7 8 9
## 26 29 29 28 26
nrow(dulieu_NA)
## [1] 153
table(vidu4)
## vidu4
## 5 6 7 8 9
## 27 32 33 32 29
vidu4 <-data.frame(vidu4)
nrow(vidu4)
## [1] 153
4. Mo rong them
dlieu <- generateNA(iris, p = 0.2)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(dlieu)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 NA 3.0 1.4 0.2 <NA>
## 3 4.7 NA 1.3 0.2 setosa
## 4 4.6 3.1 NA 0.2 setosa
## 5 5.0 3.6 1.4 NA setosa
## 6 5.4 3.9 NA 0.4 setosa
dlieu <- missRanger(dlieu, pmm.k = 3, num.trees = 100, verbose = 0)
head(dlieu)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.0 1.4 0.2 setosa
## 3 4.7 3.0 1.3 0.2 setosa
## 4 4.6 3.1 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.3 setosa
## 6 5.4 3.9 1.4 0.4 setosa
m <- missRanger(dlieu, pmm.k = 3, num.trees = 10, seed = 1, verbose = 0)
head(m)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.0 1.4 0.2 setosa
## 3 4.7 3.0 1.3 0.2 setosa
## 4 4.6 3.1 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.3 setosa
## 6 5.4 3.9 1.4 0.4 setosa
# Impute all variables with all except Species
m <- missRanger(dlieu, . ~ . - Species, pmm.k = 3, num.trees = 10, verbose = 0)
head(m)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.0 1.4 0.2 setosa
## 3 4.7 3.0 1.3 0.2 setosa
## 4 4.6 3.1 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.3 setosa
## 6 5.4 3.9 1.4 0.4 setosa
# Impute Sepal.Width by Species
m <- missRanger(dlieu, Sepal.Width ~ Species, pmm.k = 3, num.trees = 10, verbose = 0)
head(m)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.0 1.4 0.2 setosa
## 3 4.7 3.0 1.3 0.2 setosa
## 4 4.6 3.1 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.3 setosa
## 6 5.4 3.9 1.4 0.4 setosa
m <- missRanger(dlieu, Sepal.Width + Species ~ Species, pmm.k = 3, num.trees = 10, verbose = 0)
head(m)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.0 1.4 0.2 setosa
## 3 4.7 3.0 1.3 0.2 setosa
## 4 4.6 3.1 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.3 setosa
## 6 5.4 3.9 1.4 0.4 setosa
# Impute all variables univariatly
m <- missRanger(dlieu, . ~ 1, verbose = 0)
head(m)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5.0 3.0 1.4 0.2 setosa
## 3 4.7 3.0 1.3 0.2 setosa
## 4 4.6 3.1 1.6 0.2 setosa
## 5 5.0 3.6 1.4 0.3 setosa
## 6 5.4 3.9 1.4 0.4 setosa