1. Trực quan dữ liệu lỗi

library(dlookr)      # for exploratory data analysis and imputation
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
dulieu <-airquality

dim(dulieu)
## [1] 153   6
plot_na_pareto(dulieu, only_na = TRUE)

#plot_na_intersect(dulieu)
library(visdat)      # for visualizing NAs
library(plotly)      # for interactive visualization
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
vis_miss(dulieu) %>% ggplotly()

2. Impute đơn biến

2.1 Don bien lien tuc

library(grid)
# produce more NAs with missRanger package
library(missRanger)
set.seed(111)
dulieu_NA <- generateNA(dulieu) %>% mutate(Month = factor(Month))
head(dulieu)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
head(dulieu_NA)
##   Ozone Solar.R Wind Temp Month Day
## 1    41      NA   NA   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66  <NA>   6
# check out NAs
plot_na_pareto(dulieu_NA)

# iputate X theo Y
vidu1 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "mean")
plot(vidu1)

head(vidu1)
## [1] 41.00000 36.00000 12.00000 18.00000 42.80189 28.00000
vidu2 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "rpart")
plot(vidu2)

head(vidu2)
## [1] 41.00000 36.00000 12.00000 18.00000 17.66667 28.00000
vidu3 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "mice")
## 
##  iter imp variable
##   1   1  Ozone  Solar.R  Wind  Month  Day
##   1   2  Ozone  Solar.R  Wind  Month  Day
##   1   3  Ozone  Solar.R  Wind  Month  Day
##   1   4  Ozone  Solar.R  Wind  Month  Day
##   1   5  Ozone  Solar.R  Wind  Month  Day
##   2   1  Ozone  Solar.R  Wind  Month  Day
##   2   2  Ozone  Solar.R  Wind  Month  Day
##   2   3  Ozone  Solar.R  Wind  Month  Day
##   2   4  Ozone  Solar.R  Wind  Month  Day
##   2   5  Ozone  Solar.R  Wind  Month  Day
##   3   1  Ozone  Solar.R  Wind  Month  Day
##   3   2  Ozone  Solar.R  Wind  Month  Day
##   3   3  Ozone  Solar.R  Wind  Month  Day
##   3   4  Ozone  Solar.R  Wind  Month  Day
##   3   5  Ozone  Solar.R  Wind  Month  Day
##   4   1  Ozone  Solar.R  Wind  Month  Day
##   4   2  Ozone  Solar.R  Wind  Month  Day
##   4   3  Ozone  Solar.R  Wind  Month  Day
##   4   4  Ozone  Solar.R  Wind  Month  Day
##   4   5  Ozone  Solar.R  Wind  Month  Day
##   5   1  Ozone  Solar.R  Wind  Month  Day
##   5   2  Ozone  Solar.R  Wind  Month  Day
##   5   3  Ozone  Solar.R  Wind  Month  Day
##   5   4  Ozone  Solar.R  Wind  Month  Day
##   5   5  Ozone  Solar.R  Wind  Month  Day
plot(vidu3)

head(vidu3)
## [1] 41 36 12 18 24 28
#vidu4 <- imputate_na(dulieu_NA, xvar=Ozone, yvar=Temp, method = "knn")
#plot(vidu4)
#head(vidu4)

2.2 Don bien roi rac

vidu4 <- imputate_na(dulieu_NA, Month, Temp, method = "mice", seed = 111)
## 
##  iter imp variable
##   1   1  Ozone  Solar.R  Wind  Month  Day
##   1   2  Ozone  Solar.R  Wind  Month  Day
##   1   3  Ozone  Solar.R  Wind  Month  Day
##   1   4  Ozone  Solar.R  Wind  Month  Day
##   1   5  Ozone  Solar.R  Wind  Month  Day
##   2   1  Ozone  Solar.R  Wind  Month  Day
##   2   2  Ozone  Solar.R  Wind  Month  Day
##   2   3  Ozone  Solar.R  Wind  Month  Day
##   2   4  Ozone  Solar.R  Wind  Month  Day
##   2   5  Ozone  Solar.R  Wind  Month  Day
##   3   1  Ozone  Solar.R  Wind  Month  Day
##   3   2  Ozone  Solar.R  Wind  Month  Day
##   3   3  Ozone  Solar.R  Wind  Month  Day
##   3   4  Ozone  Solar.R  Wind  Month  Day
##   3   5  Ozone  Solar.R  Wind  Month  Day
##   4   1  Ozone  Solar.R  Wind  Month  Day
##   4   2  Ozone  Solar.R  Wind  Month  Day
##   4   3  Ozone  Solar.R  Wind  Month  Day
##   4   4  Ozone  Solar.R  Wind  Month  Day
##   4   5  Ozone  Solar.R  Wind  Month  Day
##   5   1  Ozone  Solar.R  Wind  Month  Day
##   5   2  Ozone  Solar.R  Wind  Month  Day
##   5   3  Ozone  Solar.R  Wind  Month  Day
##   5   4  Ozone  Solar.R  Wind  Month  Day
##   5   5  Ozone  Solar.R  Wind  Month  Day
plot(vidu4)

table(dulieu[5])
## Month
##  5  6  7  8  9 
## 31 30 31 31 30
nrow(dulieu)
## [1] 153
table(dulieu_NA[5])
## Month
##  5  6  7  8  9 
## 26 29 29 28 26
nrow(dulieu_NA)
## [1] 153
table(vidu4)
## vidu4
##  5  6  7  8  9 
## 27 32 33 32 29
vidu4 <-data.frame(vidu4)
nrow(vidu4)
## [1] 153

3. Inputate da bien

missRanger(dulieu_NA, pmm.k = 3, num.trees = 100, verbose = 0) ->dulieu2
head(dulieu2)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     188 13.2   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    19     220 14.3   56     5   5
## 6    28     236 14.9   66     9   6

4. Mo rong them

dlieu <- generateNA(iris, p = 0.2)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(dlieu)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2           NA         3.0          1.4         0.2    <NA>
## 3          4.7          NA          1.3         0.2  setosa
## 4          4.6         3.1           NA         0.2  setosa
## 5          5.0         3.6          1.4          NA  setosa
## 6          5.4         3.9           NA         0.4  setosa
dlieu <- missRanger(dlieu, pmm.k = 3, num.trees = 100, verbose = 0)
head(dlieu)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.0          1.4         0.2  setosa
## 3          4.7         3.0          1.3         0.2  setosa
## 4          4.6         3.1          1.6         0.2  setosa
## 5          5.0         3.6          1.4         0.3  setosa
## 6          5.4         3.9          1.4         0.4  setosa
m <- missRanger(dlieu, pmm.k = 3, num.trees = 10, seed = 1, verbose = 0)
head(m)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.0          1.4         0.2  setosa
## 3          4.7         3.0          1.3         0.2  setosa
## 4          4.6         3.1          1.6         0.2  setosa
## 5          5.0         3.6          1.4         0.3  setosa
## 6          5.4         3.9          1.4         0.4  setosa
# Impute all variables with all except Species
m <- missRanger(dlieu, . ~ . - Species, pmm.k = 3, num.trees = 10, verbose = 0)
head(m)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.0          1.4         0.2  setosa
## 3          4.7         3.0          1.3         0.2  setosa
## 4          4.6         3.1          1.6         0.2  setosa
## 5          5.0         3.6          1.4         0.3  setosa
## 6          5.4         3.9          1.4         0.4  setosa
# Impute Sepal.Width by Species 
m <- missRanger(dlieu, Sepal.Width ~ Species, pmm.k = 3, num.trees = 10, verbose = 0)
head(m)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.0          1.4         0.2  setosa
## 3          4.7         3.0          1.3         0.2  setosa
## 4          4.6         3.1          1.6         0.2  setosa
## 5          5.0         3.6          1.4         0.3  setosa
## 6          5.4         3.9          1.4         0.4  setosa
m <- missRanger(dlieu, Sepal.Width + Species ~ Species, pmm.k = 3, num.trees = 10, verbose = 0)
head(m)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.0          1.4         0.2  setosa
## 3          4.7         3.0          1.3         0.2  setosa
## 4          4.6         3.1          1.6         0.2  setosa
## 5          5.0         3.6          1.4         0.3  setosa
## 6          5.4         3.9          1.4         0.4  setosa
# Impute all variables univariatly
m <- missRanger(dlieu, . ~ 1, verbose = 0)
head(m)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.0         3.0          1.4         0.2  setosa
## 3          4.7         3.0          1.3         0.2  setosa
## 4          4.6         3.1          1.6         0.2  setosa
## 5          5.0         3.6          1.4         0.3  setosa
## 6          5.4         3.9          1.4         0.4  setosa