library(tidyverse)
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다

\(~\)

이번에는 다양한 R 내장 데이터 사용 예정.

\(~\)

1. Basic of “naniar” package

library(naniar)

n_miss

n_miss(airquality)
## [1] 44
n_miss(airquality$Ozone)
## [1] 37

n_complete

n_complete(airquality)
## [1] 874
n_complete(airquality$Ozone)
## [1] 116

prop_miss_

prop_miss(airquality)
## [1] 0.04793028
prop_miss_var(airquality) #variable with NA (column)
## [1] 0.3333333
prop_miss_case(airquality) #row with NA (row)
## [1] 0.2745098

prop_complete_

prop_complete(airquality)
## [1] 0.9520697
prop_complete_var(airquality)
## [1] 0.6666667
prop_complete_case(airquality)
## [1] 0.7254902

\(~\)

2. Summarize missing values

miss_var_summary

miss_var_summary(airquality)
## # A tibble: 6 x 3
##   variable n_miss pct_miss
##   <chr>     <int>    <dbl>
## 1 Ozone        37    24.2 
## 2 Solar.R       7     4.58
## 3 Wind          0     0   
## 4 Temp          0     0   
## 5 Month         0     0   
## 6 Day           0     0
airquality %>% group_by(Month) %>% miss_var_summary
## # A tibble: 25 x 4
## # Groups:   Month [5]
##    Month variable n_miss pct_miss
##    <int> <chr>     <int>    <dbl>
##  1     5 Ozone         5     16.1
##  2     5 Solar.R       4     12.9
##  3     5 Wind          0      0  
##  4     5 Temp          0      0  
##  5     5 Day           0      0  
##  6     6 Ozone        21     70  
##  7     6 Solar.R       0      0  
##  8     6 Wind          0      0  
##  9     6 Temp          0      0  
## 10     6 Day           0      0  
## # ... with 15 more rows

miss_case_summary

miss_case_summary(airquality)
## # A tibble: 153 x 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     5      2     33.3
##  2    27      2     33.3
##  3     6      1     16.7
##  4    10      1     16.7
##  5    11      1     16.7
##  6    25      1     16.7
##  7    26      1     16.7
##  8    32      1     16.7
##  9    33      1     16.7
## 10    34      1     16.7
## # ... with 143 more rows
airquality %>% group_by(Month) %>% miss_case_summary
## # A tibble: 153 x 4
## # Groups:   Month [5]
##    Month  case n_miss pct_miss
##    <int> <int>  <int>    <dbl>
##  1     5     5      2       40
##  2     5    27      2       40
##  3     5     6      1       20
##  4     5    10      1       20
##  5     5    11      1       20
##  6     5    25      1       20
##  7     5    26      1       20
##  8     5     1      0        0
##  9     5     2      0        0
## 10     5     3      0        0
## # ... with 143 more rows

miss_var_table

miss_var_table(airquality)
## # A tibble: 3 x 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      4     66.7
## 2             7      1     16.7
## 3            37      1     16.7

miss_case_table

miss_case_table(airquality)
## # A tibble: 3 x 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     111     72.5 
## 2              1      40     26.1 
## 3              2       2      1.31

\(~\)

3. Find patterns of missing values

miss_var_run()

miss_var_run(airquality, var = Ozone)
## # A tibble: 35 x 2
##    run_length is_na   
##         <int> <chr>   
##  1          4 complete
##  2          1 missing 
##  3          4 complete
##  4          1 missing 
##  5         14 complete
##  6          3 missing 
##  7          4 complete
##  8          6 missing 
##  9          1 complete
## 10          1 missing 
## # ... with 25 more rows

miss_var_span()

miss_var_span(airquality, var = Ozone, span_every = 4000)
## # A tibble: 1 x 6
##   span_counter n_miss n_complete prop_miss prop_complete n_in_span
##          <int>  <int>      <int>     <dbl>         <dbl>     <int>
## 1            1     37        116     0.242         0.758       153

\(~\)

4. Visualization

이번에는 “riskfactors” dataset을 사용해보자

vis_miss

vis_miss(riskfactors)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

vis_miss(riskfactors, cluster = T)

vis_miss(riskfactors, sort_miss = T)

gg_miss_

gg_miss_case(riskfactors)

gg_miss_case(riskfactors, facet = education)

gg_miss_var(riskfactors)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

gg_miss_var(riskfactors, facet = education)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

gg_miss_upset(riskfactors)

marital status(명목변수)에 따른 Missing Values

gg_miss_fct(x = riskfactors, fct = marital)

\(~\)

5. Replace missing values

miss_scan_count

NA로 코딩되어있지 않은 경우 찾기

miss_scan_count(data = exampledata
                , search = list
                ("N/A", "missing", "na", " ", "."))

replace 1

exampledata <- replace_with_na(exampledata,
                               replace = list(
                                 V1 = c("N/A", "na"),
                                 V2 = c("N/A", "na")
                               ))

replace 2: shoter code!

replace_with_na_at(exampledata,
                   .vars = c("V1", "V2", "V3"), 
                   ~.x %in% c("N/A", "missing", "na", " "))

replace 3: all variables at once

replace_with_na_all(exampledata, ~.x %in% c("N/A", "missing", "na", " "))

\(~\)

6. Implicit missing values

implicit missing value란 명시적으로 na를 갖고 있지 않지만, 구조상 보았을 때 특정 정보들이 누락되어있는 경우를 의미한다.

아래서 tibble 하나를 만들어보고, 그 구조를 잘 보자.

frogger <-
tribble(
  ~name, ~time, ~value,
  "jesse", "moring", 6678,
  "jesse", "afternoon", 800060,
  "jesse", "evening", 475528,
  "jesse", "late_night", 143533,
  "andy", "morning", 425115,
  "andy", "afternoon", 587468,
  "andy", "late_night", 111000,
  "nic", "afternoon", 588532,
  "nic", "late_night", 915533,
  "dan", "morning", 388148,
  "dan", "evening", 180912,
  "alex", "morning", 552670,
  "alex", "afternoon", 98355,
  "alex", "evening", 266055,
  "alex", "late_night", 121056,
  
)


print(frogger)
## # A tibble: 15 x 3
##    name  time        value
##    <chr> <chr>       <dbl>
##  1 jesse moring       6678
##  2 jesse afternoon  800060
##  3 jesse evening    475528
##  4 jesse late_night 143533
##  5 andy  morning    425115
##  6 andy  afternoon  587468
##  7 andy  late_night 111000
##  8 nic   afternoon  588532
##  9 nic   late_night 915533
## 10 dan   morning    388148
## 11 dan   evening    180912
## 12 alex  morning    552670
## 13 alex  afternoon   98355
## 14 alex  evening    266055
## 15 alex  late_night 121056

complete(): 누락된 정보들에 NA를 생성하라

(https://tidyr.tidyverse.org/reference/complete.html)

frogger_2 <- frogger %>% 
  complete(time, value) #기준 = 2개 이상 해야지 서로 빈 곳에 샥샥 들어감
print(frogger)
## # A tibble: 15 x 3
##    name  time        value
##    <chr> <chr>       <dbl>
##  1 jesse moring       6678
##  2 jesse afternoon  800060
##  3 jesse evening    475528
##  4 jesse late_night 143533
##  5 andy  morning    425115
##  6 andy  afternoon  587468
##  7 andy  late_night 111000
##  8 nic   afternoon  588532
##  9 nic   late_night 915533
## 10 dan   morning    388148
## 11 dan   evening    180912
## 12 alex  morning    552670
## 13 alex  afternoon   98355
## 14 alex  evening    266055
## 15 alex  late_night 121056
print(frogger_2)
## # A tibble: 75 x 3
##    time       value name 
##    <chr>      <dbl> <chr>
##  1 afternoon   6678 <NA> 
##  2 afternoon  98355 alex 
##  3 afternoon 111000 <NA> 
##  4 afternoon 121056 <NA> 
##  5 afternoon 143533 <NA> 
##  6 afternoon 180912 <NA> 
##  7 afternoon 266055 <NA> 
##  8 afternoon 388148 <NA> 
##  9 afternoon 425115 <NA> 
## 10 afternoon 475528 <NA> 
## # ... with 65 more rows

fill()

(https://tidyr.tidyverse.org/reference/fill.html)

print(frogger_2)
## # A tibble: 75 x 3
##    time       value name 
##    <chr>      <dbl> <chr>
##  1 afternoon   6678 <NA> 
##  2 afternoon  98355 alex 
##  3 afternoon 111000 <NA> 
##  4 afternoon 121056 <NA> 
##  5 afternoon 143533 <NA> 
##  6 afternoon 180912 <NA> 
##  7 afternoon 266055 <NA> 
##  8 afternoon 388148 <NA> 
##  9 afternoon 425115 <NA> 
## 10 afternoon 475528 <NA> 
## # ... with 65 more rows
frogger_2 %>% fill(name)
## # A tibble: 75 x 3
##    time       value name 
##    <chr>      <dbl> <chr>
##  1 afternoon   6678 <NA> 
##  2 afternoon  98355 alex 
##  3 afternoon 111000 alex 
##  4 afternoon 121056 alex 
##  5 afternoon 143533 alex 
##  6 afternoon 180912 alex 
##  7 afternoon 266055 alex 
##  8 afternoon 388148 alex 
##  9 afternoon 425115 alex 
## 10 afternoon 475528 alex 
## # ... with 65 more rows

\(~\)

7. Missing data dependence

MCAR: missing completely at random

delete obs is possible

oceanbuoys %>% arrange(wind_ew) %>% vis_miss()

MAR: missing at random

deleting obs not ideal

oceanbuoys %>% arrange(latitude) %>% vis_miss()

MNARL: missing not at random

data will be biased from both deletion and imputation

oceanbuoys %>% arrange(year) %>% vis_miss()

\(~\)

8. The shadow matrix and nabular data: df with NA or !NA

’한 변수’와 ’다른 변수의 결측값’의 관계를 볼 수 있음

shadow matrix

as_shadow(oceanbuoys) %>% head()
## # A tibble: 6 x 8
##   year_NA latitude_NA longitude_NA sea_temp_c_NA air_temp_c_NA humidity_NA
##   <fct>   <fct>       <fct>        <fct>         <fct>         <fct>      
## 1 !NA     !NA         !NA          !NA           !NA           !NA        
## 2 !NA     !NA         !NA          !NA           !NA           !NA        
## 3 !NA     !NA         !NA          !NA           !NA           !NA        
## 4 !NA     !NA         !NA          !NA           !NA           !NA        
## 5 !NA     !NA         !NA          !NA           !NA           !NA        
## 6 !NA     !NA         !NA          !NA           !NA           !NA        
## # ... with 2 more variables: wind_ew_NA <fct>, wind_ns_NA <fct>

nabular data: Having data in this format, with the shadow matrix column bound to the regular data is called nabular data.

oceanbuoys
## # A tibble: 736 x 8
##     year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
##    <dbl>    <dbl>     <dbl>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>
##  1  1997        0      -110       27.6       27.1     79.6   -6.40    5.40
##  2  1997        0      -110       27.5       27.0     75.8   -5.30    5.30
##  3  1997        0      -110       27.6       27       76.5   -5.10    4.5 
##  4  1997        0      -110       27.6       26.9     76.2   -4.90    2.5 
##  5  1997        0      -110       27.6       26.8     76.4   -3.5     4.10
##  6  1997        0      -110       27.8       26.9     76.7   -4.40    1.60
##  7  1997        0      -110       28.0       27.0     76.5   -2       3.5 
##  8  1997        0      -110       28.0       27.1     78.3   -3.70    4.5 
##  9  1997        0      -110       28.0       27.2     78.6   -4.20    5   
## 10  1997        0      -110       28.0       27.2     76.9   -3.60    3.5 
## # ... with 726 more rows

뒤에 어떤 변수가 추가되는지 확인해볼 것

bind_shadow(oceanbuoys)
## # A tibble: 736 x 16
##     year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
##    <dbl>    <dbl>     <dbl>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>
##  1  1997        0      -110       27.6       27.1     79.6   -6.40    5.40
##  2  1997        0      -110       27.5       27.0     75.8   -5.30    5.30
##  3  1997        0      -110       27.6       27       76.5   -5.10    4.5 
##  4  1997        0      -110       27.6       26.9     76.2   -4.90    2.5 
##  5  1997        0      -110       27.6       26.8     76.4   -3.5     4.10
##  6  1997        0      -110       27.8       26.9     76.7   -4.40    1.60
##  7  1997        0      -110       28.0       27.0     76.5   -2       3.5 
##  8  1997        0      -110       28.0       27.1     78.3   -3.70    4.5 
##  9  1997        0      -110       28.0       27.2     78.6   -4.20    5   
## 10  1997        0      -110       28.0       27.2     76.9   -3.60    3.5 
## # ... with 726 more rows, and 8 more variables: year_NA <fct>,
## #   latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## #   air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>

Bind only the variables with missing values

bind_shadow(oceanbuoys, only_miss = TRUE) 
## # A tibble: 736 x 11
##     year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
##    <dbl>    <dbl>     <dbl>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>
##  1  1997        0      -110       27.6       27.1     79.6   -6.40    5.40
##  2  1997        0      -110       27.5       27.0     75.8   -5.30    5.30
##  3  1997        0      -110       27.6       27       76.5   -5.10    4.5 
##  4  1997        0      -110       27.6       26.9     76.2   -4.90    2.5 
##  5  1997        0      -110       27.6       26.8     76.4   -3.5     4.10
##  6  1997        0      -110       27.8       26.9     76.7   -4.40    1.60
##  7  1997        0      -110       28.0       27.0     76.5   -2       3.5 
##  8  1997        0      -110       28.0       27.1     78.3   -3.70    4.5 
##  9  1997        0      -110       28.0       27.2     78.6   -4.20    5   
## 10  1997        0      -110       28.0       27.2     76.9   -3.60    3.5 
## # ... with 726 more rows, and 3 more variables: sea_temp_c_NA <fct>,
## #   air_temp_c_NA <fct>, humidity_NA <fct>

“humidity”가 결측/비결측일때 각각 “wind_we”의 mean과 sd를 구하시오

oceanbuoys %>%
  bind_shadow() %>%
  group_by(humidity_NA) %>% 
  summarize(wind_ew_mean = mean(wind_ew)
            , wind_ew_sd = sd(wind_ew)
            , N = n())
## # A tibble: 2 x 4
##   humidity_NA wind_ew_mean wind_ew_sd     N
##   <fct>              <dbl>      <dbl> <int>
## 1 !NA                -3.78       1.90   643
## 2 NA                 -3.30       2.31    93

\(~\)

9. Visualization with nabular data

air_temp_c가 각각 NA, !NA일 때 wind_ew의 분포

bind_shadow(oceanbuoys) %>%
  ggplot(aes(x = wind_ew, 
             color = air_temp_c_NA)) + 
  geom_density()

### with color() and facet_wrap()

oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = wind_ew,
             color = humidity_NA)) + 
  geom_density() + 
  facet_wrap(~air_temp_c_NA)

### boxplot()

oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = air_temp_c_NA,
             y = wind_ew)) + 
  geom_boxplot() +
  facet_wrap(~ humidity_NA)

geom_miss_point(): 비/결측치를 산포도로

ggplot(oceanbuoys,
       aes(x = humidity,
           y = air_temp_c)) + 
  geom_miss_point() + 
  facet_wrap(~year)

How the missingness in wind_ew and air_temp_c is different for missingness of humidity and by year?

bind_shadow(oceanbuoys) %>%
  ggplot(aes(x = wind_ew,
             y = air_temp_c)) + 
  geom_miss_point() + 
  facet_grid(humidity_NA~year)

\(~\)

10. Imputation (1)

impute_below_all(): NA를 최솟값 아래 값으로 impute

ocean_imp <- impute_below_all(oceanbuoys)
ggplot(ocean_imp, 
       aes(x = wind_ew, y = air_temp_c)) +  
  geom_point()

### Track imputation 1: bind_shadow() + impute_below_all() + geom_point()

nabular data를 통해 NA/!NA를 구분한 뒤 impute를 해서 어떤 녀석들이 imputed인지 track 가능!

ocean_imp_track <- bind_shadow(oceanbuoys) %>% 
  impute_below_all() %>%
  add_label_shadow()
ocean_imp_track
## # A tibble: 736 x 17
##     year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
##    <dbl>    <dbl>     <dbl>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>
##  1  1997        0      -110       27.6       27.1     79.6   -6.40    5.40
##  2  1997        0      -110       27.5       27.0     75.8   -5.30    5.30
##  3  1997        0      -110       27.6       27       76.5   -5.10    4.5 
##  4  1997        0      -110       27.6       26.9     76.2   -4.90    2.5 
##  5  1997        0      -110       27.6       26.8     76.4   -3.5     4.10
##  6  1997        0      -110       27.8       26.9     76.7   -4.40    1.60
##  7  1997        0      -110       28.0       27.0     76.5   -2       3.5 
##  8  1997        0      -110       28.0       27.1     78.3   -3.70    4.5 
##  9  1997        0      -110       28.0       27.2     78.6   -4.20    5   
## 10  1997        0      -110       28.0       27.2     76.9   -3.60    3.5 
## # ... with 726 more rows, and 9 more variables: year_NA <fct>,
## #   latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## #   air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>,
## #   any_missing <chr>
ggplot(ocean_imp_track,
       aes(x = wind_ew, y = air_temp_c, color = air_temp_c_NA)) + 
  geom_point()

ggplot(ocean_imp_track,
       aes(x = humidity, y = air_temp_c, color = any_missing)) + 
  geom_point()

Track imputation 2: bind_shadow() + impute_below_all() + geom_histogram()

# Explore the values of air_temp_c, visualizing the amount of missings with `air_temp_c_NA`.
p <- ggplot(ocean_imp_track, aes(x = air_temp_c, fill = air_temp_c_NA)) + geom_histogram()
# Explore the missings in air_temp_c according to year, using `facet_wrap(~year)`.
p + facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Expore the missings in humidity using humidity_NA
p2 <- ggplot(ocean_imp_track, aes(x = humidity, fill = humidity_NA)) + geom_histogram()
# Explore the missings in humidity according to year, using `facet_wrap(~year)`.
p2 + facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

\(~\)

11. Imputation (2)

impute_mean_all(): 평균으로 imputation

ocean_imp_mean <- bind_shadow(oceanbuoys) %>% 
  impute_mean_all() %>% 
  add_label_shadow()

Box_plot()으로 결과 확인

ggplot(ocean_imp_mean, 
       aes(x = humidity_NA, y = humidity)) + 
  geom_boxplot()

### geom_point()로 결과 확인

ggplot(ocean_imp_mean, 
       aes(x = air_temp_c, y = humidity, color = any_missing)) + 
  geom_point()

Imputation 3: simputation package (Linear Imputation)

library(simputation)
## 
## 다음의 패키지를 부착합니다: 'simputation'
## The following object is masked from 'package:naniar':
## 
##     impute_median

simputation: impute_lm

air_temp_c와 humidity 변수를 impute_lm할 건데 이때 독립변수는 각각 두개씩(wind_ew, wind_ns)

ocean_imp_lm_wind <- oceanbuoys %>% 
    bind_shadow() %>%
    impute_lm(air_temp_c ~ wind_ew + wind_ns) %>% 
    impute_lm(humidity ~ wind_ew + wind_ns) %>%
    add_label_shadow()

Track above

ggplot(ocean_imp_lm_wind, 
       aes(x = air_temp_c, y = humidity, color = any_missing)) + 
  geom_point()

\(~\)

12. 결측치 삭제하기

데이터 전체에서 결측치 있는 행 삭제

na.omit(airquality)

특정 변수 하나에서 결측치 있는 행 삭제

airquality[complete.cases(airquality$Ozone), ]

특정 변수 둘 이상에서 결측치 있는 행 삭제

airquality[complete.cases(airquality$Ozone, airquality$Solar.R), ]