library(tidyverse)
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다
\(~\)
\(~\)
library(naniar)
n_miss(airquality)
## [1] 44
n_miss(airquality$Ozone)
## [1] 37
n_complete(airquality)
## [1] 874
n_complete(airquality$Ozone)
## [1] 116
prop_miss(airquality)
## [1] 0.04793028
prop_miss_var(airquality) #variable with NA (column)
## [1] 0.3333333
prop_miss_case(airquality) #row with NA (row)
## [1] 0.2745098
prop_complete(airquality)
## [1] 0.9520697
prop_complete_var(airquality)
## [1] 0.6666667
prop_complete_case(airquality)
## [1] 0.7254902
\(~\)
miss_var_summary(airquality)
## # A tibble: 6 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Ozone 37 24.2
## 2 Solar.R 7 4.58
## 3 Wind 0 0
## 4 Temp 0 0
## 5 Month 0 0
## 6 Day 0 0
airquality %>% group_by(Month) %>% miss_var_summary
## # A tibble: 25 x 4
## # Groups: Month [5]
## Month variable n_miss pct_miss
## <int> <chr> <int> <dbl>
## 1 5 Ozone 5 16.1
## 2 5 Solar.R 4 12.9
## 3 5 Wind 0 0
## 4 5 Temp 0 0
## 5 5 Day 0 0
## 6 6 Ozone 21 70
## 7 6 Solar.R 0 0
## 8 6 Wind 0 0
## 9 6 Temp 0 0
## 10 6 Day 0 0
## # ... with 15 more rows
miss_case_summary(airquality)
## # A tibble: 153 x 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 5 2 33.3
## 2 27 2 33.3
## 3 6 1 16.7
## 4 10 1 16.7
## 5 11 1 16.7
## 6 25 1 16.7
## 7 26 1 16.7
## 8 32 1 16.7
## 9 33 1 16.7
## 10 34 1 16.7
## # ... with 143 more rows
airquality %>% group_by(Month) %>% miss_case_summary
## # A tibble: 153 x 4
## # Groups: Month [5]
## Month case n_miss pct_miss
## <int> <int> <int> <dbl>
## 1 5 5 2 40
## 2 5 27 2 40
## 3 5 6 1 20
## 4 5 10 1 20
## 5 5 11 1 20
## 6 5 25 1 20
## 7 5 26 1 20
## 8 5 1 0 0
## 9 5 2 0 0
## 10 5 3 0 0
## # ... with 143 more rows
miss_var_table(airquality)
## # A tibble: 3 x 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 4 66.7
## 2 7 1 16.7
## 3 37 1 16.7
miss_case_table(airquality)
## # A tibble: 3 x 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 111 72.5
## 2 1 40 26.1
## 3 2 2 1.31
\(~\)
miss_var_run(airquality, var = Ozone)
## # A tibble: 35 x 2
## run_length is_na
## <int> <chr>
## 1 4 complete
## 2 1 missing
## 3 4 complete
## 4 1 missing
## 5 14 complete
## 6 3 missing
## 7 4 complete
## 8 6 missing
## 9 1 complete
## 10 1 missing
## # ... with 25 more rows
miss_var_span(airquality, var = Ozone, span_every = 4000)
## # A tibble: 1 x 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 37 116 0.242 0.758 153
\(~\)
vis_miss(riskfactors)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
vis_miss(riskfactors, cluster = T)
vis_miss(riskfactors, sort_miss = T)
gg_miss_case(riskfactors)
gg_miss_case(riskfactors, facet = education)
gg_miss_var(riskfactors)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
gg_miss_var(riskfactors, facet = education)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
gg_miss_upset(riskfactors)
gg_miss_fct(x = riskfactors, fct = marital)
\(~\)
miss_scan_count(data = exampledata
, search = list
("N/A", "missing", "na", " ", "."))
exampledata <- replace_with_na(exampledata,
replace = list(
V1 = c("N/A", "na"),
V2 = c("N/A", "na")
))
replace_with_na_at(exampledata,
.vars = c("V1", "V2", "V3"),
~.x %in% c("N/A", "missing", "na", " "))
replace_with_na_all(exampledata, ~.x %in% c("N/A", "missing", "na", " "))
\(~\)
frogger <-
tribble(
~name, ~time, ~value,
"jesse", "moring", 6678,
"jesse", "afternoon", 800060,
"jesse", "evening", 475528,
"jesse", "late_night", 143533,
"andy", "morning", 425115,
"andy", "afternoon", 587468,
"andy", "late_night", 111000,
"nic", "afternoon", 588532,
"nic", "late_night", 915533,
"dan", "morning", 388148,
"dan", "evening", 180912,
"alex", "morning", 552670,
"alex", "afternoon", 98355,
"alex", "evening", 266055,
"alex", "late_night", 121056,
)
print(frogger)
## # A tibble: 15 x 3
## name time value
## <chr> <chr> <dbl>
## 1 jesse moring 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
frogger_2 <- frogger %>%
complete(time, value) #기준 = 2개 이상 해야지 서로 빈 곳에 샥샥 들어감
print(frogger)
## # A tibble: 15 x 3
## name time value
## <chr> <chr> <dbl>
## 1 jesse moring 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
print(frogger_2)
## # A tibble: 75 x 3
## time value name
## <chr> <dbl> <chr>
## 1 afternoon 6678 <NA>
## 2 afternoon 98355 alex
## 3 afternoon 111000 <NA>
## 4 afternoon 121056 <NA>
## 5 afternoon 143533 <NA>
## 6 afternoon 180912 <NA>
## 7 afternoon 266055 <NA>
## 8 afternoon 388148 <NA>
## 9 afternoon 425115 <NA>
## 10 afternoon 475528 <NA>
## # ... with 65 more rows
print(frogger_2)
## # A tibble: 75 x 3
## time value name
## <chr> <dbl> <chr>
## 1 afternoon 6678 <NA>
## 2 afternoon 98355 alex
## 3 afternoon 111000 <NA>
## 4 afternoon 121056 <NA>
## 5 afternoon 143533 <NA>
## 6 afternoon 180912 <NA>
## 7 afternoon 266055 <NA>
## 8 afternoon 388148 <NA>
## 9 afternoon 425115 <NA>
## 10 afternoon 475528 <NA>
## # ... with 65 more rows
frogger_2 %>% fill(name)
## # A tibble: 75 x 3
## time value name
## <chr> <dbl> <chr>
## 1 afternoon 6678 <NA>
## 2 afternoon 98355 alex
## 3 afternoon 111000 alex
## 4 afternoon 121056 alex
## 5 afternoon 143533 alex
## 6 afternoon 180912 alex
## 7 afternoon 266055 alex
## 8 afternoon 388148 alex
## 9 afternoon 425115 alex
## 10 afternoon 475528 alex
## # ... with 65 more rows
\(~\)
oceanbuoys %>% arrange(wind_ew) %>% vis_miss()
oceanbuoys %>% arrange(latitude) %>% vis_miss()
oceanbuoys %>% arrange(year) %>% vis_miss()
\(~\)
as_shadow(oceanbuoys) %>% head()
## # A tibble: 6 x 8
## year_NA latitude_NA longitude_NA sea_temp_c_NA air_temp_c_NA humidity_NA
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA
## # ... with 2 more variables: wind_ew_NA <fct>, wind_ns_NA <fct>
oceanbuoys
## # A tibble: 736 x 8
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows
bind_shadow(oceanbuoys)
## # A tibble: 736 x 16
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows, and 8 more variables: year_NA <fct>,
## # latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>
bind_shadow(oceanbuoys, only_miss = TRUE)
## # A tibble: 736 x 11
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows, and 3 more variables: sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>
oceanbuoys %>%
bind_shadow() %>%
group_by(humidity_NA) %>%
summarize(wind_ew_mean = mean(wind_ew)
, wind_ew_sd = sd(wind_ew)
, N = n())
## # A tibble: 2 x 4
## humidity_NA wind_ew_mean wind_ew_sd N
## <fct> <dbl> <dbl> <int>
## 1 !NA -3.78 1.90 643
## 2 NA -3.30 2.31 93
\(~\)
bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew,
color = air_temp_c_NA)) +
geom_density()
### with color() and facet_wrap()
oceanbuoys %>%
bind_shadow() %>%
ggplot(aes(x = wind_ew,
color = humidity_NA)) +
geom_density() +
facet_wrap(~air_temp_c_NA)
### boxplot()
oceanbuoys %>%
bind_shadow() %>%
ggplot(aes(x = air_temp_c_NA,
y = wind_ew)) +
geom_boxplot() +
facet_wrap(~ humidity_NA)
ggplot(oceanbuoys,
aes(x = humidity,
y = air_temp_c)) +
geom_miss_point() +
facet_wrap(~year)
bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew,
y = air_temp_c)) +
geom_miss_point() +
facet_grid(humidity_NA~year)
\(~\)
ocean_imp <- impute_below_all(oceanbuoys)
ggplot(ocean_imp,
aes(x = wind_ew, y = air_temp_c)) +
geom_point()
### Track imputation 1: bind_shadow() + impute_below_all() + geom_point()
ocean_imp_track <- bind_shadow(oceanbuoys) %>%
impute_below_all() %>%
add_label_shadow()
ocean_imp_track
## # A tibble: 736 x 17
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # ... with 726 more rows, and 9 more variables: year_NA <fct>,
## # latitude_NA <fct>, longitude_NA <fct>, sea_temp_c_NA <fct>,
## # air_temp_c_NA <fct>, humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>,
## # any_missing <chr>
ggplot(ocean_imp_track,
aes(x = wind_ew, y = air_temp_c, color = air_temp_c_NA)) +
geom_point()
ggplot(ocean_imp_track,
aes(x = humidity, y = air_temp_c, color = any_missing)) +
geom_point()
# Explore the values of air_temp_c, visualizing the amount of missings with `air_temp_c_NA`.
p <- ggplot(ocean_imp_track, aes(x = air_temp_c, fill = air_temp_c_NA)) + geom_histogram()
# Explore the missings in air_temp_c according to year, using `facet_wrap(~year)`.
p + facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Expore the missings in humidity using humidity_NA
p2 <- ggplot(ocean_imp_track, aes(x = humidity, fill = humidity_NA)) + geom_histogram()
# Explore the missings in humidity according to year, using `facet_wrap(~year)`.
p2 + facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\(~\)
ocean_imp_mean <- bind_shadow(oceanbuoys) %>%
impute_mean_all() %>%
add_label_shadow()
ggplot(ocean_imp_mean,
aes(x = humidity_NA, y = humidity)) +
geom_boxplot()
### geom_point()로 결과 확인
ggplot(ocean_imp_mean,
aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point()
library(simputation)
##
## 다음의 패키지를 부착합니다: 'simputation'
## The following object is masked from 'package:naniar':
##
## impute_median
ocean_imp_lm_wind <- oceanbuoys %>%
bind_shadow() %>%
impute_lm(air_temp_c ~ wind_ew + wind_ns) %>%
impute_lm(humidity ~ wind_ew + wind_ns) %>%
add_label_shadow()
ggplot(ocean_imp_lm_wind,
aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point()
\(~\)
na.omit(airquality)
airquality[complete.cases(airquality$Ozone), ]
airquality[complete.cases(airquality$Ozone, airquality$Solar.R), ]