repeat.before = function(x) { # repeats the last non NA value. Keeps leading NA
ind = which(!is.na(x)) # get positions of nonmissing values
if(is.na(x[1])) # if it begins with a missing, add the
ind = c(1,ind) # first position to the indices
rep(x[ind], times = diff( # repeat the values at these indices
c(ind, length(x) + 1) )) # diffing the indices + length yields how often
} # they need to be repeated
repeat_last = function(x, forward = TRUE, maxgap = Inf, na.rm = FALSE) { # repeats the last non NA value.
if (!forward) x = rev(x) # reverse x twice if carrying backward
ind = which(!is.na(x)) # get positions of nonmissing values
if (is.na(x[1]) && !na.rm) # if it begins with NA
ind = c(1,ind) # add first pos
rep_times = diff( # diffing the indices + length yields how often
c(ind, length(x) + 1) ) # they need to be repeated
if (maxgap < Inf) {
exceed = rep_times - 1 > maxgap # exceeding maxgap
if (any(exceed)) { # any exceed?
ind = sort(c(ind[exceed] + 1, ind)) # add NA following large gaps to indices
rep_times = diff(c(ind, length(x) + 1) ) # diff again
}
}
x = rep(x[ind], times = rep_times) # repeat the values at these indices
if (!forward) x = rev(x) # second reversion
x
}
# replaces all NA values with last non-NA values
na.lomf <- function(x) {
na.lomf.0 <- function(x) {
non.na.idx <- which(!is.na(x))
if (is.na(x[1L])) {
non.na.idx <- c(1L, non.na.idx)
}
rep.int(x[non.na.idx], diff(c(non.na.idx, length(x) + 1L)))
}
dim.len <- length(dim(x))
if (dim.len == 0L) {
na.lomf.0(x)
} else {
apply(x, dim.len, na.lomf.0)
}
}
require(data.table)
## Loading required package: data.table
replaceNaWithLatest <- function(
dfIn,
nameColNa = names(dfIn)[1]
){
dtTest <- data.table(dfIn)
setnames(dtTest, nameColNa, "colNa")
dtTest[, segment := cumsum(!is.na(colNa))]
dtTest[, colNa := colNa[1], by = "segment"]
dtTest[, segment := NULL]
setnames(dtTest, "colNa", nameColNa)
return(dtTest)
}
attention: carries forward and backward in case of NAs at beginning
replace_na_with_last<-function(x,a=!is.na(x)){
x[which(a)[c(1,1:sum(a))][cumsum(a)+1]]
}
library(Rcpp)
cppFunction('IntegerVector na_locf(IntegerVector x) {
int n = x.size();
for(int i = 0; i<n; i++) {
if((i > 0) && (x[i] == NA_INTEGER) & (x[i-1] != NA_INTEGER)) {
x[i] = x[i-1];
}
}
return x;
}')
cppFunction('NumericVector na_locf_numeric(NumericVector x) {
int n = x.size();
LogicalVector ina = is_na(x);
for(int i = 1; i<n; i++) {
if((ina[i] == TRUE) & (ina[i-1] != TRUE)) {
x[i] = x[i-1];
}
}
return x;
}')
tidyr_fill = tidyr:::fillDown
edge_cases = function(x) {
ref = zoo::na.locf(x, na.rm = F)
results = c(
tryCatch(all.equal(ref, repeat.before(x)), error = warning)
,tryCatch(all.equal(ref, na_locf(x)), error = warning)
,tryCatch(all.equal(ref, na_locf_numeric(x)), error = warning)
,tryCatch(all.equal(ref, spacetime::na.locf(x,na.rm=F)), error = warning)
,tryCatch(all.equal(ref, imputeTS::na.locf(x)), error = warning)
,tryCatch(all.equal(ref, na.lomf(x)), error = warning)
,tryCatch(all.equal(ref, replace_na_with_last(x)), error = warning)
,tryCatch(all.equal(ref, tidyr_fill(x)), error = warning)
)
names(results) = c("repeat.before","na_locf", "na_locf_numeric", "spacetime", "imputeTS","na.lomf", "replace_na_with_last","tidyr_fill")
if(all(results == TRUE)) T else results[results!=T]
}
edge_cases(c(NA,1))
## imputeTS
## "'is.NA' value mismatch: 0 in current 1 in target"
## replace_na_with_last
## "'is.NA' value mismatch: 0 in current 1 in target"
edge_cases(c(1, NA))
## [1] TRUE
edge_cases(c( NA))
## na_locf
## "Modes: logical, numeric"
## na_locf_numeric
## "target is logical, current is numeric"
## spacetime
## "Modes: logical, numeric"
## imputeTS
## "target is logical, current is numeric"
## replace_na_with_last
## "Input x is not numeric"
edge_cases(c( NA, T))
## na_locf
## "Modes: logical, numeric"
## na_locf_numeric
## "target is logical, current is numeric"
## spacetime
## "Modes: logical, numeric"
## imputeTS
## "target is logical, current is numeric"
## replace_na_with_last
## "Input x is not numeric"
## <NA>
## "'is.NA' value mismatch: 0 in current 1 in target"
edge_cases(c( T, NA))
## na_locf
## "Modes: logical, numeric"
## na_locf_numeric
## "target is logical, current is numeric"
## spacetime
## "Modes: logical, numeric"
## imputeTS
## "target is logical, current is numeric"
## replace_na_with_last
## "Input x is not numeric"
edge_cases(numeric())
## [1] TRUE
edge_cases(character())
## na_locf na_locf_numeric
## "not compatible with requested type" "not compatible with requested type"
edge_cases(c(NA, "1", NA))
## na_locf
## "not compatible with requested type"
## na_locf_numeric
## "not compatible with requested type"
## imputeTS
## "Input x is not numeric"
## replace_na_with_last
## "'is.NA' value mismatch: 0 in current 1 in target"
edge_cases(as.Date(c(NA, "1970-01-01" , NA)))
## imputeTS
## "Input x is not numeric"
## na.lomf
## "Attributes: < Modes: list, NULL >"
## replace_na_with_last
## "Attributes: < Lengths: 1, 0 >"
## tidyr_fill
## "Attributes: < names for target but not for current >"
## <NA>
## "Attributes: < current is not list-like >"
## <NA>
## "target is Date, current is numeric"
## <NA>
## "'is.NA' value mismatch: 0 in current 1 in target"
edge_cases(c(NA, 100.1000, NA))
## na_locf
## "Mean relative difference: 0.001"
## imputeTS
## "'is.NA' value mismatch: 0 in current 1 in target"
## replace_na_with_last
## "'is.NA' value mismatch: 0 in current 1 in target"
x = c(NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,2,3,4,NA,NA,NA,NA,NA,5)
library(microbenchmark)
bench_em <- function(x, times = 2000, count = 1000) {
x <- sample(x,count,replace = TRUE)
res = microbenchmark(
na_locf(x),
na.lomf(x),
zoo::na.locf(x, na.rm = F),
spacetime::na.locf(x,na.rm=F),
imputeTS::na.locf(x, na.remaining = "keep"),
repeat.before(x),
repeat_last(x),
tidyr_fill(x),
times = times)
print(res, order = "mean", digits = 1)
ggplot2::autoplot(res)
}
bench_em(x)
## Unit: microseconds
## expr min lq mean median uq max
## na_locf(x) 6 10 15 12 15 1561
## tidyr_fill(x) 6 11 19 14 19 1678
## repeat.before(x) 21 28 46 34 44 1940
## repeat_last(x) 22 30 50 36 48 2018
## na.lomf(x) 22 30 92 37 47 74631
## zoo::na.locf(x, na.rm = F) 75 92 147 112 144 2174
## spacetime::na.locf(x, na.rm = F) 78 94 159 113 147 2217
## imputeTS::na.locf(x, na.remaining = "keep") 149 174 260 213 269 2302
## neval cld
## 2000 a
## 2000 a
## 2000 ab
## 2000 ab
## 2000 bc
## 2000 cd
## 2000 d
## 2000 e
edge_cases_maxgap = function(x, maxgap = 3, fromLast = F) {
ref = zoo::na.locf(x, na.rm = F, maxgap = maxgap, fromLast = fromLast)
tryCatch(all.equal(ref, repeat_last(x, maxgap = maxgap, forward = !fromLast)), error = warning)
}
edge_cases_maxgap(c(1,NA, 2, NA, NA, NA, 3))
## [1] TRUE
edge_cases_maxgap(c(1,NA, 2, NA, NA, NA, NA, 3))
## [1] TRUE
edge_cases_maxgap(c(1,NA, 2, NA, NA, NA, NA, 3))
## [1] TRUE
edge_cases_maxgap(c(1,NA, 2, NA, NA, NA, NA), fromLast = F)
## [1] TRUE
edge_cases_maxgap(c(1,NA, 2, NA, NA, NA, NA), fromLast = F, maxgap = Inf)
## [1] TRUE
edge_cases_maxgap(as.Date(c(1,NA, 2, NA, NA, NA, NA), origin = "1970-01-01"), fromLast = F, maxgap = Inf)
## [1] TRUE
bench_em <- function(x,count = 1000) {
x <- sample(x,count,replace = TRUE)
res = microbenchmark(
zoo::na.locf(x, na.rm = F, maxgap = 3),
repeat_last(x, maxgap = 3),
times = 2000)
print(res, order = "mean", digits = 1)
ggplot2::autoplot(res)
}
bench_em(x)
## Unit: microseconds
## expr min lq mean median uq max neval
## repeat_last(x, maxgap = 3) 57 71 104 81 105 2649 2000
## zoo::na.locf(x, na.rm = F, maxgap = 3) 215 232 379 287 362 3423 2000
## cld
## a
## b
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] microbenchmark_1.4-2.1 Rcpp_0.12.9 data.table_1.10.4
##
## loaded via a namespace (and not attached):
## [1] plyr_1.8.4 tseries_0.10-37 xts_0.9-7
## [4] tools_3.3.2 digest_0.6.12 evaluate_0.10
## [7] tibble_1.2 gtable_0.2.0 lattice_0.20-34
## [10] Matrix_1.2-8 yaml_2.1.14 parallel_3.3.2
## [13] mvtnorm_1.0-5 stringr_1.1.0 knitr_1.15.1
## [16] imputeTS_1.8 rprojroot_1.2 grid_3.3.2
## [19] nnet_7.3-12 forecast_7.3 spacetime_1.2-0
## [22] survival_2.40-1 rmarkdown_1.3 multcomp_1.4-6
## [25] sp_1.2-4 TH.data_1.0-8 tidyr_0.6.1
## [28] ggplot2_2.2.1 magrittr_1.5 codetools_0.2-15
## [31] MASS_7.3-45 splines_3.3.2 backports_1.0.5
## [34] scales_0.4.1 intervals_0.15.1 htmltools_0.3.5
## [37] assertthat_0.1 timeDate_3012.100 colorspace_1.3-2
## [40] fracdiff_1.4-2 quadprog_1.5-5 sandwich_2.3-4
## [43] stringi_1.1.2 stinepack_1.3 lazyeval_0.2.0
## [46] munsell_0.4.3 zoo_1.7-14