fs <- raadfiles::ncep2_uwnd_6hr_files()

print(fs$file[1])
## [1] "data/ftp.cdc.noaa.gov/Datasets/ncep.reanalysis2/gaussian_grid/uwnd.10m.gauss.1979.nc"
library(tidync)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## get the full table for only the first time step
system.time({
  fnc <- tidync(fs$fullname[1])
 tab1 <- fnc %>% 
  hyper_filter(time = index == 1) %>% 
  hyper_tibble()
})
##    user  system elapsed 
##   1.538   0.027   1.573
tab1
## # A tibble: 18,048 x 5
##         uwnd    lon    lat level    time
##  *     <dbl>  <dbl>  <dbl> <dbl>   <dbl>
##  1 -2.380001  0.000 88.542    10 1569072
##  2 -2.560001  1.875 88.542    10 1569072
##  3 -2.720001  3.750 88.542    10 1569072
##  4 -2.880001  5.625 88.542    10 1569072
##  5 -3.030001  7.500 88.542    10 1569072
##  6 -3.170001  9.375 88.542    10 1569072
##  7 -3.310001 11.250 88.542    10 1569072
##  8 -3.440001 13.125 88.542    10 1569072
##  9 -3.560001 15.000 88.542    10 1569072
## 10 -3.680001 16.875 88.542    10 1569072
## # ... with 18,038 more rows
## now, get all the data from the entire file and flesh
## out the big table
system.time({
slabs <- hyper_slice(fnc)
})
##    user  system elapsed 
##   4.092   0.955   5.054
str(slabs)
## List of 1
##  $ uwnd: num [1:192, 1:94, 1:1460] -2.38 -2.56 -2.72 -2.88 -3.03 ...
## here we are not getting the right time value, the only dimension that's varying in the loop
## but this gives and indication of something like the best timing we might get by shortcutting that
system.time({
  d <- bind_rows(lapply(seq_len(dim(slabs[[1]])[3]), function(i) {tab1 %>% mutate(uwnd = c(slabs[[1]][,,i]))}))
})
##    user  system elapsed 
##   8.787   2.867  11.678
d
## # A tibble: 26,350,080 x 5
##         uwnd    lon    lat level    time
##        <dbl>  <dbl>  <dbl> <dbl>   <dbl>
##  1 -2.380001  0.000 88.542    10 1569072
##  2 -2.560001  1.875 88.542    10 1569072
##  3 -2.720001  3.750 88.542    10 1569072
##  4 -2.880001  5.625 88.542    10 1569072
##  5 -3.030001  7.500 88.542    10 1569072
##  6 -3.170001  9.375 88.542    10 1569072
##  7 -3.310001 11.250 88.542    10 1569072
##  8 -3.440001 13.125 88.542    10 1569072
##  9 -3.560001 15.000 88.542    10 1569072
## 10 -3.680001 16.875 88.542    10 1569072
## # ... with 26,350,070 more rows
pryr::object_size(d)
## 1.05 GB
## now, what if we save ourselves that work and do the full expansion for all 4 dimensions (level is degenerate but
## no big deal at least it generalizes)
system.time({
d1 <- hyper_tibble(fnc)
})
##    user  system elapsed 
##   4.590   5.934  10.537
d1
## # A tibble: 26,350,080 x 5
##         uwnd    lon    lat level    time
##  *     <dbl>  <dbl>  <dbl> <dbl>   <dbl>
##  1 -2.380001  0.000 88.542    10 1569072
##  2 -2.560001  1.875 88.542    10 1569072
##  3 -2.720001  3.750 88.542    10 1569072
##  4 -2.880001  5.625 88.542    10 1569072
##  5 -3.030001  7.500 88.542    10 1569072
##  6 -3.170001  9.375 88.542    10 1569072
##  7 -3.310001 11.250 88.542    10 1569072
##  8 -3.440001 13.125 88.542    10 1569072
##  9 -3.560001 15.000 88.542    10 1569072
## 10 -3.680001 16.875 88.542    10 1569072
## # ... with 26,350,070 more rows
d1 %>% distinct(time)
## # A tibble: 1,460 x 1
##       time
##      <dbl>
##  1 1569072
##  2 1569078
##  3 1569084
##  4 1569090
##  5 1569096
##  6 1569102
##  7 1569108
##  8 1569114
##  9 1569120
## 10 1569126
## # ... with 1,450 more rows
d1 %>% distinct(level)
## # A tibble: 1 x 1
##   level
##   <dbl>
## 1    10
d1 %>% distinct(lon)
## # A tibble: 192 x 1
##       lon
##     <dbl>
##  1  0.000
##  2  1.875
##  3  3.750
##  4  5.625
##  5  7.500
##  6  9.375
##  7 11.250
##  8 13.125
##  9 15.000
## 10 16.875
## # ... with 182 more rows
pryr::object_size(d1)
## 1.05 GB
file.info(fs$fullname[1])$size/1e6
## [1] 52.71544
## 300 hours for 100K 52Mb files
(1e5 * 11)/3600
## [1] 305.5556