tibble-timings.R

fs <- raadfiles::ncep2_uwnd_6hr_files()

print(fs$file[1])

## [1] "data/ftp.cdc.noaa.gov/Datasets/ncep.reanalysis2/gaussian_grid/uwnd.10m.gauss.1979.nc"

library(tidync)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## get the full table for only the first time step
system.time({
  fnc <- tidync(fs$fullname[1])
 tab1 <- fnc %>% 
  hyper_filter(time = index == 1) %>% 
  hyper_tibble()
})

##    user  system elapsed 
##   1.538   0.027   1.573

tab1

## # A tibble: 18,048 x 5
##         uwnd    lon    lat level    time
##  *     <dbl>  <dbl>  <dbl> <dbl>   <dbl>
##  1 -2.380001  0.000 88.542    10 1569072
##  2 -2.560001  1.875 88.542    10 1569072
##  3 -2.720001  3.750 88.542    10 1569072
##  4 -2.880001  5.625 88.542    10 1569072
##  5 -3.030001  7.500 88.542    10 1569072
##  6 -3.170001  9.375 88.542    10 1569072
##  7 -3.310001 11.250 88.542    10 1569072
##  8 -3.440001 13.125 88.542    10 1569072
##  9 -3.560001 15.000 88.542    10 1569072
## 10 -3.680001 16.875 88.542    10 1569072
## # ... with 18,038 more rows

## now, get all the data from the entire file and flesh
## out the big table
system.time({
slabs <- hyper_slice(fnc)
})

##    user  system elapsed 
##   4.092   0.955   5.054

str(slabs)

## List of 1
##  $ uwnd: num [1:192, 1:94, 1:1460] -2.38 -2.56 -2.72 -2.88 -3.03 ...

## here we are not getting the right time value, the only dimension that's varying in the loop
## but this gives and indication of something like the best timing we might get by shortcutting that
system.time({
  d <- bind_rows(lapply(seq_len(dim(slabs[[1]])[3]), function(i) {tab1 %>% mutate(uwnd = c(slabs[[1]][,,i]))}))
})

##    user  system elapsed 
##   8.787   2.867  11.678

## # A tibble: 26,350,080 x 5
##         uwnd    lon    lat level    time
##        <dbl>  <dbl>  <dbl> <dbl>   <dbl>
##  1 -2.380001  0.000 88.542    10 1569072
##  2 -2.560001  1.875 88.542    10 1569072
##  3 -2.720001  3.750 88.542    10 1569072
##  4 -2.880001  5.625 88.542    10 1569072
##  5 -3.030001  7.500 88.542    10 1569072
##  6 -3.170001  9.375 88.542    10 1569072
##  7 -3.310001 11.250 88.542    10 1569072
##  8 -3.440001 13.125 88.542    10 1569072
##  9 -3.560001 15.000 88.542    10 1569072
## 10 -3.680001 16.875 88.542    10 1569072
## # ... with 26,350,070 more rows

pryr::object_size(d)

## 1.05 GB

## now, what if we save ourselves that work and do the full expansion for all 4 dimensions (level is degenerate but
## no big deal at least it generalizes)
system.time({
d1 <- hyper_tibble(fnc)
})

##    user  system elapsed 
##   4.590   5.934  10.537

d1

## # A tibble: 26,350,080 x 5
##         uwnd    lon    lat level    time
##  *     <dbl>  <dbl>  <dbl> <dbl>   <dbl>
##  1 -2.380001  0.000 88.542    10 1569072
##  2 -2.560001  1.875 88.542    10 1569072
##  3 -2.720001  3.750 88.542    10 1569072
##  4 -2.880001  5.625 88.542    10 1569072
##  5 -3.030001  7.500 88.542    10 1569072
##  6 -3.170001  9.375 88.542    10 1569072
##  7 -3.310001 11.250 88.542    10 1569072
##  8 -3.440001 13.125 88.542    10 1569072
##  9 -3.560001 15.000 88.542    10 1569072
## 10 -3.680001 16.875 88.542    10 1569072
## # ... with 26,350,070 more rows

d1 %>% distinct(time)

## # A tibble: 1,460 x 1
##       time
##      <dbl>
##  1 1569072
##  2 1569078
##  3 1569084
##  4 1569090
##  5 1569096
##  6 1569102
##  7 1569108
##  8 1569114
##  9 1569120
## 10 1569126
## # ... with 1,450 more rows

d1 %>% distinct(level)

## # A tibble: 1 x 1
##   level
##   <dbl>
## 1    10

d1 %>% distinct(lon)

## # A tibble: 192 x 1
##       lon
##     <dbl>
##  1  0.000
##  2  1.875
##  3  3.750
##  4  5.625
##  5  7.500
##  6  9.375
##  7 11.250
##  8 13.125
##  9 15.000
## 10 16.875
## # ... with 182 more rows

pryr::object_size(d1)

## 1.05 GB

file.info(fs$fullname[1])$size/1e6

## [1] 52.71544

## 300 hours for 100K 52Mb files
(1e5 * 11)/3600

## [1] 305.5556

tibble-timings.R

mdsumner

Fri Jul 21 20:57:57 2017