fs <- raadfiles::ncep2_uwnd_6hr_files()
print(fs$file[1])
## [1] "data/ftp.cdc.noaa.gov/Datasets/ncep.reanalysis2/gaussian_grid/uwnd.10m.gauss.1979.nc"
library(tidync)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## get the full table for only the first time step
system.time({
fnc <- tidync(fs$fullname[1])
tab1 <- fnc %>%
hyper_filter(time = index == 1) %>%
hyper_tibble()
})
## user system elapsed
## 1.538 0.027 1.573
tab1
## # A tibble: 18,048 x 5
## uwnd lon lat level time
## * <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -2.380001 0.000 88.542 10 1569072
## 2 -2.560001 1.875 88.542 10 1569072
## 3 -2.720001 3.750 88.542 10 1569072
## 4 -2.880001 5.625 88.542 10 1569072
## 5 -3.030001 7.500 88.542 10 1569072
## 6 -3.170001 9.375 88.542 10 1569072
## 7 -3.310001 11.250 88.542 10 1569072
## 8 -3.440001 13.125 88.542 10 1569072
## 9 -3.560001 15.000 88.542 10 1569072
## 10 -3.680001 16.875 88.542 10 1569072
## # ... with 18,038 more rows
## now, get all the data from the entire file and flesh
## out the big table
system.time({
slabs <- hyper_slice(fnc)
})
## user system elapsed
## 4.092 0.955 5.054
str(slabs)
## List of 1
## $ uwnd: num [1:192, 1:94, 1:1460] -2.38 -2.56 -2.72 -2.88 -3.03 ...
## here we are not getting the right time value, the only dimension that's varying in the loop
## but this gives and indication of something like the best timing we might get by shortcutting that
system.time({
d <- bind_rows(lapply(seq_len(dim(slabs[[1]])[3]), function(i) {tab1 %>% mutate(uwnd = c(slabs[[1]][,,i]))}))
})
## user system elapsed
## 8.787 2.867 11.678
d
## # A tibble: 26,350,080 x 5
## uwnd lon lat level time
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -2.380001 0.000 88.542 10 1569072
## 2 -2.560001 1.875 88.542 10 1569072
## 3 -2.720001 3.750 88.542 10 1569072
## 4 -2.880001 5.625 88.542 10 1569072
## 5 -3.030001 7.500 88.542 10 1569072
## 6 -3.170001 9.375 88.542 10 1569072
## 7 -3.310001 11.250 88.542 10 1569072
## 8 -3.440001 13.125 88.542 10 1569072
## 9 -3.560001 15.000 88.542 10 1569072
## 10 -3.680001 16.875 88.542 10 1569072
## # ... with 26,350,070 more rows
pryr::object_size(d)
## 1.05 GB
## now, what if we save ourselves that work and do the full expansion for all 4 dimensions (level is degenerate but
## no big deal at least it generalizes)
system.time({
d1 <- hyper_tibble(fnc)
})
## user system elapsed
## 4.590 5.934 10.537
d1
## # A tibble: 26,350,080 x 5
## uwnd lon lat level time
## * <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -2.380001 0.000 88.542 10 1569072
## 2 -2.560001 1.875 88.542 10 1569072
## 3 -2.720001 3.750 88.542 10 1569072
## 4 -2.880001 5.625 88.542 10 1569072
## 5 -3.030001 7.500 88.542 10 1569072
## 6 -3.170001 9.375 88.542 10 1569072
## 7 -3.310001 11.250 88.542 10 1569072
## 8 -3.440001 13.125 88.542 10 1569072
## 9 -3.560001 15.000 88.542 10 1569072
## 10 -3.680001 16.875 88.542 10 1569072
## # ... with 26,350,070 more rows
d1 %>% distinct(time)
## # A tibble: 1,460 x 1
## time
## <dbl>
## 1 1569072
## 2 1569078
## 3 1569084
## 4 1569090
## 5 1569096
## 6 1569102
## 7 1569108
## 8 1569114
## 9 1569120
## 10 1569126
## # ... with 1,450 more rows
d1 %>% distinct(level)
## # A tibble: 1 x 1
## level
## <dbl>
## 1 10
d1 %>% distinct(lon)
## # A tibble: 192 x 1
## lon
## <dbl>
## 1 0.000
## 2 1.875
## 3 3.750
## 4 5.625
## 5 7.500
## 6 9.375
## 7 11.250
## 8 13.125
## 9 15.000
## 10 16.875
## # ... with 182 more rows
pryr::object_size(d1)
## 1.05 GB
file.info(fs$fullname[1])$size/1e6
## [1] 52.71544
## 300 hours for 100K 52Mb files
(1e5 * 11)/3600
## [1] 305.5556