library(magrittr)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.3-4, (SVN revision 766)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: /usr/share/gdal/2.2
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.3-1
library(raster)
## 
## Attaching package: 'raster'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(gdalUtils)
library(fst)
library(future.apply)
## Loading required package: future
## 
## Attaching package: 'future'
## The following object is masked from 'package:raster':
## 
##     values
## 
## Attaching package: 'future.apply'
## The following object is masked from 'package:future':
## 
##     future_lapply
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:raster':
## 
##     shift
library(here)
## here() starts at /home/michael
###################################################################
# Fixed parameters

# Number of processes to run at once
future_workers = 3L

# Variables to keep in final table
to_keep = c(
  #"filename", 
  "date",
  "lon",
  "lat",
  "LST_Day_1km", 
  "QC_Day", 
  "Day_view_time", 
  # "Day_view_angl", 
  "LST_Night_1km", 
  "QC_Night", 
  "Night_view_time", 
  # "Night_view_angl", 
  "Emis_31", 
  "Emis_32"#, 
  # "Clear_day_cov", 
  # "Clear_night_cov",
)

# Directory with HDF files, possibly in sub-directories
dir = "/home/michael/Dropbox/BGU/Itai_Kloog/p_55_processing_MODIS_LST_from_HDF_to_table/example_02_mexico/data"
# dir <- here("data")

# Output directory
out = "/home/michael/Dropbox/BGU/Itai_Kloog/p_55_processing_MODIS_LST_from_HDF_to_table/example_02_mexico"
# out <- here("intermediate")

###################################################################

# Start
start = Sys.time()

# Get list of files
files = list.files(path = dir, pattern = "\\.hdf$", recursive = TRUE, full.names = TRUE)

# Function for reading HDF file and turning it into table
read_hdf = function(input_file) {
  
  # Get subdataset paths & variable names
  s = get_subdatasets(input_file)
  vars = gsub(".*:", "", s)
  
  # Read from HDF to multi-band raster
  r = stack()
  for(i in s) {
    tmp = i %>% readGDAL(silent = TRUE) %>% raster
    r = stack(r, tmp)
  }
  names(r) = vars
  
  # To points
  pnt = rasterToPoints(r, spatial = TRUE)
  pnt = spTransform(pnt, "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
  
  # To table
  dat = as.data.frame(pnt)
  
  # coerce to data.table
  setDT(dat)
  
  # Rename 'x-y' to 'lon-lat'
  setnames(dat, c("x", "y"), c("lon", "lat"))
  
  # Add metadata
  # dat$filename = basename(input_file) # too redundant to store
  dat[, day := 
    basename(input_file) %>% 
    strsplit("\\.") %>% 
    sapply("[", 2) %>% 
    gsub("A", "", .) %>% 
    as.Date(format = "%Y%j")] # day is a better name because date conflicts with many R functions
  
  # Subset variables to keep
  dat[, setdiff(names(dat), to_keep) := NULL]
  
  # Return
  return(dat)
  
}

# Tell future to use the multiprocess backend
plan(multiprocess, workers = future_workers)

# Send work to the workers
output = future_lapply(files, FUN = read_hdf)

# Combine to one table with dat.table
final = rbindlist(output)

# Write table
write.fst(x = final, path = file.path(out, "final.fst"), compress = 80)

# End
end = Sys.time()

# Overall timing
end - start
## Time difference of 25.17534 secs
# Examine final table
head(final)
##    LST_Day_1km QC_Day Day_view_time LST_Night_1km QC_Night Night_view_time
## 1:      303.38     81          10.5        285.98       17            22.4
## 2:      302.92     81          10.5        285.72       17            22.4
## 3:      301.68     81          10.5        285.98       17            22.4
## 4:      301.46     81          10.5        286.54       17            22.4
## 5:      300.96     81          10.5        287.04       17            22.4
## 6:      300.82     81          10.5        286.86       17            22.4
##    Emis_31 Emis_32       lon      lat
## 1:   0.972   0.976 -115.4604 29.99583
## 2:   0.972   0.976 -115.4508 29.99583
## 3:   0.972   0.976 -115.4412 29.99583
## 4:   0.972   0.976 -115.4315 29.99583
## 5:   0.972   0.976 -115.4219 29.99583
## 6:   0.972   0.976 -115.4123 29.99583
tail(final)
##    LST_Day_1km QC_Day Day_view_time LST_Night_1km QC_Night Night_view_time
## 1:          NA      3            NA            NA        3              NA
## 2:          NA      3            NA            NA        3              NA
## 3:          NA      3            NA            NA        3              NA
## 4:          NA      3            NA            NA        3              NA
## 5:          NA      3            NA            NA        3              NA
## 6:          NA      3            NA            NA        3              NA
##    Emis_31 Emis_32       lon      lat
## 1:      NA      NA -91.43611 10.00417
## 2:      NA      NA -91.42765 10.00417
## 3:      NA      NA -91.41918 10.00417
## 4:      NA      NA -91.41072 10.00417
## 5:      NA      NA -91.40226 10.00417
## 6:      NA      NA -91.39380 10.00417