Spatial Prediction Competition

Task

The task was to predict soil moisture contents at 6000 points (49 locations and 5534 time points). The training dataset consisted of 6,947,622 points (198 locations and 35,089 times). The data has been collected by soil probes at a 30-minute interval and All target time points were part of the training data set but locations were lying outside of this zone, although still part of the same geographical area. Meaning that both the training and target measurement stations are located within the Bohemian Karst region in Czech Republic. The prediction competition game was part of the GEOSTAT summer school, organised by the OpenGeoHUB foundation (https://opengeohub.org/), that was hosted at the IBOT institute in Pruhonice, Czech Republic (www.ibot.cas.cz/en/). IBOT has been responsible for the data collection.

Our strategy

Derive covariates: For spatial information, we used the DEM, forest cover and derived covariates from the DEM. For temporal information, we used the date (Julian day; hour) and the mean soil moisture content across all test-locations (proxy for precipitation in temporal resolution of test data).
Subsetting the test data: The whole dataset was too large to train models on. Therefore, subsetted the training data as follows: we omitted all time points, which were not part of the target population (thanks to Tobias Rentschler, University of Tuebingen, Germany, for sharing this great idea!!) and locations which were not included in the DEM layer. From the remaining 1,101,148 observations we selected 50,000 using a balanced probability sampling method.
Model: We trained a random forest model using target-oriented cross validation and forward feature selection.

Code

Setting work environment, seed and loading libraries

rm(list=ls())

set.seed(123)

library(readxl)        # read_excel()
library(raster)        # raster(), projectRaster(), extract()
library(sf)            # st_as_sf()
library(mapview)       # mapview()
library(ggplot2)
library(data.table)    # data.table()
library(mlr)

Load training data…

train.time  = readRDS("bohsom.train.rds")
train.time$type = "train"
train.space = read_excel("GEOSTAT_SPCg_2018.xlsx",sheet="Training_stations")
train.space$type = "train"

… and target data

target.time  = read_excel("GEOSTAT_SPCg_2018.xlsx",sheet="Validation")
target.time$type = "target"
target.space = read_excel("GEOSTAT_SPCg_2018.xlsx",sheet="Validation_stations")
target.space$type = "target"

Covariates

Spatial data

First, checking spatial arrangement of testing locations and identify clusters:

train.space$transect = ifelse(train.space$GPS_LON > 14.4, "A",
                                ifelse(train.space$GPS_LON > 14.383, "B",
                                       ifelse(train.space$GPS_LAT < 50.87, "C",
                                              ifelse(train.space$GPS_LAT < 50.878,"D","E"))))

Then, load raster data (DEM, forest cover and wetness proxies derived from DEM) and extract values for training and target stations.

DEM   = raster("DEM_5m.tif")
DEM   = projectRaster(DEM, crs="+init=epsg:4326")
Cover = raster("Forest_types_10m.tif")
Cover = projectRaster(Cover, crs="+init=epsg:4326")

# The following raster maps were derived from the DEM in QGIS 2.18 (default settings)
saga.wet     = raster("Topographic wetness index.tif")              # saga:sagawetnessindex
saga.wet     = projectRaster(saga.wet, crs="+init=epsg:4326")
slope        = raster("slope.tif")                                  # grass7:r.slope
slope        = projectRaster(slope, crs="+init=epsg:4326")
valley       = raster("valley depth.tif")                           # saga:relativeheightsandslopepositions
valley       = projectRaster(valley, crs="+init=epsg:4326")
height.norm  = raster("Normalized height.tif")                      # saga:relativeheightsandslopepositions
height.norm  = projectRaster(height.norm, crs="+init=epsg:4326")
slope.height = raster("Slope Height.tif")                           # saga:relativeheightsandslopepositions
slope.height = projectRaster(slope.height, crs="+init=epsg:4326")
flow         = raster("Flow accumulation.tif")                      # saga:slopelimitedflowaccumulation
flow         = projectRaster(flow, crs="+init=epsg:4326")

# Extract information for train and target points
space = plyr::rbind.fill(train.space,target.space)                  # Merge train and target locations
space = unique(space)                                               # Observation number does not change 
                                                                    # --> Target stations not part of training stations
space = st_as_sf(space, 
                 coords = c("GPS_LON", "GPS_LAT"), 
                 crs = 4326)
space$extr_DEM     = extract(DEM,space)
space$extr_F       = extract(Cover,space)
space$saga.wet     = extract(saga.wet,space)
space$slope        = extract(slope,space)
space$valley       = extract(valley,space)
space$height.norm  = extract(height.norm,space)
space$slope.height = extract(slope.height,space)
space$flow         = extract(flow,space)

# Omit outliers in flow
space$flow[space$flow>2000] = NA                                     # For our final predictions, this step was unnecessary and omitted several training stations after na.omit(bigData)

Visualiue spatial data:

mapviewOptions(basemaps = c("Esri.WorldImagery"))
mapview(space, burst=T)

Temporal data

# Mean soil moisture per time step
time.agg = data.table(train.time)
time.agg = time.agg[,list(soil.moist.perc.mean = mean(soil.moist.perc,na.rm=T),
                          soil.moist.perc.sd   = sd(soil.moist.perc,na.rm=T)),
                          by=c("ctime")]
time.agg = as.data.frame(time.agg)

# Bind mean soil moisture to train and target times
                                                                     # Merge train and target times
train.time$ctime = as.character(train.time$ctime)                    # Convert ctime to same format...
target.time$ctime = substr(as.character(target.time$ctime),1,nchar(as.character(target.time$ctime))-3)
target.time$ctime[!(target.time$ctime %in% train.time$ctime)]        # --> All target times are part of training set!

## character(0)

time = plyr::rbind.fill(train.time[,c("type",
                                      "station_id",
                                      "ctime",
                                      "soil.moist.perc")],
                        target.time)
time = merge(time,time.agg,by="ctime",all=T)

# Derive covariates from date                     
                                                                     # We used CET (default)
                                                                     # ... accepting NAs
                                                                     # See comment of Timothy Pollington on Google Plus
time$ctime2 = as.POSIXct(time$ctime, format = "%Y-%m-%d %H:%M", tz = "CET")     
time$Date   = sapply(strsplit(as.character(time$ctime), " "), "[", 1)
time$julian = julian(as.Date(time$Date), origin=as.Date("1970-01-01"))        
time$Hour   = as.numeric(format(time$ctime2, '%H', tz = "CET")) 
time$minute = as.numeric(format(time$ctime2, '%M', tz = "CET"))

Visualize spatial data:

# Mean trend of soil moisture over time
# https://www.r-graph-gallery.com/317-time-series-with-the-dygraphs-library/
library(dygraphs)
library(xts)
time.agg$ctime = as.POSIXct(time.agg$ctime, format = "%Y-%m-%d %H:%M")
time.agg = subset(time.agg,ctime < as.POSIXct("2011-04-25", format = "%Y-%m-%d", tz = "UTC"))
plot = xts(x = time.agg$soil.moist.perc.mean, order.by = time.agg$ctime)
dygraph(plot) %>%
  dyOptions(labelsUTC = TRUE, fillGraph=TRUE, fillAlpha=0.1, drawGrid = FALSE, colors="#D8AE5A") %>%
  dyRangeSelector() %>%
  dyCrosshair(direction = "vertical") %>%
  dyHighlight(highlightCircleSize = 5, highlightSeriesBackgroundAlpha = 0.2, hideOnMouseOut = FALSE)  %>%
  dyRoller(rollPeriod = 1)

# Variablility of soil moisture across locations
ggplot(subset(time.agg,ctime < as.POSIXct("2010-06-26", format = "%Y-%m-%d", tz = "UTC")), aes(ctime)) +
  geom_ribbon(aes(ymin = soil.moist.perc.mean - soil.moist.perc.sd, ymax = soil.moist.perc.mean + soil.moist.perc.sd), fill = "grey70") +
  geom_line(aes(y = soil.moist.perc.mean), color = "red") +
  theme_bw()

Create bigData, then subset…

# Merge temporal and spatial data --> bigData
st_geometry(space) = NULL
bigData = merge(space,
                time[,c("station_id",
                        "ctime",
                        "ctime2",
                        "Date",
                        "julian",
                        "Hour",
                        "minute",
                        "soil.moist.perc",
                        "soil.moist.perc.mean")],
                by="station_id",
                all=T)

### Subset bigData
# A) Train model only on time steps, which are needed for the game
# Following the idea of Tobias Rentschler (University of Tuebingen, Germany)! 
bigData = bigData[bigData$ctime %in% target.time$ctime,]   

# B) Exclude all rows with NAs in any of the predictors 
bigData = bigData[!is.na(bigData$flow) & !is.na(bigData$slope),]
bigData = data.frame(bigData)

Cosmetics to make make bigData exactly like the original version for submission (bigData_30_08)

# Load old version
bigData_30_08  <- read.table("bigData_30_08.txt", header = T, sep="\t")

##### Major stuff
### Bug in time variables
# NAs for "ctime2", "Hour" & "minure" at ctime of "2012-03-25 01:00" and "2012-03-25 01:30" 
# Must be time zone stuff... but cannot reproduce this error (maybe because I did it on my Mac????)
bigData[bigData$ctime=="2012-03-25 01:00" | bigData$ctime=="2012-03-25 01:30", 
        c("ctime2","Hour","minute")] = NA

### Bug in "soil.moist.perc.mean"
# For bigData_30_08, we set...
bigData$soil.moist.perc[bigData$type=="target"] = 999
# Afterwards, I wrongly calculated "soil.moist.perc.mean" by including these 999 values:
temp.agg = data.table(bigData)
temp.agg = temp.agg[,list(soil.moist.perc.mean = mean(soil.moist.perc,na.rm=T)),
                          by=c("ctime")]
temp.agg = as.data.frame(temp.agg)
bigData = merge(bigData[,!(names(bigData)%in%"soil.moist.perc.mean")],temp.agg,by="ctime",all.x=T)

##### (Hopefully only) minor stuff
### Allocate transect IDs to target stations
# I did this by mistake in bigData_30_08!
target.space$transect = ifelse(target.space$GPS_LON > 14.4, "A",
                                ifelse(target.space$GPS_LON > 14.383, "B",
                                       ifelse(target.space$GPS_LAT < 50.87, "C",
                                              ifelse(target.space$GPS_LAT < 50.878,"D","E"))))
for(i in 1:nrow(target.space)){
  bigData$transect[bigData$station_id == target.space$station_id[i]] = target.space$transect[i]
}

### Sort columns
bigData = bigData[,names(bigData_30_08)]

### Format data types
# Character to factor
bigData[sapply(bigData, is.character)] = lapply(bigData[sapply(bigData, is.character)], 
                                                   as.factor)
# POSIX date to factor
bigData$ctime2 = as.factor(bigData$ctime2)
# POSIX numeric to integer
bigData[,c("julian","Hour","minute")] = lapply(bigData[,c("julian","Hour","minute")], 
                                                   as.integer)

##### Now, bigData extactly resembles bigData_30_08:
temp0 = mlr::summarizeColumns(bigData)
temp1 = mlr::summarizeColumns(bigData_30_08)

Split bigData in target and training sets

target = bigData[bigData$type == "target",]

train = bigData[bigData$type == "train",]

Variable overview of training set

mlr::summarizeColumns(train)

##                    name    type  na         mean        disp       median
## 1                 ctime  factor   0           NA   0.9998193           NA
## 2            station_id  factor   0           NA   0.9945355           NA
## 3       soil.moist.perc numeric   0 3.067270e+01  15.1697872 2.987150e+01
## 4                  type  factor   0           NA   0.0000000           NA
## 5              extr_DEM numeric   0 3.432927e+02  34.2176824 3.426459e+02
## 6                extr_F numeric   0 3.265347e+00   3.2300931 1.000000e+00
## 7              saga.wet numeric   0 3.792499e+00   1.1564269 3.692123e+00
## 8                 slope numeric   0 2.814718e+01  14.4548966 2.801676e+01
## 9          slope.height numeric   0 1.377809e+01  11.6646395 1.045187e+01
## 10                 flow numeric   0 2.967658e+02 360.8200469 1.490075e+02
## 11               valley numeric   0 1.665085e+01  13.2220252 1.363804e+01
## 12          height.norm numeric   0 4.677295e-01   0.2914717 4.370896e-01
## 13             transect  factor   0           NA   0.6284153           NA
## 14 soil.moist.perc.mean numeric   0 3.533560e+01   5.9402527 3.564203e+01
## 15               ctime2  factor 366           NA          NA           NA
## 16                 Date  factor   0           NA   0.9971088           NA
## 17               julian integer   0 1.509027e+04 209.5512724 1.509100e+04
## 18                 Hour integer 366 1.132032e+01   6.9800183 1.100000e+01
## 19               minute integer 366 1.508134e+01  14.9997868 3.000000e+01
##             mad          min          max nlevs
## 1            NA 1.830000e+02 1.830000e+02  5534
## 2            NA 0.000000e+00 5.534000e+03   183
## 3  1.426335e+01 0.000000e+00 8.159500e+01     0
## 4            NA 0.000000e+00 1.012722e+06     1
## 5  3.353319e+01 2.694940e+02 4.607665e+02     0
## 6  4.938050e-16 1.000000e+00 1.400000e+01     0
## 7  1.073867e+00 1.386252e+00 9.321682e+00     0
## 8  1.705878e+01 2.076621e+00 5.857793e+01     0
## 9  9.044813e+00 7.115742e-01 6.792996e+01     0
## 10 1.283448e+02 2.814662e+01 1.605187e+03     0
## 11 1.203807e+01 5.496106e-01 6.696100e+01     0
## 12 3.800299e-01 2.700234e-02 9.752347e-01     0
## 13           NA 9.961200e+04 3.763120e+05     5
## 14 6.075929e+00 1.240066e+01 5.571722e+01     0
## 15           NA 1.830000e+02 1.830000e+02  5532
## 16           NA 1.830000e+02 2.928000e+03   724
## 17 2.683506e+02 1.473000e+04 1.545400e+04     0
## 18 8.895600e+00 0.000000e+00 2.300000e+01     0
## 19 0.000000e+00 0.000000e+00 3.000000e+01     0

Additional data import (covariates for random forest)

Set-up

# load required packages
library(raster)
library(rgdal)
library(mapview)
library(RNetCDF) 
library(ncdf4)
library(sf)
#library(R.utils)
library(data.table)
library(BalancedSampling)
library(CAST)
library(caret)
library(mapview)
library(sp)
library(ranger)
library(gridExtra)
library(png)

# re-read geo-referenced measurement locations
space = plyr::rbind.fill(train.space,target.space)                  # Merge train and target locations
space = unique(space)                                               # Observation number does not change 

latlon = "+init=epsg:4326" # defining EPSG code for WGS84 coordinate system

                                                                    # --> Target stations not part of training stations
space = st_as_sf(space, 
                 coords = c("GPS_LON", "GPS_LAT"), 
                 crs = latlon)

Read in 250m soil grids soil hydraulic parameters./ Data available from: https://eusoilhydrogrids.rissac.hu/250.php
The imported raster data have an equal extent.

# list .tif files in current working directory for saturated hydraulic conductivity on 5 cm depth
Ks_1 <- list.files(path="./5cm_Ks", pattern = "*.tif",full.names=TRUE,recursive = TRUE)

# create raster list
Ks_1.list <- list()
for(i in 1:length(Ks_1)) { Ks_1.list[i] <- raster::raster(Ks_1[i])}

# read in merge
Ks5.merge <- do.call(merge, Ks_1.list)

Ks5.merge <- projectRaster(Ks5.merge, crs = latlon)
Ks5.merge@crs <- CRS(latlon)

plot(Ks5.merge, 
     main = "Saturated hydraulic conductivity (Ks) for 5 cm topsoil",
     ylab = "Latitude",
     xlab = "Longitude")# check whether raster have been read in properly

Repeat the above process for the saturated hydraulic conductivity for 15 and 30 cm depth

Ks_2 <- list.files(path="./15cm_Ks",pattern="*.tif",full.names=TRUE,recursive=TRUE)

# create raster list 
Ks_2.list <- list()
for(i in 1:length(Ks_2)) { Ks_2.list[i] <- raster(Ks_2[i]) }

# read in as a merged raster and project
Ks15.merge <- do.call(merge,Ks_2.list)
Ks15.merge <- projectRaster(Ks15.merge, crs = latlon)
Ks15.merge@crs <- CRS(latlon)

# 30 cm Ks
Ks_3 <- list.files(path="./30cm_Ks", pattern = "*.tif",full.names=T,recursive = T)

# create raster list
Ks_3.list <- list()
for(i in 1:length(Ks_3)) { Ks_3.list[i] <- raster(Ks_3[i])}

# read in merge
Ks30.merge <- do.call(merge, Ks_3.list)
Ks30.merge <- projectRaster(Ks30.merge, crs = latlon)
Ks30.merge@crs <- CRS(latlon)

Stack Ks data and visualize..

Ks_all <- stack(Ks15.merge, Ks5.merge, Ks30.merge)

# read in DEM for cropping extent of stacked Ks data
DEM <- raster("DEM_5m.tif")
DEM <- projectRaster(DEM, crs = latlon)
DEM@crs <- CRS(latlon)

# crop Ks data to DEM
Ks_cr <- crop(Ks_all, DEM)

space$Ks5 <- raster::extract(Ks5.merge, space, method = "simple")
space$Ks15 <- raster::extract(Ks15.merge, space, method = "simple")
space$Ks30 <- raster::extract(Ks30.merge, space, method = "simple")

mapviewOptions(basemaps = c("Esri.WorldImagery"))
mapview(space, burst = "Ks15") # Variation in saturated hydraulic conductivity takes place on a larger spatial scale compared to the extent of the area that we model..

Precipitation and temperature data from the European Climate Assessment have been explored for the area as well./ Since these did not captured the required spatial resolution either, they have not been included in the modelling exercise./ If interested, this climate data is available from:/ https://www.ecad.eu/download/ensembles/download.php#datafiles
Additionally, for a well-written guideline on handling .nc files, see: / http://geog.uoregon.edu/bartlein/courses/geog490/week04-netCDF.html#reading-restructuring-and-writing-netcdf-files-in-r

Balanced sampling

The dataset that we are currently dealing with is the training dataset at 30 minute time interval (i.e. original temporal resolution). Earlier on in the script we created a subset that contains only the timepoints for which the soil moisture values need to be imputed in the other training stations.

Additionally, we take the mean soil moisture across the 198 sampling stations in the training dataset as a proxy signal for precipitation.

Since this reduced dataset still amounts to over 1 million points, we used a balanced sampling method to get a representative sample of the entire population in both space and time. For additional information on the balanced sampling method, we recommend the following documentation:
https://cran.r-project.org/web/packages/BalancedSampling/BalancedSampling.pdf
http://www.math.helsinki.fi/msm/banocoss/Deville_Tille_2004.pdf

# Important! Ensure that [Hour] and [minute] columns are numeric
#train$Date <- as.factor(train$Date)
#train$Hour <- as.numeric(train$Hour)
#train$minute <- as.numeric(train$minute)

# Replace the NA values within 'Hour' and 'Minute' with '999' value as sampling algorithm cannot handle these
train$Hour[is.na(train$Hour)] <- 999
train$minute[is.na(train$minute)] <- 999

target$Hour[is.na(target$Hour)] <- 999
target$minute[is.na(target$minute)] <- 999

Within the construction of the random forest model, the total sample size has been iteratively increased up till a sample size that has been found to lead to a reasonable processing time.

A processing time plot has been generated (iterative simulation of [n] = 300, 600, 1200, 2400, 4800, 9600, 19200) and plotting the system.time() outcome from the model training to sample size. For brevity, this has not been included in this Rmarkdown document but can be created by running the lines for both the balanced sampling design and the random forest while altering the value of sample size [n].

# Assign variables based on data frame
N = nrow(train)               # population size
n = 50000                       # sample size
p2 = rep(n/N,N)               # inclusion probabilities for subsample
plot(p2)                      # check inclusion probability distribution

Create matrix with auxiliary variables and inclusion probability vector, variables are scaled (standardized) in order to ensure the same amount of spread in each dimension

set.seed(11041991)

# 4 auxiliary variables and inclusion probability vector
Xa = scale(cbind(p2,
                 train$julian,
                 train$Hour,
                 train$minute,
                 train$soil.moist.perc))

# idem
Xb = scale(cbind(p2,
                 train$julian,
                 train$Hour,
                 train$minute,
                 train$soil.moist.perc.mean))

# 5 auxiliary variables and inclusion probability vector
Xc = scale(cbind(p2,
                 train$julian,
                 train$Hour,
                 train$minute,
                 train$soil.moist.perc,
                 train$soil.moist.perc.mean))

# Create matrix with 7 auxiliary variables and inclusion probability vector
X = scale(cbind(p2,
                train$julian,
                train$Hour,
                train$minute,
                train$soil.moist.perc,
                train$soil.moist.perc.mean,
                train$extr_DEM,
                train$saga.wet))

Set vector of generated inclusion probabilities in the first column of the matrix to guarantee fixed sample size using the cube method

ya <- BalancedSampling::cube(p2, Xa)

yb  <- BalancedSampling::cube(p2, Xb)

yc <- BalancedSampling::cube(p2, Xc)

y <- BalancedSampling::cube(p2, X)

# Select rows from original data file for all sampling techniques
Xa_df <- as.data.frame(train[ya, ])

Xb_df <- as.data.frame(train[yb, ])

Xc_df <- as.data.frame(train[yc, ])

X_df <- as.data.frame(train[y, ])

# Assign column type for various sampling technique data.frames accordingly to allow for rbind()
train$type = "original"
Xa_df$type = "Xa"
Xb_df$type = "Xb"
Xc_df$type = "Xc"
X_df$type = "X"

# rbind() all data.frame outputs for various auxiliary matrices
all.samples <- rbind(train,(rbind(Xa_df,rbind(Xb_df, rbind(X_df, Xc_df)))))

Boxplot to assess the spread of train and target variables, i.e the extent to which the balanced sampling been effective by visually interpreting the sample distribution for each variable

par(mfrow=c(3,3))
boxplot(soil.moist.perc ~ type, all.samples, main = "Moisture") 
boxplot(soil.moist.perc.mean ~ type, all.samples, main = "Mean_30m_moisture") 
boxplot(extr_DEM ~ type, all.samples, main = "DEM")
boxplot(saga.wet ~ type, all.samples, main = "Saga_wet")
boxplot(slope ~ type, all.samples, main = "slope")
boxplot(height.norm ~type, all.samples, main = "height.norm")
boxplot(julian ~ type, all.samples, main = "julian day")
boxplot(scale(Hour) ~ type, all.samples, main = "Hours")
boxplot(minute ~ type, all.samples, main = "Minute")

par(mfrow=c(1,1))

It appears that a balanced sampling approach makes the difference in a representative sample in particularl for the temporal and moisture covariates. Spatial variables such as DEM and saga.wet appear to be accounted for by an increased sample size

Randomforest modelling using half an hour interval data

# allow for parallel processing
library(parallelMap)
library(parallel)
parallelStartSocket(parallel::detectCores())

# Assign predictor character vectors
predictors_a <- c("extr_DEM", 
                  "saga.wet",
                  "slope",
                  "julian",
                  "Hour",
                  "soil.moist.perc.mean")

predictors_b <- c("extr_DEM", 
                  "saga.wet",
                  "slope",
                  "julian",
                  "soil.moist.perc.mean")

predictors_c <- c("extr_DEM",
                  "saga.wet",
                  "soil.moist.perc.mean",
                  "julian",
                  "Hour",
                  "minute")

Run model with system.time(), used to create processing time plots as a function of sampling size

# You can try this out yourself by changing the values of [n] in line ...
#set.seed(11041991)
#model <- system.time(train(Xc_df[,predictors_b],Xc_df$soil.moist.perc,
#                           method = "rf", tuneLength = 1, importance = T,
#                           trControl = trainControl(method="cv",number=5,
#                                                    allowParallel = T)))

# print processing time
#model

Again, for reasons of brevity, here we illustrate model runs on a single sampling matrix with two different subsets of predictor variables

# Run model for sampling matrix Xa using six predictors
#model2 <- train(Xa_df[,predictors_b],Xa_df$soil.moist.perc,
#                            method = "rf", tuneLength = 1, importance = T,
#                            trControl = trainControl(method="cv",number=5))

#model2 

# Run model on same sampling matrix but with a different predictor vector
#model3 <- train(Xa_df[,predictors_c],Xa_df$soil.moist.perc,
#                            method = "rf", tuneLength = 1, importance = T,
#                            trControl = trainControl(method="cv",number=5))

#model3

Cross-validation procedures in Random forest modelling

# Using CARET package, create SpacetimeFolds for spatial and spatial-temporal cross-validation
# Make sure that the 
set.seed(11041991)
indices_1 <- CreateSpacetimeFolds(Xc_df, spacevar = "transect",
                                  k = 5)

indices_2 <- CreateSpacetimeFolds(Xc_df, spacevar = "transect", timevar = "julian",
                                  k = 5)
# choose correct predictors! ([a], [b] or [c)
# select correct indices! ([1], [2])
set.seed(11041991)
model_LLO <- train(Xc_df[,predictors_b],
                   Xc_df$soil.moist.perc,
                   method = "rf",
                   tuneLength = 1,
                   importance = TRUE,
                   trControl = trainControl(method = "cv",
                                            index = indices_1$index,
                                            allowParallel = TRUE))

model_LLO

## Random Forest 
## 
## 50000 samples
##     5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 31362, 44950, 37245, 44756, 41687 
## Resampling results:
## 
##   RMSE     Rsquared    MAE     
##   15.6368  0.06476153  12.80418
## 
## Tuning parameter 'mtry' was held constant at a value of 2

#p1 <- plot(varImp(model2),
#           main = 'model 2')
#p2 <- plot(varImp(model3),
#           main = 'model 3')
#p3 <- plot(varImp(model_LLO),
#           main = 'model_LLO')
#par(mfrow=c(1,3), mar=c(4,4,4,1), oma=c(0.5,0.5,0.5,0))
#layout(t(1:3))
#p1
#p2
#p3

#p4 <- gridExtra::grid.arrange(arrangeGrob(p1,
#                                          p2,
#                                          p3,
#                                          ncol = 3))

#p4

However, we found that the importance of the variables in these three different models was strongly dependent on the actual sample size used to train the model.

Hence, we did a forward feature selection to get a more detailed idea about the effect of the number of preditor variables on the model outcome. I.e. as we assume that we a balanced sample in time: we should include variables that make sense relating to our cross validation, which means predictor variables that have spatial meaning

# Run FFS-model for spatial variables
# including predictors vector 2+ variables
#set.seed(11041991)
#ffsmodel_LLO_b <- ffs(Xc_df[,predictors_b], 
#                    Xc_df$soil.moist.perc,
#                    metric = "Rsquared",
#                    method = "ranger",
#                    tuneLength = 1,
#                    verbose = F,
#                    trControl = trainControl(method = "cv",
#                                             index = indices_1$index,
#                                             allowParallel = T))

# Any difference in coefficient of determination with model_LLO?
#ffsmodel_LLO_b 
#ffsmodel_LLO_b$selectedvars

###############################################
###############################################

# Forward feature selection for both spatial variables and time

#ffsmodel_LLO_c <- ffs(Xc_df[,predictors_c], # including predictors vector 2+ variables
#                      Xc_df$soil.moist.perc,
#                      metric = "Rsquared",
#                      method = "ranger",
#                      tuneLength = 1,
#                      verbose = F,
#                      trControl = trainControl(method = "cv",
#                                               index = indices_2$index, # select indices_2
#                                               allowParallel = T))

#ffsmodel_LLO_c # difference with model_LLO on only spatial variables?
#ffsmodel_LLO_c$selectedvars

# plotting and grid.arrange()
#p5 <- plot_ffs(ffsmodel_LLO_b)
#p6 <- plot_ffs(ffsmodel_LLO_c)

#p7 <- gridExtra::grid.arrange(arrangeGrob(p5,
#                                          p6,
#                                          ncol = 2))

#p7 # Left pane shows FFS with only spatial variables, Right pane with both spatial and temporal variables

Forward feature selection with both space and time folds indicate a relatively good spread in the time dimension, i.e. the spatial variables come forth as important.

Model evaluation

# Code for testing predictions of previous model runs
# Extract relevant columns according to the predictor variable used in that model
#target2 <- target[,c("extr_DEM",
#                     "saga.wet",
#                     "slope",
#                     "julian",
#                     "Hour",
#                     "soil.moist.perc.mean")]

# Use predict() from base r to generate estimated soil moisture content across time-series  
#target2$soil.moist.perc <- predict.train(model2, newdata = target2)

# Equal procedure for model_3
#target3 <- target[,c("extr_DEM",
#                     "saga.wet",
#                     "soil.moist.perc.mean",
#                     "julian",
#                     "Hour",
#                     "minute")]

# idem as for model2
#target3$soil.moist.perc <- predict.train(model3, newdata = target3)


# idem for model 
target_LLO <- target[,c("extr_DEM", 
                        "saga.wet",
                        "slope",
                        "julian",
                        "soil.moist.perc.mean")]

target_LLO$soil.moist.predicted <- predict.train(model_LLO, newdata = target_LLO)

# Final model predictions (model_LLO)
# Was done on 50.000 samples using predictor subset [b], and sample matrix Xc
# Only spatial cross-validation as we assume a balanced sample in both julian day and hour dimensions, space-folds were created along the 'transect' variable
pred = read_excel("GEOSTAT_SPCg_2018.xlsx",sheet="Predictions",n_max=6001)  # Load results from google sheet
pred$ctime = round(pred$ctime,"min")                                        # Round to inutes...
names(pred)[names(pred) == 'soil.moist.perc - MEASURED']  = 'measured'
names(pred)[names(pred) == 'Florian - Timo']  = 'predicted_Timo.Flo'
names(pred)[names(pred) == 'alois']  = 'predicted_alois'
pred = pred[1:6000, c("station_id","ctime","measured","predicted_Timo.Flo","predicted_alois")]

# Script taken from alignment for prediction
bigData = as.data.frame(setDT(bigData, keep.rownames = TRUE[]))
bigData$rn = as.numeric(bigData$rn)

target_x = as.data.frame(setDT(target_LLO, keep.rownames = TRUE)[])  # extract row names from target
target_x$rn = as.numeric(target_x$rn)

final_v1 = merge(target_x[,c("rn","soil.moist.predicted")],
             bigData[,c("rn","station_id","ctime")],
             all.x=T)                                        # Merge "bigData" and "target" by row names

google = read_excel("GEOSTAT_SPCg_2018.xlsx",sheet="Validation")   # Loading Google Table - Sheet:Predictions
google$sort = 1:nrow(google)
google$ctime = substr(as.character(google$ctime),1,nchar(as.character(google$ctime))-3)   # Convert ctime to same format as in final_v1

final_v2 = merge(google,final_v1[,c("station_id",
                                    "ctime",
                                    "soil.moist.predicted")], by=c("station_id","ctime"),all.x = T) # Merge all 6000 rows from google sheet with "final_v1"

# merging pred and final_v2
pred$ctime = substr(as.character(pred$ctime),1,nchar(as.character(pred$ctime))-3)   # Convert ctime 
pred2 <- merge(pred, final_v2[,c("station_id", "ctime", "soil.moist.predicted")],
               by = c("station_id","ctime"))

plot(pred2$soil.moist.predicted, pred2$predicted_Timo.Flo)

# Our original results...
error = na.omit(pred2$measured - pred2$predicted_Timo.Flo)
sqrt(mean(error^2))                                                         # RMSE in Google Sheet = 10.4??

## [1] 11.46773

# Our new results...
error = na.omit(pred2$measured - pred2$soil.moist.predicted)
sqrt(mean(error^2))

## [1] 11.42193

# Visualisation based on 
# https://stackoverflow.com/a/46421908
ggplot(pred2,aes(soil.moist.predicted,measured)) +
  stat_density_2d(aes(fill = ..density..), geom = 'raster', contour = FALSE) +       
  scale_fill_viridis_c() +
  coord_cartesian(expand = FALSE) +
  geom_point(shape = '.', col = 'white') + 
  geom_abline(slope=1,intercept=0,color="red")                              # Oh oh... quite some scatter

# Why are there negative values...
# ...in soil.moist.perc - MEASURED??

Acknowledgements

This has been a great summer school with great people! A special thanks goes to Tobias Rentschler and Timothy Pollington for sharing ideas on the prediction game and Marieke Dirksen - without her we would have missed the deadline! The calibration and validation point data have been made available by the Department of GIS and Remote Sensing, Institute of Botany / Academy of Sciences of the Czech Republic, for the purpose of the “Spatial Prediction Competition GEOSTAT 2018”.