Load in the data and convert the times to a POSIXct class, then make them numeric because gam models don’t always play nice with this class. So we have seconds from the UNIX epoch.

north=read.csv("north.csv")
south=read.csv("south.csv")

north$time = as.POSIXct(paste0("1970-01-01 ",north$time))
south$time = as.POSIXct(paste0("1970-01-01 ",south$time))


north$time=as.numeric(north$time)
south$time=as.numeric(south$time)

We’re going to do generalized additive models that allow you to fit a spline smooth through a predictor variable.

library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-3. For overview type 'help("mgcv-package")'.
n_mod=mgcv::gam(duration ~ s(time),data=north)
s_mod=mgcv::gam(duration ~ s(time),data=south)

If we look at the summaries for these, the smooth of time is significant for the north journey, but there’s nothing going on for south.

summary(n_mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## duration ~ s(time)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1022.59      18.66   54.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F  p-value    
## s(time) 4.142  5.069 6.382 4.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.288   Deviance explained = 32.5%
## GCV =  29414  Scale est. = 27499     n = 79
summary(s_mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## duration ~ s(time)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   874.23      26.25    33.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value
## s(time) 1.482  1.805 1.987   0.145
## 
## R-sq.(adj) =  0.0267   Deviance explained = 4.49%
## GCV =  56901  Scale est. = 55135     n = 80

Let’s look at the north model - generate predictions, with standard errors, for each possible time between the minimum and maximum.

pred_frame=data.frame(time=min(north$time):max(north$time))

n_predict=predict(n_mod,newdata = pred_frame,se.fit=TRUE)

pred_frame$pred=n_predict$fit
pred_frame$upper=n_predict$fit + n_predict$se.fit
pred_frame$lower=n_predict$fit - n_predict$se.fit

Now prepare a graph - plot the predicted fit with the error bars. Convert the times back into POSIXct for nice plotting too

pgon_x = c(pred_frame$time,rev(pred_frame$time))
pgon_y = c(pred_frame$upper,rev(pred_frame$lower))

pred_frame$time_po=as.POSIXct(pred_frame$time,origin="1970-01-01")

plot(pred_frame$pred ~ pred_frame$time_po,type="l",ylim=c(700,1500),xlab="Time",ylab="Duration")
polygon(pgon_x,pgon_y,col="#44444444")
points(north$duration ~ north$time)

Hmmm. Looks like the increase is driven by very few points.

Let’s try a regression tree.

library(rpart)
rp_mod=rpart(duration ~ time,data=north)
rp_predict=predict(rp_mod,newdata=pred_frame)
pred_frame$pred_rt = rp_predict
plot(pred_frame$pred_rt ~ pred_frame$time_po,col="red")
points(north$duration ~ north$time)

Regression tree doesn’t pick up that early journeys are shorter, like the gam did, I think because there just isn’t enough data. Does pick up a declining journey time after 18:00.

Let’s do a randomForest, which is really intended to use loads of variables.

library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
rf_mod=randomForest(duration ~ time,data=north)

This is explaining 16% of the variance.

print(rf_mod)
## 
## Call:
##  randomForest(formula = duration ~ time, data = north) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 31233.01
##                     % Var explained: 18.05

Let’s plot the model predictions.

rf_predict=predict(rf_mod,newdata=pred_frame,type="response")
pred_frame$pred_rf = rf_predict
plot(pred_frame$pred_rf ~ pred_frame$time_po,col="red")
points(north$duration ~ north$time)

So that’s completely overfitted, but does seem to capture some of what’s going on if you squint at it.

For fun, let’s look at a randomForest for the south dataset.

rf_smod=randomForest(duration ~ time,data=south)
pred_frame_s=data.frame(time=min(south$time):max(south$time))
rf_predict_s=predict(rf_smod,newdata=pred_frame_s,type="response")
pred_frame_s$pred_rf = rf_predict_s
pred_frame_s$time_po = as.POSIXct(pred_frame_s$time,origin="1970-01-01")
plot(pred_frame_s$pred_rf ~ pred_frame_s$time_po,col="red")
points(south$duration ~ south$time)

Lol nope. % variance explained was negative, by the way, which is never a good sign.