Overffiting

This script uses the code made available at # https://www.r-bloggers.com/cross-validation-for-predictive-analytics-using-r/ #http://davide.eynard.it/teaching/2017_ML/overfitting.R

require(splines)

## Loading required package: splines

rm(list=ls())

seed <- 12345
set.seed(seed)

cat("
-------------------------------------------------------------------------------------

The R code below implements these idea via simulated data. In particular, this is a simulation of  100 training sets each of size 50 from a polynomial regression model, and for each a sequence fit of cubic spline models with degrees of freedom from 1 to 30.
")

## 
## -------------------------------------------------------------------------------------
## 
## The R code below implements these idea via simulated data. In particular, this is a simulation of  100 training sets each of size 50 from a polynomial regression model, and for each a sequence fit of cubic spline models with degrees of freedom from 1 to 30.

invisible(readline(prompt = "Press [enter] to continue"))

## Press [enter] to continue

gen_data <- function(n, beta, sigma_eps) {
  eps <- rnorm(n, 0, sigma_eps)
  x <- sort(runif(n, 0, 100))
  X <- cbind(1, poly(x, degree = (length(beta) - 1), raw = TRUE))
  y <- as.numeric(X %*% beta + eps)
  
  return(data.frame(x = x, y = y))
}

#Fit the models
n_rep <- 100
n_df <- 30
df <- 1:n_df
beta <- c(5, -0.1, 0.004, -3e-05)
n_train <- 50
n_test <- 1000
sigma_eps <- 0.5


xy <- res <- list()
xy_test <- gen_data(n_test, beta, sigma_eps)
for (i in 1:n_rep) {
  xy[[i]] <- gen_data(n_train, beta, sigma_eps)
  x <- xy[[i]][, "x"]
  y <- xy[[i]][, "y"]
  res[[i]] <- apply(t(df), 2, function(degf) lm(y ~ ns(x, df = degf)))
}
# Plot the data
plot_idx = 1
x <- xy[[plot_idx]]$x
X <- cbind(1, poly(x, degree = (length(beta) - 1), raw = TRUE))
y <- xy[[plot_idx]]$y
plot(y ~ x, col = "gray", lwd = 2)
lines(x, X %*% beta, lwd = 3, col = "black")
lines(x, fitted(res[[plot_idx]][[1]]), lwd = 3, col = "palegreen3")
lines(x, fitted(res[[plot_idx]][[5]]), lwd = 3, col = "darkorange")
lines(x, fitted(res[[plot_idx]][[25]]), lwd = 3, col = "steelblue")
legend(x = "topleft", legend = c("True function", "Linear fit (df = 1)", "Best model (df = 5)", 
                                 "Overfitted model (df = 25)"), lwd = rep(3, 4), 
                                col = c("black", "palegreen3", "darkorange", "steelblue"),text.width = 32, cex = 0.85)

cat("
In the plot you can see data points generated by a polynomial regression model (the 
true function is plotted in black) and three piecewise-cubic spline models with different 
flexibility (expressed in terms of degrees of freedom of the model - see \"?ns\").
")

## 
## In the plot you can see data points generated by a polynomial regression model (the 
## true function is plotted in black) and three piecewise-cubic spline models with different 
## flexibility (expressed in terms of degrees of freedom of the model - see "?ns").

invisible(readline(prompt = "Press [enter] to continue"))

## Press [enter] to continue

# Compute the training and test errors for each model
pred <- list()
mse <- te <- matrix(NA, nrow = n_df, ncol = n_rep)
for (i in 1:n_rep) {
  mse[, i] <- sapply(res[[i]], function(obj) deviance(obj)/nobs(obj))
  pred[[i]] <- mapply(function(obj, degf) predict(obj, data.frame(x = xy_test$x)), 
                      res[[i]], df)
  te[, i] <- sapply(as.list(data.frame(pred[[i]])), function(y_hat) mean((xy_test$y - 
                                                                            y_hat)^2))
}

# Compute the average training and test errors
av_mse <- rowMeans(mse)
av_te <- rowMeans(te)

# Plot the errors
plot(df, av_mse, type = "l", lwd = 2, col = gray(0.4), ylab = "Prediction error", 
     xlab = "Flexibilty (spline's degrees of freedom [log scaled])", ylim = c(0, 
                                                                              1), log = "x")
abline(h = sigma_eps, lty = 2, lwd = 0.5)
for (i in 1:n_rep) {
  lines(df, te[, i], col = "lightpink")
}
for (i in 1:n_rep) {
  lines(df, mse[, i], col = gray(0.8))
}
lines(df, av_mse, lwd = 2, col = gray(0.4))
lines(df, av_te, lwd = 2, col = "darkred")
points(df[1], av_mse[1], col = "palegreen3", pch = 17, cex = 1.5)
points(df[1], av_te[1], col = "palegreen3", pch = 17, cex = 1.5)
points(df[which.min(av_te)], av_mse[which.min(av_te)], col = "darkorange", pch = 16, 
       cex = 1.5)
points(df[which.min(av_te)], av_te[which.min(av_te)], col = "darkorange", pch = 16, 
       cex = 1.5)
points(df[25], av_mse[25], col = "steelblue", pch = 15, cex = 1.5)
points(df[25], av_te[25], col = "steelblue", pch = 15, cex = 1.5)
legend(x = "top", legend = c("Training error", "Test error"), lwd = rep(2, 2), 
       col = c(gray(0.4), "darkred"), text.width = 0.3, cex = 0.85)

cat("
In the plot you can see training and test errors for the previously generated
datasets, and their average (in thicker lines). The orange dots show the value
of the df parameter where the test error is lowest. As you can see, while the 
training error decreases when the flexibility grows, the test error first
decreases too, but after some point (on the right of the orange dot) it becomes 
larger and larger.


")

## 
## In the plot you can see training and test errors for the previously generated
## datasets, and their average (in thicker lines). The orange dots show the value
## of the df parameter where the test error is lowest. As you can see, while the 
## training error decreases when the flexibility grows, the test error first
## decreases too, but after some point (on the right of the orange dot) it becomes 
## larger and larger.

invisible(readline(prompt = "Press [enter] to continue"))

## Press [enter] to continue

Overffiting

Durley Torres-Marin

September 24, 2017