Load in the necessary packages and datasets:
#setwd.loadpac()
if(require('pacman')){
library('pacman')
}else{
install.packages('pacman')
library('pacman')
}
## Loading required package: pacman
pacman::p_load(tidyverse, reshape2, ggplot2, rmarkdown, dplyr, plotly, rstudioapi, wesanderson, RColorBrewer)
##Load Dataset
all.df <- read.csv(file = "practical04b.csv", header = TRUE, sep = ",")
head(all.df)
## X input output
## 1 1 6.525006 29021003
## 2 2 29.197010 39938554
## 3 3 104.897110 310171606
## 4 4 7.490008 7571733
## 5 5 75.419083 278974401
## 6 6 93.049239 279107301
In order to fit a model to the data, it is necessary to use the following workflow:
Setting the seed allows R to produce random numbers that will be consistent across like scripts, it is analogous to remembering which column and row random numbers are drawn from a table.
Strictly speaking we don’t need to set the seed, this data was created as 5th degree polynomial with added noise and this process will always provide a model that is a 5th degree polynomial, moreover the plots are coded relatively such that the crosshairs are always drawn at the point of lowest validation error, it can be a good habit to remember to do this though:
set.seed(3141)
There are two common ways to work with training, validation and test sets, depending on how much data is available:
Where there is insufficient data, cross-validation may be employed, in this case we will perform a random split of the data:
##So we want to create 3 random samples of data
N <- nrow(all.df)
idval <- sample(N)
train.ID <- idval[0:(N*0.5)]
val.ID <- idval[((N*0.5)+1):(N*0.75)]
test.ID <- idval[((N*0.75)+1):N]
train.df <- all.df[train.ID,]
val.df <- all.df[val.ID,]
test.df <- all.df[test.ID,]
#Order the Data Frame
train.df <- train.df[order(train.df$X),]
val.df <- val.df[order(val.df$X),]
test.df <- test.df[order(test.df$X),]
It isn’t always possible to visualise the data, however where it is possible it should be done.
In this case the data is only two-dimensional so it can be easily visualised:
# Visualise the Data ------------------------------------------------------
##This isn't always possible but in this case it is
all.df$sample <- FALSE
all.df[train.ID,]$sample <- "train"
all.df[val.ID,]$sample <- "val"
all.df[test.ID,]$sample <- "test"
head(all.df)
## X input output sample
## 1 1 6.525006 29021003 val
## 2 2 29.197010 39938554 test
## 3 3 104.897110 310171606 test
## 4 4 7.490008 7571733 test
## 5 5 75.419083 278974401 test
## 6 6 93.049239 279107301 train
all.plot <- ggplot(all.df, aes(x = input)) +
geom_point(data = all.df, size = 1, aes(y = output, col = sample)) +
theme_classic() +
labs(x = "Predictor", y = "Response", col = "Set", title = "Data to Model",
subtitle = "Seperated into Sets",
caption = "Sets created using uniform random values")
# scale_color_brewer(palette="Pastel2")
all.plot
Training a model refers to fitting models to the set of training data.
We don’t know which model will be appropriate for this data, but the data appears to have a natural curve so we will fit a few polynomial models and see how they perform.
It would be wise to create these models through a loop (because it would prevent inconsistent errors and make it possible to produce large numbers of models) however that is a little tricky to do for models so for now that will be left.
Be aware, that when making models ~I(X^n) or ~ poly(x, n) can both be used, it has something to do with correlated coefficients and orthogonal coefficients; but I couldn’t figure out which to use and why.
# Train a Model -----------------------------------------------------------
#train models of varying complexity
# attach(train.df)
head(train.df)
## X input output
## 6 6 93.049239 279107301
## 9 9 68.625796 200200695
## 10 10 -3.239293 -3428366
## 12 12 28.081122 47764570
## 13 13 104.135610 334400319
## 16 16 -14.197866 -27757917
mod.lm <- lm(output ~ input, data = train.df)
mod.p2 <- lm(output ~ I(input^2) + input ,data = train.df)
mod.p3 <- lm(output ~ I(input^3) + I(input^2) + input, data = train.df)
mod.p4 <- lm(output ~ I(input^4) + I(input^3) + I(input^2) + input, data = train.df)
mod.p5 <- lm(output ~ I(input^5) + I(input^4) + I(input^3) + I(input^2) + input, data = train.df)
mod.p6 <- lm(output ~ I(input^6) + I(input^5) + I(input^4) + I(input^3) + I(input^2) + input, data = train.df)
mod.p7 <- lm(output ~ I(input^7) + I(input^6) + I(input^5) + I(input^4) + I(input^3) + I(input^2) + input, data = train.df)
From these models create a dataframe of prediction values for the models and throw it all into a list to store it for now:
#Create Predictions and throw everything into a list
train.df <- data.frame(train.df,
mod_lm = predict(mod.lm),
mod_p2 = predict(mod.p2),
mod_p3 = predict(mod.p3),
mod_p4 = predict(mod.p4),
mod_p5 = predict(mod.p5),
mod_p6 = predict(mod.p6),
mod_p7 = predict(mod.p7)
)[order(train.df$X),]
train.mod.df <- melt(data = train.df, id.vars = c("X", "input", "output"))
train.mod.list <- list(mod.lm, mod.p2, mod.p3, mod.p4, mod.p5, mod.p6, mod.p7)
training.list <- list(models = train.mod.list, training_data = train.df, model_predictions = train.mod.df )
Now we can visualise these models, but first we will fit some pretty colours:
#Visualise the Models
##Create label and colour vectors
plot.labels <- c("Linear Model", "Quadratic", "Cubic", "4th Order",
"5th Order", "6th Order", "7th Order",
"Test Set", "Training Set", "Validation Set")
colfunc <- colorRampPalette(c("purple", "Red"))
mycol <- c(colfunc(7), "#1B065E", "#30343F", "#60E1E0")
plot(rep(1,length(mycol)),col=mycol,pch=19,cex=3)
The colours seem reasonable so we will use them for our plot:
##Plot the models over the Data
all.plot +
geom_line(data = train.mod.df, aes(x = input, y = value, col = variable)) +
scale_color_manual(values = mycol, labels = plot.labels)
So it can be seen that there are a few models, that appear to predict (not necessarily forecast) the data quite well. In order to select which model is appropriate, we will use the Root Mean Square Error \(\left( RMSE = \sqrt{ ( \frac{1}{n}\sum_{i=1}^{n}[(y_i-\hat{y_i})^2] } \right)\) which is a predictor of the average error from data point to model.
So it is necessary to calculate and plot the validation error and training error in order to decide on the most appropriate model.
The training Error provides a benchmark to understand the validation error, it can be calculated thusly:
train.rmse.lm <- sqrt(sum((train.df$output-predict(mod.lm, newdata = train.df))^2)/nrow(train.df))
Scrictly speaking there is no cause to specify the newdata in the call to predict, the predict function will use the input data that was used to create the model to predict model values where ‘newdata’ is not specified. I have specified the parameter here, so later, when I create a for loop and function, I don’t forget the need to specify the input data.
Because:
A function will be used to calculate the RMSE:
rmse <- function(dframe, model){
y <- dframe[,names(dframe) == "output"]
y.hat <- predict(object = model, newdata = dframe)
resid <- y-y.hat
RSS <- sum(resid^2)
T.Error <- RSS/nrow(dframe)
RMSE <- sqrt(T.Error)
return(RMSE)
}
Now we can use a loop to calculate the RMSE Values, this is important because:
Before executing the loop a data frame to store values will need to be created. Where possible a fixed static vector or data frame should be used within a loop because it will execute much faster than a dynamic vector that grows with each repetition, this can require some forethought but will lead to more efficient execution.
##Use a for loop to calculate all RMSE Values
##Create a data frame to store the values
error.df <- data.frame(matrix(
ncol = 4, nrow = length(training.list$models)))
colnames(error.df) <- c("Order",
"Training Error",
"Validation Error",
"Test Error")
##Execute the loop
for (i in 1:length(train.mod.list)) {
rmse.val <- rmse(train.df, training.list$models[[i]])
error.df[i,] <- c(order = i,
training_error = rmse.val,
validation_error = NA,
test_error = NA)
if (i==length(train.mod.list)) {
print(head(error.df))
}
}
## Order Training Error Validation Error Test Error
## 1 1 47168577 NA NA
## 2 2 37817753 NA NA
## 3 3 26004785 NA NA
## 4 4 25963398 NA NA
## 5 5 25962156 NA NA
## 6 6 25907613 NA NA
Calculating the validation error is now simply a process of specifying the validation data frame val.df rather than the training data frame.
# Calculate the Validation Errors -----------------------------------------
##Use a for loop to calculate all validation error values
#use the error.df data frame from before
for(i in 1:7){
rmse.val <- rmse(val.df, training.list$models[[i]])
error.df[i,3] <- rmse.val
if (i==length(train.mod.list)) {
print(head(error.df))
}
}
## Order Training Error Validation Error Test Error
## 1 1 47168577 49517836 NA
## 2 2 37817753 36797112 NA
## 3 3 26004785 29586385 NA
## 4 4 25963398 29620189 NA
## 5 5 25962156 29688027 NA
## 6 6 25907613 29604562 NA
Before the errors can be plotted it will be necessary to create a tidy data frame and determine the co-ordinates of the minimum RMSE:
error.df.melt <- melt(error.df[,-4], id.vars = "Order", variable.name = "Set", value.name = "Error")
error.df.melt
## Order Set Error
## 1 1 Training Error 47168577
## 2 2 Training Error 37817753
## 3 3 Training Error 26004785
## 4 4 Training Error 25963398
## 5 5 Training Error 25962156
## 6 6 Training Error 25907613
## 7 7 Training Error 25761159
## 8 1 Validation Error 49517836
## 9 2 Validation Error 36797112
## 10 3 Validation Error 29586385
## 11 4 Validation Error 29620189
## 12 5 Validation Error 29688027
## 13 6 Validation Error 29604562
## 14 7 Validation Error 29626615
#Mininmum Validation Error
min.val.error.y <- min(error.df$`Validation Error`)
min.val.error.x <- error.df[
error.df[,names(error.df)=="Validation Error"
]==
min(error.df$`Validation Error`),]$Order
Now using this new dataframe and the intercept values a plot can be made and a crosshair drawn in aswell:
error.plot <- ggplot(error.df.melt, aes(x = Order, y = Error, col = Set)) +
geom_line(size = 4, alpha = 0.6) +
geom_point(size = 6) +
theme_classic() +
labs(x = "Predictor", y = "Response", col = "Set", title = "Model Error",
subtitle = "Training Error vs Validation Error",
caption = "Observe that a 5th order Polynomial represents the lowest validation Error") +
geom_vline(xintercept = min.val.error.x, col = "Purple") +
geom_hline(yintercept = min.val.error.y, col = "Purple") +
scale_color_brewer(palette="Accent")
error.plot
depending on the seed, ocassionally the crosshair is drawn in wrong, this could be done in another way (perhaps using the diff() function), but it isn’t necessarily worth the effort
So it can be seen here that the model that performs the best, that isn’t overfit judging by the behaviour of the validation error, is the 3rd degree polynomial model.
Now that we have determined the appropriate model, we can plot it over all of the data:
##Specify the Correct Model
correct.mod.degree <- error.df[error.df$`Validation Error`==min(
error.df$`Validation Error`),]$Order
correct.mod <- train.mod.list[[correct.mod.degree]]
##Create a sufficiently large dataframe of predictions
#such that the model is smooth
correct.mod.df <- data.frame(input = seq(from = min(all.df$input),
to = max(all.df$input), length.out = 10^4))
correct.mod.df$mod_correct <- predict(object = correct.mod,
newdata = correct.mod.df)
##Plot the predictions over the data
all.plot +
geom_line(data = correct.mod.df, size = 2,
alpha = 1,
aes(y = mod_correct,
col = "purple")) +
scale_color_brewer(palette="Set3",
labels = c("3rd Degree Polynomial",
"Test Set",
"Training Set",
"Validation Set")) +
guides(col = guide_legend(title = NULL)) +
labs(title = "Modelled Data", subtitle = NULL)
The RMSE value of the model, relative to the test set is:
rmse.test <- rmse(test.df, correct.mod)
sd.test <- sd(test.df$output)
ratio <- rmse(test.df, correct.mod)/sd(test.df$output)
performance <- data.frame(RMSE = rmse.test, SD = sd.test, Ratio = ratio)
print(performance)
## RMSE SD Ratio
## 1 23085544 117559998 0.1963724
The rmse is a measure of the expected error from data point to model value, the rmse is 20% the size of the standard deviation, hence the model can predict on the test set with greater accuracy than merely taking the average output value
If the model is descriptive of the data (not merely predictive), the error observed between the model and the data should be normally distributed, white noise:
ggplot(data = data.frame(residuals = correct.mod$residuals), aes(residuals)) +
geom_histogram(bins = 20, aes(y = ..density..), fill = 'indianred', col = 'purple') +
geom_density(col = "Royalblue", lwd = 2) +
theme_classic() +
labs(title = "Model Residuals")
ggplot(data = data.frame(residuals = rnorm(300)), aes(residuals)) +
geom_histogram(bins = 20, aes(y = ..density..), fill = 'indianred', col = 'purple') +
geom_density(col = "Royalblue", lwd = 2) +
theme_classic() +
labs(title = "Random Normal Data")
It can be seen that the residuals are sufficiently normally distributed, and sufficiently centred about 0, to accespt that this is an appopriate model for the data (i.e. this might predict but also describe the data with added noise).