We want to predict individual credit balance, based on 10 predictor variables

Credit Balance is distributed with the following density in our sample of 400 individuals:

require(dplyr)
dat <- read.table("Assignment_1_Data.txt",header = T)
require(stringr)

#overall density
plot(density(dat$Balance),main="Distribution of Credit Balance",col="orange",lwd=3,sub="Summary statistics included as vertical lines")

abline(v=as.numeric(summary(dat$Balance)[c(1,2,3,4,5,6)]),col=c("grey","blue","red","purple","blue","grey"),lty=c("dotted","dashed","solid","solid","dashed","dotted"),lwd=c(1,1,3,3,1,1))

legend("topright",legend=c("Density","Range","IQR","Mean","Median"),lty=c("solid","dotted","dashed","solid","solid"),col=c("orange","grey","blue","red","purple"),lwd=c(2,1,1,2,2))

Looking at Categorical Variables, we can draw the following boxplots and density plots for the distribution of credit balance, per variable level

par(mfrow=c(1,2))

#individual densities and boxplots

boxplot(Balance~as.factor(Gender),data=dat,col=c("blue","deeppink"),main="Gender",ylab="Credit Balance",xlab="Gender")

males <- dat %>% subset(trimws(as.character(dat$Gender))=="Male")
females <- setdiff(dat,males)
plot(density(females$Balance),col="deeppink",main="Gender",xlab="")
lines(density(males$Balance),col="blue")

#

boxplot(Balance~as.factor(Student),data=dat,col=c("purple","orange"),main="Student Status",ylab="Credit Balance",xlab="Student Status")

students <- dat %>% subset(trimws(as.character(dat$Student))=="Yes")
not_students <- setdiff(dat,students)
plot(density(not_students$Balance),col="purple",main="Student Status",xlab="")
lines(density(students$Balance),col="orange")

#

boxplot(Balance~Ethnicity,data=dat,col=rainbow(3),main="Ethnicity",ylab="Credit Balance",xlab="Ethnicity")

whites <- dat %>% subset(trimws(as.character(dat$Ethnicity))=="Caucasian")
blacks <- dat %>% subset(trimws(as.character(dat$Ethnicity))=="African American")
asians <- dat %>% subset(trimws(as.character(dat$Ethnicity))=="Asian")

plot(density(whites$Balance),col="#0000FFFF",main="Ethnicity",xlab="")
lines(density(blacks$Balance),col="#FF0000FF")
lines(density(asians$Balance),col="#00FF00FF")

boxplot(Balance~as.factor(Married),data=dat,col=c("cornflowerblue","darkgoldenrod"),main="Marital status",ylab="Credit Balance",xlab="Marital Status")

married <- dat %>% subset(trimws(as.character(dat$Married))=="Yes")
not_married <- setdiff(dat,married)
plot(density(not_married$Balance),col="cornflowerblue",main="Marital Status",xlab="")
lines(density(married$Balance),col="darkgoldenrod")

For discrete variables, we draw boxplots only: Density plots are less informative…

par(mfrow=c(1,1))

boxplot(Balance~as.factor(Cards),data=dat,col=rainbow(9),main="Number of Cards",ylab="Credit Balance",xlab="Number of Cards")

boxplot(Balance~as.factor(Education),data=dat,col=rev(heat.colors(15)),main="Education",ylab="Credit Balance",xlab="Years of Education")

For continuous variables, we fit linear models to our sample data points, and investigate the error of these functions

#linear models
par(mfrow=c(1,1))
income.lm <- lm(Balance~Income,data=dat)
balinc <- dat %>% select(Balance,Income) %>% arrange(Income)
plot(Balance~Income,data=balinc,col="red",type="p",ylim=c(-10,40),main="Balance vs Income",pch=16)
abline(income.lm,col="purple",lwd=3)
error <-balinc$Balance-predict(income.lm,balinc)
lines(error,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

limit.lm <- lm(Balance~Limit,data=dat)
ballim <- dat %>% select(Balance,Limit) %>% arrange(Limit)
plot(Balance~Limit,data=ballim,type="p",pch=16,col="red",ylim=c(-10,40),main="Credit Balance vs Credit Limit")
abline(limit.lm,col="purple",lwd=3)
error <-ballim$Balance-predict(limit.lm,ballim)
lines(y=error,x=ballim$Limit,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

rating.lm <- lm(Balance~Rating,data=dat)
balrat <- dat %>% select(Balance,Rating) %>% arrange(Rating)
plot(Balance~Rating,data=balrat,type="p",pch=16,col="red",ylim=c(-10,40),main="Credit Balance vs Credit Rating")
abline(rating.lm,col="purple",lwd=3)
error <-balrat$Balance-predict(rating.lm,balrat)
lines(y=error,x=balrat$Rating,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

age.lm <- lm(Balance~Age,data=dat)
balage <- dat %>% select(Balance,Age) %>% arrange(Age)
plot(Balance~Age,data=balage,type="p",pch=16,col="red",ylim=c(-10,40),main="Credit Balance vs Age")
abline(age.lm,col="purple",lwd=3)
error <-balage$Balance-predict(age.lm,balage)
lines(y=error,x=balage$Age,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

We scale our numerical variables (but keep an unscaled version of our outcome variable!), and plot the correlations between the scaled variables:

Orig.Balance <- dat$Balance
Balance <- scale(dat$Balance)
Income <- scale(dat$Income)
Limit <- scale(dat$Limit)
Rating <- scale(dat$Rating)
Cards <- scale(dat$Cards)
Age <- scale(dat$Age)
Education <- scale(dat$Education)

numerical <- cbind(Balance,Income,Limit,Rating,Cards,Age,Education)
numerical <- as.data.frame(numerical)
names(numerical) <- names(dat)[1:7]

require(PerformanceAnalytics)

chart.Correlation(numerical)

And look at the covariance matrix:

M <- cov(numerical)
require(corrplot)
par(mfrow=c(1,1))
corrplot(M,type="lower",method="pie",diag=F,sub="Covariance")

We create dummy variables for the categorical variables, and add categorical variables, corresponding to each “Decade of Life”

We then draw a correlation matrix for these variables as well:

Decade <- str_split_fixed(dat$Age,pattern = "",n=2)
Decade <- Decade[,1]

age_twenties <- ifelse(Decade==2,1,0)
age_thirties <- ifelse(Decade==3,1,0)
age_fourties <- ifelse(Decade==4,1,0)
age_fifties <- ifelse(Decade==5,1,0)
age_sixties <- ifelse(Decade==6,1,0)
age_seventies <- ifelse(Decade==7,1,0)
age_eighties <- ifelse(Decade==8,1,0)
age_nineties <- ifelse(Decade==9,1,0)

Male <- ifelse(trimws(as.character(dat$Gender))=="Male",1,0)
Female <- ifelse(trimws(as.character(dat$Gender))=="Female",1,0)

Student <- ifelse(trimws(as.character(dat$Student))=="Yes",1,0)
Non_Student <- ifelse(trimws(as.character(dat$Student))=="No",1,0)

Married <- ifelse(trimws(as.character(dat$Married))=="Yes",1,0)
Not_Married <- ifelse(trimws(as.character(dat$Married))=="No",1,0)


Caucasian <- ifelse(trimws(as.character(dat$Ethnicity))=="Caucasian",1,0)
AfricanAmerican <- ifelse(trimws(as.character(dat$Ethnicity))=="African American",1,0)
Asian <- ifelse(trimws(as.character(dat$Ethnicity))=="Asian",1,0)

categorical <- cbind(Balance,age_twenties,age_thirties,age_fourties,age_fifties,age_sixties,age_seventies,age_eighties,age_nineties,Male,Female,Student,Non_Student,Married,Not_Married,Caucasian,AfricanAmerican,Asian)

categorical <- as.data.frame(categorical)
names(categorical)[1] <- c("Balance")

less.categories <- categorical %>% select(-c(Female,Non_Student,Not_Married,Asian))

#less.categories <- scale(less.categories)

N <- cor(less.categories)
corrplot(N,type = "upper",diag=F,method="color")

categorical <- less.categories

We build simple linear models for normalized numerical variables:

nineties.only <- categorical %>% select(Balance,age_nineties)
categorical <- categorical %>% select(-c(age_nineties))

categorical <- as.data.frame(categorical)

categorical$Balance <- Orig.Balance

numerical$Balance <- Orig.Balance

numerical.base.model <- lm(Balance~.,data = numerical)
categorical.base.model <- lm(Balance~.,data = categorical)
scaled.dat <- full_join(numerical,categorical,by="Balance")
full.model <- lm(Balance~.,data=scaled.dat)

require(sjPlot)
sjt.lm(numerical.base.model)
    Balance
    B CI p
(Intercept)   13.43 13.29 – 13.57 <.001
Income   5.39 5.16 – 5.63 <.001
Limit   0.68 -1.45 – 2.81 .529
Rating   -0.65 -2.79 – 1.48 .549
Cards   0.09 -0.08 – 0.25 .320
Age   0.37 0.23 – 0.51 <.001
Education   0.18 0.04 – 0.33 .011
Observations   400
R2 / adj. R2   .937 / .936

As well as categorical variables:

sjt.lm(categorical.base.model)
    Balance
    B CI p
(Intercept)   30.88 23.15 – 38.60 <.001
age_twenties   -20.54 -28.38 – -12.70 <.001
age_thirties   -18.84 -26.50 – -11.19 <.001
age_fourties   -17.93 -25.55 – -10.30 <.001
age_fifties   -17.67 -25.32 – -10.02 <.001
age_sixties   -16.99 -24.63 – -9.36 <.001
age_seventies   -18.10 -25.75 – -10.45 <.001
age_eighties   -13.84 -21.57 – -6.11 <.001
Male   -0.06 -1.13 – 1.01 .909
Student   1.30 -0.49 – 3.08 .155
Married   0.18 -0.94 – 1.29 .756
Caucasian   -0.22 -1.52 – 1.08 .741
AfricanAmerican   0.07 -1.45 – 1.58 .932
Observations   400
R2 / adj. R2   .121 / .093

Finally, we build a linear model including all these transformed variables:

sjt.lm(full.model)
    Balance
    B CI p
(Intercept)   12.09 9.29 – 14.89 <.001
Income   5.44 5.20 – 5.67 <.001
Limit   0.82 -1.29 – 2.93 .444
Rating   -0.77 -2.88 – 1.35 .477
Cards   0.09 -0.07 – 0.26 .281
Age   0.14 -0.72 – 1.00 .746
Education   0.19 0.05 – 0.33 .008
age_twenties   1.31 -2.66 – 5.29 .516
age_thirties   1.31 -2.30 – 4.93 .475
age_fourties   1.13 -2.07 – 4.32 .489
age_fifties   1.88 -0.96 – 4.72 .194
age_sixties   1.81 -0.68 – 4.30 .154
age_seventies   1.92 -0.35 – 4.18 .097
age_eighties   1.73 -0.39 – 3.85 .109
Male   -0.05 -0.33 – 0.22 .705
Student   0.67 0.20 – 1.13 .005
Married   -0.40 -0.69 – -0.11 .007
Caucasian   -0.03 -0.36 – 0.31 .883
AfricanAmerican   -0.11 -0.50 – 0.28 .588
Observations   400
R2 / adj. R2   .942 / .940
nineties.model <- lm(Balance~age_nineties,data=nineties.only)

sjt.lm(nineties.model)
    Balance
    B CI p
(Intercept)   -0.02 -0.11 – 0.08 .751
age_nineties   3.11 1.74 – 4.47 <.001
Observations   400
R2 / adj. R2   .048 / .046
#which(age_nineties==1)

Looking at the patterns in, and signifance of the results, in the above work, we learn the following:

This is the function we’ll use for our very first model:

first.model <- lm(Balance~Income,data=dat)

predictions.first.model <- unlist(predict(first.model,dat))

error <- dat$Balance-predictions.first.model


FM <- cbind(dat$Balance,predictions.first.model,error)

FM <- as.data.frame(FM)

names(FM) <- c("y","y_hat","error")

Let’s look at the model predictions vs. actual values, and look at the distribution of the error:

plot(FM$y,FM$y_hat,ppch=16,col="red",ylim = c(-10,40),ylab="Y hat",xlab="Y",main="Linear model Estimates vs Actual Values\n Error indicated in BLUE")
lines(FM$error,type="h",col="blue",lwd=3)

hist(error,col="khaki")

The error is normally distributed, which is a good thing

Let’s see how we can explain the error in our model with a tree-based model:

numerical <- dat %>% select(2:7)

error.data <- cbind(error,numerical,categorical)

error.data <- error.data %>% select(-Balance)
require(rpart)

tree1 <- rpart(error~.,data = error.data)

require(rpart.plot)

rpart.plot(tree1)

error.prediction <- unlist(predict(tree1,error.data))

error.in.error <- error.prediction-error

error.table <- cbind(error.prediction,error,error.in.error)

error.table <- as.data.frame(error.table)

hist(error.in.error,col="deeppink",main="Error in error-model")

plot(error.table,col="deeppink",pch=16)

It seems modeling the error on its own is probably not such a good idea, although interestingly, there is a linear relationship between the real error and the error in the error prediction….

We’ll rather try to model the original response variable with a tree-based model:

dat <- cbind(dat[,1:7],categorical[,2:13],nineties.only[,2])
names(dat)[20] <- "age_nineties"

We see that Income overwhelms the decision splits, since it is such an influential variable:

tree2 <- rpart(Balance~.,data = dat)

rpart.plot(tree2)

We find that the error is not much different from the linear model:

error.tree2 <- dat$Balance-as.numeric(unlist(predict(tree2,dat)))

plot(dat$Balance,as.numeric(unlist(predict(tree2,dat))),xlab="Real Balance",ylab="Tree model predictions",ylim=c(-10,35),pch=16,col="red",main="Predicted Balance vs Real Balance",sub="Error inndicated in Orange")
lines(error.tree2,type="h")

par(mfrow=c(1,2))

plot(error,type="h",col="orange",main="Error in Linear Model")
plot(error.tree2,type="h",col="orange",main="Error in Tree-based model")

#png("Error_Plot.png")
par(mfrow=c(1,2))

hist(error,main="Histogram of Error\nin Linear Model",col="khaki")
hist(error.tree2,main="Histogram of Error\nin Tree Model",col="khaki")

#dev.off()

Income is dominating our predictive models, in either case

(This is probably why our error distributions are so similar)…

Random Forests allow us to prevent this from happening, so let’s try that:

require(party)
require(randomForest)
forest1 <- randomForest(Balance~.,data=dat)

forest.error <- dat$Balance-as.numeric(unlist(predict(forest1,dat)))

forest.prediction <- as.numeric(unlist(predict(forest1,dat)))

forest.prediction <- cbind(dat$Balance,forest.prediction,forest.error)

forest.prediction <- as.data.frame(forest.prediction)

names(forest.prediction) <- c("Balance","Prediction_Random_Forest","Error")

plot(forest.prediction$Balance,forest.prediction$Prediction_Random_Forest,pch=16,col="red",ylim=c(-10,40),ylab="Random Forest Prediction",xlab="True Credit Balance")
lines(forest.prediction$Error~forest.prediction$Balance,type="h",col="orange")

hist(forest.prediction$Error,col="khaki",main="Histogram of Random Forest Model\nError",xlab="Random Forest Model Error")

It looks like our error has improved, but let’s get a standard measurement of our model errors so far, the sum of squared error:

LinearModelError <- sum(error^2)

TreeModelError <- sum(error.tree2^2)

RandomForestError <- sum(forest.error^2)
require(knitr)
kable(cbind(LinearModelError,TreeModelError,RandomForestError))
LinearModelError TreeModelError RandomForestError
871.9598 993.7124 268.1694

This is misleading

*The sum of squared errors comes to zero, which means it performs better in aggregate, but the range of errors for individual observations is much wider:

par(mfrow=c(1,3))
plot(dat$Balance,error,type="h",col="orange",ylab="Error",xlab="Balance",main="Linear Model Error")
plot(dat$Balance,error.tree2,type="h",col="blue",xlab="Balance",main="Tree Model Error",ylab="Error")
plot(dat$Balance,forest.error,type="h",col="green",xlab="Balance",main="Forest Model Error",ylab="Error")

It’s apparent that the errors are being made for similar observations…

Let’s try neural networks…

dat[,2:7] <- scale(dat[,2:7])
train <- sample_frac(dat,0.75)
test <- setdiff(dat,train)
library(neuralnet)

feats <- names(train)[-c(1)]

f <- paste(feats,collapse=' + ')

f <- paste('Balance ~',f)
f <- as.formula(f)


nn <- neuralnet(f,data=train,hidden=c(10,10,10),linear.output=FALSE)

mu <- mean(test$Balance)
sigma <- sd(test$Balance)


p <- compute(nn,test[2:20])

p <- p$net.result

#p <- (p+mu)*sigma

neural.model.1 <- cbind(scale(test$Balance),p)
neural.model.1 <- as.data.frame(neural.model.1)
names(neural.model.1) <- c("Balance","NNPrediction")

neural.model.1 <- neural.model.1+mu
neural.model.1 <- neural.model.1*sigma

neural.model.1$NN.error <- neural.model.1$Balance-neural.model.1$NNPrediction

plot(x=neural.model.1$Balance,y=neural.model.1$NNPrediction,dat=neural.model.1,col="red",pch=16,ylab="Neural Network Prediction",xlab="True Credit Balance (Training Set)")

plot(neural.model.1$NN.error~neural.model.1$Balance,type="h",col="orange",xlab="True Training Set Credit Balance",ylab="Neural Network Prediction:Taining Set")

hist(neural.model.1$NN.error,col="khaki",main="Histogram of Training Set NN Error",xlab="")

Let’s try an ensembled approach

dat <- read.table("Assignment_1_Data.txt",header=T)

train <- sample_frac(dat,0.75)
test <- setdiff(dat,train)
linear.model <- lm(Balance~Income,data=train)

linear.model.prediction <- predict(linear.model,test)

tree.model <- rpart(Balance~.,train)

tree.model.prediction <- predict(tree.model,test)

random.forest.model <- randomForest(Balance~.,train)

random.forest.prediction <- predict(random.forest.model,test)

final.prediction <- cbind(linear.model.prediction,tree.model.prediction,random.forest.prediction)

final.prediction <- rowMeans(final.prediction)

final.prediction <- cbind(dat$Balance,final.prediction)

final.error <- (final.prediction[,1]-final.prediction[,2])

final.prediction <- cbind(final.prediction,final.error)

final.prediction <- as.data.frame(final.prediction)

names(final.prediction) <- c("True Credit Balance","Ensemble Prediction","Ensemble Error")
plot(final.prediction$`True Credit Balance`~final.prediction$`Ensemble Prediction`,pch=16,col="red",ylim=c(-10,50),main="True Credit Balance vs Ensemble Prediction",sub="Error indicated in Orange",xlab="Ensemble Prediction",ylab="True Credit Balance")
lines(final.prediction$`Ensemble Error`~final.prediction$`True Credit Balance`,type="h",col="orange")

This is only on 100 test observations, let’s test this ensemble model on all data:

dat <- read.table("Assignment_1_Data.txt",header=T)


linear.model <- lm(Balance~Income,data=dat)

linear.model.prediction <- predict(linear.model,dat)

tree.model <- rpart(Balance~.,dat)

tree.model.prediction <- predict(tree.model,dat)

random.forest.model <- randomForest(Balance~.,dat)

random.forest.prediction <- predict(random.forest.model,dat)

final.prediction <- cbind(linear.model.prediction,tree.model.prediction,random.forest.prediction)

final.prediction <- rowMeans(final.prediction)

final.prediction <- cbind(dat$Balance,final.prediction)

final.error <- (final.prediction[,1]-final.prediction[,2])

final.prediction <- cbind(final.prediction,final.error)

final.prediction <- as.data.frame(final.prediction)

names(final.prediction) <- c("True Credit Balance","Ensemble Prediction","Ensemble Error")
plot(final.prediction$`True Credit Balance`~final.prediction$`Ensemble Prediction`,pch=16,col="red",ylim=c(-10,50),main="True Credit Balance vs Ensemble Prediction",sub="Error indicated in Orange",xlab="Ensemble Prediction",ylab="True Credit Balance")
lines(final.prediction$`Ensemble Error`~final.prediction$`True Credit Balance`,type="h",col="orange")

par(mfrow=c(1,2))
hist(final.prediction$`Ensemble Error`,col="khaki",main="Histogram of Ensemble Error",xlab="")
plot(density(final.prediction$`Ensemble Error`),col="red",main="Density of Ensemble Error")

lin.mod.plot <- cbind(linear.model.prediction-dat$Balance)

hist(lin.mod.plot,col="khaki",main="Histogram of Linear Model Error",xlab="")
plot(density(lin.mod.plot),col="red",main="Density of Ensemble Error")

Our model has improved, from the initial linear model, based only on Income…

barplot(c(sum(abs(final.prediction$`Ensemble Error`)),sum(abs(lin.mod.plot))),main="Sum of the Absolute Value of Model Error",names.arg = c("Ensemble Model","Linear Model Based on Income"),col=c("darkblue","deeppink"))

Still, that’s a lot of error…

Bearing in mind the sample range of our outcome variable

RANGE <- as.matrix(range(dat$Balance),ncol=2)
RANGE <- as.data.frame(t(RANGE))
kable(RANGE,col.names = c("Minimum Balance","Maximum Balance"))
Minimum Balance Maximum Balance
3.749402976 38.78512301

And our sample size…

kable(nrow(dat),col.names = "Sample Size")
Sample Size
400

We end up with the following scale of error:

kable(sum(abs(final.prediction$`Ensemble Error`))/nrow(dat),col.names = "Absolute value of error per observation")
Absolute value of error per observation
0.8840001534

If we’re smart about it, we’ll use General Additive Models from the MGCV package:

dat <- read.table("Assignment_1_Data.txt",header=T)
train <- sample_frac(dat,0.75)
test <- setdiff(dat,train)
require(mgcv)

my.gam <- gam(Balance~s(Income,k=3)+s(Limit,k=3)+s(Rating,k=30)+s(Cards,k=8)+s(Age,k=10)+s(Education,k=15),data=train)
par(mfrow=c(2,2))
gam.check(my.gam)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 17 iterations.
## The RMS GCV score gradient at convergence was 0.00000005132490567 .
## The Hessian was positive definite.
## Model rank =  64 / 64 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                 k'   edf k-index p-value
## s(Income)     2.00  1.00    0.99    0.38
## s(Limit)      2.00  2.00    1.03    0.66
## s(Rating)    29.00  3.03    1.06    0.86
## s(Cards)      7.00  1.00    1.10    0.95
## s(Age)        9.00  4.34    1.04    0.64
## s(Education) 14.00  3.95    1.03    0.73
gam.pred <- as.numeric(unlist(predict(my.gam,test)))
GAM.model <- cbind(test$Balance,gam.pred)
GAM.model <- as.data.frame(GAM.model)
GAM.error <- as.numeric(unlist(predict(my.gam,test)))-test$Balance
GAM.model <- cbind(GAM.model,GAM.error)
names(GAM.model) <- c("True Balance","GAM Prediction","GAM Error")

plot(GAM.model$`True Balance`~GAM.model$`GAM Prediction`,pch=16,col="red",ylim=c(-5,40),ylab="True Credit Balance Test Set",xlab="GAM prediction Test Set")
lines(GAM.model$`GAM Error`~GAM.model$`True Balance`,type="h",col="orange")

all.data.gam.pred <- cbind(dat$Balance,as.numeric(unlist(predict(my.gam,dat))))
gam.error.all.data <- dat$Balance-as.numeric(unlist(predict(my.gam,dat)))

all.data.gam.pred <- cbind(all.data.gam.pred,gam.error.all.data)


all.data.gam.pred <- as.data.frame(all.data.gam.pred)

names(all.data.gam.pred) <- c("True Credit Balance","Gam C Balance Pred","Gam Error")

plot(all.data.gam.pred$`True Credit Balance`~all.data.gam.pred$`Gam C Balance Pred`,ylim=c(-10,40),pch=16,col="red",ylab="True Credit Balance All Data",xlab="GAM Prediction All Data")
lines(all.data.gam.pred$`Gam Error`~all.data.gam.pred$`True Credit Balance`,type="h",col="orange")

hist(all.data.gam.pred$`Gam Error`,col="khaki",main="Histogram of GAM Error on All Data",xlab="")

The Sum of Absolute Error in the GAM model is:

kable(sum(abs(all.data.gam.pred$`Gam Error`)),col.names = "GAM Model: Sum of Absolue  Error")
GAM Model: Sum of Absolue Error
407.5650139

In summary, error is a pesky thing to get rid of…

plot(dat$Balance,all.data.gam.pred$`Gam Error`,pch=16,col="red",xlab="True Credit Balance",ylab="GAM model error")
abline(h=0,col="blue",lty="dotted")