We want to predict individual credit balance, based on 10 predictor variables
Credit Balance is distributed with the following density in our sample of 400 individuals:
require(dplyr)
dat <- read.table("Assignment_1_Data.txt",header = T)
require(stringr)
#overall density
plot(density(dat$Balance),main="Distribution of Credit Balance",col="orange",lwd=3,sub="Summary statistics included as vertical lines")
abline(v=as.numeric(summary(dat$Balance)[c(1,2,3,4,5,6)]),col=c("grey","blue","red","purple","blue","grey"),lty=c("dotted","dashed","solid","solid","dashed","dotted"),lwd=c(1,1,3,3,1,1))
legend("topright",legend=c("Density","Range","IQR","Mean","Median"),lty=c("solid","dotted","dashed","solid","solid"),col=c("orange","grey","blue","red","purple"),lwd=c(2,1,1,2,2))

Looking at Categorical Variables, we can draw the following boxplots and density plots for the distribution of credit balance, per variable level
par(mfrow=c(1,2))
#individual densities and boxplots
boxplot(Balance~as.factor(Gender),data=dat,col=c("blue","deeppink"),main="Gender",ylab="Credit Balance",xlab="Gender")
males <- dat %>% subset(trimws(as.character(dat$Gender))=="Male")
females <- setdiff(dat,males)
plot(density(females$Balance),col="deeppink",main="Gender",xlab="")
lines(density(males$Balance),col="blue")

#
boxplot(Balance~as.factor(Student),data=dat,col=c("purple","orange"),main="Student Status",ylab="Credit Balance",xlab="Student Status")
students <- dat %>% subset(trimws(as.character(dat$Student))=="Yes")
not_students <- setdiff(dat,students)
plot(density(not_students$Balance),col="purple",main="Student Status",xlab="")
lines(density(students$Balance),col="orange")

#
boxplot(Balance~Ethnicity,data=dat,col=rainbow(3),main="Ethnicity",ylab="Credit Balance",xlab="Ethnicity")
whites <- dat %>% subset(trimws(as.character(dat$Ethnicity))=="Caucasian")
blacks <- dat %>% subset(trimws(as.character(dat$Ethnicity))=="African American")
asians <- dat %>% subset(trimws(as.character(dat$Ethnicity))=="Asian")
plot(density(whites$Balance),col="#0000FFFF",main="Ethnicity",xlab="")
lines(density(blacks$Balance),col="#FF0000FF")
lines(density(asians$Balance),col="#00FF00FF")

boxplot(Balance~as.factor(Married),data=dat,col=c("cornflowerblue","darkgoldenrod"),main="Marital status",ylab="Credit Balance",xlab="Marital Status")
married <- dat %>% subset(trimws(as.character(dat$Married))=="Yes")
not_married <- setdiff(dat,married)
plot(density(not_married$Balance),col="cornflowerblue",main="Marital Status",xlab="")
lines(density(married$Balance),col="darkgoldenrod")

For continuous variables, we fit linear models to our sample data points, and investigate the error of these functions
#linear models
par(mfrow=c(1,1))
income.lm <- lm(Balance~Income,data=dat)
balinc <- dat %>% select(Balance,Income) %>% arrange(Income)
plot(Balance~Income,data=balinc,col="red",type="p",ylim=c(-10,40),main="Balance vs Income",pch=16)
abline(income.lm,col="purple",lwd=3)
error <-balinc$Balance-predict(income.lm,balinc)
lines(error,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

limit.lm <- lm(Balance~Limit,data=dat)
ballim <- dat %>% select(Balance,Limit) %>% arrange(Limit)
plot(Balance~Limit,data=ballim,type="p",pch=16,col="red",ylim=c(-10,40),main="Credit Balance vs Credit Limit")
abline(limit.lm,col="purple",lwd=3)
error <-ballim$Balance-predict(limit.lm,ballim)
lines(y=error,x=ballim$Limit,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

rating.lm <- lm(Balance~Rating,data=dat)
balrat <- dat %>% select(Balance,Rating) %>% arrange(Rating)
plot(Balance~Rating,data=balrat,type="p",pch=16,col="red",ylim=c(-10,40),main="Credit Balance vs Credit Rating")
abline(rating.lm,col="purple",lwd=3)
error <-balrat$Balance-predict(rating.lm,balrat)
lines(y=error,x=balrat$Rating,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

age.lm <- lm(Balance~Age,data=dat)
balage <- dat %>% select(Balance,Age) %>% arrange(Age)
plot(Balance~Age,data=balage,type="p",pch=16,col="red",ylim=c(-10,40),main="Credit Balance vs Age")
abline(age.lm,col="purple",lwd=3)
error <-balage$Balance-predict(age.lm,balage)
lines(y=error,x=balage$Age,type="h",col="orange")
legend(x="topleft",legend = c("Data Points","Linear Model","Error in Linear Model"),col=c("red","purple","orange"),lty=c("solid"))

We scale our numerical variables (but keep an unscaled version of our outcome variable!), and plot the correlations between the scaled variables:
Orig.Balance <- dat$Balance
Balance <- scale(dat$Balance)
Income <- scale(dat$Income)
Limit <- scale(dat$Limit)
Rating <- scale(dat$Rating)
Cards <- scale(dat$Cards)
Age <- scale(dat$Age)
Education <- scale(dat$Education)
numerical <- cbind(Balance,Income,Limit,Rating,Cards,Age,Education)
numerical <- as.data.frame(numerical)
names(numerical) <- names(dat)[1:7]
require(PerformanceAnalytics)
chart.Correlation(numerical)

And look at the covariance matrix:
M <- cov(numerical)
require(corrplot)
par(mfrow=c(1,1))
corrplot(M,type="lower",method="pie",diag=F,sub="Covariance")

We create dummy variables for the categorical variables, and add categorical variables, corresponding to each “Decade of Life”
We then draw a correlation matrix for these variables as well:
Decade <- str_split_fixed(dat$Age,pattern = "",n=2)
Decade <- Decade[,1]
age_twenties <- ifelse(Decade==2,1,0)
age_thirties <- ifelse(Decade==3,1,0)
age_fourties <- ifelse(Decade==4,1,0)
age_fifties <- ifelse(Decade==5,1,0)
age_sixties <- ifelse(Decade==6,1,0)
age_seventies <- ifelse(Decade==7,1,0)
age_eighties <- ifelse(Decade==8,1,0)
age_nineties <- ifelse(Decade==9,1,0)
Male <- ifelse(trimws(as.character(dat$Gender))=="Male",1,0)
Female <- ifelse(trimws(as.character(dat$Gender))=="Female",1,0)
Student <- ifelse(trimws(as.character(dat$Student))=="Yes",1,0)
Non_Student <- ifelse(trimws(as.character(dat$Student))=="No",1,0)
Married <- ifelse(trimws(as.character(dat$Married))=="Yes",1,0)
Not_Married <- ifelse(trimws(as.character(dat$Married))=="No",1,0)
Caucasian <- ifelse(trimws(as.character(dat$Ethnicity))=="Caucasian",1,0)
AfricanAmerican <- ifelse(trimws(as.character(dat$Ethnicity))=="African American",1,0)
Asian <- ifelse(trimws(as.character(dat$Ethnicity))=="Asian",1,0)
categorical <- cbind(Balance,age_twenties,age_thirties,age_fourties,age_fifties,age_sixties,age_seventies,age_eighties,age_nineties,Male,Female,Student,Non_Student,Married,Not_Married,Caucasian,AfricanAmerican,Asian)
categorical <- as.data.frame(categorical)
names(categorical)[1] <- c("Balance")
less.categories <- categorical %>% select(-c(Female,Non_Student,Not_Married,Asian))
#less.categories <- scale(less.categories)
N <- cor(less.categories)
corrplot(N,type = "upper",diag=F,method="color")

categorical <- less.categories
We build simple linear models for normalized numerical variables:
nineties.only <- categorical %>% select(Balance,age_nineties)
categorical <- categorical %>% select(-c(age_nineties))
categorical <- as.data.frame(categorical)
categorical$Balance <- Orig.Balance
numerical$Balance <- Orig.Balance
numerical.base.model <- lm(Balance~.,data = numerical)
categorical.base.model <- lm(Balance~.,data = categorical)
scaled.dat <- full_join(numerical,categorical,by="Balance")
full.model <- lm(Balance~.,data=scaled.dat)
require(sjPlot)
sjt.lm(numerical.base.model)
|
Â
|
Â
|
Balance
|
|
Â
|
Â
|
B
|
CI
|
p
|
|
(Intercept)
|
Â
|
13.43
|
13.29 – 13.57
|
<.001
|
|
Income
|
Â
|
5.39
|
5.16 – 5.63
|
<.001
|
|
Limit
|
Â
|
0.68
|
-1.45 – 2.81
|
.529
|
|
Rating
|
Â
|
-0.65
|
-2.79 – 1.48
|
.549
|
|
Cards
|
Â
|
0.09
|
-0.08 – 0.25
|
.320
|
|
Age
|
Â
|
0.37
|
0.23 – 0.51
|
<.001
|
|
Education
|
Â
|
0.18
|
0.04 – 0.33
|
.011
|
|
Observations
|
Â
|
400
|
|
R2 / adj. R2
|
Â
|
.937 / .936
|
As well as categorical variables:
sjt.lm(categorical.base.model)
|
Â
|
Â
|
Balance
|
|
Â
|
Â
|
B
|
CI
|
p
|
|
(Intercept)
|
Â
|
30.88
|
23.15 – 38.60
|
<.001
|
|
age_twenties
|
Â
|
-20.54
|
-28.38 – -12.70
|
<.001
|
|
age_thirties
|
Â
|
-18.84
|
-26.50 – -11.19
|
<.001
|
|
age_fourties
|
Â
|
-17.93
|
-25.55 – -10.30
|
<.001
|
|
age_fifties
|
Â
|
-17.67
|
-25.32 – -10.02
|
<.001
|
|
age_sixties
|
Â
|
-16.99
|
-24.63 – -9.36
|
<.001
|
|
age_seventies
|
Â
|
-18.10
|
-25.75 – -10.45
|
<.001
|
|
age_eighties
|
Â
|
-13.84
|
-21.57 – -6.11
|
<.001
|
|
Male
|
Â
|
-0.06
|
-1.13 – 1.01
|
.909
|
|
Student
|
Â
|
1.30
|
-0.49 – 3.08
|
.155
|
|
Married
|
Â
|
0.18
|
-0.94 – 1.29
|
.756
|
|
Caucasian
|
Â
|
-0.22
|
-1.52 – 1.08
|
.741
|
|
AfricanAmerican
|
Â
|
0.07
|
-1.45 – 1.58
|
.932
|
|
Observations
|
Â
|
400
|
|
R2 / adj. R2
|
Â
|
.121 / .093
|
Looking at the patterns in, and signifance of the results, in the above work, we learn the following:
- Credit Balance is steeply and significantly linearly dependent on Income
- Decide of life is a reliable intercept modifier
- Our best naive model would be to: use the linear model values for Balance~Income
This is the function we’ll use for our very first model:
first.model <- lm(Balance~Income,data=dat)
predictions.first.model <- unlist(predict(first.model,dat))
error <- dat$Balance-predictions.first.model
FM <- cbind(dat$Balance,predictions.first.model,error)
FM <- as.data.frame(FM)
names(FM) <- c("y","y_hat","error")
Let’s look at the model predictions vs. actual values, and look at the distribution of the error:
plot(FM$y,FM$y_hat,ppch=16,col="red",ylim = c(-10,40),ylab="Y hat",xlab="Y",main="Linear model Estimates vs Actual Values\n Error indicated in BLUE")
lines(FM$error,type="h",col="blue",lwd=3)

hist(error,col="khaki")

The error is normally distributed, which is a good thing
Let’s see how we can explain the error in our model with a tree-based model:
numerical <- dat %>% select(2:7)
error.data <- cbind(error,numerical,categorical)
error.data <- error.data %>% select(-Balance)
require(rpart)
tree1 <- rpart(error~.,data = error.data)
require(rpart.plot)
rpart.plot(tree1)

error.prediction <- unlist(predict(tree1,error.data))
error.in.error <- error.prediction-error
error.table <- cbind(error.prediction,error,error.in.error)
error.table <- as.data.frame(error.table)
hist(error.in.error,col="deeppink",main="Error in error-model")

plot(error.table,col="deeppink",pch=16)

It seems modeling the error on its own is probably not such a good idea, although interestingly, there is a linear relationship between the real error and the error in the error prediction….
We’ll rather try to model the original response variable with a tree-based model:
dat <- cbind(dat[,1:7],categorical[,2:13],nineties.only[,2])
names(dat)[20] <- "age_nineties"
We see that Income overwhelms the decision splits, since it is such an influential variable:
tree2 <- rpart(Balance~.,data = dat)
rpart.plot(tree2)

We find that the error is not much different from the linear model:
error.tree2 <- dat$Balance-as.numeric(unlist(predict(tree2,dat)))
plot(dat$Balance,as.numeric(unlist(predict(tree2,dat))),xlab="Real Balance",ylab="Tree model predictions",ylim=c(-10,35),pch=16,col="red",main="Predicted Balance vs Real Balance",sub="Error inndicated in Orange")
lines(error.tree2,type="h")

par(mfrow=c(1,2))
plot(error,type="h",col="orange",main="Error in Linear Model")
plot(error.tree2,type="h",col="orange",main="Error in Tree-based model")

#png("Error_Plot.png")
par(mfrow=c(1,2))
hist(error,main="Histogram of Error\nin Linear Model",col="khaki")
hist(error.tree2,main="Histogram of Error\nin Tree Model",col="khaki")

#dev.off()
Income is dominating our predictive models, in either case
(This is probably why our error distributions are so similar)…
Random Forests allow us to prevent this from happening, so let’s try that:
require(party)
require(randomForest)
forest1 <- randomForest(Balance~.,data=dat)
forest.error <- dat$Balance-as.numeric(unlist(predict(forest1,dat)))
forest.prediction <- as.numeric(unlist(predict(forest1,dat)))
forest.prediction <- cbind(dat$Balance,forest.prediction,forest.error)
forest.prediction <- as.data.frame(forest.prediction)
names(forest.prediction) <- c("Balance","Prediction_Random_Forest","Error")
plot(forest.prediction$Balance,forest.prediction$Prediction_Random_Forest,pch=16,col="red",ylim=c(-10,40),ylab="Random Forest Prediction",xlab="True Credit Balance")
lines(forest.prediction$Error~forest.prediction$Balance,type="h",col="orange")

hist(forest.prediction$Error,col="khaki",main="Histogram of Random Forest Model\nError",xlab="Random Forest Model Error")

It looks like our error has improved, but let’s get a standard measurement of our model errors so far, the sum of squared error:
LinearModelError <- sum(error^2)
TreeModelError <- sum(error.tree2^2)
RandomForestError <- sum(forest.error^2)
require(knitr)
kable(cbind(LinearModelError,TreeModelError,RandomForestError))
| 871.9598 |
993.7124 |
268.1694 |
This is misleading
*The sum of squared errors comes to zero, which means it performs better in aggregate, but the range of errors for individual observations is much wider:
par(mfrow=c(1,3))
plot(dat$Balance,error,type="h",col="orange",ylab="Error",xlab="Balance",main="Linear Model Error")
plot(dat$Balance,error.tree2,type="h",col="blue",xlab="Balance",main="Tree Model Error",ylab="Error")
plot(dat$Balance,forest.error,type="h",col="green",xlab="Balance",main="Forest Model Error",ylab="Error")

It’s apparent that the errors are being made for similar observations…
Let’s try neural networks…
dat[,2:7] <- scale(dat[,2:7])
train <- sample_frac(dat,0.75)
test <- setdiff(dat,train)
library(neuralnet)
feats <- names(train)[-c(1)]
f <- paste(feats,collapse=' + ')
f <- paste('Balance ~',f)
f <- as.formula(f)
nn <- neuralnet(f,data=train,hidden=c(10,10,10),linear.output=FALSE)
mu <- mean(test$Balance)
sigma <- sd(test$Balance)
p <- compute(nn,test[2:20])
p <- p$net.result
#p <- (p+mu)*sigma
neural.model.1 <- cbind(scale(test$Balance),p)
neural.model.1 <- as.data.frame(neural.model.1)
names(neural.model.1) <- c("Balance","NNPrediction")
neural.model.1 <- neural.model.1+mu
neural.model.1 <- neural.model.1*sigma
neural.model.1$NN.error <- neural.model.1$Balance-neural.model.1$NNPrediction
plot(x=neural.model.1$Balance,y=neural.model.1$NNPrediction,dat=neural.model.1,col="red",pch=16,ylab="Neural Network Prediction",xlab="True Credit Balance (Training Set)")

plot(neural.model.1$NN.error~neural.model.1$Balance,type="h",col="orange",xlab="True Training Set Credit Balance",ylab="Neural Network Prediction:Taining Set")

hist(neural.model.1$NN.error,col="khaki",main="Histogram of Training Set NN Error",xlab="")

Let’s try an ensembled approach
dat <- read.table("Assignment_1_Data.txt",header=T)
train <- sample_frac(dat,0.75)
test <- setdiff(dat,train)
linear.model <- lm(Balance~Income,data=train)
linear.model.prediction <- predict(linear.model,test)
tree.model <- rpart(Balance~.,train)
tree.model.prediction <- predict(tree.model,test)
random.forest.model <- randomForest(Balance~.,train)
random.forest.prediction <- predict(random.forest.model,test)
final.prediction <- cbind(linear.model.prediction,tree.model.prediction,random.forest.prediction)
final.prediction <- rowMeans(final.prediction)
final.prediction <- cbind(dat$Balance,final.prediction)
final.error <- (final.prediction[,1]-final.prediction[,2])
final.prediction <- cbind(final.prediction,final.error)
final.prediction <- as.data.frame(final.prediction)
names(final.prediction) <- c("True Credit Balance","Ensemble Prediction","Ensemble Error")
plot(final.prediction$`True Credit Balance`~final.prediction$`Ensemble Prediction`,pch=16,col="red",ylim=c(-10,50),main="True Credit Balance vs Ensemble Prediction",sub="Error indicated in Orange",xlab="Ensemble Prediction",ylab="True Credit Balance")
lines(final.prediction$`Ensemble Error`~final.prediction$`True Credit Balance`,type="h",col="orange")

This is only on 100 test observations, let’s test this ensemble model on all data:
dat <- read.table("Assignment_1_Data.txt",header=T)
linear.model <- lm(Balance~Income,data=dat)
linear.model.prediction <- predict(linear.model,dat)
tree.model <- rpart(Balance~.,dat)
tree.model.prediction <- predict(tree.model,dat)
random.forest.model <- randomForest(Balance~.,dat)
random.forest.prediction <- predict(random.forest.model,dat)
final.prediction <- cbind(linear.model.prediction,tree.model.prediction,random.forest.prediction)
final.prediction <- rowMeans(final.prediction)
final.prediction <- cbind(dat$Balance,final.prediction)
final.error <- (final.prediction[,1]-final.prediction[,2])
final.prediction <- cbind(final.prediction,final.error)
final.prediction <- as.data.frame(final.prediction)
names(final.prediction) <- c("True Credit Balance","Ensemble Prediction","Ensemble Error")
plot(final.prediction$`True Credit Balance`~final.prediction$`Ensemble Prediction`,pch=16,col="red",ylim=c(-10,50),main="True Credit Balance vs Ensemble Prediction",sub="Error indicated in Orange",xlab="Ensemble Prediction",ylab="True Credit Balance")
lines(final.prediction$`Ensemble Error`~final.prediction$`True Credit Balance`,type="h",col="orange")

par(mfrow=c(1,2))
hist(final.prediction$`Ensemble Error`,col="khaki",main="Histogram of Ensemble Error",xlab="")
plot(density(final.prediction$`Ensemble Error`),col="red",main="Density of Ensemble Error")

lin.mod.plot <- cbind(linear.model.prediction-dat$Balance)
hist(lin.mod.plot,col="khaki",main="Histogram of Linear Model Error",xlab="")
plot(density(lin.mod.plot),col="red",main="Density of Ensemble Error")

Our model has improved, from the initial linear model, based only on Income…
barplot(c(sum(abs(final.prediction$`Ensemble Error`)),sum(abs(lin.mod.plot))),main="Sum of the Absolute Value of Model Error",names.arg = c("Ensemble Model","Linear Model Based on Income"),col=c("darkblue","deeppink"))

Still, that’s a lot of error…
Bearing in mind the sample range of our outcome variable
RANGE <- as.matrix(range(dat$Balance),ncol=2)
RANGE <- as.data.frame(t(RANGE))
kable(RANGE,col.names = c("Minimum Balance","Maximum Balance"))
And our sample size…
kable(nrow(dat),col.names = "Sample Size")
We end up with the following scale of error:
kable(sum(abs(final.prediction$`Ensemble Error`))/nrow(dat),col.names = "Absolute value of error per observation")
If we’re smart about it, we’ll use General Additive Models from the MGCV package:
dat <- read.table("Assignment_1_Data.txt",header=T)
train <- sample_frac(dat,0.75)
test <- setdiff(dat,train)
require(mgcv)
my.gam <- gam(Balance~s(Income,k=3)+s(Limit,k=3)+s(Rating,k=30)+s(Cards,k=8)+s(Age,k=10)+s(Education,k=15),data=train)
par(mfrow=c(2,2))
gam.check(my.gam)

##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 17 iterations.
## The RMS GCV score gradient at convergence was 0.00000005132490567 .
## The Hessian was positive definite.
## Model rank = 64 / 64
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Income) 2.00 1.00 0.99 0.38
## s(Limit) 2.00 2.00 1.03 0.66
## s(Rating) 29.00 3.03 1.06 0.86
## s(Cards) 7.00 1.00 1.10 0.95
## s(Age) 9.00 4.34 1.04 0.64
## s(Education) 14.00 3.95 1.03 0.73
gam.pred <- as.numeric(unlist(predict(my.gam,test)))
GAM.model <- cbind(test$Balance,gam.pred)
GAM.model <- as.data.frame(GAM.model)
GAM.error <- as.numeric(unlist(predict(my.gam,test)))-test$Balance
GAM.model <- cbind(GAM.model,GAM.error)
names(GAM.model) <- c("True Balance","GAM Prediction","GAM Error")
plot(GAM.model$`True Balance`~GAM.model$`GAM Prediction`,pch=16,col="red",ylim=c(-5,40),ylab="True Credit Balance Test Set",xlab="GAM prediction Test Set")
lines(GAM.model$`GAM Error`~GAM.model$`True Balance`,type="h",col="orange")

all.data.gam.pred <- cbind(dat$Balance,as.numeric(unlist(predict(my.gam,dat))))
gam.error.all.data <- dat$Balance-as.numeric(unlist(predict(my.gam,dat)))
all.data.gam.pred <- cbind(all.data.gam.pred,gam.error.all.data)
all.data.gam.pred <- as.data.frame(all.data.gam.pred)
names(all.data.gam.pred) <- c("True Credit Balance","Gam C Balance Pred","Gam Error")
plot(all.data.gam.pred$`True Credit Balance`~all.data.gam.pred$`Gam C Balance Pred`,ylim=c(-10,40),pch=16,col="red",ylab="True Credit Balance All Data",xlab="GAM Prediction All Data")
lines(all.data.gam.pred$`Gam Error`~all.data.gam.pred$`True Credit Balance`,type="h",col="orange")

hist(all.data.gam.pred$`Gam Error`,col="khaki",main="Histogram of GAM Error on All Data",xlab="")

The Sum of Absolute Error in the GAM model is:
kable(sum(abs(all.data.gam.pred$`Gam Error`)),col.names = "GAM Model: Sum of Absolue Error")
In summary, error is a pesky thing to get rid of…
plot(dat$Balance,all.data.gam.pred$`Gam Error`,pch=16,col="red",xlab="True Credit Balance",ylab="GAM model error")
abline(h=0,col="blue",lty="dotted")
