library(MASS)
library(mgcv)
library(splines)
data('mcycle')
head(mcycle)
## times accel
## 1 2.4 0.0
## 2 2.6 -1.3
## 3 3.2 -2.7
## 4 3.6 0.0
## 5 4.0 -2.7
## 6 6.2 -2.7
dim(mcycle) #133 2
## [1] 133 2
str(mcycle)
## 'data.frame': 133 obs. of 2 variables:
## $ times: num 2.4 2.6 3.2 3.6 4 6.2 6.6 6.8 7.8 8.2 ...
## $ accel: num 0 -1.3 -2.7 0 -2.7 -2.7 -2.7 -1.3 -2.7 -2.7 ...
summary(mcycle)
## times accel
## Min. : 2.40 Min. :-134.00
## 1st Qu.:15.60 1st Qu.: -54.90
## Median :23.40 Median : -13.30
## Mean :25.18 Mean : -25.55
## 3rd Qu.:34.80 3rd Qu.: 0.00
## Max. :57.60 Max. : 75.00
Y <- mcycle$accel
X <- mcycle$times
First, we explore the bs() function using different number of knots
#knot=1
# Specifying equally spaced knots:
number.knots <- 1
spacings <- seq(from=min(X),to=max(X),length=number.knots+2)[2:(number.knots+1)]
regr.spline <- lm(Y ~ bs(X, df = NULL, knots=spacings, degree = 3, intercept=T))
#K=5
number.knots5 <- 5
spacings5 <- seq(from=min(X),to=max(X),length=number.knots5+2)[2:(number.knots5+1)]
regr.spline5 <- lm(Y ~ bs(X, df = NULL, knots=spacings5, degree = 3, intercept=T))
#K=10
number.knots10 <- 10
spacings10 <- seq(from=min(X),to=max(X),length=number.knots10+2)[2:(number.knots10+1)]
regr.spline10 <- lm(Y ~ bs(X, df = NULL, knots=spacings10, degree = 3, intercept=T))
#K=20
number.knots20 <- 20
spacings20 <- seq(from=min(X),to=max(X),length=number.knots20+2)[2:(number.knots20+1)]
regr.spline20 <- lm(Y ~ bs(X, df = NULL, knots=spacings20, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(regr.spline, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(regr.spline5, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(regr.spline10, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(regr.spline20, data.frame(X=x.values)))
Next, we explore the bs() function using the same number of knots, but different degree of polynomial.
K=1
#K=1
number.knots1 <- 1
spacings1 <- seq(from=min(X),to=max(X),length=number.knots1+2)[2:(number.knots1+1)]
deg1_1 <- lm(Y ~ bs(X, df = NULL, knots=spacings1, degree = 1, intercept=T))
deg2_1 <- lm(Y ~ bs(X, df = NULL, knots=spacings1, degree = 2, intercept=T))
deg3_1 <- lm(Y ~ bs(X, df = NULL, knots=spacings1, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(deg1_1, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(deg2_1, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(deg3_1, data.frame(X=x.values)))
K=5
#K=5
number.knots5 <- 5
spacings5 <- seq(from=min(X),to=max(X),length=number.knots5+2)[2:(number.knots5+1)]
deg1_5 <- lm(Y ~ bs(X, df = NULL, knots=spacings5, degree = 1, intercept=T))
deg2_5 <- lm(Y ~ bs(X, df = NULL, knots=spacings5, degree = 2, intercept=T))
deg3_5 <- lm(Y ~ bs(X, df = NULL, knots=spacings5, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(deg1_5, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(deg2_5, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(deg3_5, data.frame(X=x.values)))
K=10
#K=10
number.knots10 <- 10
spacings10 <- seq(from=min(X),to=max(X),length=number.knots10+2)[2:(number.knots10+1)]
deg1_10 <- lm(Y ~ bs(X, df = NULL, knots=spacings10, degree = 1, intercept=T))
deg2_10 <- lm(Y ~ bs(X, df = NULL, knots=spacings10, degree = 2, intercept=T))
deg3_10 <- lm(Y ~ bs(X, df = NULL, knots=spacings10, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(deg1_10, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(deg2_10, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(deg3_10, data.frame(X=x.values)))
K=20
#K=20
number.knots5 <- 20
spacings20 <- seq(from=min(X),to=max(X),length=number.knots20+2)[2:(number.knots20+1)]
deg1_20 <- lm(Y ~ bs(X, df = NULL, knots=spacings20, degree = 1, intercept=T))
deg2_20 <- lm(Y ~ bs(X, df = NULL, knots=spacings20, degree = 2, intercept=T))
deg3_20 <- lm(Y ~ bs(X, df = NULL, knots=spacings20, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(deg1_20, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(deg2_20, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(deg3_20, data.frame(X=x.values)))
Next, we explore bs() using different K, degree=3, and different df Despite having a degree of polynomial which is 3 and varying the df, the curves produced are identical. This is probably because of K=1. You still get a curve because degree=3, but it’s not a very flexible curve because k=1.
K=1, degree=3
#df=1
df1_1 <- lm(Y ~ bs(X, df = 1, knots=spacings1, degree = 3, intercept=T))
#df=5
df1_5 <- lm(Y ~ bs(X, df = 5, knots=spacings1, degree = 3, intercept=T))
#df=10
df1_10 <- lm(Y ~ bs(X, df = 10, knots=spacings1, degree = 3, intercept=T))
#df=20
df1_20 <- lm(Y ~ bs(X, df = 20, knots=spacings1, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(df1_1, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(df1_5, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df1_10, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df1_20, data.frame(X=x.values)))
K=5, degree=3
The curves here as quite similar to the previous sets of curves for k=1. Does this mean that we would achieve identical curves with the same k and degree, despite changing the degrees of freedom? Let’s see what happens if we vary the degree.
#df=1
df5_1 <- lm(Y ~ bs(X, df = 1, knots=spacings5, degree = 3, intercept=T))
#df=5
df5_5 <- lm(Y ~ bs(X, df = 5, knots=spacings5, degree = 3, intercept=T))
#df=10
df5_10 <- lm(Y ~ bs(X, df = 10, knots=spacings5, degree = 3, intercept=T))
#df=20
df5_20 <- lm(Y ~ bs(X, df = 20, knots=spacings5, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(df5_1, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(df5_5, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df5_10, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df5_20, data.frame(X=x.values)))
K=10 The curves here as quite similar to the previous sets of curves for k=1. Does this mean that we would achieve identical curves with the same k and degree, despite changing the degrees of freedom? Let’s see what happens if we vary the degree.
#df=1
df10_1 <- lm(Y ~ bs(X, df = 1, knots=spacings10, degree = 3, intercept=T))
#df=5
df10_5 <- lm(Y ~ bs(X, df = 5, knots=spacings10, degree = 3, intercept=T))
#df=10
df10_10 <- lm(Y ~ bs(X, df = 10, knots=spacings10, degree = 3, intercept=T))
#df=20
df10_20 <- lm(Y ~ bs(X, df = 20, knots=spacings10, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(df10_1, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(df10_5, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_10, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_20, data.frame(X=x.values)))
Here we try to vary the degree and while keeping the df constant to see what happens.
#df=1
df10_1a <- lm(Y ~ bs(X, df = 1, knots=spacings10, degree = 1, intercept=T))
#df=5
df10_5a <- lm(Y ~ bs(X, df = 1, knots=spacings10, degree = 2, intercept=T))
#df=10
df10_10a <- lm(Y ~ bs(X, df = 1, knots=spacings10, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(df10_1a, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(df10_5a, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_10a, data.frame(X=x.values)))
Still not sure what df does, so we explore with larger values of df. No difference still. Let’s try degree of 1
#df=10
df10_1b <- lm(Y ~ bs(X, df = 10, knots=spacings10, degree = 3, intercept=T))
#df=100
df10_5b <- lm(Y ~ bs(X, df = 100, knots=spacings10, degree = 3, intercept=T))
#df=1000
df10_10b <- lm(Y ~ bs(X, df = 1000, knots=spacings10, degree = 3, intercept=T))
#df=10000
df10_20b <- lm(Y ~ bs(X, df = 10000, knots=spacings10, degree = 3, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(df10_1b, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(df10_5b, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_10b, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_20b, data.frame(X=x.values)))
No major difference still. So, we decided to not specify the number of knots. what is the effect of df? When degree 1 and df=1, we get a straight line. With same degree of freedom, the df controls the flexibility of the curve. So, you can specify the df instead of the knots. If you specify the K and then the df, the df has no effect! Seems like the degree produces the curve and controls the curvature, while the df and k control its smoothness, continuity, and flexibility.
#df=1
df10_1c <- lm(Y ~ bs(X, df = 1, knots=NULL, degree = 1, intercept=T))
#df=5
df10_5c <- lm(Y ~ bs(X, df = 5, knots=NULL, degree = 3, intercept=T))
#df=1000
df10_10c <- lm(Y ~ bs(X, df = 10, knots=NULL, degree = 3, intercept=T))
#df=10000
df10_20c<- lm(Y ~ bs(X, df = 20, knots=NULL, degree = 1, intercept=T))
# plotting the data with the regression spline overlain:
par(mfrow = c(2, 2))
x.values <- seq(from=min(X), to=max(X), length=200)
plot(X, Y); lines(x.values, predict(df10_1c, data.frame(X=x.values)))
plot(X, Y); lines(x.values,predict(df10_5c, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_10c, data.frame(X=x.values)))
plot(X, Y); lines(x.values, predict(df10_20c, data.frame(X=x.values)))
#About gam()
#input:
#formula: exactly like the formula for a GLM except that
#smooth terms, s, te, ti and t2, can be added to the right
#hand side to specify that the linear predictor depends on
#smooth functions of predictors (or linear functionals of these).
#family: a family object specifying the distribution and link to use in fitting, e.g binomial
#data: A data frame or list containing the model response variable and covariates
#weights: prior weights on the contribution of the data to the log likelihood.
#subset:an optional vector specifying a subset of observations to be used in the fitting process
#control: A list of fit control parameters to replace defaults returned by gam.control. Values not set assume default values.
#method: The smoothing parameter estimation method: GCV, REML (default), ML
#optimizer: An array specifying the numerical optimization method to use to optimize the smoothing parameter estimation criterion (given by method)
#knots: optional list containing user specified knot values to be used for basis construction.
#sp: A vector of smoothing parameters can be provided here.
#Negative elements indicate that the parameter should be estimated, and hence a mixture of fixed and estimated parameters is possible
#Output:
## Create a formula for a model:
gam_formula <- as.formula(paste("Y~s(X)"))
gam_mod <- gam(formula = gam_formula,data=mcycle);
summary(gam_mod)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Y ~ s(X)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -25.546 1.951 -13.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(X) 8.693 8.972 53.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.783 Deviance explained = 79.8%
## GCV = 545.78 Scale est. = 506 n = 133
#plot the model
plot(gam_mod, residuals = TRUE, pch = 1)
Setting the complexity of the model
Now use different basis function k=1,5,10,20
gam_mod_k1 <- gam(Y ~ s(X, k = 1), data = mcycle)
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
gam_mod_k5 <- gam(Y ~ s(X, k = 5), data = mcycle)
gam_mod_k10 <- gam(Y ~ s(X, k = 10), data = mcycle)
gam_mod_k20 <- gam(Y ~ s(X, k = 20), data = mcycle)
#visualize the GAMs
par(mfrow = c(2, 2))
plot(gam_mod_k1, residuals = TRUE, pch = 1)
plot(gam_mod_k5, residuals = TRUE, pch = 1)
plot(gam_mod_k10, residuals = TRUE, pch = 1)
plot(gam_mod_k20, residuals = TRUE, pch = 1)
Using different smoothing parameter
The smoothing parameter balances between likelihood and wiggliness to optimize model fit. Here, we examine smoothing parameters and fit models with different fixed smoothing parameters.
# Extract the smoothing parameter
#View the value of the smoothing parameter lambda of the provided gam_mod model
gam_mod1 <- gam(Y ~ s(X), data = mcycle, method = "REML")
gam_mod1$sp #0.0007758036
## s(X)
## 0.0007758036
# Fix the smoothing parameter at 0.1
gam_mod_s1 <- gam(Y ~ s(X), data = mcycle, sp = 0.1)
# Fix the smoothing parameter at 0.0001
gam_mod_s2 <- gam(Y ~ s(X), data = mcycle, sp = 0.0001)
# Plot both models
par(mfrow = c(1, 2))
plot(gam_mod_s1, residuals = TRUE, pch = 1)
plot(gam_mod_s2, residuals = TRUE, pch = 1)
Using complexity and smoothing together
The number of basis functions and the smoothing parameters interact to control the wiggliness of a smooth function. Here you will see how changing both together affects model behavior.
#a smoothing parameter of 0.0001
gam_mod_sk1 <- gam(Y ~ s(X, k = 1), data = mcycle, sp = 0.0001)
gam_mod_sk5 <- gam(Y ~ s(X, k = 5), data = mcycle, sp = 0.0001)
gam_mod_sk10 <- gam(Y ~ s(X, k = 10), data = mcycle, sp = 0.0001)
gam_mod_sk20 <- gam(Y ~ s(X, k = 20), data = mcycle, sp = 0.0001)
# Visualize the model
par(mfrow = c(2, 2))
plot(gam_mod_sk1, residuals = TRUE, pch = 1)
plot(gam_mod_sk5, residuals = TRUE, pch = 1)
plot(gam_mod_sk10, residuals = TRUE, pch = 1)
plot(gam_mod_sk20, residuals = TRUE, pch = 1)
We fit a GAM which is Non linear in ‘age’ and ‘year’ with 6 degrees of freedom because they are fitted using Smoothing Splines, whereas it is Linear in terms of variable ‘education’.
require(ISLR)
library(gam)
attach(Wage) #Mid-Atlantic Wage Data
#?Wage # To search more on the dataset
#?gam() # To search on the gam function
gam1 <- gam(wage~s(age, df=6)+s(year,df=6)+education ,data = Wage)
summary(gam1)
##
## Call: gam(formula = wage ~ s(age, df = 6) + s(year, df = 6) + education,
## data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.89 -19.73 -3.28 14.27 214.45
##
## (Dispersion Parameter for gaussian family taken to be 1235.516)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3685543 on 2983 degrees of freedom
## AIC: 29890.31
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(age, df = 6) 1 200717 200717 162.456 < 2.2e-16 ***
## s(year, df = 6) 1 22090 22090 17.879 2.425e-05 ***
## education 4 1069323 267331 216.372 < 2.2e-16 ***
## Residuals 2983 3685543 1236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(age, df = 6) 5 26.2089 <2e-16 ***
## s(year, df = 6) 5 1.0144 0.4074
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,3))
plot(gam1, se=TRUE ,col="blue")
Salary first increases with ‘age’ then decreases after around 60. For variable ‘year’ the salaries tend to increase, and it seems that there is a decrease in salary at around year 2007 or 2008. And for the categorical variable ‘education’, Salary also increases monotonically.
credit.data <- read.csv("credit0.csv", header=T)
#str(credit.data) 5,000 obs of 63 variables
#remove X9 and id from the data since we will not be using them for prediction.
credit.data$X9 = NULL
credit.data$id = NULL
credit.data$Y = as.factor(credit.data$Y)
#split the data 90/10 as training/testing datasets:
set.seed(1234)
id_train <- sample(nrow(credit.data),nrow(credit.data)*0.90)
credit.train = credit.data[id_train,]
credit.test = credit.data[-id_train,]
str(credit.train) #4,500 obs of 61 variables
## 'data.frame': 4500 obs. of 61 variables:
## $ Y : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ X2 : num -0.934 -0.349 1.072 -0.432 1.49 ...
## $ X3 : num -0.181 -0.307 0.827 -0.433 -0.307 ...
## $ X4 : num -0.695 -0.426 1.459 -0.291 2.671 ...
## $ X5 : num -0.825 -0.825 2.58 -0.825 -0.17 -0.432 -0.825 -0.432 -0.694 -0.694 ...
## $ X6 : num -0.537 -0.537 0.943 -0.537 -0.537 ...
## $ X7 : num -0.528 -0.071 0.339 0.025 0.086 -0.355 -0.421 -0.626 -0.276 -0.485 ...
## $ X8 : num -0.302 0.798 -0.302 -0.302 -0.302 ...
## $ X10_2 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ X11_2 : int 1 1 1 0 1 1 1 1 1 1 ...
## $ X12_2 : int 1 1 1 1 1 1 1 0 1 1 ...
## $ X13_2 : int 1 1 1 1 1 0 1 0 1 1 ...
## $ X14_2 : int 1 1 1 1 1 1 0 1 1 1 ...
## $ X15_2 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ X15_3 : int 1 1 0 0 0 0 1 0 0 0 ...
## $ X15_4 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ X15_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X15_6 : int 0 0 0 0 0 0 0 1 1 1 ...
## $ X16_2 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ X16_3 : int 0 1 0 0 1 0 0 0 0 1 ...
## $ X16_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X16_5 : int 1 0 0 0 0 1 1 0 1 0 ...
## $ X16_6 : int 0 0 0 1 0 0 0 1 0 0 ...
## $ X17_2 : int 0 0 0 0 0 0 1 0 0 0 ...
## $ X17_3 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ X17_4 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ X17_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X17_6 : int 1 0 1 1 1 1 0 0 1 1 ...
## $ X18_2 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ X18_3 : int 0 0 1 0 1 0 0 0 0 0 ...
## $ X18_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X18_5 : int 1 1 0 0 0 0 1 1 0 0 ...
## $ X18_6 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ X18_7 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ X19_2 : int 0 1 0 0 0 0 0 0 1 0 ...
## $ X19_3 : int 1 0 0 1 0 0 0 0 0 0 ...
## $ X19_4 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ X19_5 : int 0 0 0 0 0 1 0 1 0 0 ...
## $ X19_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X19_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X19_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X19_9 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ X19_10: int 0 0 0 0 0 0 1 0 0 0 ...
## $ X20_2 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ X20_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X20_4 : int 1 0 0 1 0 0 1 0 1 1 ...
## $ X21_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X21_3 : int 1 1 1 1 1 1 0 0 1 1 ...
## $ X22_2 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ X22_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X22_4 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ X22_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X22_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X22_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X22_8 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ X22_9 : int 0 1 1 0 0 0 0 0 0 0 ...
## $ X22_10: int 0 0 0 0 0 0 1 0 0 0 ...
## $ X22_11: int 1 0 0 0 0 0 0 0 0 0 ...
## $ X23_2 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ X23_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X24_2 : int 0 0 0 0 0 0 0 0 0 0 ...
Model and plots
## Create a formula for a model with a large number of variables:
gam_formula <- as.formula(paste("Y~s(X2)+s(X3)+s(X4)+s(X5)+", paste(colnames(credit.train)[6:61], collapse= "+")))
#(colnames(credit.train)[6:61]: add all other variables to the model from variable 6 given that we have listed up to X5
credit.gam <- gam(formula = gam_formula, family=binomial,data=credit.train);
summary(credit.gam)
##
## Call: gam(formula = gam_formula, family = binomial, data = credit.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4925 -0.3516 -0.2220 -0.1287 3.2362
##
## (Dispersion Parameter for binomial family taken to be 1)
##
## Null Deviance: 2037.201 on 4499 degrees of freedom
## Residual Deviance: 1629.217 on 4427 degrees of freedom
## AIC: 1775.217
##
## Number of Local Scoring Iterations: 8
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(X2) 1 4.6 4.629 5.0266 0.0250101 *
## s(X3) 1 22.8 22.783 24.7391 6.813e-07 ***
## s(X4) 1 0.4 0.408 0.4435 0.5054765
## s(X5) 1 0.6 0.583 0.6331 0.4262525
## X6 1 0.2 0.223 0.2423 0.6225702
## X7 1 2.5 2.489 2.7025 0.1002651
## X8 1 38.3 38.284 41.5711 1.258e-10 ***
## X10_2 1 12.6 12.589 13.6700 0.0002206 ***
## X11_2 1 41.5 41.508 45.0710 2.140e-11 ***
## X12_2 1 3.8 3.765 4.0879 0.0432514 *
## X13_2 1 7.8 7.843 8.5164 0.0035374 **
## X14_2 1 0.3 0.324 0.3518 0.5531412
## X15_2 1 0.3 0.271 0.2939 0.5877321
## X15_3 1 2.5 2.512 2.7282 0.0986636 .
## X15_4 1 4.1 4.134 4.4885 0.0341794 *
## X15_5 1 0.2 0.159 0.1726 0.6778008
## X15_6 1 10.3 10.295 11.1787 0.0008343 ***
## X16_2 1 12.5 12.488 13.5601 0.0002338 ***
## X16_3 1 0.0 0.000 0.0002 0.9893479
## X16_4 1 2.1 2.112 2.2937 0.1299707
## X16_5 1 0.2 0.154 0.1674 0.6824351
## X16_6 1 0.0 0.017 0.0181 0.8931110
## X17_2 1 1.9 1.872 2.0323 0.1540614
## X17_3 1 3.4 3.424 3.7177 0.0539019 .
## X17_4 1 1.5 1.487 1.6142 0.2039759
## X17_5 1 5.2 5.238 5.6876 0.0171266 *
## X17_6 1 38.7 38.681 42.0017 1.012e-10 ***
## X18_2 1 1.4 1.381 1.4998 0.2207716
## X18_3 1 2.5 2.502 2.7167 0.0993738 .
## X18_4 1 6.8 6.754 7.3343 0.0067910 **
## X18_5 1 5.6 5.566 6.0442 0.0139898 *
## X18_6 1 0.4 0.426 0.4624 0.4965518
## X18_7 1 5.5 5.451 5.9192 0.0150160 *
## X19_2 1 0.2 0.233 0.2532 0.6148492
## X19_3 1 3.8 3.809 4.1359 0.0420425 *
## X19_4 1 0.4 0.415 0.4509 0.5019231
## X19_5 1 0.1 0.089 0.0962 0.7564315
## X19_6 1 0.2 0.182 0.1978 0.6565367
## X19_7 1 0.7 0.715 0.7763 0.3783208
## X19_8 1 5.0 5.049 5.4823 0.0192542 *
## X19_9 1 0.2 0.170 0.1849 0.6672221
## X19_10 1 2.0 1.975 2.1445 0.1431488
## X20_2 1 1.6 1.585 1.7215 0.1895733
## X20_3 1 0.0 0.047 0.0508 0.8216754
## X20_4 1 2.3 2.324 2.5234 0.1122396
## X21_2 1 0.2 0.171 0.1861 0.6662417
## X21_3 1 16.4 16.379 17.7856 2.522e-05 ***
## X22_2 1 6.8 6.759 7.3389 0.0067736 **
## X22_3 1 2.0 1.976 2.1460 0.1430138
## X22_4 1 0.5 0.490 0.5324 0.4656573
## X22_5 1 0.4 0.416 0.4512 0.5017953
## X22_6 1 0.0 0.018 0.0191 0.8902176
## X22_7 1 0.0 0.010 0.0110 0.9164708
## X22_8 1 3.1 3.136 3.4055 0.0650462 .
## X22_9 1 3.6 3.640 3.9522 0.0468707 *
## X22_10 1 2.1 2.121 2.3029 0.1292059
## X22_11 1 1.3 1.295 1.4061 0.2357763
## X23_2 1 0.0 0.009 0.0101 0.9200586
## X23_3 1 0.6 0.638 0.6932 0.4051333
## X24_2 1 1.5 1.520 1.6509 0.1989034
## Residuals 4427 4077.0 0.921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar Chisq P(Chi)
## (Intercept)
## s(X2) 3 1.1245 0.771137
## s(X3) 3 1.0675 0.784917
## s(X4) 3 6.3605 0.095348 .
## s(X5) 3 12.2901 0.006454 **
## X6
## X7
## X8
## X10_2
## X11_2
## X12_2
## X13_2
## X14_2
## X15_2
## X15_3
## X15_4
## X15_5
## X15_6
## X16_2
## X16_3
## X16_4
## X16_5
## X16_6
## X17_2
## X17_3
## X17_4
## X17_5
## X17_6
## X18_2
## X18_3
## X18_4
## X18_5
## X18_6
## X18_7
## X19_2
## X19_3
## X19_4
## X19_5
## X19_6
## X19_7
## X19_8
## X19_9
## X19_10
## X20_2
## X20_3
## X20_4
## X21_2
## X21_3
## X22_2
## X22_3
## X22_4
## X22_5
## X22_6
## X22_7
## X22_8
## X22_9
## X22_10
## X22_11
## X23_2
## X23_3
## X24_2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,4))
plot(credit.gam, shade=TRUE,seWithMean=TRUE,scale=0, pages=1)
Model AIC/BIC and mean residual deviance
AIC(credit.gam)
## [1] 1775.217
BIC(credit.gam)
## [1] 2166.339
credit.gam$deviance
## [1] 1629.217
In-sample fit performance
#get our confusion matrix
pcut.gam <- .08
prob.gam.in <- predict(credit.gam,credit.train,type="response")
pred.gam.in <- (prob.gam.in>=pcut.gam)*1
table(credit.train$Y,pred.gam.in,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 3418 813
## 1 96 173
#misclassification rate:
mean(ifelse(credit.train$Y != pred.gam.in, 1, 0))
## [1] 0.202
Search for optimal cut-off probability
#define the search grid from 0.01 to 0.20
searchgrid = seq(0.01, 0.20, 0.01)
#the 1st col stores the cut-off p, the 2nd column stores the cost
result.gam = cbind(searchgrid, NA)
#in the cost function, both r and pi are vectors, r=Observed, pi=predicted probability
cost1 <- function(r, pi){
weight1 = 10
weight0 = 1
c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
for(i in 1:length(searchgrid))
{
pcut <- result.gam[i,1]
#assign the cost to the 2nd col
result.gam[i,2] <- cost1(credit.train$Y, predict(credit.gam,type="response"))
}
plot(result.gam, ylab="Cost in Training Set")
index.min<-which.min(result.gam[,2])#find the index of minimum value
result.gam[index.min,2] #min cost
##
## 0.3882222
result.gam[index.min,1] #optimal cutoff probability
## searchgrid
## 0.1
Out-of-sample fit performance
pcut <- result.gam[index.min,1]
prob.gam.out<-predict(credit.gam,credit.test,type="response")
pred.gam.out<-(prob.gam.out>=pcut)*1
table(credit.test$Y,pred.gam.out,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 392 77
## 1 19 12
mean(ifelse(credit.test$Y != pred.gam.out, 1, 0))
## [1] 0.192
#Cost associated with misclassification is
creditcost <- function(observed, predicted){
weight1 = 10
weight0 = 1
c1 = (observed==1)&(predicted == 0) #logical vector - true if actual 1 but predict 0
c0 = (observed==0)&(predicted == 1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
creditcost(credit.test$Y, pred.gam.out)
## [1] 0.534
The addition of a truncated power basis function to the model for a cubic polynomial will lead to a discontinuity in only the third derivative; the function will remain continuous, with continuous first and second derivatives, at each of the knots. Hence the spline function will be smooth and continuous.
The truncated power basis is used for imposing a certain degree of smoothness at the knot locations. This is done by incorporating additional constraints at the knot locations
“bs=tr”: truncated power basis m = 2: quadratic spline default dimension of the spline basis, k, is set to be 4 = 1 + degree of spline (2) + number of knots (1). “method=GCV.Cp”: specifies the smoothing parameter selection method scale = -1, implies its unknown and needs to be estimated fx=fx :
#k<-1
#tr1 <- gam(Y~s(X,bs="tr",m=2,k=k),
#family=gaussian,method="GCV.Cp",scale=-1)