We will use insights from a gradient boosting machine (gbm) to build better linear regression model.
There are two key limitations of linear regression:
Our example dataset is from the uci-repository on abalone; a type of shelled sea-animal. The researchers were interested in predicting the age (Rings) of the abalones from different covariates such as weight, diameter, and length.
The data has 4200 rows and 9 columns with the first few lines as follows:
data=read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",sep=",")
colnames(data)=c("Sex","Length","Diameter","Height","Whole_w",
"Shucked_w","Viscera_w","Shell_w","Rings")
head(data,3)
## Sex Length Diameter Height Whole_w Shucked_w Viscera_w Shell_w Rings
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.15 15
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.07 7
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.21 9
Before building any models, some exploratory data analysis reveals that we should log the response.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
ggplot(data,aes(x=Rings))+geom_histogram()
and after logging, we get something that looks normal
data['logged']=log(data$Rings+1)
ggplot(data,aes(x=logged))+geom_histogram()
We will split the dataset into training/test sets (60/40) for model validation.
d = sort(sample(nrow(data), nrow(data)*.6))
#select training sample
train<-data[d,]
test<-data[-d,]
Now, we will try a few linear models.
library(car)
## Warning: package 'car' was built under R version 3.1.2
library(forecast)
## Warning: package 'forecast' was built under R version 3.1.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.1.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: timeDate
## Warning: package 'timeDate' was built under R version 3.1.2
## This is forecast 5.7
library(lattice)
# null model
m0=glm(logged~1,data=train)
plot(m0)
## hat values (leverages) are all = 0.0003990423
## and there are no factor predictors; no plot no. 5
# saturated model
m1<-glm(logged~. -Rings,data=train,family=gaussian())
plot(m1)
# check a few plots outliers?
crPlots(m1)
# delete outlier
train=train[-which(train$Height>0.5),]
vif(m1)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.558752 2 1.117363
## Length 40.582312 1 6.370425
## Diameter 42.797253 1 6.541961
## Height 6.058339 1 2.461369
## Whole_w 101.313986 1 10.065485
## Shucked_w 27.240498 1 5.219243
## Viscera_w 17.499280 1 4.183214
## Shell_w 20.696339 1 4.549323
#update model
m1<-glm(logged~.-Rings -Whole_w -Length,data=train,family=gaussian())
plot(m1)
crPlots(m1)
# residual diag
plot(m1$residuals)
lines(lowess(m1$residuals),col='red',lwd=3)
lines(rep(0,length(m1$residuals)),col='green',lwd=3)
#acf
tsdisplay(m1$residuals)
#dist
histogram(m1$residuals,breaks=50)
hist(m1$residuals,breaks=50,freq=F)
xfit<-seq(min(m1$residuals),max(m1$residuals),length=100)
yfit<-dnorm(xfit,mean=mean(m1$residuals),sd=sd(m1$residuals))
lines(xfit, yfit,col='red',lwd=3)
#update m0
m0=glm(logged~1,data=train,family=gaussian)
# model comparison
anova(m0,m1)
## Analysis of Deviance Table
##
## Model 1: logged ~ 1
## Model 2: logged ~ (Sex + Length + Diameter + Height + Whole_w + Shucked_w +
## Viscera_w + Shell_w + Rings) - Rings - Whole_w - Length
## Resid. Df Resid. Dev Df Deviance
## 1 2504 201.873
## 2 2497 81.686 7 120.19
And the p-value is 0 so that our saturated model fits considerably better.
We will look at a scatterplot matrix to see relationships between variables
And we will arbitrarily delete Length and Whole_w since they are highly correlated. The new model is
m2=update(m1,.~.-Length)
anova(m2,m1)
## Analysis of Deviance Table
##
## Model 1: logged ~ Sex + Diameter + Height + Shucked_w + Viscera_w + Shell_w
## Model 2: logged ~ (Sex + Length + Diameter + Height + Whole_w + Shucked_w +
## Viscera_w + Shell_w + Rings) - Rings - Whole_w - Length
## Resid. Df Resid. Dev Df Deviance
## 1 2497 81.686
## 2 2497 81.686 0 0
The error rate on the test set for model m1 is 9.677 and for m2 it is 9.677, so we will keep m1 for now.
We will now run a tree to see if we can get any additional insights.
And the tree model has error 6.233
Which gives us the following tree
##
## Regression tree:
## rpart(formula = logged ~ . - Rings - Diameter - Whole_w, data = train)
##
## Variables actually used in tree construction:
## [1] Shell_w Shucked_w
##
## Root node error: 201.87/2505 = 0.080588
##
## n= 2505
##
## CP nsplit rel error xerror xstd
## 1 0.358690 0 1.00000 1.00027 0.032998
## 2 0.079850 1 0.64131 0.66521 0.021362
## 3 0.047068 2 0.56146 0.57033 0.018857
## 4 0.020590 3 0.51439 0.53459 0.017465
## 5 0.013358 5 0.47321 0.50573 0.016322
## 6 0.012052 6 0.45985 0.49226 0.016253
## 7 0.010999 8 0.43575 0.48151 0.015790
## 8 0.010000 9 0.42475 0.47063 0.015644
## Call:
## rpart(formula = logged ~ . - Rings - Diameter - Whole_w, data = train)
## n= 2505
##
## CP nsplit rel error xerror xstd
## 1 0.35869039 0 1.0000000 1.0002659 0.03299844
## 2 0.07985029 1 0.6413096 0.6652126 0.02136164
## 3 0.04706763 2 0.5614593 0.5703311 0.01885672
## 4 0.02058965 3 0.5143917 0.5345923 0.01746473
## 5 0.01335790 5 0.4732124 0.5057275 0.01632201
## 6 0.01205228 6 0.4598545 0.4922566 0.01625334
## 7 0.01099898 8 0.4357499 0.4815115 0.01579045
## 8 0.01000000 9 0.4247509 0.4706301 0.01564415
##
## Variable importance
## Shell_w Length Shucked_w Viscera_w Height Sex
## 24 20 19 19 17 1
##
## Node number 1: 2505 observations, complexity param=0.3586904
## mean=2.34561, MSE=0.08058808
## left son=2 (525 obs) right son=3 (1980 obs)
## Primary splits:
## Shell_w < 0.1115 to the left, improve=0.3586904, (0 missing)
## Height < 0.1125 to the left, improve=0.3411889, (0 missing)
## Length < 0.4375 to the left, improve=0.3303993, (0 missing)
## Viscera_w < 0.08175 to the left, improve=0.3222184, (0 missing)
## Shucked_w < 0.15775 to the left, improve=0.3022013, (0 missing)
## Surrogate splits:
## Length < 0.4275 to the left, agree=0.972, adj=0.869, (0 split)
## Viscera_w < 0.08025 to the left, agree=0.966, adj=0.836, (0 split)
## Shucked_w < 0.16075 to the left, agree=0.955, adj=0.787, (0 split)
## Height < 0.1025 to the left, agree=0.948, adj=0.750, (0 split)
## Sex splits as RLR, agree=0.791, adj=0.002, (0 split)
##
## Node number 2: 525 observations, complexity param=0.04706763
## mean=2.015432, MSE=0.05344547
## left son=4 (66 obs) right son=5 (459 obs)
## Primary splits:
## Shell_w < 0.02375 to the left, improve=0.3386341, (0 missing)
## Length < 0.2575 to the left, improve=0.3273922, (0 missing)
## Viscera_w < 0.01725 to the left, improve=0.3138988, (0 missing)
## Shucked_w < 0.03525 to the left, improve=0.3132131, (0 missing)
## Height < 0.0625 to the left, improve=0.2911978, (0 missing)
## Surrogate splits:
## Length < 0.2525 to the left, agree=0.990, adj=0.924, (0 split)
## Shucked_w < 0.02825 to the left, agree=0.983, adj=0.864, (0 split)
## Viscera_w < 0.01675 to the left, agree=0.975, adj=0.803, (0 split)
## Height < 0.0575 to the left, agree=0.954, adj=0.636, (0 split)
##
## Node number 3: 1980 observations, complexity param=0.07985029
## mean=2.433157, MSE=0.0512143
## left son=6 (825 obs) right son=7 (1155 obs)
## Primary splits:
## Shell_w < 0.24925 to the left, improve=0.15896390, (0 missing)
## Height < 0.1525 to the left, improve=0.12643500, (0 missing)
## Viscera_w < 0.13125 to the left, improve=0.08709626, (0 missing)
## Sex splits as RLR, improve=0.07063830, (0 missing)
## Length < 0.5325 to the left, improve=0.06530452, (0 missing)
## Surrogate splits:
## Viscera_w < 0.18025 to the left, agree=0.874, adj=0.698, (0 split)
## Length < 0.5525 to the left, agree=0.872, adj=0.692, (0 split)
## Height < 0.1475 to the left, agree=0.862, adj=0.668, (0 split)
## Shucked_w < 0.36425 to the left, agree=0.849, adj=0.639, (0 split)
## Sex splits as RLR, agree=0.718, adj=0.322, (0 split)
##
## Node number 4: 66 observations
## mean=1.660655, MSE=0.03313713
##
## Node number 5: 459 observations, complexity param=0.0133579
## mean=2.066446, MSE=0.03566478
## left son=10 (126 obs) right son=11 (333 obs)
## Primary splits:
## Shell_w < 0.05125 to the left, improve=0.1647269, (0 missing)
## Length < 0.3325 to the left, improve=0.1376584, (0 missing)
## Sex splits as RLR, improve=0.1346215, (0 missing)
## Viscera_w < 0.03225 to the left, improve=0.1207861, (0 missing)
## Height < 0.0875 to the left, improve=0.1150279, (0 missing)
## Surrogate splits:
## Length < 0.3275 to the left, agree=0.948, adj=0.810, (0 split)
## Shucked_w < 0.06525 to the left, agree=0.928, adj=0.738, (0 split)
## Viscera_w < 0.03225 to the left, agree=0.902, adj=0.643, (0 split)
## Height < 0.0775 to the left, agree=0.880, adj=0.563, (0 split)
##
## Node number 6: 825 observations, complexity param=0.01205228
## mean=2.326397, MSE=0.03932197
## left son=12 (335 obs) right son=13 (490 obs)
## Primary splits:
## Shell_w < 0.16775 to the left, improve=0.07024574, (0 missing)
## Sex splits as RLR, improve=0.04775830, (0 missing)
## Height < 0.1225 to the left, improve=0.03790206, (0 missing)
## Viscera_w < 0.12125 to the left, improve=0.02057904, (0 missing)
## Shucked_w < 0.301 to the right, improve=0.01565452, (0 missing)
## Surrogate splits:
## Length < 0.4875 to the left, agree=0.833, adj=0.588, (0 split)
## Viscera_w < 0.11975 to the left, agree=0.819, adj=0.555, (0 split)
## Shucked_w < 0.22225 to the left, agree=0.782, adj=0.463, (0 split)
## Height < 0.1225 to the left, agree=0.756, adj=0.400, (0 split)
## Sex splits as RLR, agree=0.612, adj=0.045, (0 split)
##
## Node number 7: 1155 observations, complexity param=0.02058965
## mean=2.509415, MSE=0.04575243
## left son=14 (829 obs) right son=15 (326 obs)
## Primary splits:
## Shell_w < 0.39175 to the left, improve=0.07503220, (0 missing)
## Shucked_w < 0.44375 to the right, improve=0.05003366, (0 missing)
## Height < 0.1725 to the left, improve=0.04010657, (0 missing)
## Length < 0.5675 to the right, improve=0.01723062, (0 missing)
## Viscera_w < 0.16875 to the right, improve=0.01235743, (0 missing)
## Surrogate splits:
## Length < 0.6525 to the left, agree=0.854, adj=0.482, (0 split)
## Viscera_w < 0.31125 to the left, agree=0.829, adj=0.393, (0 split)
## Height < 0.1825 to the left, agree=0.822, adj=0.368, (0 split)
## Shucked_w < 0.63625 to the left, agree=0.822, adj=0.368, (0 split)
##
## Node number 10: 126 observations
## mean=1.94184, MSE=0.02334352
##
## Node number 11: 333 observations
## mean=2.113594, MSE=0.03222897
##
## Node number 12: 335 observations
## mean=2.262835, MSE=0.03100399
##
## Node number 13: 490 observations, complexity param=0.01205228
## mean=2.369854, MSE=0.04035812
## left son=26 (388 obs) right son=27 (102 obs)
## Primary splits:
## Shucked_w < 0.2445 to the right, improve=0.13083110, (0 missing)
## Length < 0.5025 to the right, improve=0.08145319, (0 missing)
## Sex splits as RLL, improve=0.03477035, (0 missing)
## Viscera_w < 0.18025 to the right, improve=0.03034473, (0 missing)
## Shell_w < 0.19425 to the left, improve=0.01214378, (0 missing)
## Surrogate splits:
## Length < 0.4925 to the right, agree=0.851, adj=0.284, (0 split)
## Viscera_w < 0.10975 to the right, agree=0.841, adj=0.235, (0 split)
##
## Node number 14: 829 observations, complexity param=0.02058965
## mean=2.472673, MSE=0.03992455
## left son=28 (496 obs) right son=29 (333 obs)
## Primary splits:
## Shucked_w < 0.44375 to the right, improve=0.13136930, (0 missing)
## Length < 0.5675 to the right, improve=0.06093165, (0 missing)
## Viscera_w < 0.16875 to the right, improve=0.04037415, (0 missing)
## Shell_w < 0.33925 to the left, improve=0.01231247, (0 missing)
## Height < 0.1625 to the left, improve=0.01040135, (0 missing)
## Surrogate splits:
## Length < 0.5875 to the right, agree=0.806, adj=0.517, (0 split)
## Viscera_w < 0.21525 to the right, agree=0.789, adj=0.474, (0 split)
## Shell_w < 0.29075 to the right, agree=0.682, adj=0.207, (0 split)
## Height < 0.1475 to the right, agree=0.654, adj=0.138, (0 split)
## Sex splits as LRL, agree=0.622, adj=0.060, (0 split)
##
## Node number 15: 326 observations, complexity param=0.01099898
## mean=2.602847, MSE=0.0484098
## left son=30 (242 obs) right son=31 (84 obs)
## Primary splits:
## Shucked_w < 0.56625 to the right, improve=0.14069540, (0 missing)
## Viscera_w < 0.28775 to the right, improve=0.08646927, (0 missing)
## Shell_w < 0.50925 to the left, improve=0.06610217, (0 missing)
## Length < 0.6475 to the right, improve=0.05854863, (0 missing)
## Height < 0.2125 to the left, improve=0.02075267, (0 missing)
## Surrogate splits:
## Viscera_w < 0.2955 to the right, agree=0.859, adj=0.452, (0 split)
## Length < 0.6375 to the right, agree=0.853, adj=0.429, (0 split)
## Height < 0.1575 to the right, agree=0.758, adj=0.060, (0 split)
## Shell_w < 0.39475 to the right, agree=0.745, adj=0.012, (0 split)
##
## Node number 26: 388 observations
## mean=2.332597, MSE=0.02690101
##
## Node number 27: 102 observations
## mean=2.511575, MSE=0.0661827
##
## Node number 28: 496 observations
## mean=2.413333, MSE=0.02237007
##
## Node number 29: 333 observations
## mean=2.561059, MSE=0.05301473
##
## Node number 30: 242 observations
## mean=2.554225, MSE=0.03389019
##
## Node number 31: 84 observations
## mean=2.742927, MSE=0.06380679
##
## Regression tree:
## rpart(formula = logged ~ . - Rings - Diameter - Whole_w, data = train)
##
## Variables actually used in tree construction:
## [1] Shell_w Shucked_w
##
## Root node error: 201.87/2505 = 0.080588
##
## n= 2505
##
## CP nsplit rel error xerror xstd
## 1 0.358690 0 1.00000 1.00027 0.032998
## 2 0.079850 1 0.64131 0.66521 0.021362
## 3 0.047068 2 0.56146 0.57033 0.018857
## 4 0.020590 3 0.51439 0.53459 0.017465
## 5 0.013358 5 0.47321 0.50573 0.016322
## 6 0.012052 6 0.45985 0.49226 0.016253
## 7 0.010999 8 0.43575 0.48151 0.015790
## 8 0.010000 9 0.42475 0.47063 0.015644
We will now fit the gradient boosting machine.
## Warning: package 'gbm' was built under R version 3.1.1
## Warning: package 'dismo' was built under R version 3.1.2
## Warning: package 'raster' was built under R version 3.1.2
## Warning: package 'sp' was built under R version 3.1.2
gbm=gbm.step(data=train, gbm.x = 1:8, gbm.y = 10,
family = "gaussian", tree.complexity = 5,
learning.rate = 0.01, bag.fraction = 0.5)
##
##
## GBM STEP - version 2.9
##
## Performing cross-validation optimisation of a boosted regression tree model
## for logged and using a family of gaussian
## Using 2505 observations and 8 predictors
## creating 10 initial models of 50 trees
##
## folds are unstratified
## total mean deviance = 0.0806
## tolerance is fixed at 1e-04
## ntrees resid. dev.
## 50 0.0535
## now adding trees...
## 100 0.0421
## 150 0.0368
## 200 0.0341
## 250 0.0326
## 300 0.0317
## 350 0.0311
## 400 0.0307
## 450 0.0304
## 500 0.0302
## 550 0.0301
## 600 0.0299
## 650 0.0299
## 700 0.0298
## 750 0.0297
## 800 0.0297
## 850 0.0297
## 900 0.0296
## 950 0.0296
## 1000 0.0296
## 1050 0.0296
## 1100 0.0295
## 1150 0.0296
## 1200 0.0296
## 1250 0.0295
## 1300 0.0295
## 1350 0.0295
## 1400 0.0295
## 1450 0.0295
## 1500 0.0295
## 1550 0.0295
## 1600 0.0296
## 1650 0.0296
## fitting final gbm model with a fixed number of 1350 trees for logged
##
## mean total deviance = 0.081
## mean residual deviance = 0.022
##
## estimated cv deviance = 0.03 ; se = 0.001
##
## training data correlation = 0.85
## cv correlation = 0.796 ; se = 0.01
##
## elapsed time - 0.03 minutes
Which has prediction error 4.927.
Then we can plot the marginal effects
par(mfrow=c(3,3))
#par(mar=c(5,5,5,5))
#gbm.plot(gbm4,n.plots=8,write.title=F,common.scale=T)
gbm.plot(gbm,n.plots=8,write.title=F,common.scale=F,smooth=T,plot.layout=c(3,3))
We can make some key observations here: - Shell_w looks quadratic. - Shucked_w also looks quadratic. - Whole_w has a different slope before 0.5.
And the important variables
summary(gbm)
## var rel.inf
## Shell_w Shell_w 59.864851
## Shucked_w Shucked_w 13.113362
## Height Height 8.100374
## Whole_w Whole_w 5.189147
## Viscera_w Viscera_w 4.386936
## Diameter Diameter 3.695383
## Sex Sex 3.019249
## Length Length 2.630698
We can also find interactions between variables as follows
find.int <- gbm.interactions(gbm)
## gbm.interactions - version 2.9
## Cross tabulating interactions for gbm model with 8 predictors
## 1 2 3 4 5 6 7
find.int$interactions
## Sex Length Diameter Height Whole_w Shucked_w Viscera_w Shell_w
## Sex 0 0 0.00 0.08 0.01 0.45 0.01 0.02
## Length 0 0 0.01 0.05 0.11 0.10 0.02 0.02
## Diameter 0 0 0.00 0.01 0.16 0.13 0.01 0.05
## Height 0 0 0.00 0.00 0.03 0.46 0.11 0.30
## Whole_w 0 0 0.00 0.00 0.00 4.28 0.17 0.40
## Shucked_w 0 0 0.00 0.00 0.00 0.00 0.08 1.10
## Viscera_w 0 0 0.00 0.00 0.00 0.00 0.00 0.11
## Shell_w 0 0 0.00 0.00 0.00 0.00 0.00 0.00
find.int$rank.list
## var1.index var1.names var2.index var2.names int.size
## 1 6 Shucked_w 5 Whole_w 4.28
## 2 8 Shell_w 6 Shucked_w 1.10
## 3 6 Shucked_w 4 Height 0.46
Now lets look at the various interactions between the variables
par(mfrow=c(1,1))
gbm.perspec(gbm, 6, 5,phi=40,z.range=c(min(gbm$fitted),max(gbm$fitted)))
## maximum value = 3.12
## Warning in persp.default(x = x.var, y = y.var, z = pred.matrix, zlim =
## z.range, : surface extends beyond the box
gbm.perspec(gbm, 8,6,phi=40,z.range=c(min(gbm$fitted),max(gbm$fitted)))
## maximum value = 3.03
gbm.perspec(gbm, 8,7,phi=40,z.range=c(min(gbm$fitted),max(gbm$fitted)))
## maximum value = 2.81
And then using the insights from the gbm, lets update the linear model as follows
# First update the from the marginal plot from crPlot
m3=update(m1,.~.+I(ifelse(Diameter<0.2,Diameter,0))+I(ifelse(Shucked_w<0.2,Shucked_w,0))+I(ifelse(Viscera_w<0.05,Viscera_w,0))+I(ifelse(Shell_w<0.1,Shell_w,0)))
summary(m3)
##
## Call:
## glm(formula = logged ~ Sex + Diameter + Height + Shucked_w +
## Viscera_w + Shell_w + I(ifelse(Diameter < 0.2, Diameter,
## 0)) + I(ifelse(Shucked_w < 0.2, Shucked_w, 0)) + I(ifelse(Viscera_w <
## 0.05, Viscera_w, 0)) + I(ifelse(Shell_w < 0.1, Shell_w, 0)),
## family = gaussian(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.72365 -0.12022 -0.01737 0.09726 0.72082
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.7043260 0.0435257 39.157
## SexI -0.0792549 0.0107825 -7.350
## SexM 0.0003995 0.0087816 0.045
## Diameter 1.0304949 0.1381525 7.459
## Height 2.1056283 0.2451313 8.590
## Shucked_w -0.8612026 0.0487708 -17.658
## Viscera_w -0.1826866 0.1108659 -1.648
## Shell_w 1.2837705 0.0787043 16.311
## I(ifelse(Diameter < 0.2, Diameter, 0)) -1.0461459 0.1713049 -6.107
## I(ifelse(Shucked_w < 0.2, Shucked_w, 0)) 0.1150130 0.0816736 1.408
## I(ifelse(Viscera_w < 0.05, Viscera_w, 0)) -0.7331219 0.4483705 -1.635
## I(ifelse(Shell_w < 0.1, Shell_w, 0)) -0.3831860 0.2062253 -1.858
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## SexI 2.67e-13 ***
## SexM 0.9637
## Diameter 1.20e-13 ***
## Height < 2e-16 ***
## Shucked_w < 2e-16 ***
## Viscera_w 0.0995 .
## Shell_w < 2e-16 ***
## I(ifelse(Diameter < 0.2, Diameter, 0)) 1.17e-09 ***
## I(ifelse(Shucked_w < 0.2, Shucked_w, 0)) 0.1592
## I(ifelse(Viscera_w < 0.05, Viscera_w, 0)) 0.1022
## I(ifelse(Shell_w < 0.1, Shell_w, 0)) 0.0633 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.03216569)
##
## Null deviance: 201.873 on 2504 degrees of freedom
## Residual deviance: 80.189 on 2493 degrees of freedom
## AIC: -1486.5
##
## Number of Fisher Scoring iterations: 2
# 2nd update the from the marginal plot from gbm
m4=update(m1,.~.+I(ifelse(Height<0.05,Height,0))+I(ifelse(Shucked_w>0.5,Shucked_w,0))+I(ifelse(Shell_w>0.5,Shell_w,0)))
summary(m4)
##
## Call:
## glm(formula = logged ~ Sex + Diameter + Height + Shucked_w +
## Viscera_w + Shell_w + I(ifelse(Height < 0.05, Height, 0)) +
## I(ifelse(Shucked_w > 0.5, Shucked_w, 0)) + I(ifelse(Shell_w >
## 0.5, Shell_w, 0)), family = gaussian(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.72463 -0.11825 -0.01648 0.09887 0.71971
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.622637 0.034281 47.333
## SexI -0.079047 0.010833 -7.297
## SexM -0.001197 0.008812 -0.136
## Diameter 1.244626 0.133522 9.322
## Height 2.090156 0.249159 8.389
## Shucked_w -0.868003 0.064698 -13.416
## Viscera_w -0.224125 0.111325 -2.013
## Shell_w 1.304874 0.092849 14.054
## I(ifelse(Height < 0.05, Height, 0)) -5.467165 1.002865 -5.452
## I(ifelse(Shucked_w > 0.5, Shucked_w, 0)) -0.015903 0.026181 -0.607
## I(ifelse(Shell_w > 0.5, Shell_w, 0)) -0.063035 0.043075 -1.463
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## SexI 3.94e-13 ***
## SexM 0.8919
## Diameter < 2e-16 ***
## Height < 2e-16 ***
## Shucked_w < 2e-16 ***
## Viscera_w 0.0442 *
## Shell_w < 2e-16 ***
## I(ifelse(Height < 0.05, Height, 0)) 5.48e-08 ***
## I(ifelse(Shucked_w > 0.5, Shucked_w, 0)) 0.5436
## I(ifelse(Shell_w > 0.5, Shell_w, 0)) 0.1435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.03235472)
##
## Null deviance: 201.873 on 2504 degrees of freedom
## Residual deviance: 80.693 on 2494 degrees of freedom
## AIC: -1472.8
##
## Number of Fisher Scoring iterations: 2
plot(m4)
crPlots(m4)
## Warning in smoother(.x, partial.res[, var], col = col.lines[2], log.x =
## FALSE, : could not fit smooth
## Warning in smoother(.x, partial.res[, var], col = col.lines[2], log.x =
## FALSE, : could not fit smooth
## Warning in smoother(.x, partial.res[, var], col = col.lines[2], log.x =
## FALSE, : could not fit smooth
anova(m4,m3)
## Analysis of Deviance Table
##
## Model 1: logged ~ Sex + Diameter + Height + Shucked_w + Viscera_w + Shell_w +
## I(ifelse(Height < 0.05, Height, 0)) + I(ifelse(Shucked_w >
## 0.5, Shucked_w, 0)) + I(ifelse(Shell_w > 0.5, Shell_w, 0))
## Model 2: logged ~ Sex + Diameter + Height + Shucked_w + Viscera_w + Shell_w +
## I(ifelse(Diameter < 0.2, Diameter, 0)) + I(ifelse(Shucked_w <
## 0.2, Shucked_w, 0)) + I(ifelse(Viscera_w < 0.05, Viscera_w,
## 0)) + I(ifelse(Shell_w < 0.1, Shell_w, 0))
## Resid. Df Resid. Dev Df Deviance
## 1 2494 80.693
## 2 2493 80.189 1 0.5036
Which has a prediction error of 8.059.
Now add the interactions into the model
m5=update(m4,.~. +Shucked_w*Whole_w+Shucked_w*Shell_w+Shucked_w*Height)
summary(m5)
##
## Call:
## glm(formula = logged ~ Sex + Diameter + Height + Shucked_w +
## Viscera_w + Shell_w + I(ifelse(Height < 0.05, Height, 0)) +
## I(ifelse(Shucked_w > 0.5, Shucked_w, 0)) + I(ifelse(Shell_w >
## 0.5, Shell_w, 0)) + Whole_w + Shucked_w:Whole_w + Shucked_w:Shell_w +
## Height:Shucked_w, family = gaussian(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.87749 -0.11218 -0.01130 0.09323 0.70240
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.5209929 0.0395568 38.451
## SexI -0.0675335 0.0105187 -6.420
## SexM -0.0008613 0.0084710 -0.102
## Diameter 0.7088869 0.1561637 4.539
## Height 4.3495643 0.4367144 9.960
## Shucked_w -0.5954379 0.1434998 -4.149
## Viscera_w -0.7575524 0.1325888 -5.714
## Shell_w 2.3185891 0.2426245 9.556
## I(ifelse(Height < 0.05, Height, 0)) -3.6753005 1.0042433 -3.660
## I(ifelse(Shucked_w > 0.5, Shucked_w, 0)) 0.0197865 0.0271499 0.729
## I(ifelse(Shell_w > 0.5, Shell_w, 0)) -0.0333759 0.0517313 -0.645
## Whole_w 0.0727921 0.0902658 0.806
## Shucked_w:Whole_w 1.0580790 0.1032086 10.252
## Shucked_w:Shell_w -3.0092620 0.4310528 -6.981
## Height:Shucked_w -6.9144414 1.0075669 -6.863
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## SexI 1.62e-10 ***
## SexM 0.919025
## Diameter 5.91e-06 ***
## Height < 2e-16 ***
## Shucked_w 3.45e-05 ***
## Viscera_w 1.24e-08 ***
## Shell_w < 2e-16 ***
## I(ifelse(Height < 0.05, Height, 0)) 0.000258 ***
## I(ifelse(Shucked_w > 0.5, Shucked_w, 0)) 0.466200
## I(ifelse(Shell_w > 0.5, Shell_w, 0)) 0.518872
## Whole_w 0.420078
## Shucked_w:Whole_w < 2e-16 ***
## Shucked_w:Shell_w 3.74e-12 ***
## Height:Shucked_w 8.51e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.02986995)
##
## Null deviance: 201.873 on 2504 degrees of freedom
## Residual deviance: 74.376 on 2490 degrees of freedom
## AIC: -1669
##
## Number of Fisher Scoring iterations: 2
anova(m5,m4)
## Analysis of Deviance Table
##
## Model 1: logged ~ Sex + Diameter + Height + Shucked_w + Viscera_w + Shell_w +
## I(ifelse(Height < 0.05, Height, 0)) + I(ifelse(Shucked_w >
## 0.5, Shucked_w, 0)) + I(ifelse(Shell_w > 0.5, Shell_w, 0)) +
## Whole_w + Shucked_w:Whole_w + Shucked_w:Shell_w + Height:Shucked_w
## Model 2: logged ~ Sex + Diameter + Height + Shucked_w + Viscera_w + Shell_w +
## I(ifelse(Height < 0.05, Height, 0)) + I(ifelse(Shucked_w >
## 0.5, Shucked_w, 0)) + I(ifelse(Shell_w > 0.5, Shell_w, 0))
## Resid. Df Resid. Dev Df Deviance
## 1 2490 74.376
## 2 2494 80.693 -4 -6.3165
#residuals
plot(m5)
tsdisplay(m5$residuals)
#m
library(earth)
## Warning: package 'earth' was built under R version 3.1.2
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.1.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.1.2
m6=earth(logged~. -Rings,data=train)
m7=earth(logged~. -Rings -Diameter,data=train)
summary(m6)
## Call: earth(formula=logged~.-Rings, data=train)
##
## coefficients
## (Intercept) 1.04872728
## SexI -0.05632461
## h(0.31-Length) -1.42859256
## h(Length-0.31) -0.86566465
## h(0.56-Diameter) -0.74570061
## h(0.185-Height) -1.43499064
## h(0.9615-Whole_w) -1.20551010
## h(Whole_w-0.9615) 0.60821468
## h(Shucked_w-0.452) 1.13894282
## h(1.1335-Shucked_w) 2.21045071
## h(Shucked_w-1.1335) -1.42057718
## h(0.207-Viscera_w) 1.21971172
## h(Viscera_w-0.207) -0.51508318
## h(0.1125-Shell_w) -2.74300468
## h(Shell_w-0.1125) 1.61606870
## h(Shell_w-0.2) -1.04480812
##
## Selected 16 of 18 terms, and 8 of 9 predictors
## Termination condition: Reached nk 21
## Importance: Shell_w, Shucked_w, Whole_w, Length, SexI, Height, ...
## Number of terms at each degree of interaction: 1 15 (additive model)
## GCV 0.02891552 RSS 70.65172 GRSq 0.6414799 RSq 0.6500192
e1=evimp(m6,trim=F)
e1
## nsubsets gcv rss
## Shell_w 15 100.0 100.0
## Shucked_w 13 42.7 43.6
## Whole_w 12 29.7 31.2
## Length 10 22.2 23.9
## SexI 9 18.8 20.6
## Height 7 13.4 15.4
## Viscera_w 6 11.3 13.3
## Diameter 1 3.8 4.8
## SexM-unused 0 0.0 0.0
plot(m6)
plotmo(m6,clip=F)
##
## grid: Sex Length Diameter Height Whole_w Shucked_w Viscera_w Shell_w
## F 0.545 0.425 0.14 0.804 0.3365 0.1715 0.2345
## Rings
## 9
plot(e1)
#resid
hist(gbm$residuals,breaks=50)
tsdisplay(gbm$residuals)
plot(gbm$fitted,gbm$residuals)
plot(gbm$residuals)
# Fit gbm residuals
lm_gbm=glm(gbm$residuals~.-Rings -logged,data=train)
##prediction error
sum((predict(m0,newdata=test)-test$logged)^2)
## [1] 140.5901
sum((predict(m1,newdata=test)-test$logged)^2)
## [1] 63.00566
sum((predict(m2,newdata=test)-test$logged)^2)
## [1] 63.00566
sum((predict(m3,newdata=test)-test$logged)^2)
## [1] 60.81818
sum((predict(m4,newdata=test)-test$logged)^2)
## [1] 61.78671
sum((predict(m5,newdata=test)-test$logged)^2)
## [1] 57.10017
sum((predict(m6,newdata=test)-test$logged)^2)
## [1] 50.78596
sum((predict(m7,newdata=test)-test$logged)^2)
## [1] 50.72114
sum((predict(gbm,newdata=test,n.trees=1400)-test$logged)^2)
## Warning in predict.gbm(gbm, newdata = test, n.trees = 1400): Number of
## trees not specified or exceeded number fit so far. Using 1350.
## [1] 49.80458
sum((predict(tree,newdata=test)-test$logged)^2)
## [1] 65.12776
# some blending
gbm_pred=predict(gbm,newdata=test,n.trees=1400)
## Warning in predict.gbm(gbm, newdata = test, n.trees = 1400): Number of
## trees not specified or exceeded number fit so far. Using 1350.
glm_pred=predict(m5,newdata=test)
mars_pred=predict(m6,newdata=test)
sum(((gbm_pred+mars_pred)/2-test$logged)^2)
## [1] 49.23321
sum(((gbm_pred+glm_pred)/2-test$logged)^2)
## [1] 51.343
sum(((glm_pred+mars_pred)/2-test$logged)^2)
## [1] 52.14234
Which has a prediction error of 7.941.
Finally, compare the models with their out-of-sample error rates:
| Model | Error |
|---|---|
| Full Model | 9.677 |
| Model with additive (m3) | 8.059 |
| Model with interactions (m4) | 7.941 |
| GBM | 4.927 |
| Tree | 6.233 |