#1) Problem 3.1 from the text
#3.1a) Fit the Gaussian GAM model
library(AER)
data(HousePrices)
library(mgcv)
fitGaussAM<-gam(price~s(lotsize,k=27)+bedrooms
+factor(bathrooms)+factor(stories)
+factor(driveway)+factor(recreation)
+factor(fullbase)+factor(gasheat)
+factor(aircon)+garage+factor(prefer),
data=HousePrices, family=gaussian)
#3.1b) Check residuals for Gaussian fit
par(mfrow=c(2,2))
gam.check(fitGaussAM)
gam.check plots (Documents/Grad School/STAT 689/Homework/gaussgam.png) FIX
The Gaussian model appears to fit fairly well; the plot of the residuals looks mostly random, and the Normal QQ plot produces an acceptable fit. The only area of concern is that the model does not seem to be a good fit for several values in the middle of the data set as shown by the deviance in the upper part of the QQ plot and the right skew of the residuals histogram.
#3.1c) Fit the Gamma GAM model
fitGammaAM<-gam(price~s(lotsize,k=27)+bedrooms
+factor(bathrooms)+factor(stories)
+factor(driveway)+factor(recreation)
+factor(fullbase)+factor(gasheat)
+factor(aircon)+garage+factor(prefer),
data=HousePrices, family=Gamma)
#3.1d) Check residual for Gamma fit
par(mfrow=c(2,2))
gam.check(fitGammaAM)
INSERT PLOTS
The Gamma GAM model appears to be a very good fit. The residual plot looks completely random, the histogram of residuals looks mostly symmetrical and Normally distributed, and the QQ plot shows an almost perfect fit. Based on these plots the Gamma model is a better fit for this data than the Gaussian model.
#2) Apply stepwise & variable selection methods to the 3.1 data
#2.a) Stepwise regression
detach("package:mgcv", unload=TRUE)
library(gam)
#baseline model
fitinit<-gam::gam(price~lotsize+bedrooms
+as.factor(bathrooms)+as.factor(stories)
+as.factor(driveway)+as.factor(recreation)
+as.factor(fullbase)+as.factor(gasheat)
+as.factor(aircon)+garage+as.factor(prefer),
data=HousePrices, family=gaussian)
summary(fitinit)
#stepwise model
fitstep = gam::step.Gam(fitinit,scope =
list("lotsize" = ~1 + lotsize +s(lotsize,df=2),
"bedrooms" = ~1 + bedrooms,
"bathrooms" = ~1 + as.factor(bathrooms),
"stories" = ~1 + as.factor(stories),
"driveway" = ~1 + as.factor(driveway),
"recreation" = ~1 + as.factor(recreation),
"fullbase" = ~1 + as.factor(fullbase),
"gasheat" = ~1 + as.factor(gasheat),
"aircon" = ~1 + as.factor(aircon),
"garage" = ~1 + garage,
"prefer" = ~1 + as.factor(prefer)),
family = gaussian, data = HousePrices)
print(names(fitstep$"model")[-1])
INSERT CONSOLE OUTPUT
#2.b) Variable selection model
detach("package:gam", unload=TRUE)
library(mgcv)
fitvs = mgcv::gam(price~lotsize+s(lotsize,k=27)+bedrooms
+factor(bathrooms)+factor(stories)
+factor(driveway)+factor(recreation)
+factor(fullbase)+factor(gasheat)
+factor(aircon)+garage+factor(prefer),
data=HousePrices, family=gaussian, select=TRUE)
summary(fitvs)
#2.c) Variable analysis
Both the stepwise and variable selection methods select a large number of variables for the model - the stepwise regression kept all variables except the linear “lotsize” term while the variable selection method kept lotsize as a linear term as well as bathrooms, stories, driveway, fullbase, gasheat, aircon, garage, and prefer. Thus the variables that both models have in common are the linear predictors bathrooms, stories, driveway, fullbase, gasheat, aircon, garage, and prefer.
#3) Working with problem 3.3 from the text
#3.a) Install gss package & look at ozone data
#install.packages("gss")
library(gss)
data(ozone)
pairs(ozone)
help(ozone)
This series of commands makes the ozone data available in the R session [data(ozone)], gives graphs that illustrate the relationships between all the predictor pairs [pairs(ozone)], and provides context for the data set as well as descriptions of each of the variables [help(ozone)].
#3.b) Meaning of “upo3” variable
The “upo3” variable consists of 330 measurements of the Upland ozone concentration, in parts per million, for the Los Angeles Basin in 1976.
#3.c) Fit a stepwise model
detach("package:gam", unload=TRUE)
detach("package:mgcv", unload=TRUE)
library(gam)
#fit baseline model
ozoneinit = gam::gam(upo3~vdht+wdsp+hmdt+sbtp+ibht+dgpg
+ibtp+vsty+day, family=gaussian, data=ozone)
#fit stepwise model
ozonestep = gam::step.Gam(ozoneinit,scope =
list("vdht" = ~1 + vdht + s(vdht,df=6),
"wdsp" = ~1 + wdsp + s(wdsp,df=6),
"hmdt" = ~1 + hmdt + s(hmdt,df=6),
"sbtp" = ~1 + sbtp + s(sbtp,df=6),
"ibht" = ~1 + ibht + s(ibht,df=6),
"dgpg" = ~1 + dgpg + s(dgpg,df=6),
"ibtp" = ~1 + ibtp + s(ibtp,df=6),
"vsty" = ~1 + vsty + s(vsty,df=6),
"day" = ~1 + day + s(day)),
family = gaussian, data = ozone)
print(names(ozonestep$"model")[-1])
The final model using the stepwise selection includes vdht, wdsp, hmdt, and ibtp as linear components as well as sbtp, dgpg, vsty, and day as spline components. This yields a model with AIC=1816.
#3.d) Fit variable selection model
detach("package:gam", unload=TRUE)
library(mgcv)
ozonevs = mgcv::gam(upo3~1 + vdht + s(vdht)
+ wdsp + s(wdsp)
+ hmdt + s(hmdt)
+ sbtp + s(sbtp)
+ ibht + s(ibht)
+ dgpg + s(dgpg)
+ ibtp + s(ibtp)
+ vsty + s(vsty)
+ day + s(day), family=gaussian, data=ozone)
#3.e) Numerical summaries of variable selection model
summary(ozonevs)
Using the variable selection method, we get a model where the only significant linear component is wdsp and the significant spline components are vdht, sbtp, dgpg, and day.
#3.f) Fitting variable selection model with select=TRUE
ozonevs_st = mgcv::gam(upo3~1 + vdht + s(vdht)
+ wdsp + s(wdsp)
+ hmdt + s(hmdt)
+ sbtp + s(sbtp)
+ ibht + s(ibht)
+ dgpg + s(dgpg)
+ ibtp + s(ibtp)
+ vsty + s(vsty)
+ day + s(day), family=gaussian, data=ozone, select=TRUE)
Using the select=TRUE option, we get a model that has vdht as the only significant linear term as well as sbtp, dgpg, vsty, and day as the significant spline terms.
#3.g) Do a model selection using gamsel
library(Ecdat)
library(gamsel)
y=ozone$upo3
x=data.frame(ozone$vdht,ozone$wdsp,ozone$hmdt,ozone$sbtp,ozone$ibht,
ozone$dgpg,ozone$ibtp,ozone$vsty,ozone$day)
names(x)=c("vdht","wdsp","hmdt","sbtp","ibht","dgpg","ibtp","vsty","day")
ozonegamsel<-cv.gamsel(x,y)
print(ozonegamsel$gamsel.fit)
par(mfrow=c(1,1))
plot(ozonegamsel)
Not sure how to complete this…
#4) Problem 3.4 from the text
#3.4a) Explore icu data
library(aplore3)
library(gam)
data(icu)
help(icu)
#3.4b) Create logistic gam model using stepwise
#fit baseline model
icuinit = gam::gam(sta~age+as.factor(gender)+as.factor(race)
+as.factor(ser)+as.factor(can)+as.factor(crn)
+as.factor(inf)+as.factor(cpr)+sys+hra
+as.factor(pre)+as.factor(type)+as.factor(fra)
+as.factor(po2)+as.factor(ph)+as.factor(pco)
+as.factor(bic)+as.factor(cre)+as.factor(loc),
family=binomial, data=icu)
#fit stepwise model
icustep = gam::step.Gam(icuinit,scope =
list("age" = ~1 + age + s(age,df=2),
"gender" = ~1 + as.factor(gender),
"race" = ~1 + as.factor(race),
"ser" = ~1 + as.factor(ser),
"can" = ~1 + as.factor(can),
"crn" = ~1 + as.factor(crn),
"inf" = ~1 + as.factor(inf),
"cpr" = ~1 + as.factor(cpr),
"sys" = ~1 + sys + s(sys,df=2),
"hra" = ~1 + hra + s(hra,df=2),
"pre" = ~1 + as.factor(pre),
"type" = ~1 + as.factor(type),
"fra" = ~1 + as.factor(fra),
"po2" = ~1 + as.factor(po2),
"ph" = ~1 + as.factor(ph),
"pco" = ~1 + as.factor(pco),
"bic" = ~1 + as.factor(bic),
"cre" = ~1 + as.factor(cre),
"loc" = ~1 + as.factor(loc)),
family = binomial, data = icu)
print(names(icustep$"model")[-1])
#3.4c) Use mgcv to fit a logistic model using variable selection
detach("package:gam", unload=TRUE)
library(mgcv)
icuvs = mgcv::gam(sta~1 + age + s(age)
+ as.factor(gender) + as.factor(race)
+ as.factor(ser) + as.factor(can)
+ as.factor(crn) + as.factor(inf)
+ as.factor(cpr) + sys + s(sys)
+ hra + s(hra) + as.factor(pre)
+ as.factor(type) + as.factor(fra)
+ as.factor(po2) + as.factor(ph)
+ as.factor(pco) + as.factor(bic)
+ as.factor(cre) + as.factor(loc),
family=binomial, data=icu)
summary(icuvs)
Looking at the summary of the variable selection model, we see that the only significant linear terms are ca, sys, pre, type, and loc. None of the spline terms are significant.
#4.e) Fit variable selection model with select=TRUE
detach("package:gam", unload=TRUE)
library(mgcv)
icuvs_st = mgcv::gam(sta~1 + age + s(age)
+ as.factor(gender) + as.factor(race)
+ as.factor(ser) + as.factor(can)
+ as.factor(crn) + as.factor(inf)
+ as.factor(cpr) + sys + s(sys)
+ hra + s(hra) + as.factor(pre)
+ as.factor(type) + as.factor(fra)
+ as.factor(po2) + as.factor(ph)
+ as.factor(pco) + as.factor(bic)
+ as.factor(cre) + as.factor(loc),
family=binomial, data=icu, select=TRUE)
summary(icuvs_st)
Looking at the summary of the variable selection model with select=TRUE, we see that there are even fewer significant linear terms than the previous variable selection mode; the only significant terms are can, pre, type, and loc. None of the spline terms are significant. This is a much more reduced model than what is produced by the stepwise method.