date()
## [1] "Tue Nov 20 16:18:50 2012"
Due Date: November 20, 2012
Total Points: 30
1 Use the petrol consumption data set from Lecture 16 and build a regression tree to predict petrol consumption based on petrol tax, average income, amount of pavement and the proportion of the population with drivers licences. Plot the tree. Which variables are split first and second? Prune the tree leaving only three terminal nodes. Plot the final tree. (10)
PC = read.table("http://myweb.fsu.edu/jelsner/PetrolConsumption.txt", header = TRUE)
head(PC)
## Petrol.Tax Avg.Inc Pavement Prop.DL Petrol.Consumption
## 1 9.0 3571 1976 0.525 541
## 2 9.0 4092 1250 0.572 524
## 3 9.0 3865 1586 0.580 561
## 4 7.5 4870 2351 0.529 414
## 5 8.0 4399 431 0.544 410
## 6 10.0 5342 1333 0.571 457
suppressMessages(require(tree))
## Warning: package 'tree' was built under R version 2.15.2
tr = tree(Petrol.Consumption ~ ., data = PC)
plot(tr)
text(tr)
tr
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 48 6e+05 600
## 2) Prop.DL < 0.646 42 3e+05 600
## 4) Avg.Inc < 4395 27 9e+04 600
## 8) Petrol.Tax < 7.75 16 5e+04 600
## 16) Pavement < 6482.5 7 2e+04 700 *
## 17) Pavement > 6482.5 9 2e+04 600 *
## 9) Petrol.Tax > 7.75 11 8e+03 600 *
## 5) Avg.Inc > 4395 15 5e+04 500
## 10) Prop.DL < 0.5515 7 1e+04 400 *
## 11) Prop.DL > 0.5515 8 1e+04 500 *
## 3) Prop.DL > 0.646 6 9e+04 800 *
The variable of the proportion of the population with drivers licences is split first; the variable of average income is split second.
tr1 = prune.tree(tr, best = 3)
plot(tr1)
text(tr1)
tr1
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 48 6e+05 600
## 2) Prop.DL < 0.646 42 3e+05 600
## 4) Avg.Inc < 4395 27 9e+04 600 *
## 5) Avg.Inc > 4395 15 5e+04 500 *
## 3) Prop.DL > 0.646 6 9e+04 800 *
2 Use the data from Lecture 18 to model the probability of O-ring damage as a logistic regression using launch temperature as the explanatory variable. Is the temperature a significant predictor of damage? Is it adequate? What are the odds of damage when launch temperature is 60F relative to the odds of damage when the temperature is 75F? Use the model to predict the probability of damage given a launch temperature of 55F. (20)
temp = c(66, 70, 69, 68, 67, 72, 73, 70, 57, 63, 70, 78, 67, 53, 67, 75, 70,
81, 76, 79, 75, 76, 58)
damage = c(0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
1)
TD = data.frame(temp, damage)
logrm = glm(damage ~ temp, data = TD, family = binomial)
summary(logrm)
##
## Call:
## glm(formula = damage ~ temp, family = binomial, data = TD)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.061 -0.761 -0.378 0.452 2.217
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.043 7.379 2.04 0.041 *
## temp -0.232 0.108 -2.14 0.032 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28.267 on 22 degrees of freedom
## Residual deviance: 20.315 on 21 degrees of freedom
## AIC: 24.32
##
## Number of Fisher Scoring iterations: 5
##
The temperature a significant predictor of damage.
pchisq(20.315, 21, lower.tail = FALSE)
## [1] 0.5014
p-value > 0.5, so the model is adequate.
o = exp(-0.2322 * (60 - 75))
paste("The odds of damage of launch temperature being 60F relative to the odds of damage of launch temperature being 75F is",
round(o, digits = 0))
## [1] "The odds of damage of launch temperature being 60F relative to the odds of damage of launch temperature being 75F is 33"
predict(logrm, data.frame(temp = 55), type = "response")
## 1
## 0.9067