date()
## [1] "Thu Nov 17 12:17:23 2016"
Due Date: November 15, 2016
Total Points: 55
1 Use the petrol consumption data set (http://myweb.fsu.edu/jelsner/temp/data/PetrolConsumption.txt) and build a regression tree to predict petrol consumption based on petrol tax, average income, amount of pavement and the proportion of the population with drivers licences. Plot the tree. Prune the tree leaving only three terminal nodes. Plot the final tree. (10)
library(rpart)
petrol=read.table("http://myweb.fsu.edu/jelsner/temp/data/PetrolConsumption.txt",header=T)
tree1=rpart(Petrol.Consumption~.,data=petrol)
plot(tree1)
text(tree1)
cp=tree1$cptable[3,"CP"]
tree1.prune=prune(tree1,cp=cp)
plot(tree1.prune)
text(tree1.prune)
2 Use the data below to model the probability of O-ring damage as a logistic regression using launch temperature as the explanatory variable. Is the temperature a significant predictor of damage? Is it adequate? What are the odds of damage when launch temperature is 60F relative to the odds of damage when the temperature is 75F? Use the model to predict the probability of damage given a launch temperature of 55F. (20)
Temp = c(66, 70, 69, 68, 67, 72, 73, 70, 57, 63, 70, 78, 67, 53, 67, 75, 70, 81, 76, 79, 75, 76, 58)
Damage = c(0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1)
plot(Temp,Damage)
log.fit=glm(Damage~Temp,family="binomial")
summary(log.fit)
##
## Call:
## glm(formula = Damage ~ Temp, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0611 -0.7613 -0.3783 0.4524 2.2175
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.0429 7.3786 2.039 0.0415 *
## Temp -0.2322 0.1082 -2.145 0.0320 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28.267 on 22 degrees of freedom
## Residual deviance: 20.315 on 21 degrees of freedom
## AIC: 24.315
##
## Number of Fisher Scoring iterations: 5
pi.hat=log.fit$fitted.values
table(pred=ifelse(pi.hat<.5,0,1),Damage)
## Damage
## pred 0 1
## 0 16 3
## 1 0 4
exp(-.2322*(75-60))
## [1] 0.03071513
p1=exp(15.0429-.2322*75)/(1+exp(15.0429-.2322*75))
p2=exp(15.0429-.2322*60)/(1+exp(15.0429-.2322*60))
(p2/(1-p2))/(p1/(1-p1))
## [1] 32.55725
exp(15.0429-.2322*55)/(1+exp(15.0429-.2322*55))
## [1] 0.9065229
3 Consider a set of medical records for 81 children undergoing a spinal operation. The data are in data frame called kyphosis (rpart package). The variables are: Kyphosis: A binary variable indicating the presence/absence of a post-operative spinal deformity called Kyphosis. Age: The age of the child in months. Number: The number of vertebrae involved in the spinal operation. Start: The beginning of the range of the vertebrae involved in the operation. (25)
attach(kyphosis)
mean(Age)
## [1] 83.65432
length(which(Kyphosis=="present"))/nrow(kyphosis)
## [1] 0.2098765
boxplot(Age~Kyphosis)
log1=glm(Kyphosis~.,data=kyphosis,family="binomial")
summary(log1)
##
## Call:
## glm(formula = Kyphosis ~ ., family = "binomial", data = kyphosis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3124 -0.5484 -0.3632 -0.1659 2.1613
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.036934 1.449575 -1.405 0.15996
## Age 0.010930 0.006446 1.696 0.08996 .
## Number 0.410601 0.224861 1.826 0.06785 .
## Start -0.206510 0.067699 -3.050 0.00229 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.234 on 80 degrees of freedom
## Residual deviance: 61.380 on 77 degrees of freedom
## AIC: 69.38
##
## Number of Fisher Scoring iterations: 5
Based on changes in the deviance, Start appears to be the most important.
log.a=glm(Kyphosis~Age,data=kyphosis,family="binomial")
summary(log.a)
##
## Call:
## glm(formula = Kyphosis ~ Age, family = "binomial", data = kyphosis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9023 -0.7397 -0.6028 -0.5521 1.9449
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.809351 0.530353 -3.412 0.000646 ***
## Age 0.005442 0.004822 1.129 0.259068
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.234 on 80 degrees of freedom
## Residual deviance: 81.932 on 79 degrees of freedom
## AIC: 85.932
##
## Number of Fisher Scoring iterations: 4
### Dev.=81.9 (Null Dev.=83.2)
log.n=glm(Kyphosis~Number,data=kyphosis,family="binomial")
summary(log.n)
##
## Call:
## glm(formula = Kyphosis ~ Number, family = "binomial", data = kyphosis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6813 -0.6278 -0.4908 -0.3808 2.0863
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6510 0.8986 -4.063 4.85e-05 ***
## Number 0.5317 0.1851 2.873 0.00406 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.234 on 80 degrees of freedom
## Residual deviance: 73.357 on 79 degrees of freedom
## AIC: 77.357
##
## Number of Fisher Scoring iterations: 4
## Dev.=73.4
log.s=glm(Kyphosis~Start,data=kyphosis,family="binomial")
summary(log.s)
##
## Call:
## glm(formula = Kyphosis ~ Start, family = "binomial", data = kyphosis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4729 -0.5176 -0.4211 -0.3413 2.1305
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.89007 0.62996 1.413 0.157686
## Start -0.21789 0.06044 -3.605 0.000312 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.234 on 80 degrees of freedom
## Residual deviance: 68.072 on 79 degrees of freedom
## AIC: 72.072
##
## Number of Fisher Scoring iterations: 5
## Dev.=68.1
log.ns=glm(Kyphosis~Number+Start,data=kyphosis,family="binomial")
summary(log.ns)
##
## Call:
## glm(formula = Kyphosis ~ Number + Start, family = "binomial",
## data = kyphosis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9035 -0.5938 -0.3886 -0.2490 2.2141
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.02890 1.22567 -0.839 0.40121
## Number 0.35745 0.20025 1.785 0.07426 .
## Start -0.18495 0.06414 -2.883 0.00393 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.234 on 80 degrees of freedom
## Residual deviance: 64.536 on 78 degrees of freedom
## AIC: 70.536
##
## Number of Fisher Scoring iterations: 5
## Dev.=64.5