suppressPackageStartupMessages(library(maptools))
## Warning: package 'maptools' was built under R version 4.2.2
suppressPackageStartupMessages(library(tree))
## Warning: package 'tree' was built under R version 4.2.2
suppressPackageStartupMessages(library(randomForest))
## Warning: package 'randomForest' was built under R version 4.2.2
suppressPackageStartupMessages(library(sm))
## Warning: package 'sm' was built under R version 4.2.2
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(rpart))
suppressPackageStartupMessages(library(tidyr))
1 Use the petrol consumption data set (http://myweb.fsu.edu/jelsner/temp/data/PetrolConsumption.txt) and build a regression tree to predict petrol consumption based on petrol tax, average income, amount of pavement and the proportion of the population with drivers licences. Plot the tree. Prune the tree leaving only three terminal nodes. Plot the final tree. (10)
petrol <- read.table("http://myweb.fsu.edu/jelsner/temp/data/PetrolConsumption.txt", header = TRUE)
head(petrol)
## Petrol.Tax Avg.Inc Pavement Prop.DL Petrol.Consumption
## 1 9.0 3571 1976 0.525 541
## 2 9.0 4092 1250 0.572 524
## 3 9.0 3865 1586 0.580 561
## 4 7.5 4870 2351 0.529 414
## 5 8.0 4399 431 0.544 410
## 6 10.0 5342 1333 0.571 457
ptr = tree(Petrol.Consumption ~ ., data = petrol)
plot(ptr)
text(ptr)
pptr <- prune.tree(ptr, best = 3)
pptr
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 48 588400 576.8
## 2) Prop.DL < 0.646 42 267500 550.8
## 4) Avg.Inc < 4395 27 86400 592.5 *
## 5) Avg.Inc > 4395 15 49400 475.7 *
## 3) Prop.DL > 0.646 6 94030 758.7 *
plot(pptr)
text(pptr)
pptr
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 48 588400 576.8
## 2) Prop.DL < 0.646 42 267500 550.8
## 4) Avg.Inc < 4395 27 86400 592.5 *
## 5) Avg.Inc > 4395 15 49400 475.7 *
## 3) Prop.DL > 0.646 6 94030 758.7 *
2 Use the data below to model the probability of O-ring damage as a logistic regression using launch temperature as the explanatory variable. Is the temperature a significant predictor of damage? Is it adequate? What are the odds of damage when launch temperature is 60F relative to the odds of damage when the temperature is 75F? Use the model to predict the probability of damage given a launch temperature of 55F. (20)
Temp = c(66, 70, 69, 68, 67, 72, 73, 70, 57, 63, 70, 78, 67, 53, 67, 75, 70, 81, 76, 79, 75, 76, 58)
Damage = c(0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1)
ord <- data.frame(Temp, Damage)
head(ord)
## Temp Damage
## 1 66 0
## 2 70 1
## 3 69 0
## 4 68 0
## 5 67 0
## 6 72 0
omod <- glm(Damage ~ Temp, data = ord, family = binomial)
summary(omod)
##
## Call:
## glm(formula = Damage ~ Temp, family = binomial, data = ord)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0611 -0.7613 -0.3783 0.4524 2.2175
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.0429 7.3786 2.039 0.0415 *
## Temp -0.2322 0.1082 -2.145 0.0320 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28.267 on 22 degrees of freedom
## Residual deviance: 20.315 on 21 degrees of freedom
## AIC: 24.315
##
## Number of Fisher Scoring iterations: 5
lfit = loess(ord$Damage ~ ord$Temp)
ord$lgpred = log(predict(lfit)/(1 - predict(lfit)))
## Warning in log(predict(lfit)/(1 - predict(lfit))): NaNs produced
ggplot(ord, aes(x = Temp, y = lgpred)) +
geom_point() +
geom_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
pchisq(20.315, 21, lower.tail = FALSE)
## [1] 0.5013948
predict(omod, data.frame(Temp = 55), type = "response")
## 1
## 0.9066965
3 Consider a set of medical records for 81 children undergoing a spinal operation. The data are in data frame called kyphosis (rpart package). The variables are: Kyphosis: A binary variable indicating the presence/absence of a post-operative spinal deformity called Kyphosis. Age: The age of the child in months. Number: The number of vertebrae involved in the spinal operation. Start: The beginning of the range of the vertebrae involved in the operation. (25)
kyph <- kyphosis
head(kyph)
## Kyphosis Age Number Start
## 1 absent 71 3 5
## 2 absent 158 3 14
## 3 present 128 4 5
## 4 absent 2 5 1
## 5 absent 1 4 15
## 6 absent 1 2 16
age_mean_mo <- mean(kyph$Age)
age_mean_yrs <- age_mean_mo / 12
age_mean_yrs
## [1] 6.971193
age_mean_mo
## [1] 83.65432
summary(kyph)
## Kyphosis Age Number Start
## absent :64 Min. : 1.00 Min. : 2.000 Min. : 1.00
## present:17 1st Qu.: 26.00 1st Qu.: 3.000 1st Qu.: 9.00
## Median : 87.00 Median : 4.000 Median :13.00
## Mean : 83.65 Mean : 4.049 Mean :11.49
## 3rd Qu.:130.00 3rd Qu.: 5.000 3rd Qu.:16.00
## Max. :206.00 Max. :10.000 Max. :18.00
kyph_prop <- 67 / 81
kyph_prop
## [1] 0.8271605
ggplot(kyph, aes(x = Kyphosis, y = Age)) +
geom_boxplot() +
ylab("Age (months)")+
xlab("Kyphosis (presence vs. absence)")
log_kyph <- glm(Kyphosis ~ Age + Number + Start, data = kyph, family = binomial)
summary(log_kyph)
##
## Call:
## glm(formula = Kyphosis ~ Age + Number + Start, family = binomial,
## data = kyph)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3124 -0.5484 -0.3632 -0.1659 2.1613
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.036934 1.449575 -1.405 0.15996
## Age 0.010930 0.006446 1.696 0.08996 .
## Number 0.410601 0.224861 1.826 0.06785 .
## Start -0.206510 0.067699 -3.050 0.00229 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.234 on 80 degrees of freedom
## Residual deviance: 61.380 on 77 degrees of freedom
## AIC: 69.38
##
## Number of Fisher Scoring iterations: 5