2.4 The dataset prostate
comes from a study on 97 men with prostate cancer who ere due to receive a radical prostatectomy. Fit a model with lpsa
as the response and lcavol
as the predictor. Record the residual standard error and the \(R^2\). Now add lweight
,svi
,age
,lcp
,pgg45
, and gleason
. to the model one at a time. For each model record the residual standard error and the \(R^2\). Plot the trends in these two statistics.
require(faraway)
## Loading required package: faraway
data(prostate, package='faraway')
head(prostate)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.7695 50 -1.386294 0 -1.38629 6 0 -0.43078
## 2 -0.9942523 3.3196 58 -1.386294 0 -1.38629 6 0 -0.16252
## 3 -0.5108256 2.6912 74 -1.386294 0 -1.38629 7 20 -0.16252
## 4 -1.2039728 3.2828 58 -1.386294 0 -1.38629 6 0 -0.16252
## 5 0.7514161 3.4324 62 -1.386294 0 -1.38629 6 0 0.37156
## 6 -1.0498221 3.2288 50 -1.386294 0 -1.38629 6 0 0.76547
summary(prostate)
## lcavol lweight age lbph
## Min. :-1.3471 Min. :2.375 Min. :41.00 Min. :-1.3863
## 1st Qu.: 0.5128 1st Qu.:3.376 1st Qu.:60.00 1st Qu.:-1.3863
## Median : 1.4469 Median :3.623 Median :65.00 Median : 0.3001
## Mean : 1.3500 Mean :3.653 Mean :63.87 Mean : 0.1004
## 3rd Qu.: 2.1270 3rd Qu.:3.878 3rd Qu.:68.00 3rd Qu.: 1.5581
## Max. : 3.8210 Max. :6.108 Max. :79.00 Max. : 2.3263
## svi lcp gleason pgg45
## Min. :0.0000 Min. :-1.3863 Min. :6.000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:-1.3863 1st Qu.:6.000 1st Qu.: 0.00
## Median :0.0000 Median :-0.7985 Median :7.000 Median : 15.00
## Mean :0.2165 Mean :-0.1794 Mean :6.753 Mean : 24.38
## 3rd Qu.:0.0000 3rd Qu.: 1.1786 3rd Qu.:7.000 3rd Qu.: 40.00
## Max. :1.0000 Max. : 2.9042 Max. :9.000 Max. :100.00
## lpsa
## Min. :-0.4308
## 1st Qu.: 1.7317
## Median : 2.5915
## Mean : 2.4784
## 3rd Qu.: 3.0564
## Max. : 5.5829
The prostate
dataset contains the following columns:
lcavol
: log(cancer volume)lweight
: log(prostate weight)age
: agelbph
: log(benign prostatic hyperplasia amount)svi
: seminal vesicle invasionlcp
: log(capsular penetration)gleason
: Gleason scorepgg45
: percentage Gleason scores 4 or 5lpsa
: log(prostate specific antigen)lmod <- lm(lpsa ~ lcavol, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.539431908779019"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.787499423513711"
Now adding more independent variables one at a time.
r_squared_trend <- c()
se_trend <- c()
# Adding lweight
lmod <- lm(lpsa ~ lcavol + lweight, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.585934512070213"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.750646932552003"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
# Adding svi
lmod <- lm(lpsa ~ lcavol + lweight + svi, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.626440253553244"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.71680938995835"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
# Adding lbph
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.636603479801418"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.710823197727069"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
# Adding age
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.644102401261455"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.707305372441944"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
# Adding lcp
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age + lcp, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.645112974108872"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.710213512046953"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
# Adding pgg45
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age + lcp + pgg45, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.65443165616093"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.704753265042738"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
# Adding gleason
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age + lcp + pgg45 + gleason, prostate)
sum_lmod <- summary(lmod)
print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.654754085299708"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.708415511834863"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)
Above are the listed residual standard errors and r-squared’s as we add one independent variable at a time. Let’s plot these two trends.
plot(r_squared_trend, type="b", main="R-Squared Trend", ylab="R-squared")
plot(se_trend, type="b", main="Residual Standard Error Trend", ylab="Standard Error")
2.5 Using the prostate
data, plot lpsa
against lcavol
. Fit the regressions of lpsa
on lcavol
and lcavol
on lpsa
. Display both regression lines on the plot. At what points do the two lines intersect?
plot(x=prostate$lcavol,y=prostate$lpsa, type='p',col="blue", main="Cancer Volume (log) vs. PSA (log)", xlab="Cancer Volume (log)", ylab="Prostate Specific Antigen (log)")
lmod_1 <- lm(lpsa ~ lcavol, prostate)
abline(lmod_1, col='red')
lmod_2 <- lm(lcavol ~ lpsa, prostate)
abline(lmod_2, col='green')
I may need to re-approach this, but it doesn’t appear that the lines intersect (?).
Playing with ggplot:
require(ggplot2)
## Loading required package: ggplot2
ggplot(prostate, aes(x=lcavol, y=lpsa)) + geom_point() + geom_smooth(method='lm') + labs(title = "Cancer Volume (log) vs. PSA (log)", y = "Prostate Specific Antigen (log)", x = "Cancer Volume (log)")
2.1 The web site www.playbill.com provides weekly reports on the box office ticket sales for plays on Broadway in New York. We shall consider the data for the week October 11-17, 2004 (referred to below as the current week). The data are in the form of the gross box office results for the current week and the gross box office results for the previous week (i.e., October 3 - 10, 2004). The data, plotted in Figure 2.6, are available on the book web site in the file playbill.csv
url <- 'http://www.stat.tamu.edu/~sheather/book/docs/datasets/playbill.csv'
playbill <- read.csv(url, header=TRUE)
playbill
## Production CurrentWeek LastWeek
## 1 42nd Street 684966 695437
## 2 Avenue Q 502367 498969
## 3 Beauty and Beast 594474 598576
## 4 Bombay Dreams 529298 528994
## 5 Chicago 570254 562964
## 6 Dracula 319959 282778
## 7 Fiddler on the Roof 579126 583177
## 8 Forever Tango 134042 152833
## 9 Golda's Balcony 105853 105698
## 10 Hairspray 822775 836959
## 11 Mamma Mia! 949462 970190
## 12 Movin' Out 610007 651808
## 13 Rent 386797 378238
## 14 The Lion King 1133034 1113510
## 15 The Phantom of the Opera 627609 614246
## 16 The Producers 911727 933822
## 17 Wicked 1180266 1202536
## 18 Wonderful Town 479155 488624
plot(playbill$LastWeek, playbill$CurrentWeek, xlab="Gross Box Office Results Previous Week", ylab="Gross Box Office Results Current Week")
Fit the following model to the data: \(Y = \beta_0 + \beta_1x + e\) where Y is the gross box office results for the current week (in $) and x is the gross box office results for the previous week (in $). Complete the following tasks:
lmodel <- lm(CurrentWeek ~ LastWeek ,playbill)
sumary(lmodel)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8049e+03 9.9293e+03 0.6853 0.5029
## LastWeek 9.8208e-01 1.4427e-02 68.0714 <2e-16
##
## n = 18, p = 2, Residual SE = 18007.55711, R-Squared = 1
slope <- lmodel$coefficients["LastWeek"]
print(paste0("The slope of the regression model is: ", slope))
## [1] "The slope of the regression model is: 0.982081479070464"
The standard error for the slope is 1.4427e-02 or 0.014427.
library(ISwR)
confint(lmodel, 'LastWeek', level=0.95)
## 2.5 % 97.5 %
## LastWeek 0.9514971 1.012666
Above notes the 95% confidence interval. Given that 1 is within the 95% confidence interval, 1 can certainly be a plausible \(\beta_1\) value.
confint(lmodel, '(Intercept)', level=0.95)
## 2.5 % 97.5 %
## (Intercept) -14244.33 27854.1
h_0 <- 10000
h_obs <- coef(lmodel)[[1]]
h_obs_se <- summary(lmodel)$coef[1, 2]
tobs <- (h_obs - h_0) / h_obs_se
(pobs <- 2 * pt(abs(tobs), nrow(playbill) - 2, lower.tail = FALSE))
## [1] 0.7517807
Given that the intercept’s 95% confidence interval ranges from -14244.33 to 27854, we do NOT reject the null hypothesis.
(predict(lmodel, data.frame(LastWeek = 400000, interval="predict")))
## 1
## 399637.5
int_sd <- (summary(lmodel)$coef[1,2])
print(paste0("2 standard deviations above 400000 is: $", round(2 * int_sd + 400000,2)))
## [1] "2 standard deviations above 400000 is: $419858.64"
The prediction is approximately $400,000. The Upper Limit of the 95% Confidence Interval is $419858.64. $450,000 is very unlikely to occur.
par(mfrow = c(2, 2))
plot(lmodel)
Residuals for our linear fit to the playbill data.
The model is not a great model. It may not be the most useful prediction tool for the promoters.
2.2 A story by James R. Hagerty entitled ‘With Buyers Sidelined, Home Prices Slide’ published in the Thursday October 25, 2007 edition of the Wall Street Journal contained data on so-called fundamental housing indicators in major real estate markets across the US. The author argues that… ‘prices are generally falling and overdue loan payments are piling up’. Thus, we shall consider data presented in the article on:
Y = Percentage change in average price from July 2006 to July 2007 (based on the S&P/Case-Shiller national housing index); and
x = Percentage of mortgage loans 30 days or more overdue in latest quarter (based on data from Equifax and Moody’s).
The data are available on the book web site in the file indicators.txt
. Fit the following model to the data: \(Y = \beta_0 + \beta_1x + e\). Complete the following tasks:
indicators <- read.table('http://www.stat.tamu.edu/~sheather/book/docs/datasets/indicators.txt', header=TRUE)
head(indicators)
## MetroArea PriceChange LoanPaymentsOverdue
## 1 Atlanta 1.2 4.55
## 2 Boston -3.4 3.31
## 3 Chicago -0.9 2.99
## 4 Dallas 0.8 4.26
## 5 Denver -0.7 3.56
## 6 Detroit -9.7 4.71
summary(indicators)
## MetroArea PriceChange LoanPaymentsOverdue
## Atlanta: 1 Min. :-9.700 Min. :1.650
## Boston : 1 1st Qu.:-7.000 1st Qu.:3.020
## Chicago: 1 Median :-3.950 Median :3.300
## Dallas : 1 Mean :-3.428 Mean :3.532
## Denver : 1 3rd Qu.:-0.750 3rd Qu.:4.478
## Detroit: 1 Max. : 6.900 Max. :5.630
## (Other):12
plot(indicators$LoanPaymentsOverdue, indicators$PriceChange, xlab="Percentage of Loan Payments Overdue", ylab = "Percentage change in Housing Price 2006 to 2007")
indicators_lm <- lm(PriceChange ~ LoanPaymentsOverdue, indicators)
abline(indicators_lm)
summary(indicators_lm)
##
## Call:
## lm(formula = PriceChange ~ LoanPaymentsOverdue, data = indicators)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6541 -3.3419 -0.6944 2.5288 6.9163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5145 3.3240 1.358 0.1933
## LoanPaymentsOverdue -2.2485 0.9033 -2.489 0.0242 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.954 on 16 degrees of freedom
## Multiple R-squared: 0.2792, Adjusted R-squared: 0.2341
## F-statistic: 6.196 on 1 and 16 DF, p-value: 0.02419
sum_coef <- summary(indicators_lm)$coef
print(sum_coef)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.514494 3.3240193 1.358143 0.19326241
## LoanPaymentsOverdue -2.248520 0.9033113 -2.489197 0.02419411
print(paste0("Slope: ", sum_coef[2,1]))
## [1] "Slope: -2.24851978357815"
print(confint(indicators_lm, 'LoanPaymentsOverdue', level=0.95))
## 2.5 % 97.5 %
## LoanPaymentsOverdue -4.163454 -0.3335853
There is evidence fo significant linear association as the 95% confidence interval only span negative numbers.
(predict(indicators_lm, data.frame(LoanPaymentsOverdue = 4), interval="confidence"))
## fit lwr upr
## 1 -4.479585 -6.648849 -2.310322
0% is NOT a feasible value as that is outside the 95% confidence interval.