CUNY 621 Week 2 Suggested Exercises

Linear Models with R

2.4 The dataset prostate comes from a study on 97 men with prostate cancer who ere due to receive a radical prostatectomy. Fit a model with lpsa as the response and lcavol as the predictor. Record the residual standard error and the \(R^2\). Now add lweight,svi,age,lcp,pgg45, and gleason. to the model one at a time. For each model record the residual standard error and the \(R^2\). Plot the trends in these two statistics.

require(faraway)
## Loading required package: faraway
data(prostate, package='faraway')
head(prostate)
##       lcavol lweight age      lbph svi      lcp gleason pgg45     lpsa
## 1 -0.5798185  2.7695  50 -1.386294   0 -1.38629       6     0 -0.43078
## 2 -0.9942523  3.3196  58 -1.386294   0 -1.38629       6     0 -0.16252
## 3 -0.5108256  2.6912  74 -1.386294   0 -1.38629       7    20 -0.16252
## 4 -1.2039728  3.2828  58 -1.386294   0 -1.38629       6     0 -0.16252
## 5  0.7514161  3.4324  62 -1.386294   0 -1.38629       6     0  0.37156
## 6 -1.0498221  3.2288  50 -1.386294   0 -1.38629       6     0  0.76547
summary(prostate)
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.653   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.878   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :6.108   Max.   :79.00   Max.   : 2.3263  
##       svi              lcp             gleason          pgg45       
##  Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
##  Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
##  Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
##  3rd Qu.:0.0000   3rd Qu.: 1.1786   3rd Qu.:7.000   3rd Qu.: 40.00  
##  Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
##       lpsa        
##  Min.   :-0.4308  
##  1st Qu.: 1.7317  
##  Median : 2.5915  
##  Mean   : 2.4784  
##  3rd Qu.: 3.0564  
##  Max.   : 5.5829

The prostate dataset contains the following columns:

  1. lcavol: log(cancer volume)
  2. lweight: log(prostate weight)
  3. age: age
  4. lbph: log(benign prostatic hyperplasia amount)
  5. svi: seminal vesicle invasion
  6. lcp: log(capsular penetration)
  7. gleason: Gleason score
  8. pgg45: percentage Gleason scores 4 or 5
  9. lpsa: log(prostate specific antigen)
lmod <- lm(lpsa ~ lcavol, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.539431908779019"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.787499423513711"

Now adding more independent variables one at a time.

r_squared_trend <- c()
se_trend <- c()

# Adding lweight
lmod <- lm(lpsa ~ lcavol + lweight, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.585934512070213"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.750646932552003"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

# Adding svi
lmod <- lm(lpsa ~ lcavol + lweight + svi, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.626440253553244"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.71680938995835"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

# Adding lbph
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.636603479801418"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.710823197727069"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

# Adding age
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.644102401261455"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.707305372441944"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

# Adding lcp
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age + lcp, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.645112974108872"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.710213512046953"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

# Adding pgg45
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age + lcp + pgg45, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.65443165616093"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.704753265042738"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

# Adding gleason
lmod <- lm(lpsa ~ lcavol + lweight + svi + lbph + age + lcp + pgg45 + gleason, prostate)
sum_lmod <- summary(lmod)

print(paste0("R-squared: ", sum_lmod$r.squared))
## [1] "R-squared: 0.654754085299708"
print(paste0("Residual Standard Error: ", sum_lmod$sigma))
## [1] "Residual Standard Error: 0.708415511834863"
r_squared_trend <- c(r_squared_trend, sum_lmod$r.squared)
se_trend <- c(se_trend, sum_lmod$sigma)

Above are the listed residual standard errors and r-squared’s as we add one independent variable at a time. Let’s plot these two trends.

plot(r_squared_trend, type="b", main="R-Squared Trend", ylab="R-squared")

plot(se_trend, type="b", main="Residual Standard Error Trend", ylab="Standard Error")

2.5 Using the prostate data, plot lpsa against lcavol. Fit the regressions of lpsa on lcavol and lcavol on lpsa. Display both regression lines on the plot. At what points do the two lines intersect?

plot(x=prostate$lcavol,y=prostate$lpsa, type='p',col="blue", main="Cancer Volume (log) vs. PSA (log)", xlab="Cancer Volume (log)", ylab="Prostate Specific Antigen (log)")

lmod_1 <- lm(lpsa ~ lcavol, prostate)
abline(lmod_1, col='red')
lmod_2 <- lm(lcavol ~ lpsa, prostate)
abline(lmod_2, col='green')

I may need to re-approach this, but it doesn’t appear that the lines intersect (?).

Playing with ggplot:

require(ggplot2)
## Loading required package: ggplot2
ggplot(prostate, aes(x=lcavol, y=lpsa)) + geom_point() + geom_smooth(method='lm') + labs(title = "Cancer Volume (log) vs. PSA (log)", y = "Prostate Specific Antigen (log)", x = "Cancer Volume (log)")

A Modern Approach to Regression with R

2.1 The web site www.playbill.com provides weekly reports on the box office ticket sales for plays on Broadway in New York. We shall consider the data for the week October 11-17, 2004 (referred to below as the current week). The data are in the form of the gross box office results for the current week and the gross box office results for the previous week (i.e., October 3 - 10, 2004). The data, plotted in Figure 2.6, are available on the book web site in the file playbill.csv

url <- 'http://www.stat.tamu.edu/~sheather/book/docs/datasets/playbill.csv'
playbill <- read.csv(url, header=TRUE)
playbill
##                  Production CurrentWeek LastWeek
## 1               42nd Street      684966   695437
## 2                  Avenue Q      502367   498969
## 3          Beauty and Beast      594474   598576
## 4             Bombay Dreams      529298   528994
## 5                   Chicago      570254   562964
## 6                   Dracula      319959   282778
## 7       Fiddler on the Roof      579126   583177
## 8             Forever Tango      134042   152833
## 9           Golda's Balcony      105853   105698
## 10                Hairspray      822775   836959
## 11               Mamma Mia!      949462   970190
## 12               Movin' Out      610007   651808
## 13                     Rent      386797   378238
## 14            The Lion King     1133034  1113510
## 15 The Phantom of the Opera      627609   614246
## 16            The Producers      911727   933822
## 17                   Wicked     1180266  1202536
## 18           Wonderful Town      479155   488624
plot(playbill$LastWeek, playbill$CurrentWeek, xlab="Gross Box Office Results Previous Week", ylab="Gross Box Office Results Current Week")

Fit the following model to the data: \(Y = \beta_0 + \beta_1x + e\) where Y is the gross box office results for the current week (in $) and x is the gross box office results for the previous week (in $). Complete the following tasks:

  1. Find a 95% confidence interval for the slope of the regression model, \(\beta_1\). Is 1 a plausible value for \(\beta_1\)? Give a reason to support your answer.
lmodel <- lm(CurrentWeek ~ LastWeek ,playbill) 
sumary(lmodel)
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8049e+03 9.9293e+03  0.6853   0.5029
## LastWeek    9.8208e-01 1.4427e-02 68.0714   <2e-16
## 
## n = 18, p = 2, Residual SE = 18007.55711, R-Squared = 1
slope <- lmodel$coefficients["LastWeek"]
print(paste0("The slope of the regression model is: ", slope))
## [1] "The slope of the regression model is: 0.982081479070464"

The standard error for the slope is 1.4427e-02 or 0.014427.

library(ISwR)
confint(lmodel, 'LastWeek', level=0.95)
##              2.5 %   97.5 %
## LastWeek 0.9514971 1.012666

Above notes the 95% confidence interval. Given that 1 is within the 95% confidence interval, 1 can certainly be a plausible \(\beta_1\) value.

  1. Test the null hypothesis \(H_0: \beta_0 = 10000\) against a two-sided alternative. Interpret your result.
confint(lmodel, '(Intercept)', level=0.95)
##                 2.5 %  97.5 %
## (Intercept) -14244.33 27854.1
h_0 <- 10000
h_obs <- coef(lmodel)[[1]]
h_obs_se <- summary(lmodel)$coef[1, 2]

tobs <- (h_obs - h_0) / h_obs_se
(pobs <- 2 * pt(abs(tobs), nrow(playbill) - 2, lower.tail = FALSE))
## [1] 0.7517807

Given that the intercept’s 95% confidence interval ranges from -14244.33 to 27854, we do NOT reject the null hypothesis.

  1. Use the fitted regression model to estimate the gross box office results for the current week (in $) for a production with $400,000 in gross box office for the previous week. Find a 95% prediction interval for the gross box office results for the current week (in $) for a production with $400,000 in gross box office the previous week. Is $450,000 a feasible value for the gross box office results in the current week, for a production with $400,000 in gross box office the previous week? Give a reason to support your answer.
(predict(lmodel, data.frame(LastWeek = 400000, interval="predict")))
##        1 
## 399637.5
int_sd <- (summary(lmodel)$coef[1,2])
print(paste0("2 standard deviations above 400000 is: $", round(2 * int_sd + 400000,2)))
## [1] "2 standard deviations above 400000 is: $419858.64"

The prediction is approximately $400,000. The Upper Limit of the 95% Confidence Interval is $419858.64. $450,000 is very unlikely to occur.

  1. Some promoters of Broadway plays use the prediction rule that next week’s gross box office results will be equal to this week’s gross box office results. Comment on the appropriateness of this rule.
par(mfrow = c(2, 2))
plot(lmodel)
Residuals for our linear fit to the playbill data.

Residuals for our linear fit to the playbill data.

The model is not a great model. It may not be the most useful prediction tool for the promoters.

2.2 A story by James R. Hagerty entitled ‘With Buyers Sidelined, Home Prices Slide’ published in the Thursday October 25, 2007 edition of the Wall Street Journal contained data on so-called fundamental housing indicators in major real estate markets across the US. The author argues that… ‘prices are generally falling and overdue loan payments are piling up’. Thus, we shall consider data presented in the article on:

Y = Percentage change in average price from July 2006 to July 2007 (based on the S&P/Case-Shiller national housing index); and

x = Percentage of mortgage loans 30 days or more overdue in latest quarter (based on data from Equifax and Moody’s).

The data are available on the book web site in the file indicators.txt. Fit the following model to the data: \(Y = \beta_0 + \beta_1x + e\). Complete the following tasks:

indicators <- read.table('http://www.stat.tamu.edu/~sheather/book/docs/datasets/indicators.txt', header=TRUE)
head(indicators)
##   MetroArea PriceChange LoanPaymentsOverdue
## 1   Atlanta         1.2                4.55
## 2    Boston        -3.4                3.31
## 3   Chicago        -0.9                2.99
## 4    Dallas         0.8                4.26
## 5    Denver        -0.7                3.56
## 6   Detroit        -9.7                4.71
summary(indicators)
##    MetroArea   PriceChange     LoanPaymentsOverdue
##  Atlanta: 1   Min.   :-9.700   Min.   :1.650      
##  Boston : 1   1st Qu.:-7.000   1st Qu.:3.020      
##  Chicago: 1   Median :-3.950   Median :3.300      
##  Dallas : 1   Mean   :-3.428   Mean   :3.532      
##  Denver : 1   3rd Qu.:-0.750   3rd Qu.:4.478      
##  Detroit: 1   Max.   : 6.900   Max.   :5.630      
##  (Other):12
plot(indicators$LoanPaymentsOverdue, indicators$PriceChange, xlab="Percentage of Loan Payments Overdue", ylab = "Percentage change in Housing Price 2006 to 2007")

indicators_lm <- lm(PriceChange ~ LoanPaymentsOverdue, indicators)
abline(indicators_lm)

summary(indicators_lm)
## 
## Call:
## lm(formula = PriceChange ~ LoanPaymentsOverdue, data = indicators)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6541 -3.3419 -0.6944  2.5288  6.9163 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           4.5145     3.3240   1.358   0.1933  
## LoanPaymentsOverdue  -2.2485     0.9033  -2.489   0.0242 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.954 on 16 degrees of freedom
## Multiple R-squared:  0.2792, Adjusted R-squared:  0.2341 
## F-statistic: 6.196 on 1 and 16 DF,  p-value: 0.02419
  1. Find a 95% confidence interval for the slope of the regression model, \(\beta_1\). On the basis of the confidence interval, dedide whether there is evidence of a significant negative linear association.
sum_coef <- summary(indicators_lm)$coef
print(sum_coef)
##                      Estimate Std. Error   t value   Pr(>|t|)
## (Intercept)          4.514494  3.3240193  1.358143 0.19326241
## LoanPaymentsOverdue -2.248520  0.9033113 -2.489197 0.02419411
print(paste0("Slope: ", sum_coef[2,1]))
## [1] "Slope: -2.24851978357815"
print(confint(indicators_lm, 'LoanPaymentsOverdue', level=0.95))
##                         2.5 %     97.5 %
## LoanPaymentsOverdue -4.163454 -0.3335853

There is evidence fo significant linear association as the 95% confidence interval only span negative numbers.

  1. Use the fitted regression model to estimate \(E(Y|X = 4)\). Find a 95% confidence interval for \(E(Y|X = 4)\). Is 0% a feasible value for \(E(Y|X = 4)\)? Give a reason to support your answer.
(predict(indicators_lm, data.frame(LoanPaymentsOverdue = 4), interval="confidence"))
##         fit       lwr       upr
## 1 -4.479585 -6.648849 -2.310322

0% is NOT a feasible value as that is outside the 95% confidence interval.