#Question 1
#1. Exercise
#For the prostate data infaraway, fit a model with lpsa as the response and the other variables as predictors.
#Compute 90% and 95% CIs for the parameter associated with age
require(faraway)
## Loading required package: faraway
g=lm(lpsa ~ .,prostate)
summary(g)
##
## Call:
## lm(formula = lpsa ~ ., data = prostate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7331 -0.3713 -0.0170 0.4141 1.6381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.669337 1.296387 0.516 0.60693
## lcavol 0.587022 0.087920 6.677 2.11e-09 ***
## lweight 0.454467 0.170012 2.673 0.00896 **
## age -0.019637 0.011173 -1.758 0.08229 .
## lbph 0.107054 0.058449 1.832 0.07040 .
## svi 0.766157 0.244309 3.136 0.00233 **
## lcp -0.105474 0.091013 -1.159 0.24964
## gleason 0.045142 0.157465 0.287 0.77503
## pgg45 0.004525 0.004421 1.024 0.30886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7084 on 88 degrees of freedom
## Multiple R-squared: 0.6548, Adjusted R-squared: 0.6234
## F-statistic: 20.86 on 8 and 88 DF, p-value: < 2.2e-16
confint(g,level=0.9)
## 5 % 95 %
## (Intercept) -1.485718237 2.824391633
## lcavol 0.440867156 0.733176497
## lweight 0.171846568 0.737088281
## age -0.038210200 -0.001064151
## lbph 0.009890745 0.204217317
## svi 0.360029029 1.172285623
## lcp -0.256770899 0.045822373
## gleason -0.216620186 0.306903382
## pgg45 -0.002824333 0.011874796
confint(g,level=0.95)
## 2.5 % 97.5 %
## (Intercept) -1.906960983 3.245634379
## lcavol 0.412298699 0.761744954
## lweight 0.116603435 0.792331414
## age -0.041840618 0.002566267
## lbph -0.009101499 0.223209561
## svi 0.280644232 1.251670420
## lcp -0.286344443 0.075395916
## gleason -0.267786053 0.358069248
## pgg45 -0.004260932 0.013311395
#Question 2
#2. Exercise
#Compute and display a 95% joint confidence region for
#the parameters associated with age and lbph.
#Plot the origin and report the outcome of the appropriate hypotheses test. Affirm
#this conclusion with an appropriate partial F-test.
require(ellipse)
## Loading required package: ellipse
plot(ellipse(g,c("age","lbph")),
type="l",
main="Joint Confidence Region")
points(0,0)
points(coef(g)["age"],coef(g)["lbph"],pch=18)
abline(v=confint(g)["age",],lty=2)
abline(h=confint(g)["lbph",],lty=2)

#We do not reject the null hyotheses as the origin lies inside the ellipse
#The 95% CR is equivalent to testing the full model.
g2=lm(lpsa ~ age + lbph, prostate)
anova(g2,g)
## Analysis of Variance Table
##
## Model 1: lpsa ~ age + lbph
## Model 2: lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 94 122.124
## 2 88 44.163 6 77.961 25.891 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# based on the partial F-test we get to know that F value is very big and
#p-value =2.2*10^-16 which is very small thus we do not reject the null hypotheses
#and consider age and lbph
#Question 3
#Predict lpsa (95%) for a new patient with lcavol = 1.22692, lweight = 3.62301, age = 65, lbph = -0.3001, svi = 0.0,
#lcp = -0.79851, gleason = 7.0, pgg45 = 15.0.
#Do this again for the mean response. Using the exp() function, obtain the new prediction and mean response for psa.
x0= data.frame(lcavol=1.22692,
lweight=3.62301,
age=65,
lbph=-0.3001,
svi=0.0,
lcp=-0.79851,
gleason=7.0,
pgg45=15.0)
predict(g,
x0,
interval = "prediction",
level=.95)
## fit lwr upr
## 1 2.195654 0.7708983 3.62041
exp(predict(g,x0,interval = "prediction",
level = .95))
## fit lwr upr
## 1 8.985877 2.161707 37.35288
#Question 4
#Repeat the above exercise with new patient age = 20
x0= data.frame(lcavol=1.22692,
lweight=3.62301,
age=20,
lbph=-0.3001,
svi=0.0,
lcp=-0.79851,
gleason=7.0,
pgg45=15.0)
predict(g,
x0,
interval = "prediction",
level=.95)
## fit lwr upr
## 1 3.079327 1.357826 4.800828
exp(predict(g,x0,interval = "prediction",
level = .95))
## fit lwr upr
## 1 21.74376 3.887732 121.6111
#question 5
#For the model in exercise 1, remove all the predictors that are not significant at the 5% level.
#Recompute the predictions for exercises 3 and 4. Compare CIs. On the psa scale, which CIs do you prefer?
fm.1 <- lm(lpsa ~ lcavol+lweight+age+svi,prostate)
summary(fm.1)
##
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + svi, data = prostate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7204 -0.4387 0.0035 0.4696 1.5890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.171941 0.737459 0.233 0.816160
## lcavol 0.561925 0.075656 7.427 5.49e-11 ***
## lweight 0.546718 0.156426 3.495 0.000731 ***
## age -0.009287 0.010505 -0.884 0.378979
## svi 0.665046 0.210028 3.166 0.002093 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7177 on 92 degrees of freedom
## Multiple R-squared: 0.6296, Adjusted R-squared: 0.6135
## F-statistic: 39.09 on 4 and 92 DF, p-value: < 2.2e-16
newP.data0 <- data.frame(lcavol = 1.22692, lweight = 3.62031, age = 65,svi=0)
predict(fm.1,newP.data0,interval= "prediction",level=0.95)
## fit lwr upr
## 1 2.237007 0.8017119 3.672302
exp(predict(fm.1,newP.data0,interval = "prediction",
level = .95))
## fit lwr upr
## 1 9.36526 2.229354 39.34238
newP.data1 <- data.frame(lcavol = 1.22692, lweight = 3.62031, age = 20,svi=0)
predict(fm.1,newP.data1,interval= "prediction",level=0.95)
## fit lwr upr
## 1 2.654925 0.9556234 4.354227
exp(predict(fm.1,newP.data1,interval = "prediction",
level = .95))
## fit lwr upr
## 1 14.22393 2.600291 77.80669
#The length of the second prediction interval is longer than the first.This is also because of age
#The prediction intervals from the second model should be narrower than those from the original model theoretically because all significant values have been removed
#Therefore,the second one should explain the response more accurately than the first model
#The narrower prediction intervals are preferred
#Excercise 6
#Test the "small" model in exercise 5 against the "big"" model in
#exercise 1 at probability type I error ??=0.05 Which model is preferred?
fit1<- lm(lpsa~lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,prostate)
fit2 = lm(lpsa~lcavol+lweight+svi, data=prostate)
anova(fit1, fit2)
## Analysis of Variance Table
##
## Model 1: lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
## Model 2: lpsa ~ lcavol + lweight + svi
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 88 44.163
## 2 93 47.785 -5 -3.6218 1.4434 0.2167
#Since the p-value for the F-stat is 0.2167 and is larger than the significance level 0.05, we accept the reduced model
#Thus, the smaller model is preferred because it is simpler and still valid.