#HW10
#1
x = c(0,0,0,50,50,50,75,75,75,100,100,100,150,150,150,200,200,200)
y = c(1.72,1.68,1.74,2.04,2.11,2.17,2.40,2.32,2.33,2.91,3.00,2.89,4.47,4.51,4.43,6.67,6.66,6.57)
#a)
plot(x,y, xlab = "ppm", ylab = "mV")
abline(lm(y~x)$coefficients, lty = 2)

#b)
fit = lm(y ~ x + I(x^2))
summary(fit)
##
## Call:
## lm(formula = y ~ x + I(x^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.085354 -0.047679 -0.004113 0.035984 0.143329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.735e+00 3.442e-02 50.410 < 2e-16 ***
## x -3.772e-04 7.688e-04 -0.491 0.631
## I(x^2) 1.242e-04 3.605e-06 34.452 1.07e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06341 on 15 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9987
## F-statistic: 6500 on 2 and 15 DF, p-value: < 2.2e-16
#c)
xx = seq(0,200,by = 0.1)
yy = predict.lm(fit, data.frame(x = xx))
plot(x,y, xlab = "ppm", ylab = "mV")
lines(xx,yy,lwd = 2)
#2
library(faraway)

data(odor)
? odor
#a)
fit1 = lm(odor ~ temp+gas+pack+I(temp^2)+I(gas^2)+I(pack^2), data = odor)
summary(fit1)
##
## Call:
## lm(formula = odor ~ temp + gas + pack + I(temp^2) + I(gas^2) +
## I(pack^2), data = odor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.625 -9.625 -1.375 4.021 28.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.667 10.840 -2.829 0.0222 *
## temp -12.125 6.638 -1.827 0.1052
## gas -17.000 6.638 -2.561 0.0336 *
## pack -21.375 6.638 -3.220 0.0122 *
## I(temp^2) 32.083 9.771 3.284 0.0111 *
## I(gas^2) 47.833 9.771 4.896 0.0012 **
## I(pack^2) 6.083 9.771 0.623 0.5509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.77 on 8 degrees of freedom
## Multiple R-squared: 0.8683, Adjusted R-squared: 0.7695
## F-statistic: 8.789 on 6 and 8 DF, p-value: 0.003616
#b)
summary(fit1)$r.squared
## [1] 0.8682799
#c)
summary(fit1)$adj.r.squared
## [1] 0.7694898
extractAIC(fit1)
## [1] 7.00000 92.54619
#3
library(faraway)
#a)1.
fit_p = lm(lpsa ~ ., data = prostate)
fit_p_back_aic = step(fit_p, direction = "backward")
## Start: AIC=-58.32
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
##
## Df Sum of Sq RSS AIC
## - gleason 1 0.0412 44.204 -60.231
## - pgg45 1 0.5258 44.689 -59.174
## - lcp 1 0.6740 44.837 -58.853
## <none> 44.163 -58.322
## - age 1 1.5503 45.713 -56.975
## - lbph 1 1.6835 45.847 -56.693
## - lweight 1 3.5861 47.749 -52.749
## - svi 1 4.9355 49.099 -50.046
## - lcavol 1 22.3721 66.535 -20.567
##
## Step: AIC=-60.23
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
##
## Df Sum of Sq RSS AIC
## - lcp 1 0.6623 44.867 -60.789
## <none> 44.204 -60.231
## - pgg45 1 1.1920 45.396 -59.650
## - age 1 1.5166 45.721 -58.959
## - lbph 1 1.7053 45.910 -58.560
## - lweight 1 3.5462 47.750 -54.746
## - svi 1 4.8984 49.103 -52.037
## - lcavol 1 23.5039 67.708 -20.872
##
## Step: AIC=-60.79
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC
## - pgg45 1 0.6590 45.526 -61.374
## <none> 44.867 -60.789
## - age 1 1.2649 46.131 -60.092
## - lbph 1 1.6465 46.513 -59.293
## - lweight 1 3.5647 48.431 -55.373
## - svi 1 4.2503 49.117 -54.009
## - lcavol 1 25.4189 70.285 -19.248
##
## Step: AIC=-61.37
## lpsa ~ lcavol + lweight + age + lbph + svi
##
## Df Sum of Sq RSS AIC
## <none> 45.526 -61.374
## - age 1 0.9592 46.485 -61.352
## - lbph 1 1.8568 47.382 -59.497
## - lweight 1 3.2251 48.751 -56.735
## - svi 1 5.9517 51.477 -51.456
## - lcavol 1 28.7665 74.292 -15.871
#a)2.
n = length(resid(fit_p))
fit_p_back_bic = step(fit_p, direction = "backward", k = log(n))
## Start: AIC=-35.15
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason +
## pgg45
##
## Df Sum of Sq RSS AIC
## - gleason 1 0.0412 44.204 -39.634
## - pgg45 1 0.5258 44.689 -38.576
## - lcp 1 0.6740 44.837 -38.255
## - age 1 1.5503 45.713 -36.377
## - lbph 1 1.6835 45.847 -36.095
## <none> 44.163 -35.149
## - lweight 1 3.5861 47.749 -32.151
## - svi 1 4.9355 49.099 -29.448
## - lcavol 1 22.3721 66.535 0.030
##
## Step: AIC=-39.63
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
##
## Df Sum of Sq RSS AIC
## - lcp 1 0.6623 44.867 -42.766
## - pgg45 1 1.1920 45.396 -41.627
## - age 1 1.5166 45.721 -40.936
## - lbph 1 1.7053 45.910 -40.537
## <none> 44.204 -39.634
## - lweight 1 3.5462 47.750 -36.723
## - svi 1 4.8984 49.103 -34.014
## - lcavol 1 23.5039 67.708 -2.849
##
## Step: AIC=-42.77
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
##
## Df Sum of Sq RSS AIC
## - pgg45 1 0.6590 45.526 -45.926
## - age 1 1.2649 46.131 -44.644
## - lbph 1 1.6465 46.513 -43.844
## <none> 44.867 -42.766
## - lweight 1 3.5647 48.431 -39.925
## - svi 1 4.2503 49.117 -38.561
## - lcavol 1 25.4189 70.285 -3.800
##
## Step: AIC=-45.93
## lpsa ~ lcavol + lweight + age + lbph + svi
##
## Df Sum of Sq RSS AIC
## - age 1 0.9592 46.485 -48.478
## - lbph 1 1.8568 47.382 -46.623
## <none> 45.526 -45.926
## - lweight 1 3.2251 48.751 -43.862
## - svi 1 5.9517 51.477 -38.583
## - lcavol 1 28.7665 74.292 -2.997
##
## Step: AIC=-48.48
## lpsa ~ lcavol + lweight + lbph + svi
##
## Df Sum of Sq RSS AIC
## - lbph 1 1.3001 47.785 -50.377
## <none> 46.485 -48.478
## - lweight 1 2.8014 49.286 -47.377
## - svi 1 5.8063 52.291 -41.636
## - lcavol 1 27.8298 74.315 -7.542
##
## Step: AIC=-50.38
## lpsa ~ lcavol + lweight + svi
##
## Df Sum of Sq RSS AIC
## <none> 47.785 -50.377
## - svi 1 5.1814 52.966 -44.966
## - lweight 1 5.8924 53.677 -43.673
## - lcavol 1 28.0445 75.829 -10.160
#b)1.
fit_p_start = lm(lpsa ~ 1, data = prostate)
fit_p_fwd_aic = step(fit_p_start,lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, direction = "forward")
## Start: AIC=28.84
## lpsa ~ 1
##
## Df Sum of Sq RSS AIC
## + lcavol 1 69.003 58.915 -44.366
## + svi 1 41.011 86.907 -6.658
## + lcp 1 38.528 89.389 -3.926
## + pgg45 1 22.814 105.103 11.783
## + gleason 1 17.416 110.501 16.641
## + lweight 1 16.041 111.876 17.840
## + lbph 1 4.136 123.782 27.650
## + age 1 3.679 124.238 28.007
## <none> 127.918 28.837
##
## Step: AIC=-44.37
## lpsa ~ lcavol
##
## Df Sum of Sq RSS AIC
## + lweight 1 5.9485 52.966 -52.690
## + svi 1 5.2375 53.677 -51.397
## + lbph 1 3.2658 55.649 -47.898
## + pgg45 1 1.6980 57.217 -45.203
## <none> 58.915 -44.366
## + lcp 1 0.6562 58.259 -43.453
## + gleason 1 0.4156 58.499 -43.053
## + age 1 0.0025 58.912 -42.370
##
## Step: AIC=-52.69
## lpsa ~ lcavol + lweight
##
## Df Sum of Sq RSS AIC
## + svi 1 5.1814 47.785 -60.676
## + pgg45 1 1.9489 51.017 -54.327
## <none> 52.966 -52.690
## + lcp 1 0.8371 52.129 -52.236
## + gleason 1 0.7810 52.185 -52.131
## + lbph 1 0.6751 52.291 -51.935
## + age 1 0.4200 52.546 -51.463
##
## Step: AIC=-60.68
## lpsa ~ lcavol + lweight + svi
##
## Df Sum of Sq RSS AIC
## + lbph 1 1.30006 46.485 -61.352
## <none> 47.785 -60.676
## + pgg45 1 0.57347 47.211 -59.847
## + age 1 0.40251 47.382 -59.497
## + gleason 1 0.38901 47.396 -59.469
## + lcp 1 0.06412 47.721 -58.806
##
## Step: AIC=-61.35
## lpsa ~ lcavol + lweight + svi + lbph
##
## Df Sum of Sq RSS AIC
## + age 1 0.95924 45.526 -61.374
## <none> 46.485 -61.352
## + pgg45 1 0.35332 46.131 -60.092
## + gleason 1 0.21256 46.272 -59.796
## + lcp 1 0.10230 46.383 -59.565
##
## Step: AIC=-61.37
## lpsa ~ lcavol + lweight + svi + lbph + age
##
## Df Sum of Sq RSS AIC
## <none> 45.526 -61.374
## + pgg45 1 0.65896 44.867 -60.789
## + gleason 1 0.45601 45.070 -60.351
## + lcp 1 0.12927 45.396 -59.650
#b)2.
n = length(resid(fit_p))
fit_p_fwd_bic = step(fit_p_start,lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, direction = "forward", k = log(n))
## Start: AIC=31.41
## lpsa ~ 1
##
## Df Sum of Sq RSS AIC
## + lcavol 1 69.003 58.915 -39.217
## + svi 1 41.011 86.907 -1.508
## + lcp 1 38.528 89.389 1.224
## + pgg45 1 22.814 105.103 16.932
## + gleason 1 17.416 110.501 21.790
## + lweight 1 16.041 111.876 22.990
## <none> 127.918 31.412
## + lbph 1 4.136 123.782 32.799
## + age 1 3.679 124.238 33.156
##
## Step: AIC=-39.22
## lpsa ~ lcavol
##
## Df Sum of Sq RSS AIC
## + lweight 1 5.9485 52.966 -44.966
## + svi 1 5.2375 53.677 -43.673
## + lbph 1 3.2658 55.649 -40.174
## <none> 58.915 -39.217
## + pgg45 1 1.6980 57.217 -37.479
## + lcp 1 0.6562 58.259 -35.728
## + gleason 1 0.4156 58.499 -35.329
## + age 1 0.0025 58.912 -34.646
##
## Step: AIC=-44.97
## lpsa ~ lcavol + lweight
##
## Df Sum of Sq RSS AIC
## + svi 1 5.1814 47.785 -50.377
## <none> 52.966 -44.966
## + pgg45 1 1.9489 51.017 -44.028
## + lcp 1 0.8371 52.129 -41.937
## + gleason 1 0.7810 52.185 -41.833
## + lbph 1 0.6751 52.291 -41.636
## + age 1 0.4200 52.546 -41.164
##
## Step: AIC=-50.38
## lpsa ~ lcavol + lweight + svi
##
## Df Sum of Sq RSS AIC
## <none> 47.785 -50.377
## + lbph 1 1.30006 46.485 -48.478
## + pgg45 1 0.57347 47.211 -46.974
## + age 1 0.40251 47.382 -46.623
## + gleason 1 0.38901 47.396 -46.596
## + lcp 1 0.06412 47.721 -45.933
#c)
library(leaps)
all_fits = regsubsets(lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data = prostate)
all_fits_sum = summary (all_fits)
all_fits_sum$which
## (Intercept) lcavol lweight age lbph svi lcp gleason pgg45
## 1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 4 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
## 5 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## 6 TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## 7 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## 8 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#AIC
p = length(coef(fit_p))
n = length(resid(fit_p))
AIC = n*log(all_fits_sum$rss/n) + 2*(2:p)
which.min(AIC)
## [1] 5
#BIC
BIC = n*log(all_fits_sum$rss/n) + log(n)*(2:p)
which.min(BIC)
## [1] 3
#R2adj
R_adj = all_fits_sum$adjr2
which.max(R_adj)
## [1] 7
R_adj
## [1] 0.5345838 0.5771246 0.6143899 0.6208036 0.6245476 0.6258707 0.6272521
## [8] 0.6233681
#d)
#a)1
summary(lm(lpsa ~ lcavol + lweight + age + lbph + svi, data = prostate))$adj.r.squared
## [1] 0.6245476
#a)2
summary(lm(lpsa ~ lcavol + lweight + svi, data = prostate))$adj.r.squared
## [1] 0.6143899
#b)1
summary(lm(lpsa ~ lcavol + lweight + svi + lbph + age, data = prostate))$adj.r.squared
## [1] 0.6245476
#b)2
summary(lm(lpsa ~ lcavol + lweight + svi, data = prostate))$adj.r.squared
## [1] 0.6143899
#c)1
summary(lm(lpsa ~ lcavol + lweight + age + lbph + svi, data = prostate))$adj.r.squared
## [1] 0.6245476
#c)2
summary(lm(lpsa ~ lcavol + lweight + svi, data = prostate))$adj.r.squared
## [1] 0.6143899
#c)3
summary(lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45, data = prostate))$adj.r.squared
## [1] 0.6272521
#d) alternative
library(faraway)
fit_best_1 = lm (lpsa ~ lcavol + lweight + svi, data = prostate)
fit_best_2 = lm (lpsa ~ lcavol + lweight + age + lbph + svi, data = prostate)
anova(fit_best_1,fit_best_2)
## Analysis of Variance Table
##
## Model 1: lpsa ~ lcavol + lweight + svi
## Model 2: lpsa ~ lcavol + lweight + age + lbph + svi
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 93 47.785
## 2 91 45.526 2 2.2593 2.258 0.1104