#HW10
#1
x = c(0,0,0,50,50,50,75,75,75,100,100,100,150,150,150,200,200,200)
y = c(1.72,1.68,1.74,2.04,2.11,2.17,2.40,2.32,2.33,2.91,3.00,2.89,4.47,4.51,4.43,6.67,6.66,6.57)

#a)
plot(x,y, xlab = "ppm", ylab = "mV")
abline(lm(y~x)$coefficients, lty = 2)

#b)
fit = lm(y ~ x + I(x^2))
summary(fit)
## 
## Call:
## lm(formula = y ~ x + I(x^2))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.085354 -0.047679 -0.004113  0.035984  0.143329 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.735e+00  3.442e-02  50.410  < 2e-16 ***
## x           -3.772e-04  7.688e-04  -0.491    0.631    
## I(x^2)       1.242e-04  3.605e-06  34.452 1.07e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06341 on 15 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9987 
## F-statistic:  6500 on 2 and 15 DF,  p-value: < 2.2e-16
#c)
xx = seq(0,200,by = 0.1)
yy = predict.lm(fit, data.frame(x = xx))
plot(x,y, xlab = "ppm", ylab = "mV")
lines(xx,yy,lwd = 2)

#2
library(faraway)

data(odor)
? odor

#a)
fit1 = lm(odor ~ temp+gas+pack+I(temp^2)+I(gas^2)+I(pack^2), data = odor)
summary(fit1)
## 
## Call:
## lm(formula = odor ~ temp + gas + pack + I(temp^2) + I(gas^2) + 
##     I(pack^2), data = odor)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.625  -9.625  -1.375   4.021  28.875 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -30.667     10.840  -2.829   0.0222 * 
## temp         -12.125      6.638  -1.827   0.1052   
## gas          -17.000      6.638  -2.561   0.0336 * 
## pack         -21.375      6.638  -3.220   0.0122 * 
## I(temp^2)     32.083      9.771   3.284   0.0111 * 
## I(gas^2)      47.833      9.771   4.896   0.0012 **
## I(pack^2)      6.083      9.771   0.623   0.5509   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.77 on 8 degrees of freedom
## Multiple R-squared:  0.8683, Adjusted R-squared:  0.7695 
## F-statistic: 8.789 on 6 and 8 DF,  p-value: 0.003616
#b)
summary(fit1)$r.squared
## [1] 0.8682799
#c)
summary(fit1)$adj.r.squared
## [1] 0.7694898
extractAIC(fit1)
## [1]  7.00000 92.54619
#3
library(faraway)

#a)1.
fit_p = lm(lpsa ~ ., data = prostate)
fit_p_back_aic = step(fit_p, direction = "backward")
## Start:  AIC=-58.32
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - gleason  1    0.0412 44.204 -60.231
## - pgg45    1    0.5258 44.689 -59.174
## - lcp      1    0.6740 44.837 -58.853
## <none>                 44.163 -58.322
## - age      1    1.5503 45.713 -56.975
## - lbph     1    1.6835 45.847 -56.693
## - lweight  1    3.5861 47.749 -52.749
## - svi      1    4.9355 49.099 -50.046
## - lcavol   1   22.3721 66.535 -20.567
## 
## Step:  AIC=-60.23
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - lcp      1    0.6623 44.867 -60.789
## <none>                 44.204 -60.231
## - pgg45    1    1.1920 45.396 -59.650
## - age      1    1.5166 45.721 -58.959
## - lbph     1    1.7053 45.910 -58.560
## - lweight  1    3.5462 47.750 -54.746
## - svi      1    4.8984 49.103 -52.037
## - lcavol   1   23.5039 67.708 -20.872
## 
## Step:  AIC=-60.79
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - pgg45    1    0.6590 45.526 -61.374
## <none>                 44.867 -60.789
## - age      1    1.2649 46.131 -60.092
## - lbph     1    1.6465 46.513 -59.293
## - lweight  1    3.5647 48.431 -55.373
## - svi      1    4.2503 49.117 -54.009
## - lcavol   1   25.4189 70.285 -19.248
## 
## Step:  AIC=-61.37
## lpsa ~ lcavol + lweight + age + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 45.526 -61.374
## - age      1    0.9592 46.485 -61.352
## - lbph     1    1.8568 47.382 -59.497
## - lweight  1    3.2251 48.751 -56.735
## - svi      1    5.9517 51.477 -51.456
## - lcavol   1   28.7665 74.292 -15.871
#a)2.
n = length(resid(fit_p))
fit_p_back_bic = step(fit_p, direction = "backward", k = log(n))
## Start:  AIC=-35.15
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - gleason  1    0.0412 44.204 -39.634
## - pgg45    1    0.5258 44.689 -38.576
## - lcp      1    0.6740 44.837 -38.255
## - age      1    1.5503 45.713 -36.377
## - lbph     1    1.6835 45.847 -36.095
## <none>                 44.163 -35.149
## - lweight  1    3.5861 47.749 -32.151
## - svi      1    4.9355 49.099 -29.448
## - lcavol   1   22.3721 66.535   0.030
## 
## Step:  AIC=-39.63
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - lcp      1    0.6623 44.867 -42.766
## - pgg45    1    1.1920 45.396 -41.627
## - age      1    1.5166 45.721 -40.936
## - lbph     1    1.7053 45.910 -40.537
## <none>                 44.204 -39.634
## - lweight  1    3.5462 47.750 -36.723
## - svi      1    4.8984 49.103 -34.014
## - lcavol   1   23.5039 67.708  -2.849
## 
## Step:  AIC=-42.77
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - pgg45    1    0.6590 45.526 -45.926
## - age      1    1.2649 46.131 -44.644
## - lbph     1    1.6465 46.513 -43.844
## <none>                 44.867 -42.766
## - lweight  1    3.5647 48.431 -39.925
## - svi      1    4.2503 49.117 -38.561
## - lcavol   1   25.4189 70.285  -3.800
## 
## Step:  AIC=-45.93
## lpsa ~ lcavol + lweight + age + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC
## - age      1    0.9592 46.485 -48.478
## - lbph     1    1.8568 47.382 -46.623
## <none>                 45.526 -45.926
## - lweight  1    3.2251 48.751 -43.862
## - svi      1    5.9517 51.477 -38.583
## - lcavol   1   28.7665 74.292  -2.997
## 
## Step:  AIC=-48.48
## lpsa ~ lcavol + lweight + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC
## - lbph     1    1.3001 47.785 -50.377
## <none>                 46.485 -48.478
## - lweight  1    2.8014 49.286 -47.377
## - svi      1    5.8063 52.291 -41.636
## - lcavol   1   27.8298 74.315  -7.542
## 
## Step:  AIC=-50.38
## lpsa ~ lcavol + lweight + svi
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 47.785 -50.377
## - svi      1    5.1814 52.966 -44.966
## - lweight  1    5.8924 53.677 -43.673
## - lcavol   1   28.0445 75.829 -10.160
#b)1.
fit_p_start = lm(lpsa ~ 1, data = prostate)
fit_p_fwd_aic = step(fit_p_start,lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, direction = "forward")
## Start:  AIC=28.84
## lpsa ~ 1
## 
##           Df Sum of Sq     RSS     AIC
## + lcavol   1    69.003  58.915 -44.366
## + svi      1    41.011  86.907  -6.658
## + lcp      1    38.528  89.389  -3.926
## + pgg45    1    22.814 105.103  11.783
## + gleason  1    17.416 110.501  16.641
## + lweight  1    16.041 111.876  17.840
## + lbph     1     4.136 123.782  27.650
## + age      1     3.679 124.238  28.007
## <none>                 127.918  28.837
## 
## Step:  AIC=-44.37
## lpsa ~ lcavol
## 
##           Df Sum of Sq    RSS     AIC
## + lweight  1    5.9485 52.966 -52.690
## + svi      1    5.2375 53.677 -51.397
## + lbph     1    3.2658 55.649 -47.898
## + pgg45    1    1.6980 57.217 -45.203
## <none>                 58.915 -44.366
## + lcp      1    0.6562 58.259 -43.453
## + gleason  1    0.4156 58.499 -43.053
## + age      1    0.0025 58.912 -42.370
## 
## Step:  AIC=-52.69
## lpsa ~ lcavol + lweight
## 
##           Df Sum of Sq    RSS     AIC
## + svi      1    5.1814 47.785 -60.676
## + pgg45    1    1.9489 51.017 -54.327
## <none>                 52.966 -52.690
## + lcp      1    0.8371 52.129 -52.236
## + gleason  1    0.7810 52.185 -52.131
## + lbph     1    0.6751 52.291 -51.935
## + age      1    0.4200 52.546 -51.463
## 
## Step:  AIC=-60.68
## lpsa ~ lcavol + lweight + svi
## 
##           Df Sum of Sq    RSS     AIC
## + lbph     1   1.30006 46.485 -61.352
## <none>                 47.785 -60.676
## + pgg45    1   0.57347 47.211 -59.847
## + age      1   0.40251 47.382 -59.497
## + gleason  1   0.38901 47.396 -59.469
## + lcp      1   0.06412 47.721 -58.806
## 
## Step:  AIC=-61.35
## lpsa ~ lcavol + lweight + svi + lbph
## 
##           Df Sum of Sq    RSS     AIC
## + age      1   0.95924 45.526 -61.374
## <none>                 46.485 -61.352
## + pgg45    1   0.35332 46.131 -60.092
## + gleason  1   0.21256 46.272 -59.796
## + lcp      1   0.10230 46.383 -59.565
## 
## Step:  AIC=-61.37
## lpsa ~ lcavol + lweight + svi + lbph + age
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 45.526 -61.374
## + pgg45    1   0.65896 44.867 -60.789
## + gleason  1   0.45601 45.070 -60.351
## + lcp      1   0.12927 45.396 -59.650
#b)2.
n = length(resid(fit_p))
fit_p_fwd_bic = step(fit_p_start,lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, direction = "forward", k = log(n))
## Start:  AIC=31.41
## lpsa ~ 1
## 
##           Df Sum of Sq     RSS     AIC
## + lcavol   1    69.003  58.915 -39.217
## + svi      1    41.011  86.907  -1.508
## + lcp      1    38.528  89.389   1.224
## + pgg45    1    22.814 105.103  16.932
## + gleason  1    17.416 110.501  21.790
## + lweight  1    16.041 111.876  22.990
## <none>                 127.918  31.412
## + lbph     1     4.136 123.782  32.799
## + age      1     3.679 124.238  33.156
## 
## Step:  AIC=-39.22
## lpsa ~ lcavol
## 
##           Df Sum of Sq    RSS     AIC
## + lweight  1    5.9485 52.966 -44.966
## + svi      1    5.2375 53.677 -43.673
## + lbph     1    3.2658 55.649 -40.174
## <none>                 58.915 -39.217
## + pgg45    1    1.6980 57.217 -37.479
## + lcp      1    0.6562 58.259 -35.728
## + gleason  1    0.4156 58.499 -35.329
## + age      1    0.0025 58.912 -34.646
## 
## Step:  AIC=-44.97
## lpsa ~ lcavol + lweight
## 
##           Df Sum of Sq    RSS     AIC
## + svi      1    5.1814 47.785 -50.377
## <none>                 52.966 -44.966
## + pgg45    1    1.9489 51.017 -44.028
## + lcp      1    0.8371 52.129 -41.937
## + gleason  1    0.7810 52.185 -41.833
## + lbph     1    0.6751 52.291 -41.636
## + age      1    0.4200 52.546 -41.164
## 
## Step:  AIC=-50.38
## lpsa ~ lcavol + lweight + svi
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 47.785 -50.377
## + lbph     1   1.30006 46.485 -48.478
## + pgg45    1   0.57347 47.211 -46.974
## + age      1   0.40251 47.382 -46.623
## + gleason  1   0.38901 47.396 -46.596
## + lcp      1   0.06412 47.721 -45.933
#c)
library(leaps)
all_fits = regsubsets(lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data = prostate)
all_fits_sum = summary (all_fits)
all_fits_sum$which
##   (Intercept) lcavol lweight   age  lbph   svi   lcp gleason pgg45
## 1        TRUE   TRUE   FALSE FALSE FALSE FALSE FALSE   FALSE FALSE
## 2        TRUE   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE FALSE
## 3        TRUE   TRUE    TRUE FALSE FALSE  TRUE FALSE   FALSE FALSE
## 4        TRUE   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE FALSE
## 5        TRUE   TRUE    TRUE  TRUE  TRUE  TRUE FALSE   FALSE FALSE
## 6        TRUE   TRUE    TRUE  TRUE  TRUE  TRUE FALSE   FALSE  TRUE
## 7        TRUE   TRUE    TRUE  TRUE  TRUE  TRUE  TRUE   FALSE  TRUE
## 8        TRUE   TRUE    TRUE  TRUE  TRUE  TRUE  TRUE    TRUE  TRUE
#AIC
p = length(coef(fit_p))
n = length(resid(fit_p))
AIC = n*log(all_fits_sum$rss/n) + 2*(2:p)
which.min(AIC)
## [1] 5
#BIC
BIC = n*log(all_fits_sum$rss/n) + log(n)*(2:p)
which.min(BIC)
## [1] 3
#R2adj
R_adj = all_fits_sum$adjr2
which.max(R_adj)
## [1] 7
R_adj
## [1] 0.5345838 0.5771246 0.6143899 0.6208036 0.6245476 0.6258707 0.6272521
## [8] 0.6233681
#d)
#a)1
summary(lm(lpsa ~ lcavol + lweight + age + lbph + svi, data = prostate))$adj.r.squared
## [1] 0.6245476
#a)2
summary(lm(lpsa ~ lcavol + lweight + svi, data = prostate))$adj.r.squared
## [1] 0.6143899
#b)1
summary(lm(lpsa ~ lcavol + lweight + svi + lbph + age, data = prostate))$adj.r.squared
## [1] 0.6245476
#b)2
summary(lm(lpsa ~ lcavol + lweight + svi, data = prostate))$adj.r.squared
## [1] 0.6143899
#c)1
summary(lm(lpsa ~ lcavol + lweight + age + lbph + svi, data = prostate))$adj.r.squared
## [1] 0.6245476
#c)2
summary(lm(lpsa ~ lcavol + lweight + svi, data = prostate))$adj.r.squared
## [1] 0.6143899
#c)3
summary(lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45, data = prostate))$adj.r.squared
## [1] 0.6272521
#d) alternative
library(faraway)
fit_best_1 = lm (lpsa ~ lcavol + lweight + svi, data = prostate)
fit_best_2 = lm (lpsa ~ lcavol + lweight + age + lbph + svi, data = prostate)
anova(fit_best_1,fit_best_2)
## Analysis of Variance Table
## 
## Model 1: lpsa ~ lcavol + lweight + svi
## Model 2: lpsa ~ lcavol + lweight + age + lbph + svi
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     93 47.785                          
## 2     91 45.526  2    2.2593 2.258 0.1104