data2.5 <- read.csv("/Users/nicolechen/Desktop/R_proj/For postgrad/Dataset2024forpost/ex2.5.csv")
lm <- lm(y~x1+x2+x3+x4+x5+x6, data = data2.5) #建立回归
summary(lm) #给出回归系数
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, data = data2.5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1696.0 -986.5 -270.6 1032.5 2502.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.288e+04 7.501e+04 -0.172 0.86633
## x1 1.597e+00 2.208e-01 7.233 6.62e-06 ***
## x2 2.760e-02 9.133e-02 0.302 0.76730
## x3 5.127e-01 5.942e-01 0.863 0.40391
## x4 -8.074e-02 6.027e-01 -0.134 0.89548
## x5 1.659e-01 5.319e-02 3.119 0.00814 **
## x6 5.194e-01 7.314e-01 0.710 0.49014
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1435 on 13 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9994
## F-statistic: 5449 on 6 and 13 DF, p-value: < 2.2e-16
对于以上输出结果可以看出,回归方程的F值为5449,对应的p值是2.2e-16,说明回归方程显著。但t检验对应的p值显示x1和x5显著,但其余变量不显著。
#逐步回归处理不显著的变量
lm <- lm(y~x1+x2+x3+x4+x5+x6, data = data2.5)
lm.step <- step(lm,direction = "both") #进行逐步回归
## Start: AIC=296.15
## y ~ x1 + x2 + x3 + x4 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x4 1 36974 26816409 294.18
## - x2 1 188095 26967530 294.29
## - x6 1 1038959 27818394 294.91
## - x3 1 1533308 28312742 295.26
## <none> 26779435 296.15
## - x5 1 20042169 46821604 305.32
## - x1 1 107776958 134556393 326.44
##
## Step: AIC=294.18
## y ~ x1 + x2 + x3 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x2 1 210582 27026991 292.33
## - x6 1 1045984 27862392 292.94
## <none> 26816409 294.18
## - x3 1 3909502 30725911 294.90
## + x4 1 36974 26779435 296.15
## - x5 1 31758103 58574512 307.80
## - x1 1 107802170 134618578 324.44
##
## Step: AIC=292.33
## y ~ x1 + x3 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x6 1 1019260 28046251 291.07
## <none> 27026991 292.33
## + x2 1 210582 26816409 294.18
## + x4 1 59461 26967530 294.29
## - x3 1 10260882 37287873 296.77
## - x5 1 45929206 72956197 310.19
## - x1 1 181638367 208665357 331.21
##
## Step: AIC=291.07
## y ~ x1 + x3 + x5
##
## Df Sum of Sq RSS AIC
## <none> 28046251 291.07
## + x6 1 1019260 27026991 292.33
## + x2 1 183858 27862392 292.94
## + x4 1 43891 28002360 293.04
## - x3 1 11965198 40011449 296.18
## - x5 1 44956437 73002687 308.21
## - x1 1 199336940 227383190 330.93
summary(lm.step)
##
## Call:
## lm(formula = y ~ x1 + x3 + x5, data = data2.5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2239.2 -1002.2 -227.8 890.5 2277.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.966e+04 1.804e+03 -10.898 8.20e-09 ***
## x1 1.593e+00 1.494e-01 10.664 1.12e-08 ***
## x3 7.018e-01 2.686e-01 2.613 0.018850 *
## x5 1.492e-01 2.946e-02 5.064 0.000115 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1324 on 16 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9995
## F-statistic: 1.281e+04 on 3 and 16 DF, p-value: < 2.2e-16
以上结果表明,当变量只留x1,x3和x5时候,模型的AIC最小,为291.07。注意到x1,x3和x5都是显著的,模型也是显著的,所以得到如下 “最优” 回归方程:y = -1.966e+04 + 1.593e+00x1 + 7.018e-01x3 + 1.492e-01x5
data2.6 <- read.csv("/Users/nicolechen/Desktop/R_proj/For postgrad/Dataset2024forpost/ex2.6.csv")
lm <- lm(y~x1+x2+x3+x4+x5, data = data2.6) #建立回归
summary(lm) #给出回归系数
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = data2.6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127473 -30072 1477 27211 178017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 925551.025 72652.402 12.739 1.25e-11 ***
## x1 -2.957 8.675 -0.341 0.7364
## x2 54.304 22.461 2.418 0.0244 *
## x3 -4.872 2.466 -1.976 0.0609 .
## x4 -214.738 301.481 -0.712 0.4838
## x5 -40.325 25.190 -1.601 0.1237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61730 on 22 degrees of freedom
## Multiple R-squared: 0.9983, Adjusted R-squared: 0.9979
## F-statistic: 2610 on 5 and 22 DF, p-value: < 2.2e-16
#逐步回归处理不显著的变量
lm2.6 <- lm(y~x1+x2+x3+x4+x5, data = data2.6)
lm.step <- step(lm2.6,direction = "both") #进行逐步回归
## Start: AIC=622.95
## y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x1 1 4.4283e+08 8.4267e+10 621.10
## - x4 1 1.9331e+09 8.5757e+10 621.59
## <none> 8.3824e+10 622.95
## - x5 1 9.7646e+09 9.3589e+10 624.04
## - x3 1 1.4871e+10 9.8696e+10 625.53
## - x2 1 2.2271e+10 1.0610e+11 627.55
##
## Step: AIC=621.1
## y ~ x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x4 1 4.8556e+09 8.9123e+10 620.67
## <none> 8.4267e+10 621.10
## - x5 1 1.0533e+10 9.4800e+10 622.40
## + x1 1 4.4283e+08 8.3824e+10 622.95
## - x3 1 1.7568e+10 1.0184e+11 624.40
## - x2 1 2.7194e+10 1.1146e+11 626.93
##
## Step: AIC=620.67
## y ~ x2 + x3 + x5
##
## Df Sum of Sq RSS AIC
## <none> 8.9123e+10 620.67
## + x4 1 4.8556e+09 8.4267e+10 621.10
## + x1 1 3.3653e+09 8.5757e+10 621.59
## - x5 1 6.4854e+10 1.5398e+11 633.98
## - x3 1 1.0085e+11 1.8997e+11 639.86
## - x2 1 1.0533e+11 1.9446e+11 640.52
summary(lm.step)
##
## Call:
## lm(formula = y ~ x2 + x3 + x5, data = data2.6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145386 -28621 1109 29154 185370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 846790.577 23036.008 36.759 < 2e-16 ***
## x2 65.840 12.362 5.326 1.82e-05 ***
## x3 -6.269 1.203 -5.211 2.44e-05 ***
## x5 -55.068 13.177 -4.179 0.000335 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60940 on 24 degrees of freedom
## Multiple R-squared: 0.9982, Adjusted R-squared: 0.998
## F-statistic: 4463 on 3 and 24 DF, p-value: < 2.2e-16
以上结果表明,当变量只留x2,x3和x5时候,模型的AIC最小,为620.67。注意到x2,x3和x5都是显著的,模型也是显著的,所以得到如下 “最优” 回归方程:y = 846790.577 + 65.840x2 - 6.269x3 - 55.068x5