setwd("D:\\Rdownload\\lianxi\\second") #设定工作路径
d2.5<-read.csv("ex2.5.csv", header = T) #将ex2.5.csv数据读入到d2.5中
lm.exam<-lm(y~x1+x2+x3+x4+x5+x6,data=d2.5) #建立y关于x1,x2,x3,x4,x5,x6的线性回归方程,数据为d2.5
summary(lm.exam) #给出回归系数的估计和显著性检验等
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, data = d2.5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3078.2 -713.3 -118.6 674.8 2852.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.716e+04 1.585e+04 2.345 0.028937 *
## x1 -7.792e-01 3.351e-01 -2.326 0.030138 *
## x2 2.308e-01 5.888e-02 3.920 0.000786 ***
## x3 5.425e-01 8.940e-01 0.607 0.550460
## x4 -3.059e-01 1.636e-01 -1.869 0.075580 .
## x5 4.600e-01 1.527e-01 3.012 0.006636 **
## x6 -5.757e-01 6.274e-01 -0.918 0.369255
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1428 on 21 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9981
## F-statistic: 2338 on 6 and 21 DF, p-value: < 2.2e-16
##回归方程的F值为2338,相应的p值小于< 2.2e-16,说明回归方程是显著的;但t检验对应的p值则显示:常数项,x1,x2,x4,x5是显著的,而x3和x6是不显著的。
lm.step<-step(lm.exam,direction="both") #进行逐步回归
## Start: AIC=412.73
## y ~ x1 + x2 + x3 + x4 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x3 1 750912 43571799 411.22
## - x6 1 1716840 44537727 411.83
## <none> 42820887 412.73
## - x4 1 7125754 49946641 415.04
## - x1 1 11028364 53849251 417.15
## - x5 1 18499558 61320445 420.78
## - x2 1 31333958 74154845 426.10
##
## Step: AIC=411.22
## y ~ x1 + x2 + x4 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x6 1 1114248 44686047 409.92
## <none> 43571799 411.22
## + x3 1 750912 42820887 412.73
## - x1 1 10574111 54145911 415.30
## - x4 1 21835301 65407100 420.59
## - x2 1 32092087 75663886 424.67
## - x5 1 112640001 156211801 444.97
##
## Step: AIC=409.92
## y ~ x1 + x2 + x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 44686047 409.92
## + x6 1 1114248 43571799 411.22
## + x3 1 148320 44537727 411.83
## - x1 1 9603822 54289869 413.37
## - x4 1 31433342 76119389 422.84
## - x2 1 32250373 76936420 423.14
## - x5 1 111626871 156312918 442.98
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = y ~ x1 + x2 + x4 + x5, data = d2.5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2972.2 -736.2 -232.6 1003.9 2551.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.556e+04 1.088e+04 4.189 0.000352 ***
## x1 -6.482e-01 2.916e-01 -2.223 0.036302 *
## x2 2.337e-01 5.735e-02 4.074 0.000468 ***
## x4 -4.115e-01 1.023e-01 -4.022 0.000532 ***
## x5 5.376e-01 7.093e-02 7.580 1.07e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1394 on 23 degrees of freedom
## Multiple R-squared: 0.9984, Adjusted R-squared: 0.9982
## F-statistic: 3681 on 4 and 23 DF, p-value: < 2.2e-16
##所以,变量选择后的方程为 y= 45560 - 0.65 x1 + 0.02 x2 - 0.41 x4 + 0.54 x5
y.res<-residuals (lm.exam) #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
print(y.rst) #输出回归模型lm.step的标准化残差y.rst
## 1 2 3 4 5 6
## -1.21699581 -0.78721946 -0.38233142 -0.27703068 0.28527306 1.22685046
## 7 8 9 10 11 12
## 1.32333522 1.22932548 0.81626125 0.45556993 -0.10743258 -0.69287852
## 13 14 15 16 17 18
## -0.92329244 -0.49363842 -0.24834446 -0.53198827 0.18421338 -0.03401075
## 19 20 21 22 23 24
## -0.37768467 0.74526662 -0.27973593 -0.82441646 2.20553032 0.07556743
## 25 26 27 28
## -1.73291937 -2.37088887 1.36683593 1.96225224
##因为学生化删除残差的值都在-3至3的范围内,所以没有异常点。
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.res~ y.fit) #绘制以普通残差为纵坐标,预测值为横坐标的散点图
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
lm.step_new<-update(lm.step,log(.)~.) #对模型进行对数变换
y.rst<-rstandard(lm.step_new) #计算lm.step_new的标准化残差
print(y.rst) #输出回归模型lm.step_new的标准化残差y.rst
## 1 2 3 4 5 6
## 2.51787590 1.56615602 0.45133799 -0.28549680 -0.53753827 -0.82298091
## 7 8 9 10 11 12
## -1.49016805 -1.90146940 -1.35819005 -0.96837445 -0.61462117 -0.27027163
## 13 14 15 16 17 18
## -0.31083961 -0.02743086 0.20256063 0.21598119 1.01248118 1.39020702
## 19 20 21 22 23 24
## 0.88950979 1.27815003 0.56988711 -0.03388515 -0.04297635 -0.43131791
## 25 26 27 28
## 0.75781547 -0.41845494 -1.28720561 0.31465673
y.fit<-predict(lm.step_new) #计算lm.step_new的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##对数变换后,学生化删除残差的值也都在-3至3的范围内,所以没有异常点。
par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图
influence.measures(lm.step_new) #计算各个观测值的诊断统计量
## Influence measures of
## lm(formula = log(y) ~ x1 + x2 + x4 + x5, data = d2.5) :
##
## dfb.1_ dfb.x1 dfb.x2 dfb.x4 dfb.x5 dffit cov.r cook.d hat
## 1 1.15382 0.286464 0.41922 -1.05942 -0.47200 1.7243 0.338 4.50e-01 0.2621
## 2 0.43166 0.010515 0.19295 -0.38173 -0.11994 0.8120 0.889 1.23e-01 0.2007
## 3 0.07350 -0.021235 0.03598 -0.06118 -0.00288 0.1892 1.412 7.42e-03 0.1541
## 4 -0.02365 0.024050 -0.01058 0.01673 -0.01484 -0.1050 1.400 2.30e-03 0.1235
## 5 -0.00129 0.070870 -0.00907 -0.01074 -0.05444 -0.1841 1.314 7.00e-03 0.1080
## 6 -0.00723 0.058363 0.01813 -0.00627 -0.06241 -0.2415 1.170 1.18e-02 0.0804
## 7 0.14099 0.237992 0.04155 -0.16674 -0.22168 -0.4805 0.826 4.36e-02 0.0894
## 8 0.30990 0.410269 0.03243 -0.34326 -0.34676 -0.6703 0.589 7.92e-02 0.0987
## 9 0.23236 0.273760 0.01793 -0.25224 -0.22485 -0.4431 0.906 3.78e-02 0.0928
## 10 -0.04352 -0.084235 0.05584 0.04187 0.03713 -0.2447 1.079 1.20e-02 0.0602
## 11 -0.14727 -0.217264 0.07202 0.15347 0.13077 -0.2638 1.367 1.43e-02 0.1593
## 12 -0.09018 -0.137104 0.04975 0.09514 0.07819 -0.1546 1.648 4.98e-03 0.2543
## 13 -0.06317 -0.111748 0.04456 0.06765 0.06251 -0.1379 1.474 3.96e-03 0.1701
## 14 -0.00287 -0.007665 0.00511 0.00327 0.00286 -0.0111 1.462 2.57e-05 0.1461
## 15 -0.01138 0.020994 -0.03597 0.00941 0.00643 0.0672 1.380 9.44e-04 0.1032
## 16 -0.04387 -0.016402 -0.02617 0.04311 0.02909 0.0703 1.373 1.03e-03 0.0996
## 17 -0.26479 -0.124161 -0.14521 0.26117 0.19147 0.3740 1.130 2.79e-02 0.1200
## 18 -0.53212 -0.328033 -0.26385 0.52967 0.44139 0.6534 0.976 8.18e-02 0.1746
## 19 -0.36864 -0.287610 -0.03444 0.37206 0.24286 0.4171 1.281 3.51e-02 0.1816
## 20 0.04531 0.245424 -0.02369 -0.06273 -0.20886 0.4672 0.976 4.24e-02 0.1149
## 21 -0.06594 -0.036022 0.09503 0.06554 -0.05316 0.1926 1.300 7.65e-03 0.1054
## 22 0.00776 0.009361 -0.01258 -0.00813 0.00246 -0.0175 1.598 6.43e-05 0.2187
## 23 -0.00415 -0.000772 -0.02397 0.00391 0.01939 -0.0283 1.813 1.67e-04 0.3113
## 24 -0.11065 -0.078031 -0.20547 0.11049 0.21931 -0.2820 1.731 1.65e-02 0.3072
## 25 -0.14216 -0.162992 -0.00164 0.14625 0.14918 0.3127 1.292 1.99e-02 0.1479
## 26 0.03618 0.075839 -0.02872 -0.03965 -0.05413 -0.1997 1.486 8.27e-03 0.1911
## 27 -0.31795 -0.162936 -0.10055 0.31635 0.14073 -0.8176 1.196 1.30e-01 0.2813
## 28 0.04990 0.059612 -0.25822 -0.05455 0.18255 0.4145 3.430 3.58e-02 0.6437
## inf
## 1 *
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23 *
## 24 *
## 25
## 26
## 27
## 28 *
##第1、23、24、28号观测值被诊断为强影响点
lm.exam<-lm(log(y)~x1+x2+x3+x4+x5+x6, data= d2.5[-c(1,23,24,28),]) #去掉第第1、23、24、28号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归
## Start: AIC=-119.8
## log(y) ~ x1 + x2 + x3 + x4 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x6 1 0.004268 0.095264 -120.700
## - x1 1 0.005701 0.096698 -120.341
## <none> 0.090997 -119.800
## - x5 1 0.059096 0.150093 -109.789
## - x2 1 0.061673 0.152669 -109.381
## - x4 1 0.126097 0.217094 -100.932
## - x3 1 0.153170 0.244166 -98.111
##
## Step: AIC=-120.7
## log(y) ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x1 1 0.003048 0.09831 -121.944
## <none> 0.09526 -120.700
## + x6 1 0.004268 0.09100 -119.800
## - x2 1 0.060287 0.15555 -110.932
## - x5 1 0.060886 0.15615 -110.840
## - x3 1 0.154947 0.25021 -99.524
## - x4 1 0.222711 0.31798 -93.772
##
## Step: AIC=-121.94
## log(y) ~ x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 0.09831 -121.944
## + x1 1 0.00305 0.09526 -120.700
## + x6 1 0.00162 0.09670 -120.341
## - x5 1 0.05785 0.15617 -112.837
## - x2 1 0.06698 0.16529 -111.474
## - x3 1 0.17944 0.27775 -99.018
## - x4 1 0.89202 0.99033 -68.506
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = log(y) ~ x2 + x3 + x4 + x5, data = d2.5[-c(1, 23,
## 24, 28), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.127705 -0.034211 0.002612 0.031899 0.151760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.631e-01 5.646e-01 0.466 0.64652
## x2 2.202e-05 6.122e-06 3.598 0.00192 **
## x3 -2.615e-04 4.441e-05 -5.889 1.14e-05 ***
## x4 6.620e-05 5.042e-06 13.130 5.59e-11 ***
## x5 3.608e-05 1.079e-05 3.344 0.00341 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07193 on 19 degrees of freedom
## Multiple R-squared: 0.997, Adjusted R-squared: 0.9964
## F-statistic: 1592 on 4 and 19 DF, p-value: < 2.2e-16
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##此时,变量选择后的方程为 y= 0.26 + 0.00002 x2 -0.00003 x3 + 0.00007 x4 + 0.00004 x5,残差几乎全部落在[-2,2]区域内
preds<-data.frame(x2= 100000, x3=1500, x4= 110000, x5= 90000) #给定解释变量x1, x2, x4, x5的值
predict(lm.step,newdata=preds,interval="prediction",
level=0.95) #进行点预测和区间预测
## fit lwr upr
## 1 12.60188 11.21075 13.993
##计算结果y的点预测为12.6, 预测区间为[11.21, 13.99].