setwd("D:\\Rdownload\\lianxi\\second") #设定工作路径
d2.3<-read.csv("ex2.3.csv", header = T) #将ex2.3.csv数据读入到d2.3中
lm.exam<-lm(y~x1+x2+x3+x4+x5,data=d2.3) #建立y关于x1,x2,x3,x4和x5的线性回归方程,数据为d2.3
summary(lm.exam) #给出回归系数的估计和显著性检验等
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = d2.3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -252.31 -48.18 -12.79 52.81 193.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.012e+02 1.431e+02 -2.803 0.01870 *
## x1 1.444e-02 2.402e-02 0.601 0.56109
## x2 -2.087e-02 8.657e-02 -0.241 0.81440
## x3 5.810e-05 1.464e-03 0.040 0.96912
## x4 3.044e+01 8.298e+00 3.668 0.00433 **
## x5 2.004e-01 1.115e-01 1.798 0.10235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129.6 on 10 degrees of freedom
## Multiple R-squared: 0.9879, Adjusted R-squared: 0.9818
## F-statistic: 162.8 on 5 and 10 DF, p-value: 3.043e-09
##回归方程的F值为162.8,相应的p值为3.043e-09,说明回归方程是显著的;但t检验对应的p值则显示:常数项和x4是显著的,而x1,x2,x3和x5是不显著的。
lm.step<-step(lm.exam,direction="both") #进行逐步回归
## Start: AIC=160.15
## y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x3 1 26 168042 158.15
## - x2 1 976 168992 158.24
## - x1 1 6073 174088 158.72
## <none> 168015 160.15
## - x5 1 54329 222344 162.63
## - x4 1 226106 394122 171.79
##
## Step: AIC=158.15
## y ~ x1 + x2 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x2 1 957 168999 156.24
## - x1 1 6050 174092 156.72
## <none> 168042 158.15
## + x3 1 26 168015 160.15
## - x5 1 55493 223535 160.72
## - x4 1 228011 396052 169.87
##
## Step: AIC=156.24
## y ~ x1 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x1 1 5903 174902 154.79
## <none> 168999 156.24
## + x2 1 957 168042 158.15
## + x3 1 8 168992 158.24
## - x5 1 155567 324566 164.68
## - x4 1 501855 670854 176.30
##
## Step: AIC=154.79
## y ~ x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 174902 154.79
## + x1 1 5903 168999 156.24
## + x2 1 811 174092 156.72
## + x3 1 1 174901 156.79
## - x5 1 180176 355078 164.12
## - x4 1 1843682 2018584 191.93
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = y ~ x4 + x5, data = d2.3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -249.75 -42.72 -13.87 54.34 205.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -410.07401 57.16828 -7.173 7.23e-06 ***
## x4 31.47130 2.68842 11.706 2.81e-08 ***
## x5 0.18680 0.05105 3.660 0.00288 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 116 on 13 degrees of freedom
## Multiple R-squared: 0.9874, Adjusted R-squared: 0.9854
## F-statistic: 508 on 2 and 13 DF, p-value: 4.572e-13
##所以,变量选择后的方程为 y= -410.07401 + 31.47130 x4 + 0.18680 x5
y.res<-residuals (lm.exam) #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
print(y.rst) #输出回归模型lm.step的标准化残差y.rst
## 1 2 3 4 5 6
## 1.34714761 1.18901283 0.29852998 -0.19086128 -0.23331509 -0.88348308
## 7 8 9 10 11 12
## -0.85374520 -0.46733123 -0.37082828 -0.06628483 0.91759182 -2.23768758
## 13 14 15 16
## -0.34282279 1.94017167 0.49964531 -0.07834811
##因为学生化删除残差的值都在-3至3的范围内,所以没有异常点。
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.res~ y.fit) #绘制以普通残差为纵坐标,预测值为横坐标的散点图
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
lm.step_new<-update(lm.step,log(.)~.) #对模型进行对数变换
y.rst<-rstandard(lm.step_new) #计算lm.step_new的标准化残差
print(y.rst) #输出回归模型lm.step_new的标准化残差y.rst
## 1 2 3 4 5 6
## -1.40262851 -0.31793577 -0.01338569 0.21238633 0.98803532 -0.94042409
## 7 8 9 10 11 12
## 0.29102351 0.32690756 0.16223542 0.16788788 -1.56157864 0.67577508
## 13 14 15 16
## 1.28633606 0.46626492 1.75873749 -2.83694083
y.fit<-predict(lm.step_new) #计算lm.step_new的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##对数变换后,第16号点为异常点。
lm.exam<-lm(log(y)~x1+x2+x3+x4+x5, data= d2.3[-c(16),]) #去掉第16号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归
## Start: AIC=-69.76
## log(y) ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x2 1 0.00020 0.06459 -71.717
## - x3 1 0.00173 0.06612 -71.366
## - x1 1 0.00256 0.06695 -71.178
## <none> 0.06439 -69.763
## - x4 1 0.03715 0.10153 -64.931
## - x5 1 0.36463 0.42901 -43.315
##
## Step: AIC=-71.72
## log(y) ~ x1 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x3 1 0.00165 0.06623 -73.339
## - x1 1 0.00308 0.06767 -73.018
## <none> 0.06459 -71.717
## + x2 1 0.00020 0.06439 -69.763
## - x4 1 0.09443 0.15901 -60.202
## - x5 1 1.14543 1.21002 -29.761
##
## Step: AIC=-73.34
## log(y) ~ x1 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x1 1 0.00286 0.06910 -74.704
## <none> 0.06623 -73.339
## + x3 1 0.00165 0.06459 -71.717
## + x2 1 0.00012 0.06612 -71.366
## - x4 1 0.09672 0.16295 -61.835
## - x5 1 1.25571 1.32195 -30.434
##
## Step: AIC=-74.7
## log(y) ~ x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 0.06910 -74.704
## + x1 1 0.00286 0.06623 -73.339
## + x3 1 0.00143 0.06767 -73.018
## + x2 1 0.00077 0.06833 -72.872
## - x4 1 0.18166 0.25076 -57.370
## - x5 1 1.29071 1.35980 -32.011
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = log(y) ~ x4 + x5, data = d2.3[-c(16), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12746 -0.02987 0.01767 0.03739 0.11059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.299e+00 4.256e-02 124.511 < 2e-16 ***
## x4 1.190e-02 2.119e-03 5.617 0.000113 ***
## x5 5.198e-04 3.472e-05 14.972 3.97e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07588 on 12 degrees of freedom
## Multiple R-squared: 0.9921, Adjusted R-squared: 0.9908
## F-statistic: 751.1 on 2 and 12 DF, p-value: 2.478e-13
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##此时,变量选择后的方程为 y= 5.299 + 0.0119 x4 + 0.00052 x5,残差几乎全部落在[-2,2]区域内
par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图
influence.measures(lm.step_new) #计算各个观测值的诊断统计量
## Influence measures of
## lm(formula = log(y) ~ x4 + x5, data = d2.3) :
##
## dfb.1_ dfb.x4 dfb.x5 dffit cov.r cook.d hat inf
## 1 -0.63272 -0.22122 0.47744 -0.75975 0.987 1.77e-01 0.2124
## 2 -0.12171 -0.02249 0.06946 -0.13646 1.488 6.67e-03 0.1653
## 3 -0.00461 -0.00127 0.00297 -0.00541 1.497 1.06e-05 0.1505
## 4 0.06627 0.01389 -0.03693 0.07688 1.436 2.13e-03 0.1239
## 5 0.30940 0.09625 -0.20258 0.37587 1.152 4.72e-02 0.1266
## 6 -0.28320 -0.02228 0.11485 -0.31998 1.149 3.45e-02 0.1047
## 7 0.07207 -0.01065 -0.00912 0.08372 1.358 2.51e-03 0.0818
## 8 0.06848 -0.06323 0.05096 0.10491 1.378 3.94e-03 0.0996
## 9 0.02460 -0.04629 0.04661 0.06379 1.475 1.47e-03 0.1432
## 10 0.01405 -0.05005 0.05729 0.07239 1.517 1.89e-03 0.1673
## 11 -0.13082 1.06851 -1.19911 -1.31112 1.105 5.04e-01 0.3829 *
## 12 0.02647 0.00154 0.03355 0.18701 1.234 1.22e-02 0.0741
## 13 -0.00239 -0.04409 0.14969 0.42146 0.931 5.60e-02 0.0921
## 14 -0.03490 -0.06894 0.12885 0.20221 1.451 1.45e-02 0.1669
## 15 -0.63582 1.29132 -0.79868 1.60711 0.951 7.11e-01 0.4081 *
## 16 2.39755 -2.96628 1.25665 -4.42069 0.141 2.69e+00 0.5005 *
##第11号、15号、16号观测值被诊断为强影响点
lm.exam<-lm(log(y)~x1+x2+x3+x4+x5, data= d2.3[-c(11,15,16),]) #去掉第11、15、16号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归
## Start: AIC=-63.64
## log(y) ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x3 1 0.000222 0.038846 -65.570
## - x1 1 0.000692 0.039317 -65.414
## - x2 1 0.004341 0.042965 -64.260
## <none> 0.038624 -63.645
## - x4 1 0.015324 0.053949 -61.301
## - x5 1 0.202208 0.240833 -41.852
##
## Step: AIC=-65.57
## log(y) ~ x1 + x2 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x1 1 0.000612 0.039458 -67.367
## - x2 1 0.004602 0.043448 -66.115
## <none> 0.038846 -65.570
## + x3 1 0.000222 0.038624 -63.645
## - x4 1 0.015627 0.054473 -63.175
## - x5 1 0.202244 0.241090 -43.838
##
## Step: AIC=-67.37
## log(y) ~ x2 + x4 + x5
##
## Df Sum of Sq RSS AIC
## - x2 1 0.00577 0.04523 -67.594
## <none> 0.03946 -67.367
## + x1 1 0.00061 0.03885 -65.570
## + x3 1 0.00014 0.03932 -65.414
## - x4 1 0.01510 0.05455 -65.155
## - x5 1 0.32849 0.36794 -40.342
##
## Step: AIC=-67.59
## log(y) ~ x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 0.04523 -67.594
## + x2 1 0.00577 0.03946 -67.367
## + x1 1 0.00178 0.04345 -66.115
## + x3 1 0.00032 0.04491 -65.686
## - x4 1 0.02706 0.07228 -63.497
## - x5 1 0.34370 0.38893 -41.621
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = log(y) ~ x4 + x5, data = d2.3[-c(11, 15, 16), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.111097 -0.019653 0.004984 0.035724 0.114282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.281e+00 6.416e-02 82.305 1.71e-15 ***
## x4 1.174e-02 4.800e-03 2.446 0.0345 *
## x5 5.428e-04 6.226e-05 8.718 5.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06725 on 10 degrees of freedom
## Multiple R-squared: 0.9928, Adjusted R-squared: 0.9913
## F-statistic: 685.2 on 2 and 10 DF, p-value: 1.995e-11
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##此时,变量选择后的方程为 y= 5.281 + 0.01174 x4 + 0.0005 x5,残差几乎全部落在[-2,2]区域内
preds<-data.frame(x4=60,x5=3300) #给定解释变量x4和x5的值
predict(lm.step,newdata=preds,interval="prediction",
level=0.95) #进行点预测和区间预测
## fit lwr upr
## 1 7.775986 7.584898 7.967075
##计算结果y的点预测为7.78,预测区间为[7.58, 7.97].