回归分析：全变量回归

setwd("D:\\Rdownload\\lianxi\\second")  #设定工作路径
d2.3<-read.csv("ex2.3.csv", header = T)  #将ex2.3.csv数据读入到d2.3中
lm.exam<-lm(y~x1+x2+x3+x4+x5,data=d2.3)  #建立y关于x1,x2,x3,x4和x5的线性回归方程,数据为d2.3
summary(lm.exam)  #给出回归系数的估计和显著性检验等

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = d2.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -252.31  -48.18  -12.79   52.81  193.02 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -4.012e+02  1.431e+02  -2.803  0.01870 * 
## x1           1.444e-02  2.402e-02   0.601  0.56109   
## x2          -2.087e-02  8.657e-02  -0.241  0.81440   
## x3           5.810e-05  1.464e-03   0.040  0.96912   
## x4           3.044e+01  8.298e+00   3.668  0.00433 **
## x5           2.004e-01  1.115e-01   1.798  0.10235   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.6 on 10 degrees of freedom
## Multiple R-squared:  0.9879, Adjusted R-squared:  0.9818 
## F-statistic: 162.8 on 5 and 10 DF,  p-value: 3.043e-09

##回归方程的F值为162.8，相应的p值为3.043e-09，说明回归方程是显著的；但t检验对应的p值则显示：常数项和x4是显著的，而x1，x2，x3和x5是不显著的。

回归分析：逐步回归

lm.step<-step(lm.exam,direction="both")  #进行逐步回归

## Start:  AIC=160.15
## y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1        26 168042 158.15
## - x2    1       976 168992 158.24
## - x1    1      6073 174088 158.72
## <none>              168015 160.15
## - x5    1     54329 222344 162.63
## - x4    1    226106 394122 171.79
## 
## Step:  AIC=158.15
## y ~ x1 + x2 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x2    1       957 168999 156.24
## - x1    1      6050 174092 156.72
## <none>              168042 158.15
## + x3    1        26 168015 160.15
## - x5    1     55493 223535 160.72
## - x4    1    228011 396052 169.87
## 
## Step:  AIC=156.24
## y ~ x1 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x1    1      5903 174902 154.79
## <none>              168999 156.24
## + x2    1       957 168042 158.15
## + x3    1         8 168992 158.24
## - x5    1    155567 324566 164.68
## - x4    1    501855 670854 176.30
## 
## Step:  AIC=154.79
## y ~ x4 + x5
## 
##        Df Sum of Sq     RSS    AIC
## <none>               174902 154.79
## + x1    1      5903  168999 156.24
## + x2    1       811  174092 156.72
## + x3    1         1  174901 156.79
## - x5    1    180176  355078 164.12
## - x4    1   1843682 2018584 191.93

summary(lm.step)  #回归模型汇总信息

## 
## Call:
## lm(formula = y ~ x4 + x5, data = d2.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -249.75  -42.72  -13.87   54.34  205.41 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -410.07401   57.16828  -7.173 7.23e-06 ***
## x4            31.47130    2.68842  11.706 2.81e-08 ***
## x5             0.18680    0.05105   3.660  0.00288 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 116 on 13 degrees of freedom
## Multiple R-squared:  0.9874, Adjusted R-squared:  0.9854 
## F-statistic:   508 on 2 and 13 DF,  p-value: 4.572e-13

##所以，变量选择后的方程为 y= -410.07401 + 31.47130 x4 + 0.18680 x5

回归诊断

标准化残差表

y.res<-residuals (lm.exam)  #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step)  #计算回归模型lm.step的标准化残差
print(y.rst)  #输出回归模型lm.step的标准化残差y.rst

##           1           2           3           4           5           6 
##  1.34714761  1.18901283  0.29852998 -0.19086128 -0.23331509 -0.88348308 
##           7           8           9          10          11          12 
## -0.85374520 -0.46733123 -0.37082828 -0.06628483  0.91759182 -2.23768758 
##          13          14          15          16 
## -0.34282279  1.94017167  0.49964531 -0.07834811

##因为学生化删除残差的值都在-3至3的范围内，所以没有异常点。

普通残差图和标准化残差图

y.fit<-predict(lm.step)  #计算回归模型lm.step的预测值
plot(y.res~ y.fit)  #绘制以普通残差为纵坐标，预测值为横坐标的散点图

plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标，预测值为横坐标的散点图

对数变换

lm.step_new<-update(lm.step,log(.)~.)  #对模型进行对数变换
y.rst<-rstandard(lm.step_new)  #计算lm.step_new的标准化残差
print(y.rst)  #输出回归模型lm.step_new的标准化残差y.rst

##           1           2           3           4           5           6 
## -1.40262851 -0.31793577 -0.01338569  0.21238633  0.98803532 -0.94042409 
##           7           8           9          10          11          12 
##  0.29102351  0.32690756  0.16223542  0.16788788 -1.56157864  0.67577508 
##          13          14          15          16 
##  1.28633606  0.46626492  1.75873749 -2.83694083

y.fit<-predict(lm.step_new)  #计算lm.step_new的预测值
plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标,预测值为横坐标的散点图

##对数变换后，第16号点为异常点。

去掉第16号观测值再回归

lm.exam<-lm(log(y)~x1+x2+x3+x4+x5, data= d2.3[-c(16),])  #去掉第16号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归

## Start:  AIC=-69.76
## log(y) ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## - x2    1   0.00020 0.06459 -71.717
## - x3    1   0.00173 0.06612 -71.366
## - x1    1   0.00256 0.06695 -71.178
## <none>              0.06439 -69.763
## - x4    1   0.03715 0.10153 -64.931
## - x5    1   0.36463 0.42901 -43.315
## 
## Step:  AIC=-71.72
## log(y) ~ x1 + x3 + x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## - x3    1   0.00165 0.06623 -73.339
## - x1    1   0.00308 0.06767 -73.018
## <none>              0.06459 -71.717
## + x2    1   0.00020 0.06439 -69.763
## - x4    1   0.09443 0.15901 -60.202
## - x5    1   1.14543 1.21002 -29.761
## 
## Step:  AIC=-73.34
## log(y) ~ x1 + x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## - x1    1   0.00286 0.06910 -74.704
## <none>              0.06623 -73.339
## + x3    1   0.00165 0.06459 -71.717
## + x2    1   0.00012 0.06612 -71.366
## - x4    1   0.09672 0.16295 -61.835
## - x5    1   1.25571 1.32195 -30.434
## 
## Step:  AIC=-74.7
## log(y) ~ x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## <none>              0.06910 -74.704
## + x1    1   0.00286 0.06623 -73.339
## + x3    1   0.00143 0.06767 -73.018
## + x2    1   0.00077 0.06833 -72.872
## - x4    1   0.18166 0.25076 -57.370
## - x5    1   1.29071 1.35980 -32.011

summary(lm.step) #回归模型汇总信息

## 
## Call:
## lm(formula = log(y) ~ x4 + x5, data = d2.3[-c(16), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12746 -0.02987  0.01767  0.03739  0.11059 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.299e+00  4.256e-02 124.511  < 2e-16 ***
## x4          1.190e-02  2.119e-03   5.617 0.000113 ***
## x5          5.198e-04  3.472e-05  14.972 3.97e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07588 on 12 degrees of freedom
## Multiple R-squared:  0.9921, Adjusted R-squared:  0.9908 
## F-statistic: 751.1 on 2 and 12 DF,  p-value: 2.478e-13

y.rst<-rstandard(lm.step)  #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step)  #计算回归模型lm.step的预测值
plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标，预测值为横坐标的散点图

##此时，变量选择后的方程为 y= 5.299 + 0.0119 x4 + 0.00052 x5，残差几乎全部落在[-2，2]区域内

回归诊断分析

par(mfrow=c(2,2))  #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图

influence.measures(lm.step_new) #计算各个观测值的诊断统计量

## Influence measures of
##   lm(formula = log(y) ~ x4 + x5, data = d2.3) :
## 
##      dfb.1_   dfb.x4   dfb.x5    dffit cov.r   cook.d    hat inf
## 1  -0.63272 -0.22122  0.47744 -0.75975 0.987 1.77e-01 0.2124    
## 2  -0.12171 -0.02249  0.06946 -0.13646 1.488 6.67e-03 0.1653    
## 3  -0.00461 -0.00127  0.00297 -0.00541 1.497 1.06e-05 0.1505    
## 4   0.06627  0.01389 -0.03693  0.07688 1.436 2.13e-03 0.1239    
## 5   0.30940  0.09625 -0.20258  0.37587 1.152 4.72e-02 0.1266    
## 6  -0.28320 -0.02228  0.11485 -0.31998 1.149 3.45e-02 0.1047    
## 7   0.07207 -0.01065 -0.00912  0.08372 1.358 2.51e-03 0.0818    
## 8   0.06848 -0.06323  0.05096  0.10491 1.378 3.94e-03 0.0996    
## 9   0.02460 -0.04629  0.04661  0.06379 1.475 1.47e-03 0.1432    
## 10  0.01405 -0.05005  0.05729  0.07239 1.517 1.89e-03 0.1673    
## 11 -0.13082  1.06851 -1.19911 -1.31112 1.105 5.04e-01 0.3829   *
## 12  0.02647  0.00154  0.03355  0.18701 1.234 1.22e-02 0.0741    
## 13 -0.00239 -0.04409  0.14969  0.42146 0.931 5.60e-02 0.0921    
## 14 -0.03490 -0.06894  0.12885  0.20221 1.451 1.45e-02 0.1669    
## 15 -0.63582  1.29132 -0.79868  1.60711 0.951 7.11e-01 0.4081   *
## 16  2.39755 -2.96628  1.25665 -4.42069 0.141 2.69e+00 0.5005   *

##第11号、15号、16号观测值被诊断为强影响点

去掉11、15、16号观测值再回归

lm.exam<-lm(log(y)~x1+x2+x3+x4+x5, data= d2.3[-c(11,15,16),])  #去掉第11、15、16号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归

## Start:  AIC=-63.64
## log(y) ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq      RSS     AIC
## - x3    1  0.000222 0.038846 -65.570
## - x1    1  0.000692 0.039317 -65.414
## - x2    1  0.004341 0.042965 -64.260
## <none>              0.038624 -63.645
## - x4    1  0.015324 0.053949 -61.301
## - x5    1  0.202208 0.240833 -41.852
## 
## Step:  AIC=-65.57
## log(y) ~ x1 + x2 + x4 + x5
## 
##        Df Sum of Sq      RSS     AIC
## - x1    1  0.000612 0.039458 -67.367
## - x2    1  0.004602 0.043448 -66.115
## <none>              0.038846 -65.570
## + x3    1  0.000222 0.038624 -63.645
## - x4    1  0.015627 0.054473 -63.175
## - x5    1  0.202244 0.241090 -43.838
## 
## Step:  AIC=-67.37
## log(y) ~ x2 + x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## - x2    1   0.00577 0.04523 -67.594
## <none>              0.03946 -67.367
## + x1    1   0.00061 0.03885 -65.570
## + x3    1   0.00014 0.03932 -65.414
## - x4    1   0.01510 0.05455 -65.155
## - x5    1   0.32849 0.36794 -40.342
## 
## Step:  AIC=-67.59
## log(y) ~ x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## <none>              0.04523 -67.594
## + x2    1   0.00577 0.03946 -67.367
## + x1    1   0.00178 0.04345 -66.115
## + x3    1   0.00032 0.04491 -65.686
## - x4    1   0.02706 0.07228 -63.497
## - x5    1   0.34370 0.38893 -41.621

summary(lm.step)  #回归模型汇总信息

## 
## Call:
## lm(formula = log(y) ~ x4 + x5, data = d2.3[-c(11, 15, 16), ])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.111097 -0.019653  0.004984  0.035724  0.114282 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.281e+00  6.416e-02  82.305 1.71e-15 ***
## x4          1.174e-02  4.800e-03   2.446   0.0345 *  
## x5          5.428e-04  6.226e-05   8.718 5.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06725 on 10 degrees of freedom
## Multiple R-squared:  0.9928, Adjusted R-squared:  0.9913 
## F-statistic: 685.2 on 2 and 10 DF,  p-value: 1.995e-11

y.rst<-rstandard(lm.step)  #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step)  #计算回归模型lm.step的预测值
plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标，预测值为横坐标的散点图

##此时，变量选择后的方程为 y= 5.281 + 0.01174 x4 + 0.0005 x5，残差几乎全部落在[-2，2]区域内

回归预测（点预测和区间预测）

preds<-data.frame(x4=60,x5=3300)  #给定解释变量x4和x5的值
predict(lm.step,newdata=preds,interval="prediction", 
level=0.95)  #进行点预测和区间预测

##        fit      lwr      upr
## 1 7.775986 7.584898 7.967075

##计算结果y的点预测为7.78,预测区间为[7.58, 7.97].