回归分析:全变量回归

setwd("D:\\Rdownload\\lianxi\\second") #设定工作路径
d2.4<-read.csv("ex2.4.csv", header = T) #将ex2.4.csv数据读入到d2.4中
lm.exam<-lm(y~x1+x2+x3+x4,data=d2.4) #建立y关于x1,x2,x3,x4的线性回归方程,数据为d2.4
summary(lm.exam)#给出回归系数的估计和显著性检验等
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d2.4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -709.52 -271.18  -35.46  212.92 1026.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.593    155.914  -0.010   0.9919    
## x1           190.588     33.514   5.687 9.08e-07 ***
## x2             1.588      2.674   0.594   0.5555    
## x3             1.013      1.315   0.770   0.4453    
## x4           304.716    139.869   2.179   0.0346 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 422.4 on 45 degrees of freedom
##   (因为不存在,1个观察量被删除了)
## Multiple R-squared:  0.7485, Adjusted R-squared:  0.7261 
## F-statistic: 33.48 on 4 and 45 DF,  p-value: 5.8e-13

##回归方程的F值为33.48,相应的p值为5.8×(10的-13次方),说明回归方程是显著的;但t检验对应的p值则显示:x1和x4是显著的,而x2和x3是不显著的。

回归分析:逐步回归

lm.step<-step(lm.exam,direction="both") #进行逐步回归
## Start:  AIC=609.33
## y ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq      RSS    AIC
## - x2    1     62949  8092032 607.72
## - x3    1    105799  8134882 607.98
## <none>               8029083 609.33
## - x4    1    846846  8875929 612.34
## - x1    1   5770382 13799465 634.41
## 
## Step:  AIC=607.72
## y ~ x1 + x3 + x4
## 
##        Df Sum of Sq      RSS    AIC
## - x3    1    141084  8233116 606.58
## <none>               8092032 607.72
## + x2    1     62949  8029083 609.33
## - x4    1    867198  8959230 610.81
## - x1    1  14810227 22902258 657.74
## 
## Step:  AIC=606.58
## y ~ x1 + x4
## 
##        Df Sum of Sq      RSS    AIC
## <none>               8233116 606.58
## + x3    1    141084  8092032 607.72
## + x2    1     98234  8134882 607.98
## - x4    1    862282  9095398 609.56
## - x1    1  15298607 23531723 657.09
summary(lm.step) #回归模型汇总信息
## 
## Call:
## lm(formula = y ~ x1 + x4, data = d2.4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -815.95 -279.23  -59.84  245.14  993.05 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -45.58     130.66  -0.349   0.7288    
## x1            207.47      22.20   9.345 2.72e-12 ***
## x4            307.22     138.47   2.219   0.0314 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 418.5 on 47 degrees of freedom
##   (因为不存在,1个观察量被删除了)
## Multiple R-squared:  0.7421, Adjusted R-squared:  0.7311 
## F-statistic: 67.62 on 2 and 47 DF,  p-value: 1.474e-14

##所以,变量选择后的方程为 y= -45.58 + 207.47 x1 + 307.22 x4

回归诊断

标准化残差表

y.res<-residuals (lm.exam)  #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step)  #计算回归模型lm.step的标准化残差
print(y.rst)  #输出回归模型lm.step的标准化残差y.rst
##            1            2            3            4            5            6 
## -0.446568862 -0.945218013  0.062004702 -0.816593881 -0.281164377  0.997634957 
##            7            8            9           10           11           12 
## -0.171591006 -0.016449598  0.743372084  0.189530905  2.410737483  0.446008336 
##           13           14           15           16           17           18 
## -0.123934084  1.909831850 -0.852775518 -1.980823416 -0.705232906 -0.295270768 
##           19           20           21           22           23           24 
##  0.725379919 -0.975727136 -1.672437645 -0.268752285 -0.398016503 -0.394519084 
##           25           26           27           28           29           30 
## -0.611646884 -0.626212592  0.152394172 -1.406352767  0.745391559  2.312642857 
##           31           32           33           34           35           36 
## -0.043006286  1.686007110 -0.229683682 -0.977540717 -0.004239526 -0.295270768 
##           37           38           39           40           41           42 
## -1.413701844 -0.799868777 -0.040006782  2.394244916  1.308598927  0.753432011 
##           43           44           45           46           47           48 
## -0.093586794  0.491488133  1.085409594 -0.908800697  0.629820894 -0.059490423 
##           49           50 
## -0.345530197 -0.815410137

普通残差图和标准化残差图

y.fit<-predict(lm.step)  #计算回归模型lm.step的预测值
plot(y.res~ y.fit)  #绘制以普通残差为纵坐标,预测值为横坐标的散点图

plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标,预测值为横坐标的散点图

##因为学生化删除残差的值都在-3至3的范围内,所以没有异常点。

对数变换

lm.step_new<-update(lm.step,log(.)~.)  #对模型进行对数变换
y.rst<-rstandard(lm.step_new)  #计算lm.step_new的标准化残差
print(y.rst) #输出回归模型lm.step_new的标准化残差y.rst
##           1           2           3           4           5           6 
## -0.04471551 -0.34415887  0.74355672 -0.20394133  0.57301238  0.18113628 
##           7           8           9          10          11          12 
## -0.73620399  0.73941595  0.03170872 -0.63513815  1.68163931  0.32563570 
##          13          14          15          16          17          18 
##  0.68978579  1.68471837 -0.27522362 -1.65694184 -0.50106002 -1.29829282 
##          19          20          21          22          23          24 
##  1.63047812 -0.35298218 -2.10318077  0.58722439 -0.00542081  0.41357307 
##          25          26          27          28          29          30 
## -0.18226873 -0.19471145  0.08939477 -1.17358192  0.79739752  1.21111929 
##          31          32          33          34          35          36 
## -1.08621993 -0.41851563  0.50216360 -0.98702425 -0.89481012 -1.29829282 
##          37          38          39          40          41          42 
## -1.03137583 -0.57686715 -0.03394335  2.47666079  1.12619554  0.08146429 
##          43          44          45          46          47          48 
## -0.06920203  1.20491416  1.39327690 -1.63179278  1.04137544 -0.04671735 
##          49          50 
##  0.41552559 -2.00547881
y.fit<-predict(lm.step_new)  #计算lm.step_new的预测值
plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标,预测值为横坐标的散点图

##对数变换后,学生化删除残差的值也都在-3至3的范围内,所以没有异常点。

回归诊断分析

par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图

influence.measures(lm.step_new) #计算各个观测值的诊断统计量
## Influence measures of
##   lm(formula = log(y) ~ x1 + x4, data = d2.4) :
## 
##      dfb.1_    dfb.x1    dfb.x4     dffit cov.r   cook.d    hat inf
## 1   0.00091 -0.001445 -0.003494 -0.007957 1.101 2.16e-05 0.0313    
## 2  -0.00456  0.007236 -0.034835 -0.060711 1.092 1.25e-03 0.0307    
## 3   0.16182 -0.028581 -0.123398  0.187323 1.095 1.18e-02 0.0602    
## 4  -0.00270  0.004284 -0.020625 -0.035947 1.098 4.40e-04 0.0307    
## 5   0.06608  0.071924 -0.136086  0.160383 1.127 8.70e-03 0.0736    
## 6  -0.01600  0.025416  0.005887  0.040870 1.120 5.69e-04 0.0494    
## 7  -0.18611  0.068627  0.105269 -0.196328 1.104 1.30e-02 0.0670    
## 8   0.11286 -0.179246  0.147692  0.223723 1.125 1.68e-02 0.0846    
## 9   0.01031 -0.006491 -0.003074  0.010309 1.182 3.62e-05 0.0975    
## 10  0.07868 -0.124961 -0.005982 -0.168961 1.114 9.64e-03 0.0669    
## 11 -0.03529  0.056054  0.135546  0.308657 0.914 3.05e-02 0.0313    
## 12  0.09371 -0.048247 -0.038973  0.094733 1.151 3.05e-03 0.0794    
## 13  0.12628  0.011031 -0.130474  0.171849 1.100 9.96e-03 0.0591    
## 14  0.08205  0.415308 -0.503170  0.606661 0.995 1.18e-01 0.1107    
## 15 -0.02231  0.035436 -0.040831 -0.060128 1.113 1.23e-03 0.0464    
## 16  0.03474 -0.055179 -0.133431 -0.303841 0.919 2.96e-02 0.0313    
## 17 -0.09151 -0.007994  0.094548 -0.124530 1.116 5.25e-03 0.0591    
## 18 -0.33232  0.122541  0.187970 -0.350566 1.025 4.04e-02 0.0670    
## 19  0.13598 -0.215954  0.248838  0.366439 0.939 4.31e-02 0.0464    
## 20 -0.01658  0.026330 -0.043964 -0.067349 1.097 1.54e-03 0.0358    
## 21 -0.17900  0.284278 -0.327565 -0.482373 0.832 7.18e-02 0.0464    
## 22  0.06773  0.073721 -0.139486  0.164389 1.126 9.14e-03 0.0736    
## 23  0.00011 -0.000175 -0.000424 -0.000965 1.101 3.17e-07 0.0313    
## 24  0.06150  0.029076 -0.087842  0.106767 1.127 3.87e-03 0.0635    
## 25  0.00371 -0.005892 -0.014248 -0.032444 1.099 3.58e-04 0.0313    
## 26  0.00396 -0.006295 -0.015221 -0.034661 1.098 4.09e-04 0.0313    
## 27 -0.00484  0.007681  0.004948  0.017473 1.108 1.04e-04 0.0376    
## 28  0.02820 -0.431926  0.416171 -0.533866 1.176 9.42e-02 0.1703    
## 29 -0.01634  0.025943  0.062735  0.142857 1.057 6.86e-03 0.0313    
## 30 -0.06656  0.105709  0.068094  0.240482 1.008 1.91e-02 0.0376    
## 31 -0.25141  0.399277 -0.279907 -0.448799 1.156 6.69e-02 0.1453    
## 32  0.06690 -0.106242  0.005959 -0.130450 1.159 5.77e-03 0.0900    
## 33  0.04092  0.091210 -0.132120  0.156037 1.153 8.25e-03 0.0894    
## 34  0.08809 -0.139904 -0.032403 -0.224971 1.054 1.69e-02 0.0494    
## 35 -0.20625  0.327566 -0.229635 -0.368193 1.185 4.54e-02 0.1453    
## 36 -0.33232  0.122541  0.187970 -0.350566 1.025 4.04e-02 0.0670    
## 37 -0.04893  0.077716 -0.129764 -0.198787 1.033 1.32e-02 0.0358    
## 38  0.03132 -0.049735 -0.032037 -0.113143 1.085 4.33e-03 0.0376    
## 39  0.00184 -0.002916 -0.001878 -0.006634 1.108 1.50e-05 0.0376    
## 40  0.12458 -0.197858  0.330369  0.506097 0.727 7.58e-02 0.0358   *
## 41 -0.02323  0.036894  0.089214  0.203153 1.014 1.37e-02 0.0313    
## 42  0.02649 -0.016677 -0.007897  0.026486 1.181 2.39e-04 0.0975    
## 43  0.00374 -0.005945 -0.003830 -0.013525 1.108 6.23e-05 0.0376    
## 44  0.05741 -0.091178  0.152242  0.233222 1.007 1.80e-02 0.0358    
## 45  0.46261 -0.291288 -0.137927  0.462615 1.041 6.99e-02 0.0975    
## 46  0.26804 -0.425700  0.023878 -0.522700 0.984 8.78e-02 0.0900    
## 47  0.01393 -0.022123  0.106508  0.185625 1.026 1.15e-02 0.0307    
## 48  0.00253 -0.004014 -0.002585 -0.009131 1.108 2.84e-05 0.0376    
## 49  0.01953 -0.031011  0.051780  0.079323 1.094 2.14e-03 0.0358    
## 50 -0.45372  0.080137  0.345995 -0.525233 0.868 8.59e-02 0.0602

##第40号观测值被诊断为强影响点

去掉第40号观测值再回归

lm.exam<-lm(log(y)~x1+x2+x3+x4, data= d2.4[-c(40),])  #去掉第40号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归
## Start:  AIC=-107.57
## log(y) ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq     RSS      AIC
## - x3    1    0.0097  4.4580 -109.459
## - x2    1    0.1328  4.5810 -108.125
## <none>               4.4483 -107.566
## - x4    1    0.7488  5.1971 -101.942
## - x1    1    6.1169 10.5652  -67.179
## 
## Step:  AIC=-109.46
## log(y) ~ x1 + x2 + x4
## 
##        Df Sum of Sq     RSS      AIC
## - x2    1    0.1505  4.6084 -109.833
## <none>               4.4580 -109.459
## + x3    1    0.0097  4.4483 -107.566
## - x4    1    0.7439  5.2019 -103.897
## - x1    1    6.1079 10.5659  -69.175
## 
## Step:  AIC=-109.83
## log(y) ~ x1 + x4
## 
##        Df Sum of Sq     RSS      AIC
## <none>               4.6084 -109.833
## + x2    1    0.1505  4.4580 -109.459
## + x3    1    0.0274  4.5810 -108.125
## - x4    1    0.7697  5.3781 -104.265
## - x1    1   16.9403 21.5487  -36.254
summary(lm.step) #回归模型汇总信息
## 
## Call:
## lm(formula = log(y) ~ x1 + x4, data = d2.4[-c(40), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65606 -0.19234  0.00887  0.18218  0.57945 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.49903    0.09893  55.587  < 2e-16 ***
## x1           0.21894    0.01684  13.004  < 2e-16 ***
## x4           0.29254    0.10554   2.772  0.00802 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3165 on 46 degrees of freedom
##   (因为不存在,1个观察量被删除了)
## Multiple R-squared:  0.8482, Adjusted R-squared:  0.8416 
## F-statistic: 128.5 on 2 and 46 DF,  p-value: < 2.2e-16
y.rst<-rstandard(lm.step)  #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step)  #计算回归模型lm.step的预测值
plot(y.rst~ y.fit)  #绘制以标准化残差为纵坐标,预测值为横坐标的散点图

##此时,变量选择后的方程为 y= 5.5 + 0.22 x1 + 0.29 x4,残差几乎全部落在[-2,2]区域内

##所以,CEO年薪与在目前职位年数和是否有MBA学位的关系更显著,与前一年股票价格的变化和前一年公司销售额的变化关系较弱。

回归预测(点预测和区间预测)

preds<-data.frame(x1= 5, x4= 1)  #给定解释变量x1, x4的值
predict(lm.step,newdata=preds,interval="prediction", 
level=0.95)  #进行点预测和区间预测
##       fit      lwr      upr
## 1 6.88627 6.237443 7.535096

##计算结果y的点预测为6.89, 预测区间为[6.24, 7.54].