setwd("D:\\Rdownload\\lianxi\\second") #设定工作路径
d2.4<-read.csv("ex2.4.csv", header = T) #将ex2.4.csv数据读入到d2.4中
lm.exam<-lm(y~x1+x2+x3+x4,data=d2.4) #建立y关于x1,x2,x3,x4的线性回归方程,数据为d2.4
summary(lm.exam)#给出回归系数的估计和显著性检验等
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d2.4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -709.52 -271.18 -35.46 212.92 1026.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.593 155.914 -0.010 0.9919
## x1 190.588 33.514 5.687 9.08e-07 ***
## x2 1.588 2.674 0.594 0.5555
## x3 1.013 1.315 0.770 0.4453
## x4 304.716 139.869 2.179 0.0346 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 422.4 on 45 degrees of freedom
## (因为不存在,1个观察量被删除了)
## Multiple R-squared: 0.7485, Adjusted R-squared: 0.7261
## F-statistic: 33.48 on 4 and 45 DF, p-value: 5.8e-13
##回归方程的F值为33.48,相应的p值为5.8×(10的-13次方),说明回归方程是显著的;但t检验对应的p值则显示:x1和x4是显著的,而x2和x3是不显著的。
lm.step<-step(lm.exam,direction="both") #进行逐步回归
## Start: AIC=609.33
## y ~ x1 + x2 + x3 + x4
##
## Df Sum of Sq RSS AIC
## - x2 1 62949 8092032 607.72
## - x3 1 105799 8134882 607.98
## <none> 8029083 609.33
## - x4 1 846846 8875929 612.34
## - x1 1 5770382 13799465 634.41
##
## Step: AIC=607.72
## y ~ x1 + x3 + x4
##
## Df Sum of Sq RSS AIC
## - x3 1 141084 8233116 606.58
## <none> 8092032 607.72
## + x2 1 62949 8029083 609.33
## - x4 1 867198 8959230 610.81
## - x1 1 14810227 22902258 657.74
##
## Step: AIC=606.58
## y ~ x1 + x4
##
## Df Sum of Sq RSS AIC
## <none> 8233116 606.58
## + x3 1 141084 8092032 607.72
## + x2 1 98234 8134882 607.98
## - x4 1 862282 9095398 609.56
## - x1 1 15298607 23531723 657.09
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = y ~ x1 + x4, data = d2.4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -815.95 -279.23 -59.84 245.14 993.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -45.58 130.66 -0.349 0.7288
## x1 207.47 22.20 9.345 2.72e-12 ***
## x4 307.22 138.47 2.219 0.0314 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 418.5 on 47 degrees of freedom
## (因为不存在,1个观察量被删除了)
## Multiple R-squared: 0.7421, Adjusted R-squared: 0.7311
## F-statistic: 67.62 on 2 and 47 DF, p-value: 1.474e-14
##所以,变量选择后的方程为 y= -45.58 + 207.47 x1 + 307.22 x4
y.res<-residuals (lm.exam) #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
print(y.rst) #输出回归模型lm.step的标准化残差y.rst
## 1 2 3 4 5 6
## -0.446568862 -0.945218013 0.062004702 -0.816593881 -0.281164377 0.997634957
## 7 8 9 10 11 12
## -0.171591006 -0.016449598 0.743372084 0.189530905 2.410737483 0.446008336
## 13 14 15 16 17 18
## -0.123934084 1.909831850 -0.852775518 -1.980823416 -0.705232906 -0.295270768
## 19 20 21 22 23 24
## 0.725379919 -0.975727136 -1.672437645 -0.268752285 -0.398016503 -0.394519084
## 25 26 27 28 29 30
## -0.611646884 -0.626212592 0.152394172 -1.406352767 0.745391559 2.312642857
## 31 32 33 34 35 36
## -0.043006286 1.686007110 -0.229683682 -0.977540717 -0.004239526 -0.295270768
## 37 38 39 40 41 42
## -1.413701844 -0.799868777 -0.040006782 2.394244916 1.308598927 0.753432011
## 43 44 45 46 47 48
## -0.093586794 0.491488133 1.085409594 -0.908800697 0.629820894 -0.059490423
## 49 50
## -0.345530197 -0.815410137
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.res~ y.fit) #绘制以普通残差为纵坐标,预测值为横坐标的散点图
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##因为学生化删除残差的值都在-3至3的范围内,所以没有异常点。
lm.step_new<-update(lm.step,log(.)~.) #对模型进行对数变换
y.rst<-rstandard(lm.step_new) #计算lm.step_new的标准化残差
print(y.rst) #输出回归模型lm.step_new的标准化残差y.rst
## 1 2 3 4 5 6
## -0.04471551 -0.34415887 0.74355672 -0.20394133 0.57301238 0.18113628
## 7 8 9 10 11 12
## -0.73620399 0.73941595 0.03170872 -0.63513815 1.68163931 0.32563570
## 13 14 15 16 17 18
## 0.68978579 1.68471837 -0.27522362 -1.65694184 -0.50106002 -1.29829282
## 19 20 21 22 23 24
## 1.63047812 -0.35298218 -2.10318077 0.58722439 -0.00542081 0.41357307
## 25 26 27 28 29 30
## -0.18226873 -0.19471145 0.08939477 -1.17358192 0.79739752 1.21111929
## 31 32 33 34 35 36
## -1.08621993 -0.41851563 0.50216360 -0.98702425 -0.89481012 -1.29829282
## 37 38 39 40 41 42
## -1.03137583 -0.57686715 -0.03394335 2.47666079 1.12619554 0.08146429
## 43 44 45 46 47 48
## -0.06920203 1.20491416 1.39327690 -1.63179278 1.04137544 -0.04671735
## 49 50
## 0.41552559 -2.00547881
y.fit<-predict(lm.step_new) #计算lm.step_new的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##对数变换后,学生化删除残差的值也都在-3至3的范围内,所以没有异常点。
par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图
influence.measures(lm.step_new) #计算各个观测值的诊断统计量
## Influence measures of
## lm(formula = log(y) ~ x1 + x4, data = d2.4) :
##
## dfb.1_ dfb.x1 dfb.x4 dffit cov.r cook.d hat inf
## 1 0.00091 -0.001445 -0.003494 -0.007957 1.101 2.16e-05 0.0313
## 2 -0.00456 0.007236 -0.034835 -0.060711 1.092 1.25e-03 0.0307
## 3 0.16182 -0.028581 -0.123398 0.187323 1.095 1.18e-02 0.0602
## 4 -0.00270 0.004284 -0.020625 -0.035947 1.098 4.40e-04 0.0307
## 5 0.06608 0.071924 -0.136086 0.160383 1.127 8.70e-03 0.0736
## 6 -0.01600 0.025416 0.005887 0.040870 1.120 5.69e-04 0.0494
## 7 -0.18611 0.068627 0.105269 -0.196328 1.104 1.30e-02 0.0670
## 8 0.11286 -0.179246 0.147692 0.223723 1.125 1.68e-02 0.0846
## 9 0.01031 -0.006491 -0.003074 0.010309 1.182 3.62e-05 0.0975
## 10 0.07868 -0.124961 -0.005982 -0.168961 1.114 9.64e-03 0.0669
## 11 -0.03529 0.056054 0.135546 0.308657 0.914 3.05e-02 0.0313
## 12 0.09371 -0.048247 -0.038973 0.094733 1.151 3.05e-03 0.0794
## 13 0.12628 0.011031 -0.130474 0.171849 1.100 9.96e-03 0.0591
## 14 0.08205 0.415308 -0.503170 0.606661 0.995 1.18e-01 0.1107
## 15 -0.02231 0.035436 -0.040831 -0.060128 1.113 1.23e-03 0.0464
## 16 0.03474 -0.055179 -0.133431 -0.303841 0.919 2.96e-02 0.0313
## 17 -0.09151 -0.007994 0.094548 -0.124530 1.116 5.25e-03 0.0591
## 18 -0.33232 0.122541 0.187970 -0.350566 1.025 4.04e-02 0.0670
## 19 0.13598 -0.215954 0.248838 0.366439 0.939 4.31e-02 0.0464
## 20 -0.01658 0.026330 -0.043964 -0.067349 1.097 1.54e-03 0.0358
## 21 -0.17900 0.284278 -0.327565 -0.482373 0.832 7.18e-02 0.0464
## 22 0.06773 0.073721 -0.139486 0.164389 1.126 9.14e-03 0.0736
## 23 0.00011 -0.000175 -0.000424 -0.000965 1.101 3.17e-07 0.0313
## 24 0.06150 0.029076 -0.087842 0.106767 1.127 3.87e-03 0.0635
## 25 0.00371 -0.005892 -0.014248 -0.032444 1.099 3.58e-04 0.0313
## 26 0.00396 -0.006295 -0.015221 -0.034661 1.098 4.09e-04 0.0313
## 27 -0.00484 0.007681 0.004948 0.017473 1.108 1.04e-04 0.0376
## 28 0.02820 -0.431926 0.416171 -0.533866 1.176 9.42e-02 0.1703
## 29 -0.01634 0.025943 0.062735 0.142857 1.057 6.86e-03 0.0313
## 30 -0.06656 0.105709 0.068094 0.240482 1.008 1.91e-02 0.0376
## 31 -0.25141 0.399277 -0.279907 -0.448799 1.156 6.69e-02 0.1453
## 32 0.06690 -0.106242 0.005959 -0.130450 1.159 5.77e-03 0.0900
## 33 0.04092 0.091210 -0.132120 0.156037 1.153 8.25e-03 0.0894
## 34 0.08809 -0.139904 -0.032403 -0.224971 1.054 1.69e-02 0.0494
## 35 -0.20625 0.327566 -0.229635 -0.368193 1.185 4.54e-02 0.1453
## 36 -0.33232 0.122541 0.187970 -0.350566 1.025 4.04e-02 0.0670
## 37 -0.04893 0.077716 -0.129764 -0.198787 1.033 1.32e-02 0.0358
## 38 0.03132 -0.049735 -0.032037 -0.113143 1.085 4.33e-03 0.0376
## 39 0.00184 -0.002916 -0.001878 -0.006634 1.108 1.50e-05 0.0376
## 40 0.12458 -0.197858 0.330369 0.506097 0.727 7.58e-02 0.0358 *
## 41 -0.02323 0.036894 0.089214 0.203153 1.014 1.37e-02 0.0313
## 42 0.02649 -0.016677 -0.007897 0.026486 1.181 2.39e-04 0.0975
## 43 0.00374 -0.005945 -0.003830 -0.013525 1.108 6.23e-05 0.0376
## 44 0.05741 -0.091178 0.152242 0.233222 1.007 1.80e-02 0.0358
## 45 0.46261 -0.291288 -0.137927 0.462615 1.041 6.99e-02 0.0975
## 46 0.26804 -0.425700 0.023878 -0.522700 0.984 8.78e-02 0.0900
## 47 0.01393 -0.022123 0.106508 0.185625 1.026 1.15e-02 0.0307
## 48 0.00253 -0.004014 -0.002585 -0.009131 1.108 2.84e-05 0.0376
## 49 0.01953 -0.031011 0.051780 0.079323 1.094 2.14e-03 0.0358
## 50 -0.45372 0.080137 0.345995 -0.525233 0.868 8.59e-02 0.0602
##第40号观测值被诊断为强影响点
lm.exam<-lm(log(y)~x1+x2+x3+x4, data= d2.4[-c(40),]) #去掉第40号观测值再建立全变量回归方程
lm.step<-step(lm.exam, direction="both") #用一切子集回归法来进行逐步回归
## Start: AIC=-107.57
## log(y) ~ x1 + x2 + x3 + x4
##
## Df Sum of Sq RSS AIC
## - x3 1 0.0097 4.4580 -109.459
## - x2 1 0.1328 4.5810 -108.125
## <none> 4.4483 -107.566
## - x4 1 0.7488 5.1971 -101.942
## - x1 1 6.1169 10.5652 -67.179
##
## Step: AIC=-109.46
## log(y) ~ x1 + x2 + x4
##
## Df Sum of Sq RSS AIC
## - x2 1 0.1505 4.6084 -109.833
## <none> 4.4580 -109.459
## + x3 1 0.0097 4.4483 -107.566
## - x4 1 0.7439 5.2019 -103.897
## - x1 1 6.1079 10.5659 -69.175
##
## Step: AIC=-109.83
## log(y) ~ x1 + x4
##
## Df Sum of Sq RSS AIC
## <none> 4.6084 -109.833
## + x2 1 0.1505 4.4580 -109.459
## + x3 1 0.0274 4.5810 -108.125
## - x4 1 0.7697 5.3781 -104.265
## - x1 1 16.9403 21.5487 -36.254
summary(lm.step) #回归模型汇总信息
##
## Call:
## lm(formula = log(y) ~ x1 + x4, data = d2.4[-c(40), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65606 -0.19234 0.00887 0.18218 0.57945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.49903 0.09893 55.587 < 2e-16 ***
## x1 0.21894 0.01684 13.004 < 2e-16 ***
## x4 0.29254 0.10554 2.772 0.00802 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3165 on 46 degrees of freedom
## (因为不存在,1个观察量被删除了)
## Multiple R-squared: 0.8482, Adjusted R-squared: 0.8416
## F-statistic: 128.5 on 2 and 46 DF, p-value: < 2.2e-16
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图
##此时,变量选择后的方程为 y= 5.5 + 0.22 x1 + 0.29 x4,残差几乎全部落在[-2,2]区域内
##所以,CEO年薪与在目前职位年数和是否有MBA学位的关系更显著,与前一年股票价格的变化和前一年公司销售额的变化关系较弱。
preds<-data.frame(x1= 5, x4= 1) #给定解释变量x1, x4的值
predict(lm.step,newdata=preds,interval="prediction",
level=0.95) #进行点预测和区间预测
## fit lwr upr
## 1 6.88627 6.237443 7.535096
##计算结果y的点预测为6.89, 预测区间为[6.24, 7.54].