.2.3.knit

setwd("d:/data")  
d2.3 <- read.csv("C:\\Users\\86167\\Desktop\\ex2.3.csv", header = T)  
# 建立全变量回归方程
lm.air <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = d2.3)  
# 进行逐步回归
summary(lm.air)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = d2.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -252.31  -48.18  -12.79   52.81  193.02 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -4.012e+02  1.431e+02  -2.803  0.01870 * 
## x1           1.444e-02  2.402e-02   0.601  0.56109   
## x2          -2.087e-02  8.657e-02  -0.241  0.81440   
## x3           5.810e-05  1.464e-03   0.040  0.96912   
## x4           3.044e+01  8.298e+00   3.668  0.00433 **
## x5           2.004e-01  1.115e-01   1.798  0.10235   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.6 on 10 degrees of freedom
## Multiple R-squared:  0.9879, Adjusted R-squared:  0.9818 
## F-statistic: 162.8 on 5 and 10 DF,  p-value: 3.043e-09

##最终模型为y ~ x4 + x5，回归系数显示x4的系数为31.47130，x5的系数为0.18680。这意味着在其他条件不变的情况下，民航航线里程每增加1万公里，民航客运量预计增加31.47130万人；来华旅游入境人数每增加1万人，民航客运量预计增加 0.18680 万人。 ##系数的显著性检验结果表明，两个变量的系数均显著不为零（x4的p值为2.81e-08，x5的p值为0.00288），说明这两个变量对民航客运量有显著的影响。 ##决定系数（Multiple R-squared）为0.9874，调整后的决定系数（AdjustedR-squared）为0.9854。这表明模型能够解释约98.74%的民航客运量的变异，调整后的决定系数也很高，说明模型在考虑变量个数的情况下拟合效果非常好，变量选择比较合理，没有过度拟合数据。 ##残差标准误差为 116，在13个自由度下，这是衡量模型预测误差的一个指标，相对较小的值表示模型的预测精度较高，即实际观测值与预测值之间的差异较小。 ## F- 统计量为 508，对应的p值为4.572e-13，远小于显著性水平（如0.05），这表明整个回归模型是显著的，即x4和x5联合起来对民航客运量有显著的线性关系，模型整体具有统计学意义。 ##从最终模型来看，民航航线里程（x4）和来华旅游入境人数（x5）是影响民航客运量的两个关键因素。民航航线里程的系数较大，说明其对民航客运量的影响更为显著，可能是因为航线里程的增加直接拓展了民航的服务范围，能够吸引更多的旅客选择民航出行。 ##自己拓展

lm.air.step<-step(lm.air,direction="both")

## Start:  AIC=160.15
## y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1        26 168042 158.15
## - x2    1       976 168992 158.24
## - x1    1      6073 174088 158.72
## <none>              168015 160.15
## - x5    1     54329 222344 162.63
## - x4    1    226106 394122 171.79
## 
## Step:  AIC=158.15
## y ~ x1 + x2 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x2    1       957 168999 156.24
## - x1    1      6050 174092 156.72
## <none>              168042 158.15
## + x3    1        26 168015 160.15
## - x5    1     55493 223535 160.72
## - x4    1    228011 396052 169.87
## 
## Step:  AIC=156.24
## y ~ x1 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x1    1      5903 174902 154.79
## <none>              168999 156.24
## + x2    1       957 168042 158.15
## + x3    1         8 168992 158.24
## - x5    1    155567 324566 164.68
## - x4    1    501855 670854 176.30
## 
## Step:  AIC=154.79
## y ~ x4 + x5
## 
##        Df Sum of Sq     RSS    AIC
## <none>               174902 154.79
## + x1    1      5903  168999 156.24
## + x2    1       811  174092 156.72
## + x3    1         1  174901 156.79
## - x5    1    180176  355078 164.12
## - x4    1   1843682 2018584 191.93

# 计算普通残差
y.res<-residuals(lm.air)
# 计算标准化残差
y.rst<-rstandard(lm.air.step)
print(y.rst)

##           1           2           3           4           5           6 
##  1.34714761  1.18901283  0.29852998 -0.19086128 -0.23331509 -0.88348308 
##           7           8           9          10          11          12 
## -0.85374520 -0.46733123 -0.37082828 -0.06628483  0.91759182 -2.23768758 
##          13          14          15          16 
## -0.34282279  1.94017167  0.49964531 -0.07834811

# 计算预测值
y.fit<-predict(lm.air.step)
# 绘制普通残差与预测值的散点图
plot(y.res~y.fit)

# 绘制标准化残差与预测值的散点图
plot(y.rst~y.fit)

##第12号为异常点

# 第12号点为异常点
d2.3_no_outlier <- d2.3[-c(12), ]  
# 重新建立回归模型
lm.air_no_outlier <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = d2.3_no_outlier)  
summary(lm.air_no_outlier)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = d2.3_no_outlier)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -128.63  -41.01   14.88   50.66  114.44 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -4.073e+02  1.024e+02  -3.976  0.00323 **
## x1           6.351e-03  1.737e-02   0.366  0.72308   
## x2           7.490e-02  6.861e-02   1.092  0.30333   
## x3           6.089e-04  1.061e-03   0.574  0.58016   
## x4           2.128e+01  6.574e+00   3.236  0.01022 * 
## x5           8.893e-02  8.683e-02   1.024  0.33246   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 92.74 on 9 degrees of freedom
## Multiple R-squared:  0.9944, Adjusted R-squared:  0.9913 
## F-statistic: 319.7 on 5 and 9 DF,  p-value: 7.554e-10

lm.step<-step(lm.air_no_outlier,direction="both") #用一切子集回归法来进行逐步回归

## Start:  AIC=140.23
## y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x1    1      1150  78560 138.45
## - x3    1      2832  80242 138.77
## - x5    1      9023  86433 139.89
## - x2    1     10251  87661 140.10
## <none>               77410 140.23
## - x4    1     90094 167504 149.81
## 
## Step:  AIC=138.45
## y ~ x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1      2614  81174 136.94
## - x5    1      8029  86589 137.91
## <none>               78560 138.45
## + x1    1      1150  77410 140.23
## - x2    1     27237 105797 140.92
## - x4    1     89346 167905 147.85
## 
## Step:  AIC=136.94
## y ~ x2 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x5    1     10939  92112 136.84
## <none>               81174 136.94
## + x3    1      2614  78560 138.45
## + x1    1       932  80242 138.77
## - x2    1     26361 107535 139.16
## - x4    1     89005 170178 146.05
## 
## Step:  AIC=136.84
## y ~ x2 + x4
## 
##        Df Sum of Sq    RSS    AIC
## <none>               92112 136.84
## + x5    1     10939  81174 136.94
## + x3    1      5523  86589 137.91
## + x1    1       626  91487 138.74
## - x4    1     89282 181394 145.01
## - x2    1    206465 298578 152.48

y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标，预测值为横坐标的散点

##残差几乎全落在[-2.1]区域内 ##对数

lm.step_new<-update(lm.step,log(.)~.) #对模型进行对数变换
y.rst<-rstandard(lm.step_new) #计算lm.step_new的标准化残差
y.fit<-predict(lm.step_new) #计算lm.step_new的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标

##对数变换后，第12号为异常点 ##比较不同模型的拟合优度指标

# 全变量模型的拟合优度指标
summary(lm.air)$r.squared

## [1] 0.9878631

summary(lm.air)$adj.r.squared

## [1] 0.9817947

# 逐步回归模型的拟合优度指标
summary(lm.air.step)$r.squared

## [1] 0.9873656

summary(lm.air.step)$adj.r.squared

## [1] 0.9854219

##回归诊断

par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图

influence.measures(lm.step_new)

## Influence measures of
##   lm(formula = log(y) ~ x2 + x4, data = d2.3_no_outlier) :
## 
##     dfb.1_  dfb.x2  dfb.x4   dffit cov.r   cook.d    hat inf
## 1  -0.6588  0.2386 -0.1285 -0.7818 0.642 0.166894 0.1434    
## 2  -0.3076  0.0791 -0.0303 -0.3519 1.203 0.041897 0.1307    
## 3  -0.1146  0.0718 -0.0528 -0.1580 1.437 0.008947 0.1348    
## 4   0.0312 -0.0272  0.0220  0.0493 1.504 0.000881 0.1404    
## 5   0.0869 -0.0802  0.0660  0.1425 1.449 0.007293 0.1349    
## 6  -0.1642  0.0624 -0.0394 -0.2103 1.304 0.015529 0.1025    
## 7   0.0945 -0.0401  0.0281  0.1294 1.368 0.005998 0.0923    
## 8   0.1347  0.0606 -0.0715  0.1639 1.330 0.009536 0.0915    
## 9   0.2788  0.0938 -0.1089  0.3882 0.876 0.046828 0.0745    
## 10  0.2821 -0.0598  0.0644  0.6194 0.427 0.094121 0.0674    
## 11  0.0127  0.0282 -0.0277  0.0311 2.090 0.000351 0.3791   *
## 13 -0.0275 -0.1099  0.0985 -0.1443 1.600 0.007520 0.2062    
## 14 -0.0318 -0.2340  0.2090 -0.2858 1.828 0.029251 0.3222   *
## 15 -0.5297 -0.5752  0.7104  0.9995 1.878 0.330606 0.4789   *
## 16  1.2374  0.4517 -0.8140 -2.0329 1.003 1.093714 0.5011   *

##11,15,16号可能为强影响点 ##进行点预测和区间预测

# 给定新的解释变量值
new_data <- data.frame(x1 = 10000, x2 = 8000, x3 = 200000, x4 = 15, x5 = 500)  

# 进行点预测和区间预测（95%置信区间）
predict(lm.air.step, newdata = new_data, interval = "prediction", level = 0.95)

##        fit       lwr      upr
## 1 155.3967 -113.4272 424.2205

##计算结果y的点预测为155.40,预测区间为[-113.43, 424.22] ##查看回归系数的大小和显著性

summary(lm.air.step)$coefficients

##                 Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept) -410.0740118 57.16828159 -7.173104 7.227824e-06
## x4            31.4712963  2.68842378 11.706226 2.811650e-08
## x5             0.1868025  0.05104582  3.659506 2.884496e-03