Chapter10 多元线性回归

0.1 多元线性回归前述准备
- 0.1.1 绘制多个变量的相关系数矩阵图
0.2 多元线性回归模型拟合
0.3 相对重要性、模型比较
- 0.3.1 （自变量的）相对重要性：哪些变量对预测比较重要：标准化回归系数
- 0.3.2 模型比较：所建立的模型是否包含建模所必需的自变量
0.4 利用回归方程做预测
- 0.4.1 均值的置信区间、个别值的预测区间

0.1 多元线性回归前述准备

0.1.1 绘制多个变量的相关系数矩阵图

load("D:\\New_Folder\\Study_Programming\\R_Programme\\Applied Statistics\\datas - Copy\\example\\ch10\\example10_1.RData")
library(corrgram)
corrgram(example10_1[2:7],order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)

0.2 多元线性回归模型拟合

model_1=lm(y~x1+x2+x3+x4+x5,data = example10_1)
summary(model_1)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = example10_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.7204  -6.0600   0.7152   3.2144  21.4805 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  4.2604768 10.4679833   0.407  0.68856   
## x1           0.1273254  0.0959790   1.327  0.20037   
## x2           0.1605660  0.0556834   2.884  0.00952 **
## x3           0.0007636  0.0013556   0.563  0.57982   
## x4          -0.3331990  0.3986248  -0.836  0.41362   
## x5          -0.5746462  0.3087506  -1.861  0.07826 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.65 on 19 degrees of freedom
## Multiple R-squared:  0.8518, Adjusted R-squared:  0.8128 
## F-statistic: 21.84 on 5 and 19 DF,  p-value: 2.835e-07

confint(model_1,level = 0.95)#计算回归系数的置信区间

##                     2.5 %       97.5 %
## (Intercept) -17.649264072 26.170217667
## x1           -0.073561002  0.328211809
## x2            0.044019355  0.277112598
## x3           -0.002073719  0.003600932
## x4           -1.167530271  0.501132297
## x5           -1.220868586  0.071576251

anova(model_1)#方差分析表

## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x1         1 10508.9 10508.9 92.7389 9.625e-09 ***
## x2         1  1347.1  1347.1 11.8878  0.002696 ** 
## x3         1    85.4    85.4  0.7539  0.396074    
## x4         1    40.5    40.5  0.3573  0.557082    
## x5         1   392.5   392.5  3.4641  0.078262 .  
## Residuals 19  2153.0   113.3                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

0.2.1 拟合优度检验 & 显著性检验

    #模型诊断：残差图
par(mfrow=c(1,2),mai=c(0.8,0.8,0.4,0.1),cex=0.8,cex.main=0.7)
plot(model_1,which = 1:2)

    #考虑到第2、4个点的残差较大（见Q-Q图），剔除后继续分析
new_model_1=lm(y~x1+x2+x3+x4+x5,data=example10_1[-c(2,4),])
summary(new_model_1)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = example10_1[-c(2, 
##     4), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7788  -4.4404   0.7701   4.5470  13.0666 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  6.935e+00  8.211e+00   0.845  0.41000   
## x1           2.588e-01  7.370e-02   3.511  0.00268 **
## x2           2.314e-02  5.768e-02   0.401  0.69331   
## x3           6.481e-05  9.743e-04   0.067  0.94774   
## x4          -2.572e-01  2.849e-01  -0.903  0.37922   
## x5          -7.479e-01  2.171e-01  -3.446  0.00309 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.252 on 17 degrees of freedom
## Multiple R-squared:  0.9184, Adjusted R-squared:  0.8944 
## F-statistic: 38.28 on 5 and 17 DF,  p-value: 1.143e-08

    #此时的回归诊断
par(mfrow=c(1,2),mai=c(0.8,0.8,0.4,0.1),cex=0.8)
plot(new_model_1,which = 1:2)

## 多重共线性：放宽回归模型假定1

0.2.2 识别多重共线性

0.2.2.1 检验自变量之间相关系数

library(psych)
corr.test(example10_1[3:7],use='complete')

## Call:corr.test(x = example10_1[3:7], use = "complete")
## Correlation matrix 
##       x1    x2    x3    x4    x5
## x1  1.00  0.74  0.88 -0.62 -0.28
## x2  0.74  1.00  0.55 -0.54 -0.32
## x3  0.88  0.55  1.00 -0.52 -0.29
## x4 -0.62 -0.54 -0.52  1.00  0.10
## x5 -0.28 -0.32 -0.29  0.10  1.00
## Sample Size 
## [1] 25
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##      x1   x2   x3   x4   x5
## x1 0.00 0.00 0.00 0.01 0.47
## x2 0.00 0.00 0.03 0.03 0.46
## x3 0.00 0.00 0.00 0.04 0.47
## x4 0.00 0.01 0.01 0.00 0.65
## x5 0.18 0.12 0.16 0.65 0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

0.2.2.2 方差膨胀因子：VIF

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

vif(model_1)

##       x1       x2       x3       x4       x5 
## 8.233159 2.629940 5.184365 1.702361 1.174053

tolerance=1/vif(model_1)#容忍度：VIF的倒数
tolerance

##        x1        x2        x3        x4        x5 
## 0.1214601 0.3802368 0.1928877 0.5874195 0.8517500

0.2.3 处理多重共线性

0.2.3.1 逐步回归：Stepwise Regression

model_2=step(model_1)#变量选择

## Start:  AIC=123.39
## y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1     35.96 2189.0 121.81
## - x4    1     79.17 2232.2 122.30
## <none>              2153.0 123.39
## - x1    1    199.42 2352.4 123.61
## - x5    1    392.54 2545.6 125.58
## - x2    1    942.22 3095.2 130.47
## 
## Step:  AIC=121.81
## y ~ x1 + x2 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x4    1     78.22 2267.2 120.69
## <none>              2189.0 121.81
## - x5    1    445.69 2634.7 124.44
## - x2    1    925.88 3114.9 128.63
## - x1    1   1133.27 3322.3 130.24
## 
## Step:  AIC=120.69
## y ~ x1 + x2 + x5
## 
##        Df Sum of Sq    RSS    AIC
## <none>              2267.2 120.69
## - x5    1    404.28 2671.5 122.79
## - x2    1   1050.90 3318.1 128.21
## - x1    1   1661.83 3929.0 132.43

new_model_2=lm(y~x1+x2+x5,data = example10_1)#拟合逐步回归模型
summary(new_model_2)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x5, data = example10_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.027  -5.361  -1.560   2.304  23.001 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.68928    6.25242  -0.270  0.78966    
## x1           0.19022    0.04848   3.923  0.00078 ***
## x2           0.15763    0.05052   3.120  0.00518 ** 
## x5          -0.56979    0.29445  -1.935  0.06656 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.39 on 21 degrees of freedom
## Multiple R-squared:  0.8439, Adjusted R-squared:  0.8216 
## F-statistic: 37.85 on 3 and 21 DF,  p-value: 1.187e-08

anova(new_model_2)#方差分析表

## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## x1         1 10508.9 10508.9 97.3392 2.452e-09 ***
## x2         1  1347.1  1347.1 12.4775  0.001976 ** 
## x5         1   404.3   404.3  3.7447  0.066558 .  
## Residuals 21  2267.2   108.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

    #逐步回归模型的诊断
par(mfrow=c(1,2),mai=c(0.8,0.8,0.4,0.1),cex=0.8)
plot(new_model_2,which=1:2)

0.3 相对重要性、模型比较

0.3.1 （自变量的）相对重要性：哪些变量对预测比较重要：标准化回归系数

library(lm.beta)
mode1_1.beta=lm.beta(model_1)
summary(mode1_1.beta)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = example10_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.7204  -6.0600   0.7152   3.2144  21.4805 
## 
## Coefficients:
##               Estimate Standardized Std. Error t value Pr(>|t|)   
## (Intercept)  4.2604768    0.0000000 10.4679833   0.407  0.68856   
## x1           0.1273254    0.3361822  0.0959790   1.327  0.20037   
## x2           0.1605660    0.4130034  0.0556834   2.884  0.00952 **
## x3           0.0007636    0.1132753  0.0013556   0.563  0.57982   
## x4          -0.3331990   -0.0963203  0.3986248  -0.836  0.41362   
## x5          -0.5746462   -0.1781104  0.3087506  -1.861  0.07826 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.65 on 19 degrees of freedom
## Multiple R-squared:  0.8518, Adjusted R-squared:  0.8128 
## F-statistic: 21.84 on 5 and 19 DF,  p-value: 2.835e-07

0.3.2 模型比较：所建立的模型是否包含建模所必需的自变量

0.3.2.1 ANOVA(要求嵌套)

anova(new_model_2,model_1)

## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2 + x5
## Model 2: y ~ x1 + x2 + x3 + x4 + x5
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     21 2267.2                           
## 2     19 2153.0  2    114.17 0.5038 0.6121

0.3.2.2 AIC(不要求嵌套)

AIC(new_model_2,model_1)

##             df      AIC
## new_model_2  5 193.6325
## model_1      7 196.3408

0.4 利用回归方程做预测

0.4.1 均值的置信区间、个别值的预测区间

x_0=example10_1[,c(3,4,7)]
pre=predict(new_model_2)
res=residuals(new_model_2)
zre=rstandard(new_model_2)
con_int=predict(new_model_2,x_0,interval = 'confidence',level = 0.95)
pre_int=predict(new_model_2,x_0,interval = 'prediction',level = 0.95)
my_summary=data.frame(营业额=example10_1$y,点预测值=pre,残差=res,标准化残差=zre,置信下限=con_int[,2],置信上限=con_int[,3],预测下限=pre_int[,2],预测上限=pre_int[,3])
round(my_summary,3)#表示输出my_summary，保留小数点3位

##    营业额 点预测值    残差 标准化残差 置信下限 置信上限 预测下限 预测上限
## 1    53.2   52.189   1.011      0.102   45.457   58.921   29.557   74.822
## 2    18.5   -4.501  23.001      2.359  -11.969    2.967  -27.363   18.361
## 3    11.3   21.963 -10.663     -1.072   15.685   28.240   -0.539   44.464
## 4    84.7   65.114  19.586      2.766   49.300   80.928   38.337   91.891
## 5     7.3    6.128   1.172      0.122   -2.283   14.540  -17.059   29.316
## 6    17.9   22.408  -4.508     -0.466   14.581   30.235   -0.574   45.390
## 7     2.5    1.278   1.222      0.125   -6.113    8.670  -21.559   24.116
## 8    27.3   34.719  -7.419     -0.747   28.400   41.038   12.206   57.232
## 9     5.9   10.628  -4.728     -0.472    4.904   16.351  -11.726   32.981
## 10   23.9   37.843 -13.943     -1.390   32.193   43.493   15.509   60.178
## 11   69.4   62.852   6.548      0.709   52.939   72.766   39.079   86.626
## 12   20.6   18.296   2.304      0.229   13.036   23.556   -3.943   40.535
## 13    1.9   -5.510   7.410      0.771  -13.694    2.674  -28.616   17.596
## 14    3.0   14.956 -11.956     -1.202    8.687   21.224   -7.543   37.455
## 15    7.3    8.860  -1.560     -0.169   -1.050   18.770  -14.913   32.632
## 16   46.2   29.831  16.369      1.699   21.745   37.917    6.759   52.903
## 17   78.8   78.016   0.784      0.112   62.105   93.927   51.182  104.850
## 18   11.1   13.167  -2.067     -0.209    6.420   19.915   -9.470   35.805
## 19    8.6   15.847  -7.247     -0.815    4.670   27.024   -8.481   40.175
## 20   48.9   50.727  -1.827     -0.184   44.205   57.250   28.156   73.299
## 21   22.1   27.461  -5.361     -0.536   21.671   33.251    5.090   49.831
## 22   11.1    0.233  10.867      1.354  -13.486   13.953  -25.362   25.829
## 23    8.6   22.627 -14.027     -1.395   17.181   28.073    0.344   44.911
## 24   48.9   48.961  -0.061     -0.006   42.722   55.200   26.470   71.452
## 25   22.1   27.005  -4.905     -0.490   21.220   32.790    4.636   49.374

     #新值预测
x_1=data.frame(x1=50,x2=100,x5=10)
predict(new_model_2,newdata = x_1)

##        1 
## 17.88685

predict(new_model_2,data.frame(x1=50,x2=10,x5=10),interval = 'confidence',level = 0.95)

##        fit       lwr      upr
## 1 3.700067 -4.684078 12.08421

predict(new_model_2,data.frame(x1=50,x2=10,x5=10),interval = 'prediction',level = 0.95)

##        fit       lwr      upr
## 1 3.700067 -19.47764 26.87777