Chapter9 一元线性回归

0.1 一元线性回归前述准备
0.2 一元线性回归模型拟合
0.3 利用回归方程做预测
- 0.3.1 均值的置信区间、个别值的预测区间
0.4 回归模型的诊断：F检验（方差分析表），残差分析
- 0.4.1 残差分析：残差图
- 0.4.2 检验模型假定

0.1 一元线性回归前述准备

0.1.1 绘制散点图

load("D:\\New_Folder\\Study_Programming\\R_Programme\\Applied Statistics\\datas - Copy\\example\\ch9\\example9_1.RData")
library(car)

## Loading required package: carData

scatterplot(example9_1$销售收入~example9_1$广告支出,pch=19,xlab='广告支出',ylab='销售收入',cex.lab=0.8)

    #cex.lab: The magnification to be used for x and y labels relative to the current setting of cex.

0.1.2 相关系数的计算

cor(example9_1$销售收入,example9_1$广告支出)#也可以cor(example9_1[,2],example9_1[,3])

## [1] 0.937114

0.1.3 相关系数的检验

\[H_0:\rho=0 \quad v.s. \quad H_1: \rho \neq 0\]

cor.test(example9_1$销售收入,example9_1$广告支出)

## 
##  Pearson's product-moment correlation
## 
## data:  example9_1$销售收入 and example9_1$广告支出
## t = 11.391, df = 18, p-value = 1.161e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8450142 0.9752189
## sample estimates:
##      cor 
## 0.937114

0.2 一元线性回归模型拟合

model=lm(example9_1$销售收入~example9_1$广告支出)
summary(model)

## 
## Call:
## lm(formula = example9_1$销售收入 ~ example9_1$广告支出)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -766.30 -273.85  -26.79  174.73  900.66 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         2343.8916   274.4825   8.539 9.56e-08 ***
## example9_1$\u5e7f\u544a\u652f\u51fa    5.6735     0.4981  11.391 1.16e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 394 on 18 degrees of freedom
## Multiple R-squared:  0.8782, Adjusted R-squared:  0.8714 
## F-statistic: 129.8 on 1 and 18 DF,  p-value: 1.161e-09

confint(model,level = 0.95)#置信区间

##                                           2.5 %      97.5 %
## (Intercept)                         1767.225152 2920.558006
## example9_1$\u5e7f\u544a\u652f\u51fa    4.627092    6.719825

    #Actually, the level is 0.95 by default.

anova(model)#方差分析表

## Analysis of Variance Table
## 
## Response: example9_1$销售收入
##                                     Df   Sum Sq  Mean Sq F value    Pr(>F)    
## example9_1$\u5e7f\u544a\u652f\u51fa  1 20139304 20139304  129.76 1.161e-09 ***
## Residuals                           18  2793629   155202                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

    #绘制拟合图
plot(example9_1$销售收入~example9_1$广告支出)
text(example9_1$销售收入~example9_1$广告支出,labels=example9_1$企业编号,cex=0.6,adj=c(-0.6,0.25),col=4)
abline(model,col=2,lwd=2)
n=nrow(example9_1)
for (i in 1:n) {
  segments(example9_1[i,3],example9_1[i,2],example9_1[i,3],model$fitted.values[i])
}
mtext(expression(hat(y)==2343.8916+5.6735%*%example9_1$广告支出),cex=0.7,side=1,line=-6,adj=0.75)
arrows(600,4900,550,5350,code = 2,angle = 15,length = 0.08)

0.3 利用回归方程做预测

0.3.1 均值的置信区间、个别值的预测区间

pre_model=predict(model)
con_int=predict(model,data.frame(广告支出=example9_1$广告支出),interval='confidence',level=0.95)
pre_int=predict(model,data.frame(广告支出=example9_1$广告支出),interval='prediction',level=0.95)
data.frame(销售收入=example9_1$销售收入,点预测值=pre_model,置信下限=con_int[,2],置信上限=con_int[,3],预测下限=pre_int[,2],预测上限=pre_int[,3])

##    \u9500\u552e\u6536\u5165 \u70b9\u9884\u6d4b\u503c \u7f6e\u4fe1\u4e0b\u9650
## 1                    4597.5                 4264.925                 3998.348
## 2                    6611.0                 6945.066                 6590.492
## 3                    7349.3                 6448.639                 6168.060
## 4                    5525.7                 5260.049                 5074.789
## 5                    4675.9                 4763.054                 4552.697
## 6                    4418.6                 4762.487                 4552.080
## 7                    5845.4                 6196.170                 5948.675
## 8                    7313.0                 7151.013                 6763.533
## 9                    5035.4                 5015.523                 4822.893
## 10                   4322.6                 4578.100                 4349.549
## 11                   6389.5                 6320.986                 6057.644
## 12                   4152.2                 4011.888                 3709.980
## 13                   5544.8                 4854.964                 4652.116
## 14                   6095.1                 5946.538                 5726.896
## 15                   3626.2                 3821.828                 3491.525
## 16                   3745.4                 4074.296                 3781.397
## 17                   5121.8                 5888.101                 5674.071
## 18                   5674.5                 5747.967                 5545.680
## 19                   4256.6                 4043.660                 3746.360
## 20                   5803.7                 6008.946                 5782.898
##    \u7f6e\u4fe1\u4e0a\u9650 \u9884\u6d4b\u4e0b\u9650 \u9884\u6d4b\u4e0a\u9650
## 1                  4531.501                 3395.383                 5134.467
## 2                  7299.641                 6044.643                 7845.490
## 3                  6729.217                 5574.703                 7322.575
## 4                  5445.310                 4411.897                 6108.201
## 5                  4973.412                 3909.069                 5617.039
## 6                  4972.894                 3908.490                 5616.484
## 7                  6443.664                 5332.287                 7060.053
## 8                  7538.493                 6237.130                 8064.896
## 9                  5208.154                 4165.731                 5865.315
## 10                 4806.650                 3719.452                 5436.747
## 11                 6584.328                 5452.430                 7189.542
## 12                 4313.796                 3130.873                 4892.904
## 13                 5057.813                 4002.798                 5707.131
## 14                 6166.179                 5090.218                 6802.857
## 15                 4152.130                 2930.682                 4712.973
## 16                 4367.196                 3196.327                 4952.266
## 17                 6102.132                 5033.204                 6742.998
## 18                 5950.254                 4895.934                 6600.000
## 19                 4340.960                 3164.212                 4923.107
## 20                 6234.994                 5150.961                 6866.931

    #绘制置信区间、预测区间图
model_1=lm(销售收入~广告支出,data=example9_1)
    #所以下面都是对应起来的，区分下上述的model，原因详见predict函数，data.frame这部分还不能改变量
    #所以杜绝中文名变量，从你我做起 XD
x_0=seq(min(example9_1$广告支出),max(example9_1$广告支出))
con_int_1=predict(model_1,data.frame(广告支出=x_0),interval='confidence',level=0.95)
pre_int_1=predict(model_1,data.frame(广告支出=x_0),interval='prediction',level=0.95)
par(cex=0.8,mai=c(0.7,0.7,0.1,0.1))
n=nrow(example9_1)
plot(example9_1$销售收入~example9_1$广告支出)
abline(model,lwd=2)
for (i in 1:n) {
  segments(example9_1[i,3],example9_1[i,2],example9_1[i,3],model$fitted.values[i])
}
lines(x_0,y=con_int_1[,2],lty=5,lwd=2,col='blue')#如果使用之前的model,就会报错 Error in xy.coords(x, y) : 'x' and 'y' lengths differ 
lines(x_0,y=con_int_1[,3],lty=5,lwd=2,col='blue')
lines(x_0,y=pre_int_1[,2],lty=6,lwd=2,col='red')
lines(x_0,y=pre_int_1[,3],lty=6,lwd=2,col='red')
# lty: The line type. Line types can either be specified as an integer
# (0=blank, 1=solid (default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash) 
# or as one of the character strings "blank", "solid", "dashed", "dotted", "dotdash", "longdash", or "twodash", where "blank" uses ‘invisible lines’ (i.e., does not draw them).
legend(x='topleft',legend = c('回归线','置信区间','预测区间'),lty=c(1,5,6),col=c(1,4,2),lwd=2,cex=0.8)

   #计算x某点的点预测值、置信区间、预测区间
x_1=data.frame(广告支出 =500)
predict(model_1,newdata = x_1)

##        1 
## 5180.621

predict(model_1,data.frame(广告支出=500),interval='confidence',level=0.95)

##        fit      lwr      upr
## 1 5180.621 4994.127 5367.115

predict(model_1,data.frame(广告支出=500),interval='prediction',level=0.95)

##        fit      lwr      upr
## 1 5180.621 4332.199 6029.043

0.4 回归模型的诊断：F检验（方差分析表），残差分析

0.4.1 残差分析：残差图

pre=fitted(model)
res=residuals(model)
zre=model$residuals/sqrt(deviance(model)/df.residual(model))
data.frame(销售收入=example9_1$销售收入,点预测值=pre,残差=res,标准化残差=zre)

##    \u9500\u552e\u6536\u5165 \u70b9\u9884\u6d4b\u503c \u6b8b\u5dee
## 1                    4597.5                 4264.925    332.57536
## 2                    6611.0                 6945.066   -334.06646
## 3                    7349.3                 6448.639    900.66117
## 4                    5525.7                 5260.049    265.65073
## 5                    4675.9                 4763.054    -87.15430
## 6                    4418.6                 4762.487   -343.88696
## 7                    5845.4                 6196.170   -350.76993
## 8                    7313.0                 7151.013    161.98700
## 9                    5035.4                 5015.523     19.87679
## 10                   4322.6                 4578.100   -255.49955
## 11                   6389.5                 6320.986     68.51398
## 12                   4152.2                 4011.888    140.31161
## 13                   5544.8                 4854.964    689.83567
## 14                   6095.1                 5946.538    148.56225
## 15                   3626.2                 3821.828   -195.62753
## 16                   3745.4                 4074.296   -328.89643
## 17                   5121.8                 5888.101   -766.30113
## 18                   5674.5                 5747.967    -73.46670
## 19                   4256.6                 4043.660    212.94024
## 20                   5803.7                 6008.946   -205.24580
##    \u6807\u51c6\u5316\u6b8b\u5dee
## 1                       0.8441934
## 2                      -0.8479784
## 3                       2.2861954
## 4                       0.6743151
## 5                      -0.2212283
## 6                      -0.8729063
## 7                      -0.8903777
## 8                       0.4111801
## 9                       0.0504543
## 10                     -0.6485479
## 11                      0.1739126
## 12                      0.3561603
## 13                      1.7510460
## 14                      0.3771033
## 15                     -0.4965716
## 16                     -0.8348550
## 17                     -1.9451423
## 18                     -0.1864844
## 19                      0.5405174
## 20                     -0.5209861

0.4.2 检验模型假定

0.4.2.1 线性关系：成分残差图

load("D:\\New_Folder\\Study_Programming\\R_Programme\\Applied Statistics\\datas - Copy\\example\\ch9\\example9_1.RData")
model=lm(example9_1$销售收入~example9_1$广告支出)
library(car)
crPlots(model)

0.4.2.2 正态性检验

par(mfrow=c(2,2),cex=0.8,cex.main=0.7)
plot(model)

0.4.2.3 检验方差齐性

load("D:\\New_Folder\\Study_Programming\\R_Programme\\Applied Statistics\\datas - Copy\\example\\ch9\\example9_1.RData")
model=lm(example9_1$销售收入~example9_1$广告支出)
library(car)
ncvTest(model)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.126441, Df = 1, p = 0.28854

\(p = 0.28854 > 0.05\) ，故不拒绝原假设，可认为模型符合方差齐性。

    #绘制散布-水平图
spreadLevelPlot(model)

## 
## Suggested power transformation:  0.8730215

0.4.2.4 检验独立性：Durbin-Watson检验

load("D:\\New_Folder\\Study_Programming\\R_Programme\\Applied Statistics\\datas - Copy\\example\\ch9\\example9_1.RData")
model=lm(example9_1$销售收入~example9_1$广告支出)
library(car)
durbinWatsonTest(model)

##  lag Autocorrelation D-W Statistic p-value
##    1       0.1330482      1.679232   0.506
##  Alternative hypothesis: rho != 0