线性回归.knit

setwd("d:/data")  
d2 <- read.csv("C:\\Users\\86167\\Desktop\\第一产业和一级指标.csv", header = T)  
# 建立全变量回归方程
lm.one <- lm(y ~ x1 + x2 + x3, data = d2)  
# 进行逐步回归
summary(lm.one)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = d2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -311.40 -130.26   -9.64  140.37  299.16 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3682.15792  244.04269  15.088 3.68e-07 ***
## x1             8.79002    3.36199   2.615   0.0309 *  
## x2            -3.02661    2.96981  -1.019   0.3380    
## x3            -0.07138    1.55629  -0.046   0.9645    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 245.8 on 8 degrees of freedom
## Multiple R-squared:  0.9168, Adjusted R-squared:  0.8855 
## F-statistic: 29.37 on 3 and 8 DF,  p-value: 0.0001142

lm.one.step<-step(lm.one,direction="both")

## Start:  AIC=135.24
## y ~ x1 + x2 + x3
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1       127 483365 133.24
## - x2    1     62737 545976 134.71
## <none>              483238 135.24
## - x1    1    412913 896151 140.65
## 
## Step:  AIC=133.24
## y ~ x1 + x2
## 
##        Df Sum of Sq     RSS    AIC
## - x2    1     63542  546907 132.73
## <none>               483365 133.24
## + x3    1       127  483238 135.24
## - x1    1    650578 1133944 141.48
## 
## Step:  AIC=132.73
## y ~ x1
## 
##        Df Sum of Sq     RSS    AIC
## <none>               546907 132.73
## + x2    1     63542  483365 133.24
## + x3    1       932  545976 134.71
## - x1    1   5258185 5805092 159.07

# 计算普通残差
y.res<-residuals(lm.one)
# 计算标准化残差
y.rst<-rstandard(lm.one.step)
print(y.rst)

##           1           2           3           4           5           6 
##  0.06325835 -0.10394070  0.75730091  0.36486586  0.83474324  0.02767989 
##           7           8           9          10          11          12 
## -0.97477742 -1.40480704 -1.44027614 -0.92393516  1.20320283  1.83997295

# 计算预测值
y.fit<-predict(lm.one.step)
# 绘制普通残差与预测值的散点图
plot(y.res~y.fit)

# 绘制标准化残差与预测值的散点图
plot(y.rst~y.fit)

# 第7、12号点为异常点
d2_no_outlier <- d2[-c(7),]  
# 重新建立回归模型
lm.one_no_outlier <- lm(y ~ x1 + x2 + x3 , data = d2_no_outlier)  
summary(lm.one_no_outlier)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = d2_no_outlier)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -324.80 -122.85    2.14  180.08  270.99 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3662.10399  258.59401  14.162 2.08e-06 ***
## x1             8.19420    3.69908   2.215   0.0623 .  
## x2            -2.29754    3.40342  -0.675   0.5213    
## x3            -0.09299    1.63184  -0.057   0.9562    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 257.6 on 7 degrees of freedom
## Multiple R-squared:  0.9198, Adjusted R-squared:  0.8854 
## F-statistic: 26.76 on 3 and 7 DF,  p-value: 0.0003292

lm.step<-step(lm.one_no_outlier,direction="both") #用一切子集回归法来进行逐步回归

## Start:  AIC=125.16
## y ~ x1 + x2 + x3
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1       216 464811 123.17
## - x2    1     30246 494842 123.86
## <none>              464596 125.16
## - x1    1    325689 790285 129.00
## 
## Step:  AIC=123.17
## y ~ x1 + x2
## 
##        Df Sum of Sq    RSS    AIC
## - x2    1     30129 494941 121.86
## <none>              464811 123.17
## + x3    1       216 464596 125.16
## - x1    1    475204 940015 128.91
## 
## Step:  AIC=121.86
## y ~ x1
## 
##        Df Sum of Sq     RSS    AIC
## <none>               494941 121.86
## + x2    1     30129  464811 123.17
## + x3    1        99  494842 123.86
## - x1    1   5298591 5793532 146.92

y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标，预测值为横坐标的散点

lm.step_new<-update(lm.step,log(.)~.) #对模型进行对数变换
y.rst<-rstandard(lm.step_new) #计算lm.step_new的标准化残差
y.fit<-predict(lm.step_new) #计算lm.step_new的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标

# 全变量模型的拟合优度指标
summary(lm.one)$r.squared

## [1] 0.9167561

summary(lm.one)$adj.r.squared

## [1] 0.8855397

# 逐步回归模型的拟合优度指标
summary(lm.one.step)$r.squared

## [1] 0.9057883

summary(lm.one.step)$adj.r.squared

## [1] 0.8963672

par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图

influence.measures(lm.step_new)

## Influence measures of
##   lm(formula = log(y) ~ x1, data = d2_no_outlier) :
## 
##      dfb.1_   dfb.x1  dffit cov.r  cook.d    hat inf
## 1  -0.44599  0.38486 -0.447 1.783 0.10745 0.3513   *
## 2  -0.21885  0.17604 -0.223 1.604 0.02746 0.2409    
## 3   0.38820 -0.28063  0.415 1.240 0.08758 0.1674    
## 4   0.15379 -0.08554  0.192 1.330 0.02007 0.1133    
## 5   0.25982 -0.10905  0.377 1.041 0.06899 0.0992    
## 6   0.03443 -0.00869  0.060 1.383 0.00201 0.0929    
## 8  -0.00696 -0.24033 -0.549 0.848 0.13059 0.1125    
## 9   0.10909 -0.38938 -0.659 0.827 0.18299 0.1397    
## 10  0.12419 -0.30542 -0.450 1.202 0.10135 0.1684    
## 11 -0.20291  0.38244  0.492 1.357 0.12344 0.2304    
## 12 -0.41282  0.70564  0.856 1.167 0.33470 0.2840

preds<-data.frame(x1=407.76)
predict(lm.step,newdata=preds,interval="prediction", level=0.95)

##        fit      lwr      upr
## 1 6049.366 5439.628 6659.104