2.3

本题逐年递增

## 数据处理
setwd("D:/桌面/多元统计分析数据data") #设定工作路径
d2.3<-read.csv("ex2.3.csv",header=T) #将ex2.3.csv数据读入到d2.3中
lm.exam<-lm(y~x1+x2+x3+x4+x5,data=d2.3) #建立y关于x1,x2,x3,x4和x5的线性回归方程,数据为d2.3
summary(lm.exam) #给出回归系数的估计和显著性检

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = d2.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -252.31  -48.18  -12.79   52.81  193.02 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -4.012e+02  1.431e+02  -2.803  0.01870 * 
## x1           1.444e-02  2.402e-02   0.601  0.56109   
## x2          -2.087e-02  8.657e-02  -0.241  0.81440   
## x3           5.810e-05  1.464e-03   0.040  0.96912   
## x4           3.044e+01  8.298e+00   3.668  0.00433 **
## x5           2.004e-01  1.115e-01   1.798  0.10235   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.6 on 10 degrees of freedom
## Multiple R-squared:  0.9879, Adjusted R-squared:  0.9818 
## F-statistic: 162.8 on 5 and 10 DF,  p-value: 3.043e-09

### 分析：回归方程的F统计量为162.8，相应的p值为3.043e-09，这表明回归方程是高度显著。p值远小于常用的显著性水平0.05或0.01。只有常数项和x4在统计上是显著的，其他不显著，这可能表明模型中包含一些不必要的变量，或者这些变量对因变量的影响较。

## 变量选择
lm.exam<-lm(y~x1+x2+x3+x4+x5,data=d2.3) #建立全变量回归方程
lm.step<-step(lm.exam,direction="both") #进行逐步回归

## Start:  AIC=160.15
## y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x3    1        26 168042 158.15
## - x2    1       976 168992 158.24
## - x1    1      6073 174088 158.72
## <none>              168015 160.15
## - x5    1     54329 222344 162.63
## - x4    1    226106 394122 171.79
## 
## Step:  AIC=158.15
## y ~ x1 + x2 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x2    1       957 168999 156.24
## - x1    1      6050 174092 156.72
## <none>              168042 158.15
## + x3    1        26 168015 160.15
## - x5    1     55493 223535 160.72
## - x4    1    228011 396052 169.87
## 
## Step:  AIC=156.24
## y ~ x1 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## - x1    1      5903 174902 154.79
## <none>              168999 156.24
## + x2    1       957 168042 158.15
## + x3    1         8 168992 158.24
## - x5    1    155567 324566 164.68
## - x4    1    501855 670854 176.30
## 
## Step:  AIC=154.79
## y ~ x4 + x5
## 
##        Df Sum of Sq     RSS    AIC
## <none>               174902 154.79
## + x1    1      5903  168999 156.24
## + x2    1       811  174092 156.72
## + x3    1         1  174901 156.79
## - x5    1    180176  355078 164.12
## - x4    1   1843682 2018584 191.93

summary(lm.step)

## 
## Call:
## lm(formula = y ~ x4 + x5, data = d2.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -249.75  -42.72  -13.87   54.34  205.41 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -410.07401   57.16828  -7.173 7.23e-06 ***
## x4            31.47130    2.68842  11.706 2.81e-08 ***
## x5             0.18680    0.05105   3.660  0.00288 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 116 on 13 degrees of freedom
## Multiple R-squared:  0.9874, Adjusted R-squared:  0.9854 
## F-statistic:   508 on 2 and 13 DF,  p-value: 4.572e-13

### 分析：在所分析的线性回归模型中，我们发现只有变量x4和x5对因变量𝑦有显著的正向影响。y=−410.07+31.47x4+0.1868x5

## 回归诊断
y.res<-residuals (lm.exam) #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
print(y.rst) #输出回归模型lm.step的标准化残差y.rst

##           1           2           3           4           5           6 
##  1.34714761  1.18901283  0.29852998 -0.19086128 -0.23331509 -0.88348308 
##           7           8           9          10          11          12 
## -0.85374520 -0.46733123 -0.37082828 -0.06628483  0.91759182 -2.23768758 
##          13          14          15          16 
## -0.34282279  1.94017167  0.49964531 -0.07834811

y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.res~ y.fit) #绘制以普通残差为纵坐标，预测值为横坐标的散点图

plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标，预测值为横坐标的散点图

#假设由例2.1已经得到逐步回归模型lm.step
lm.step_new <- lm(log(y) ~ x1 + x2 + x3 + x4 + x5, data=d2.3)#对模型进行对变换
y.rst<-rstandard(lm.step_new) #计算lm.step_new的标准化残差
y.fit<-predict(lm.step_new) #计算lm.step_new的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标,预测值为横坐标的散点图

### 分析：由图可知 12号是异常点

## 去掉异常点
lm.exam<-lm(log(y)~x1+x2+x3+x4,data= d2.3[-c(15),]) #去掉第15号观测值再建立全变量回归方程
lm.step<-step(lm.exam,direction="both") #用一切子集回归法来进行逐步回归

## Start:  AIC=-40.32
## log(y) ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq     RSS     AIC
## - x3    1   0.00024 0.52409 -42.312
## <none>              0.52385 -40.319
## - x4    1   0.14343 0.66728 -38.689
## - x1    1   0.15567 0.67952 -38.416
## - x2    1   1.03711 1.56096 -25.941
## 
## Step:  AIC=-42.31
## log(y) ~ x1 + x2 + x4
## 
##        Df Sum of Sq     RSS     AIC
## <none>              0.52409 -42.312
## - x4    1   0.15170 0.67578 -40.499
## + x3    1   0.00024 0.52385 -40.319
## - x1    1   0.16228 0.68636 -40.266
## - x2    1   1.16484 1.68893 -26.759

y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
y.fit<-predict(lm.step) #计算回归模型lm.step的预测值
plot(y.rst~ y.fit) #绘制以标准化残差为纵坐标，预测值为横坐标的散点图

par(mfrow=c(2,2)) #在一个2×2网格中创建4个绘图区
plot(lm.step_new) #绘制模型诊断图

## Warning in sqrt(crit * p * (1 - hh)/hh): 产生了NaNs
## Warning in sqrt(crit * p * (1 - hh)/hh): 产生了NaNs

influence.measures(lm.step_new) #计算各个观测值的诊断统计量

## Influence measures of
##   lm(formula = log(y) ~ x1 + x2 + x3 + x4 + x5, data = d2.3) :
## 
##      dfb.1_   dfb.x1    dfb.x2   dfb.x3   dfb.x4   dfb.x5    dffit    cov.r
## 1  -0.32584  0.01379 -0.225537  0.00246  0.19629  0.43797  -0.7588  0.87191
## 2  -0.04670 -0.00674 -0.022237 -0.00687  0.02525  0.05252  -0.1170  2.21526
## 3   0.00255  0.00106 -0.000745  0.00294  0.00150 -0.00372   0.0136  2.23460
## 4   0.00542  0.01577 -0.035236  0.02691  0.04036  0.00384   0.1008  2.14247
## 5  -0.02588  0.02997 -0.097022  0.16394  0.14520 -0.04493   0.4188  1.25297
## 6   0.03748 -0.03654  0.038480 -0.16120 -0.03986  0.04696  -0.3085  1.53415
## 7  -0.60828 -0.02279  0.075818  0.63215 -0.02375 -0.17328  -0.6750 23.23947
## 8  -0.02457  0.07120 -0.067878  0.07483  0.00865  0.07381   0.1727  1.99326
## 9   1.24390 18.04165 -7.382376  0.27628 -1.53712  1.02555 -20.7415 11.99284
## 10 -0.09275  0.22767 -0.397462  0.06947  0.27199  0.43328   0.4785  3.97895
## 11  0.06626 -0.47483  0.194070 -0.22212  0.41908 -0.60288  -1.2680  1.15509
## 12 -0.01272 -0.10172  0.334057  0.11361 -0.33347 -0.30206   0.4563  1.94636
## 13  0.22222 -0.09355  0.458617 -0.13011 -0.52815 -0.31991   0.7031  0.95986
## 14  0.09171  0.01305  0.121087 -0.07791 -0.18981 -0.03793   0.2778  2.47082
## 15 -0.52838  0.07617 -0.572450  0.09599  1.15011  0.09061   1.8200  0.49062
## 16  1.10406 -0.02054  0.197926 -0.10318 -1.20321  0.41108  -3.9809  0.00891
##      cook.d   hat inf
## 1  8.94e-02 0.249    
## 2  2.52e-03 0.184    
## 3  3.41e-05 0.158    
## 4  1.87e-03 0.154    
## 5  2.95e-02 0.161    
## 6  1.66e-02 0.145    
## 7  8.40e-02 0.921   *
## 8  5.42e-03 0.153    
## 9  5.14e+01 0.989   *
## 10 4.16e-02 0.577   *
## 11 2.47e-01 0.466    
## 12 3.66e-02 0.297    
## 13 7.81e-02 0.242    
## 14 1.40e-02 0.317    
## 15 4.39e-01 0.482   *
## 16 1.07e+00 0.503   *

## 根据历史数据，得出相应解释变量，利用前面得到的回归模型对y进行点预测和区间预测
summary(lm.step)## 在去掉第15号观测值再新建立的回归方程中，模型保留了x1,x2,x4

## 
## Call:
## lm(formula = log(y) ~ x1 + x2 + x4, data = d2.3[-c(15), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33698 -0.11992 -0.01300  0.09149  0.51390 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.553e+00  1.283e-01  43.281 1.22e-13 ***
## x1          -6.545e-05  3.547e-05  -1.846 0.092024 .  
## x2           4.226e-04  8.547e-05   4.945 0.000439 ***
## x4          -2.514e-02  1.409e-02  -1.784 0.101939    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2183 on 11 degrees of freedom
## Multiple R-squared:  0.9428, Adjusted R-squared:  0.9272 
## F-statistic: 60.46 on 3 and 11 DF,  p-value: 4.025e-07

preds <- data.frame(x1 =29000,x2=18000,x4 = 94)#给定解释变量x4和x5的值
predict(lm.step,newdata=preds,interval="prediction",level=0.95)#进行点预测和区间预测

##        fit      lwr      upr
## 1 8.898514 8.246997 9.550032

### 分析：根据模型 lm.step 和给定的解释变量值，因变量y 的预测值是8.898514，有95%的信心认为真实的 y 值会落在【8.246997，9.550032】之间。
### 总结：在分析数据集d2.3后，发现x4和x5对因变量y有显著的正向影响，而模型的拟合度非常高，R平方值接近0.99，显示模型能够解释大部分的变异。然而，在去掉异常点后，x1,x2,x4变为更有影响力的方程。残差分析揭示了模型预测存在误差，且对数变换后的模型分析表明原始y值可能不满足线性回归的基本假设。实际预测值9.386563远低于预期的逐年递增趋势，即模型可能未能捕捉数据的关键信息。这个问题可能源于数据的量纲和单位处理不当，异常值的影响，或者变量选择的不准确。此外，逐步回归可能排除了一些重要变量，因此我们可能需要基于领域知识重新选择变量，并考虑变换数据或使用非线性模型来满足线性回归的基本假设。

2.4

本题有一个0-1变量

## 数据处理
setwd("D:/桌面/多元统计分析数据data") #设定工作路径
d2.4<-read.csv("ex2.4.csv",header=T) #将ex2.3.csv数据读入到d2.4中
lm.exam<-lm(y~x1+x2+x3+x4,data=d2.4)#建立y关于x1,x2,x3,x4的线性回归方程,数据为d2.4
sapply(d2.4, function(x) sum(is.na(x)))

## No  y x1 x2 x3 x4 
##  1  1  1  1  1  1

missing_rows <- which(is.na(d2.4$y) | is.na(d2.4$x1) | is.na(d2.4$x2) |is.na(d2.4$x3) | is.na(d2.4$x4))# 寻找缺失值
d2.4[missing_rows, ]

##    No  y x1 x2 x3 x4
## 51 NA NA NA NA NA NA

summary(lm.exam) #给出回归系数的估计和显著性检验

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d2.4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -709.52 -271.18  -35.46  212.92 1026.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.593    155.914  -0.010   0.9919    
## x1           190.588     33.514   5.687 9.08e-07 ***
## x2             1.588      2.674   0.594   0.5555    
## x3             1.013      1.315   0.770   0.4453    
## x4           304.716    139.869   2.179   0.0346 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 422.4 on 45 degrees of freedom
##   (因为不存在，1个观察量被删除了)
## Multiple R-squared:  0.7485, Adjusted R-squared:  0.7261 
## F-statistic: 33.48 on 4 and 45 DF,  p-value: 5.8e-13

### 报错：由于存在一个观察量被删除（因为不存在），这可能会影响模型的精确度和可靠性。可能需要进一步调查这个缺失值的原因，并考虑如何处理它以确保模型的有效性。经过检验发现是51号（？），因为被自动删除，所以不再考虑。
### 分析：𝑦=−1.593+190.588𝑥1+1.588𝑥2+1.013+𝑥3+304.716𝑥4。这个线性回归模型的R平方值为0.7485，表明模型能够解释超过74%的数据变异，显示出较好的拟合效果。调整后的R平方值为0.7261，考虑了模型中变量的数量，进一步确认了模型的有效性。统计上显著的变量是x1和x4，它们的系数显著不为零，意味着这两个变量对因变量y有显著的影响，而x2和x3的影响则不显著。整体模型的显著性得到了F统计量（33.48）和p值（小于0.001）的支持，远低于常用的显著性水平0.05。模型的残差标准误差为422.4，基于45个自由度，这为我们提供了对模型预测误差的估计。

## 变量选择
lm.exam<-lm(y~x1+x2+x3+x4,data=d2.4) #建立全变量回归方程
lm.step<-step(lm.exam,direction="both") #进行逐步回归

## Start:  AIC=609.33
## y ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq      RSS    AIC
## - x2    1     62949  8092032 607.72
## - x3    1    105799  8134882 607.98
## <none>               8029083 609.33
## - x4    1    846846  8875929 612.34
## - x1    1   5770382 13799465 634.41
## 
## Step:  AIC=607.72
## y ~ x1 + x3 + x4
## 
##        Df Sum of Sq      RSS    AIC
## - x3    1    141084  8233116 606.58
## <none>               8092032 607.72
## + x2    1     62949  8029083 609.33
## - x4    1    867198  8959230 610.81
## - x1    1  14810227 22902258 657.74
## 
## Step:  AIC=606.58
## y ~ x1 + x4
## 
##        Df Sum of Sq      RSS    AIC
## <none>               8233116 606.58
## + x3    1    141084  8092032 607.72
## + x2    1     98234  8134882 607.98
## - x4    1    862282  9095398 609.56
## - x1    1  15298607 23531723 657.09

summary(lm.step)

## 
## Call:
## lm(formula = y ~ x1 + x4, data = d2.4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -815.95 -279.23  -59.84  245.14  993.05 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -45.58     130.66  -0.349   0.7288    
## x1            207.47      22.20   9.345 2.72e-12 ***
## x4            307.22     138.47   2.219   0.0314 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 418.5 on 47 degrees of freedom
##   (因为不存在，1个观察量被删除了)
## Multiple R-squared:  0.7421, Adjusted R-squared:  0.7311 
## F-statistic: 67.62 on 2 and 47 DF,  p-value: 1.474e-14

### 分析：在所分析的线性回归模型中，我们发现只有变量x1和x4对因变量𝑦有显著的正向影响。𝑦=−1.593+190.588𝑥1+304.716𝑥4

## 回归诊断
y.res<-residuals (lm.exam) #计算模型lm.exam的普通残差
y.rst<-rstandard(lm.step) #计算回归模型lm.step的标准化残差
print(y.rst) #输出回归模型lm.step的标准化残差y.rst

##            1            2            3            4            5            6 
## -0.446568862 -0.945218013  0.062004702 -0.816593881 -0.281164377  0.997634957 
##            7            8            9           10           11           12 
## -0.171591006 -0.016449598  0.743372084  0.189530905  2.410737483  0.446008336 
##           13           14           15           16           17           18 
## -0.123934084  1.909831850 -0.852775518 -1.980823416 -0.705232906 -0.295270768 
##           19           20           21           22           23           24 
##  0.725379919 -0.975727136 -1.672437645 -0.268752285 -0.398016503 -0.394519084 
##           25           26           27           28           29           30 
## -0.611646884 -0.626212592  0.152394172 -1.406352767  0.745391559  2.312642857 
##           31           32           33           34           35           36 
## -0.043006286  1.686007110 -0.229683682 -0.977540717 -0.004239526 -0.295270768 
##           37           38           39           40           41           42 
## -1.413701844 -0.799868777 -0.040006782  2.394244916  1.308598927  0.753432011 
##           43           44           45           46           47           48 
## -0.093586794  0.491488133  1.085409594 -0.908800697  0.629820894 -0.059490423 
##           49           50 
## -0.345530197 -0.815410137

outliers <- which(abs(y.rst) > 2)
plot(y.rst ~ predict(lm.step), main = "Standardized Residuals vs Fitted Values", xlab = "Fitted Values", ylab = "Standardized Residuals")
points(predict(lm.step)[outliers], y.rst[outliers], col = "red", pch = 19)# 绘制标准化残差与预测值的散点图，并标记异常点

### 分析：由图可知 12号是异常点

## 去掉异常点
lm.exam <- lm(log(y) ~ x1 + x2 + x3 + x4, data = d2.3[-c(11,30,40),])# 去掉异常点并拟合全变量回归方程
lm.step <- step(lm.exam, direction = "both")  # 进行逐步回归

## Start:  AIC=-38.84
## log(y) ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq     RSS     AIC
## - x3    1   0.00008 0.57808 -40.842
## - x4    1   0.03745 0.61544 -39.902
## <none>              0.57799 -38.844
## - x1    1   0.16026 0.73825 -37.173
## - x2    1   0.82544 1.40343 -27.537
## 
## Step:  AIC=-40.84
## log(y) ~ x1 + x2 + x4
## 
##        Df Sum of Sq     RSS     AIC
## - x4    1   0.03788 0.61596 -41.889
## <none>              0.57808 -40.842
## - x1    1   0.16778 0.74585 -39.019
## + x3    1   0.00008 0.57799 -38.844
## - x2    1   0.87584 1.45392 -29.007
## 
## Step:  AIC=-41.89
## log(y) ~ x1 + x2
## 
##        Df Sum of Sq     RSS     AIC
## <none>              0.61596 -41.889
## + x4    1   0.03788 0.57808 -40.842
## + x3    1   0.00052 0.61544 -39.902
## - x1    1   0.21522 0.83118 -39.394
## - x2    1   1.39633 2.01229 -26.132

y.rst <- rstandard(lm.step)
y.fit <- predict(lm.step)# 计算回归模型的标准化残差和预测值
plot(y.rst ~ y.fit, main = "Standardized Residuals vs Fitted Values")# 绘制以标准化残差为纵坐标，预测值为横坐标的散点图

par(mfrow = c(2, 2))# 创建一个2×2网格中的4个绘图区
# 绘制模型诊断图
plot(lm.step)  # 确保使用正确的模型名称

## Warning in sqrt(crit * p * (1 - hh)/hh): 产生了NaNs
## Warning in sqrt(crit * p * (1 - hh)/hh): 产生了NaNs

# 计算各个观测值的诊断统计量
inf.measures <- influence.measures(lm.step)
### 分析：由图可知，第16号观测值可能是异常点和强影响点

## 回归预测
summary(lm.step)## 根据均值.所以，回归预测给定解释变量x1=5,x2=23，利用前面得到的回归模型对y进行点预测和区间预测

## 
## Call:
## lm(formula = log(y) ~ x1 + x2, data = d2.3[-c(11, 30, 40), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34244 -0.10831  0.00698  0.10684  0.48860 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.420e+00  1.109e-01  48.878 3.52e-15 ***
## x1          -7.402e-05  3.615e-05  -2.048 0.063132 .  
## x2           3.066e-04  5.879e-05   5.216 0.000216 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2266 on 12 degrees of freedom
## Multiple R-squared:  0.9412, Adjusted R-squared:  0.9315 
## F-statistic: 96.12 on 2 and 12 DF,  p-value: 4.113e-08

preds <- data.frame(x1 = 5, x2=23)#给定解释变量x1和x2的值
predict(lm.step,newdata=preds,interval="prediction",level=0.95)#进行点预测和区间预

##        fit      lwr      upr
## 1 5.427152 4.878127 5.976176

### 分析：根据模型 lm.step 和给定的解释变量值x1 = 5, x2=23），因变量y 的预测值是5.427152，有95%的信心认为真实的 y 值会落在【4.878127 5.976176】之间。
### 总结：在对数据集d2.4进行线性回归分析后，发现只有x1和x4对因变量y有显著的正向影响，而x2和x3的影响不显著。模型整体拟合良好，R平方值显示模型能解释大部分因变量的变异，F统计量和对应的p值也表明模型是高度显著的。在逐步回归过程中只保留了x1和x4。残差分析显示模型预测值与实际值之间存在一定差异，但残差的标准误差在可接受范围内。对数变换后的模型分析表明，原始的y值可能不满足线性回归的一些基本假设。最终，模型预测值为5.427152，95%的预测区间为[4.878127, 5.976176]，提供了关于y值不确定性的估计。

2.5

## 数据处理
setwd("D:/桌面/多元统计分析数据data") #设定工作路径
d2.5<-read.csv("ex2.5.csv",header=T) #将ex2.5.csv数据读入到d2.3中
lm.exam<-lm(y~x1+x2+x3+x4+x5+x6,data=d2.5) #建立y关于x1,x2,x3,x4,x5和x6的线性回归方程,数据为d2.5
summary(lm.exam) #给出回归系数的估计和显著性检

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, data = d2.5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3078.2  -713.3  -118.6   674.8  2852.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.716e+04  1.585e+04   2.345 0.028937 *  
## x1          -7.792e-01  3.351e-01  -2.326 0.030138 *  
## x2           2.308e-01  5.888e-02   3.920 0.000786 ***
## x3           5.425e-01  8.940e-01   0.607 0.550460    
## x4          -3.059e-01  1.636e-01  -1.869 0.075580 .  
## x5           4.600e-01  1.527e-01   3.012 0.006636 ** 
## x6          -5.757e-01  6.274e-01  -0.918 0.369255    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1428 on 21 degrees of freedom
## Multiple R-squared:  0.9985, Adjusted R-squared:  0.9981 
## F-statistic:  2338 on 6 and 21 DF,  p-value: < 2.2e-16

### 分析：该模型R平方值高达0.9985，调整后的R平方值也达到了0.9981，显示了模型在考虑变量数量后依然保持了极高的解释能力。模型中，变量x1、x2、x4和x5对因变量y有显著影响，而x3和x6的影响则不显著。整体而言，F统计量为2338，对应的p值小于2.2e-16，远小于0.05，证实了模型整体的显著性。

d2.5$y_pow <- d2.5$y^0.5  # 建立幂变换后的全变量回归方程
lm.pow_exam <- lm(y_pow ~ x1 + x2 + x3 + x4 + x5 + x6, data = d2.5)# 例如使用0.5作为幂，可以根据需要调整
summary(lm.pow_exam)

## 
## Call:
## lm(formula = y_pow ~ x1 + x2 + x3 + x4 + x5 + x6, data = d2.5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8571 -2.1264  0.2287  1.0132  7.5583 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.875e+01  3.433e+01  -2.586   0.0173 *  
## x1          -2.282e-03  7.258e-04  -3.144   0.0049 ** 
## x2           1.875e-03  1.275e-04  14.697 1.59e-12 ***
## x3          -1.058e-02  1.936e-03  -5.462 2.03e-05 ***
## x4           1.170e-03  3.544e-04   3.302   0.0034 ** 
## x5           1.875e-03  3.308e-04   5.667 1.26e-05 ***
## x6           3.978e-04  1.359e-03   0.293   0.7726    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.093 on 21 degrees of freedom
## Multiple R-squared:  0.9991, Adjusted R-squared:  0.9988 
## F-statistic:  3838 on 6 and 21 DF,  p-value: < 2.2e-16

lm.pow_step <- step(lm.pow_exam, direction = "both")#对幂变换后的模型进行逐步回归

## Start:  AIC=69.18
## y_pow ~ x1 + x2 + x3 + x4 + x5 + x6
## 
##        Df Sum of Sq     RSS     AIC
## - x6    1      0.82  201.74  67.294
## <none>               200.92  69.180
## - x1    1     94.58  295.50  77.981
## - x4    1    104.29  305.21  78.886
## - x3    1    285.44  486.37  91.933
## - x5    1    307.26  508.18  93.162
## - x2    1   2066.78 2267.70 135.041
## 
## Step:  AIC=67.29
## y_pow ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## <none>               201.74  67.294
## + x6    1      0.82  200.92  69.180
## - x1    1    113.52  315.26  77.793
## - x4    1    162.95  364.69  81.872
## - x3    1    322.93  524.67  92.056
## - x5    1    329.69  531.43  92.414
## - x2    1   2066.55 2268.30 133.048

summary(lm.pow_step)

## 
## Call:
## lm(formula = y_pow ~ x1 + x2 + x3 + x4 + x5, data = d2.5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7664 -2.0493  0.1769  0.9529  7.7468 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.284e+01  3.071e+01  -3.023 0.006245 ** 
## x1          -2.354e-03  6.690e-04  -3.518 0.001937 ** 
## x2           1.873e-03  1.248e-04  15.012 4.83e-13 ***
## x3          -1.036e-02  1.745e-03  -5.934 5.68e-06 ***
## x4           1.227e-03  2.910e-04   4.215 0.000357 ***
## x5           1.844e-03  3.076e-04   5.996 4.92e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.028 on 22 degrees of freedom
## Multiple R-squared:  0.9991, Adjusted R-squared:  0.9989 
## F-statistic:  4806 on 5 and 22 DF,  p-value: < 2.2e-16

y_pow_res <- residuals(lm.pow_exam)
y_pow_rst <- rstandard(lm.pow_step)
print(y_pow_rst)

##           1           2           3           4           5           6 
##  1.49588065  0.92939276  0.30536070 -0.65515301 -0.91302808 -0.18879126 
##           7           8           9          10          11          12 
## -0.88061559 -1.04741169  0.14437007  0.16568720  0.06242963 -0.09914015 
##          13          14          15          16          17          18 
## -1.17687416  0.14684448  0.32466948 -0.57007516  0.65143063  0.39253518 
##          19          20          21          22          23          24 
##  0.14519465  1.63392016 -0.08666217 -1.28922313  1.38180593 -1.95877484 
##          25          26          27          28 
##  2.87130943 -0.92710170  0.07778902 -1.19170466

y_pow_fit <- predict(lm.pow_step)
plot(y_pow_res ~ y_pow_fit)# 对幂变换后的模型进行诊断

lm.pow_exam_no_outliers <- lm(y_pow ~ x1 + x2 + x3 + x4 + x5 + x6, data = d2.5[-c(25),])
lm.pow_step_no_outliers <- step(lm.pow_exam_no_outliers, direction = "both")

## Start:  AIC=55.33
## y_pow ~ x1 + x2 + x3 + x4 + x5 + x6
## 
##        Df Sum of Sq     RSS     AIC
## - x6    1      1.34  126.14  53.622
## <none>               124.80  55.334
## - x1    1     59.72  184.52  63.892
## - x4    1     77.01  201.81  66.310
## - x5    1    322.82  447.61  87.819
## - x3    1    326.76  451.56  88.056
## - x2    1   2073.42 2198.22 130.788
## 
## Step:  AIC=53.62
## y_pow ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq     RSS     AIC
## <none>               126.14  53.622
## + x6    1      1.34  124.80  55.334
## - x1    1     59.36  185.50  62.035
## - x4    1     88.50  214.64  65.974
## - x5    1    366.15  492.29  88.387
## - x3    1    384.90  511.04  89.397
## - x2    1   2079.36 2205.50 128.878

y_pow_rst_no_outliers <- rstandard(lm.pow_step_no_outliers)
y_pow_fit_no_outliers <- predict(lm.pow_step_no_outliers)
plot(y_pow_rst_no_outliers ~ y_pow_fit_no_outliers)# 去掉异常点后重新建立幂变换的全变量回归方程

plot(lm.pow_step_no_outliers)  # 绘制幂变换后的模型诊断图

influence.measures(lm.pow_step_no_outliers)  # 计算幂变换后的模型各个观测值的诊断统计量

## Influence measures of
##   lm(formula = y_pow ~ x1 + x2 + x3 + x4 + x5, data = d2.5[-c(25),      ]) :
## 
##       dfb.1_    dfb.x1    dfb.x2   dfb.x3    dfb.x4   dfb.x5   dffit cov.r
## 1   0.757393  0.319322  0.246356 -0.45476 -0.720501  0.26208  1.0178 1.094
## 2   0.292115  0.069651  0.119650 -0.16270 -0.270852  0.10436  0.4932 1.354
## 3   0.049726 -0.004507  0.024818 -0.02133 -0.043680  0.01734  0.1262 1.553
## 4  -0.088654  0.045518 -0.034767  0.05701  0.073544 -0.07198 -0.3190 1.251
## 5  -0.040533  0.116468 -0.022532  0.06304  0.022531 -0.11365 -0.3926 1.054
## 6   0.000314  0.012870  0.004143 -0.00183 -0.002525 -0.00553 -0.0539 1.443
## 7   0.114389  0.160489  0.028713 -0.07439 -0.125414 -0.00355 -0.2940 1.169
## 8   0.258694  0.264516  0.027728 -0.21973 -0.270646  0.10458 -0.3858 1.225
## 9  -0.391487 -0.324890 -0.035145  0.40986  0.402806 -0.27550  0.5053 1.522
## 10 -0.089515 -0.019292 -0.037569  0.15996  0.091306 -0.14797  0.2032 1.492
## 11  0.001904  0.009256 -0.004551  0.00698 -0.002070 -0.01001  0.0169 1.667
## 12 -0.075010 -0.165250  0.069392 -0.02960  0.079004  0.08184 -0.2156 1.756
## 13 -0.285426 -0.603299  0.253603  0.01038  0.301991  0.17860 -0.7953 0.696
## 14  0.004412  0.030153 -0.024495  0.00911 -0.005623 -0.01498  0.0531 1.572
## 15 -0.035517  0.035375 -0.085665  0.02317  0.032159 -0.01212  0.1597 1.404
## 16  0.081347  0.040912  0.065612  0.00493 -0.079274 -0.04330 -0.1793 1.372
## 17 -0.096076 -0.064693 -0.122743 -0.12888  0.091489  0.20229  0.3641 1.232
## 18 -0.046121 -0.064151 -0.094749 -0.17426  0.043043  0.24278  0.3173 1.668
## 19 -0.123718 -0.136910 -0.016293 -0.05040  0.123580  0.11592  0.2432 1.566
## 20  0.469136  0.562607  0.000827 -0.63176 -0.491494  0.39225  0.9361 0.684
## 21 -0.000725  0.000437 -0.008586  0.00792  0.000831 -0.00485 -0.0186 1.544
## 22  0.219054  0.338872 -0.471770  0.02096 -0.227540  0.01347 -0.6735 1.119
## 23  0.063993 -0.028962  1.199319  0.10800 -0.052856 -0.56638  1.4333 0.585
## 24 -1.071867 -0.762752 -1.501760  0.77392  1.070560  0.09274 -2.1539 0.251
## 26  0.067176  0.078518 -0.014785 -0.06446 -0.069212  0.03317 -0.1531 1.797
## 27 -0.188844 -0.112934  0.069702  0.60049  0.196731 -0.56898  1.0619 1.581
## 28 -0.266604 -0.204879  0.638269  0.31201  0.275723 -0.52404 -1.1266 4.429
##      cook.d    hat inf
## 1  1.64e-01 0.3292    
## 2  4.09e-02 0.2255    
## 3  2.77e-03 0.1586    
## 4  1.72e-02 0.1276    
## 5  2.54e-02 0.1113    
## 6  5.07e-04 0.0806    
## 7  1.45e-02 0.0965    
## 8  2.50e-02 0.1486    
## 9  4.32e-02 0.2769    
## 10 7.15e-03 0.1586    
## 11 5.02e-05 0.1962    
## 12 8.09e-03 0.2657    
## 13 9.61e-02 0.1729    
## 14 4.94e-04 0.1516    
## 15 4.41e-03 0.1054    
## 16 5.55e-03 0.1020    
## 17 2.23e-02 0.1415    
## 18 1.74e-02 0.2617    
## 19 1.02e-02 0.2017    
## 20 1.32e-01 0.2110    
## 21 6.03e-05 0.1329    
## 22 7.38e-02 0.2294    
## 23 2.94e-01 0.3154   *
## 24 5.71e-01 0.3546   *
## 26 4.09e-03 0.2683    
## 27 1.84e-01 0.4404    
## 28 2.17e-01 0.7357   *

### 23,24为异常点货强影响点

## 回归预测
summary(lm.step)# 根据历史数据,回归预测给定解释变量x2=210000,x3=40000,x4=94，x5=5000，利用前面得到的回归模型对y进行点预测和区间预测

## 
## Call:
## lm(formula = log(y) ~ x1 + x2, data = d2.3[-c(11, 30, 40), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34244 -0.10831  0.00698  0.10684  0.48860 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.420e+00  1.109e-01  48.878 3.52e-15 ***
## x1          -7.402e-05  3.615e-05  -2.048 0.063132 .  
## x2           3.066e-04  5.879e-05   5.216 0.000216 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2266 on 12 degrees of freedom
## Multiple R-squared:  0.9412, Adjusted R-squared:  0.9315 
## F-statistic: 96.12 on 2 and 12 DF,  p-value: 4.113e-08

preds <- data.frame( x1=58000,x2=210000)#给定解释变量x1和x2的值
predict(lm.step,newdata=preds,interval="prediction",level=0.95)#进行点预测和区间预

##        fit      lwr      upr
## 1 65.51582 43.15142 87.88023

### 分析：根据模型 lm.step 和给定的解释变量值x1=58000,x2=210000），因变量y 的预测值是65.51582，有95%的信心认为真实的 y 值会落在【43.15142,87.88023】之间。
### 结论：根据线性回归分析，我们得到了一个以x1、x2、x3、x4、x5和x6作为自变量。模型的R平方值高达0.9985，而F统计量和对应的p值也显示模型整体是极其显著的。在逐步回归过程中，模型保留了x1、x2、x4和x5，它们的p值远小于0.05，显示了它们对y的显著影响。最终，模型的预测值为65.51582。考虑到y值的预期增长特性，对数变换可能不适用，因为它可能会压缩数据的范围并改变数据的分布。因此，我尝试使用别的方法，即幂变换，但结果仍然不佳

总结：在分析数据集时，虽然结果都存在一定问题，但我掌握了多元线性回归的流程，包括结果解读和图形分析。除了对数变换，我还在2.5中尝试了幂变换来处理数据，结果有一定改善，以期捕捉非线性关系并提升模型的预测能力。逐步回归帮助识别了对因变量y有显著影响的变量，而残差分析和异常值检测则进一步验证了模型的稳健性。

作业2

2024-11-21

2.3

本题逐年递增

2.4

本题有一个0-1变量

2.5