4.1 变量之间的关系分析

4.1.1 简单相关分析的R计算

x1=c(171,175,159,155,152,158,154,164,168,166,159,164)
x2=c(57,64,41,38,35,44,41,51,57,49,47,46)
plot(x1,x2)

lxy<-function(x,y){sum(x*y)-sum(x)*sum(y)/length(x)}
lxy(x1,x1)
## [1] 556.9167
lxy(x2,x2)
## [1] 813
lxy(x1,x2)
## [1] 645.5
cor(x1,x2)
## [1] 0.9593031
r=lxy(x1,x2)/sqrt(lxy(x1,x1)*lxy(x2,x2))
n=length(x1)
tr=r/sqrt((1-r^2)/(n-2))
tr
## [1] 10.74298
cor.test(x1,x2)
## 
##  Pearson's product-moment correlation
## 
## data:  x1 and x2
## t = 10.743, df = 10, p-value = 8.21e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8574875 0.9888163
## sample estimates:
##       cor 
## 0.9593031

4.1.2 简单回归分析的R计算

x=x1
y=x2
b=lxy(x,y)/lxy(x,x)
a=mean(y)-b*mean(x)
c(a=a,b=b)
##          a          b 
## -140.36436    1.15906
plot(x,y)
lines(x,a+b*x)

SST=lxy(y,y)
SSR=b*lxy(x,y)
SSE=SST-SSR
MSR=SSR/1
MSE=SSE/(n-2)
F= MSR/MSE
c(SST=SST,SSR=SSR,SSE=SSE,MSR=MSR,MSE=MSE,F=F) 
##        SST        SSR        SSE        MSR        MSE          F 
## 813.000000 748.173425  64.826575 748.173425   6.482657 115.411531
sy.x=sqrt(MSE)
sb=sy.x/sqrt(lxy(x,x))
t=b/sb
ta=qt(1-0.05/2,n-2)
c(sy.x=sy.x,sb=sb,t=t,ta=ta) 
##       sy.x         sb          t         ta 
##  2.5461063  0.1078901 10.7429759  2.2281389
d4.3 <- read.table("clipboard",header = T)
d4.3=as.data.frame(d4.3) 
rownames(d4.3 ) = as.matrix(d4.3[,1])
d4.3 <- d4.3[,-1]
head(d4.3)
##           y     x
## 1978 11.326 5.193
## 1979 11.464 5.378
## 1980 11.599 5.717
## 1981 11.758 6.299
## 1982 12.123 7.000
## 1983 18.670 7.556
fm=lm(y~x,data=d4.3)
fm
## 
## Call:
## lm(formula = y ~ x, data = d4.3)
## 
## Coefficients:
## (Intercept)            x  
##      -1.197        1.116
summary(lm(y~x))
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.721 -1.699  0.210  1.807  3.074 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -140.3644    17.5026   -8.02 1.15e-05 ***
## x              1.1591     0.1079   10.74 8.21e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.546 on 10 degrees of freedom
## Multiple R-squared:  0.9203, Adjusted R-squared:  0.9123 
## F-statistic: 115.4 on 1 and 10 DF,  p-value: 8.21e-07
plot(y~x,data=d4.3)
abline(fm)

anova(fm)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## x          1 712077  712077   27427 < 2.2e-16 ***
## Residuals 29    753      26                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fm)
## 
## Call:
## lm(formula = y ~ x, data = d4.3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.631 -3.692 -1.535  5.338 11.432 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.19660    1.16126   -1.03    0.311    
## x            1.11623    0.00674  165.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.095 on 29 degrees of freedom
## Multiple R-squared:  0.9989, Adjusted R-squared:  0.9989 
## F-statistic: 2.743e+04 on 1 and 29 DF,  p-value: < 2.2e-16

4.2 多元线性回归分析

4.2.1 多元线性回归模型的建立

library(readxl)
d4.4=read_excel('C:/Users/12290/Desktop/多元统计表格/mvstats5.xlsx','d4.4')
plot(d4.4,gap=0)
pairs(d4.4,gap=0) 

fm=lm(y~x1+x2+x3+x4,data=d4.4)
fm
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d4.4)
## 
## Coefficients:
## (Intercept)           x1           x2           x3           x4  
##  23.5321088   -0.0033866    1.1641150    0.0002919   -0.0437416
summary(fm) 
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d4.4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0229 -2.1354  0.3297  1.2639  6.9690 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.5321088  4.5990714   5.117 2.47e-05 ***
## x1          -0.0033866  0.0080749  -0.419    0.678    
## x2           1.1641150  0.0404889  28.751  < 2e-16 ***
## x3           0.0002919  0.0085527   0.034    0.973    
## x4          -0.0437416  0.0092638  -4.722 7.00e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.79 on 26 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 2.289e+04 on 4 and 26 DF,  p-value: < 2.2e-16
summary(fm)$fstat
## value numdf dendf 
## 22893     4    26

案例4

Case4=read_excel('C:/Users/12290/Desktop/多元统计表格/mvcase5.xlsx','Case4');
Case4=as.data.frame(Case4)
summary(Case4)
##        y               x1              x2               x3       
##  Min.   : 1146   Min.   : 4038   Min.   :  7668   Min.   :41024  
##  1st Qu.: 1643   1st Qu.: 7207   1st Qu.: 66040   1st Qu.:48197  
##  Median : 2665   Median :16918   Median : 96934   Median :55329  
##  Mean   : 3896   Mean   :28471   Mean   : 93109   Mean   :57401  
##  3rd Qu.: 5218   3rd Qu.:46670   3rd Qu.:122000   3rd Qu.:67199  
##  Max.   :11444   Max.   :80423   Max.   :138948   Max.   :70586  
##        x4                x5               x6              x7      
##  Min.   :  849.4   Min.   : 19.81   Min.   :  281   Min.   : 197  
##  1st Qu.: 1832.9   1st Qu.: 31.14   1st Qu.: 1215   1st Qu.: 327  
##  Median : 4517.0   Median :102.26   Median : 5196   Median : 762  
##  Mean   : 9535.6   Mean   :218.84   Mean   :14871   Mean   :1149  
##  3rd Qu.:17042.1   3rd Qu.:432.13   3rd Qu.:21519   3rd Qu.:1746  
##  Max.   :29854.7   Max.   :644.08   Max.   :59622   Max.   :3143  
##        x8              x9       
##  Min.   : 1800   Min.   :102.0  
##  1st Qu.: 3376   1st Qu.:117.7  
##  Median : 8101   Median :203.4  
##  Mean   :11244   Mean   :214.7  
##  3rd Qu.:16265   3rd Qu.:310.2  
##  Max.   :31135   Max.   :380.8
cor(Case4)
##            y        x1        x2        x3        x4        x5        x6
## y  1.0000000 0.9852472 0.7718054 0.8336692 0.9866719 0.9383222 0.9954148
## x1 0.9852472 1.0000000 0.8254191 0.8645965 0.9969160 0.9782518 0.9848198
## x2 0.7718054 0.8254191 1.0000000 0.8733881 0.8077384 0.8392165 0.7607780
## x3 0.8336692 0.8645965 0.8733881 1.0000000 0.8452713 0.8555522 0.7999185
## x4 0.9866719 0.9969160 0.8077384 0.8452713 1.0000000 0.9775889 0.9858089
## x5 0.9383222 0.9782518 0.8392165 0.8555522 0.9775889 1.0000000 0.9398166
## x6 0.9954148 0.9848198 0.7607780 0.7999185 0.9858089 0.9398166 1.0000000
## x7 0.9865880 0.9994515 0.8231476 0.8641402 0.9953453 0.9738288 0.9856809
## x8 0.9909723 0.9971702 0.8232053 0.8688257 0.9924830 0.9641020 0.9879218
## x9 0.9340695 0.9751552 0.8862737 0.9263941 0.9622229 0.9720657 0.9259696
##           x7        x8        x9
## y  0.9865880 0.9909723 0.9340695
## x1 0.9994515 0.9971702 0.9751552
## x2 0.8231476 0.8232053 0.8862737
## x3 0.8641402 0.8688257 0.9263941
## x4 0.9953453 0.9924830 0.9622229
## x5 0.9738288 0.9641020 0.9720657
## x6 0.9856809 0.9879218 0.9259696
## x7 1.0000000 0.9986034 0.9744839
## x8 0.9986034 1.0000000 0.9697254
## x9 0.9744839 0.9697254 1.0000000
plot(Case4,gap=0)

fm=lm(y~.,data=Case4);summary(fm)
## 
## Call:
## lm(formula = y ~ ., data = Case4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -281.99  -63.28   26.90   90.62  200.81 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -1.432e+02  6.283e+02  -0.228   0.8239  
## x1          -1.139e-01  1.032e-01  -1.104   0.2933  
## x2          -4.423e-03  2.848e-03  -1.553   0.1487  
## x3           3.037e-02  1.585e-02   1.916   0.0817 .
## x4           2.292e-01  7.381e-02   3.105   0.0100 *
## x5          -7.918e-01  1.460e+00  -0.542   0.5983  
## x6           1.164e-01  4.350e-02   2.676   0.0216 *
## x7          -1.494e+00  2.708e+00  -0.552   0.5922  
## x8           3.007e-01  1.591e-01   1.890   0.0853 .
## x9           2.524e+00  6.836e+00   0.369   0.7189  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 161.4 on 11 degrees of freedom
## Multiple R-squared:  0.9985, Adjusted R-squared:  0.9973 
## F-statistic: 815.3 on 9 and 11 DF,  p-value: 3.136e-14
sfm=step(fm);summary(sfm)
## Start:  AIC=219.94
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9
## 
##        Df Sum of Sq    RSS    AIC
## - x9    1      3551 290025 218.20
## - x5    1      7662 294137 218.49
## - x7    1      7928 294402 218.51
## <none>              286474 219.94
## - x1    1     31719 318193 220.14
## - x2    1     62797 349271 222.10
## - x8    1     93076 379550 223.85
## - x3    1     95639 382113 223.99
## - x6    1    186459 472933 228.47
## - x4    1    251116 537590 231.16
## 
## Step:  AIC=218.2
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8
## 
##        Df Sum of Sq    RSS    AIC
## - x7    1      6328 296353 216.65
## - x5    1     10169 300194 216.92
## <none>              290025 218.20
## - x1    1     29332 319357 218.22
## - x2    1     61049 351074 220.21
## - x8    1     89933 379958 221.87
## - x3    1    130982 421007 224.02
## - x4    1    323379 613405 231.93
## - x6    1    325265 615290 231.99
## 
## Step:  AIC=216.65
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x8
## 
##        Df Sum of Sq    RSS    AIC
## - x5    1      8475 304829 215.24
## <none>              296353 216.65
## - x2    1     54730 351084 218.21
## - x8    1    187692 484045 224.95
## - x1    1    259293 555646 227.85
## - x3    1    290475 586828 229.00
## - x4    1    403048 699402 232.68
## - x6    1    618651 915004 238.32
## 
## Step:  AIC=215.24
## y ~ x1 + x2 + x3 + x4 + x6 + x8
## 
##        Df Sum of Sq     RSS    AIC
## <none>               304829 215.24
## - x2    1     64961  369789 217.30
## - x8    1    277592  582421 226.84
## - x3    1    317487  622316 228.23
## - x4    1    399680  704509 230.84
## - x1    1    557191  862020 235.07
## - x6    1    876510 1181338 241.69
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x6 + x8, data = Case4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -256.896  -66.598   -0.668   82.982  213.679 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.378e+02  3.767e+02  -0.897 0.384958    
## x1          -1.492e-01  2.950e-02  -5.059 0.000174 ***
## x2          -3.834e-03  2.220e-03  -1.727 0.106104    
## x3           3.839e-02  1.005e-02   3.819 0.001881 ** 
## x4           2.157e-01  5.034e-02   4.284 0.000756 ***
## x6           1.198e-01  1.888e-02   6.345 1.81e-05 ***
## x8           2.487e-01  6.966e-02   3.571 0.003073 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 147.6 on 14 degrees of freedom
## Multiple R-squared:  0.9984, Adjusted R-squared:  0.9977 
## F-statistic:  1463 on 6 and 14 DF,  p-value: < 2.2e-16
plot(Case4$y);lines(sfm$fitted)