4.1 变量之间的关系分析
4.1.1 简单相关分析的R计算
x1=c(171,175,159,155,152,158,154,164,168,166,159,164)
x2=c(57,64,41,38,35,44,41,51,57,49,47,46)
plot(x1,x2)

lxy<-function(x,y){sum(x*y)-sum(x)*sum(y)/length(x)}
lxy(x1,x1)
## [1] 556.9167
lxy(x2,x2)
## [1] 813
lxy(x1,x2)
## [1] 645.5
cor(x1,x2)
## [1] 0.9593031
r=lxy(x1,x2)/sqrt(lxy(x1,x1)*lxy(x2,x2))
n=length(x1)
tr=r/sqrt((1-r^2)/(n-2))
tr
## [1] 10.74298
cor.test(x1,x2)
##
## Pearson's product-moment correlation
##
## data: x1 and x2
## t = 10.743, df = 10, p-value = 8.21e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8574875 0.9888163
## sample estimates:
## cor
## 0.9593031
4.1.2 简单回归分析的R计算
x=x1
y=x2
b=lxy(x,y)/lxy(x,x)
a=mean(y)-b*mean(x)
c(a=a,b=b)
## a b
## -140.36436 1.15906
plot(x,y)
lines(x,a+b*x)

SST=lxy(y,y)
SSR=b*lxy(x,y)
SSE=SST-SSR
MSR=SSR/1
MSE=SSE/(n-2)
F= MSR/MSE
c(SST=SST,SSR=SSR,SSE=SSE,MSR=MSR,MSE=MSE,F=F)
## SST SSR SSE MSR MSE F
## 813.000000 748.173425 64.826575 748.173425 6.482657 115.411531
sy.x=sqrt(MSE)
sb=sy.x/sqrt(lxy(x,x))
t=b/sb
ta=qt(1-0.05/2,n-2)
c(sy.x=sy.x,sb=sb,t=t,ta=ta)
## sy.x sb t ta
## 2.5461063 0.1078901 10.7429759 2.2281389
d4.3 <- read.table("clipboard",header = T)
d4.3=as.data.frame(d4.3)
rownames(d4.3 ) = as.matrix(d4.3[,1])
d4.3 <- d4.3[,-1]
head(d4.3)
## y x
## 1978 11.326 5.193
## 1979 11.464 5.378
## 1980 11.599 5.717
## 1981 11.758 6.299
## 1982 12.123 7.000
## 1983 18.670 7.556
fm=lm(y~x,data=d4.3)
fm
##
## Call:
## lm(formula = y ~ x, data = d4.3)
##
## Coefficients:
## (Intercept) x
## -1.197 1.116
summary(lm(y~x))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.721 -1.699 0.210 1.807 3.074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -140.3644 17.5026 -8.02 1.15e-05 ***
## x 1.1591 0.1079 10.74 8.21e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.546 on 10 degrees of freedom
## Multiple R-squared: 0.9203, Adjusted R-squared: 0.9123
## F-statistic: 115.4 on 1 and 10 DF, p-value: 8.21e-07
plot(y~x,data=d4.3)
abline(fm)

anova(fm)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 712077 712077 27427 < 2.2e-16 ***
## Residuals 29 753 26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fm)
##
## Call:
## lm(formula = y ~ x, data = d4.3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.631 -3.692 -1.535 5.338 11.432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.19660 1.16126 -1.03 0.311
## x 1.11623 0.00674 165.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.095 on 29 degrees of freedom
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9989
## F-statistic: 2.743e+04 on 1 and 29 DF, p-value: < 2.2e-16
4.2 多元线性回归分析
4.2.1 多元线性回归模型的建立
library(readxl)
d4.4=read_excel('C:/Users/12290/Desktop/多元统计表格/mvstats5.xlsx','d4.4')
plot(d4.4,gap=0)
pairs(d4.4,gap=0)

fm=lm(y~x1+x2+x3+x4,data=d4.4)
fm
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d4.4)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4
## 23.5321088 -0.0033866 1.1641150 0.0002919 -0.0437416
summary(fm)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = d4.4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0229 -2.1354 0.3297 1.2639 6.9690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.5321088 4.5990714 5.117 2.47e-05 ***
## x1 -0.0033866 0.0080749 -0.419 0.678
## x2 1.1641150 0.0404889 28.751 < 2e-16 ***
## x3 0.0002919 0.0085527 0.034 0.973
## x4 -0.0437416 0.0092638 -4.722 7.00e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.79 on 26 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9997
## F-statistic: 2.289e+04 on 4 and 26 DF, p-value: < 2.2e-16
summary(fm)$fstat
## value numdf dendf
## 22893 4 26
案例4
Case4=read_excel('C:/Users/12290/Desktop/多元统计表格/mvcase5.xlsx','Case4');
Case4=as.data.frame(Case4)
summary(Case4)
## y x1 x2 x3
## Min. : 1146 Min. : 4038 Min. : 7668 Min. :41024
## 1st Qu.: 1643 1st Qu.: 7207 1st Qu.: 66040 1st Qu.:48197
## Median : 2665 Median :16918 Median : 96934 Median :55329
## Mean : 3896 Mean :28471 Mean : 93109 Mean :57401
## 3rd Qu.: 5218 3rd Qu.:46670 3rd Qu.:122000 3rd Qu.:67199
## Max. :11444 Max. :80423 Max. :138948 Max. :70586
## x4 x5 x6 x7
## Min. : 849.4 Min. : 19.81 Min. : 281 Min. : 197
## 1st Qu.: 1832.9 1st Qu.: 31.14 1st Qu.: 1215 1st Qu.: 327
## Median : 4517.0 Median :102.26 Median : 5196 Median : 762
## Mean : 9535.6 Mean :218.84 Mean :14871 Mean :1149
## 3rd Qu.:17042.1 3rd Qu.:432.13 3rd Qu.:21519 3rd Qu.:1746
## Max. :29854.7 Max. :644.08 Max. :59622 Max. :3143
## x8 x9
## Min. : 1800 Min. :102.0
## 1st Qu.: 3376 1st Qu.:117.7
## Median : 8101 Median :203.4
## Mean :11244 Mean :214.7
## 3rd Qu.:16265 3rd Qu.:310.2
## Max. :31135 Max. :380.8
cor(Case4)
## y x1 x2 x3 x4 x5 x6
## y 1.0000000 0.9852472 0.7718054 0.8336692 0.9866719 0.9383222 0.9954148
## x1 0.9852472 1.0000000 0.8254191 0.8645965 0.9969160 0.9782518 0.9848198
## x2 0.7718054 0.8254191 1.0000000 0.8733881 0.8077384 0.8392165 0.7607780
## x3 0.8336692 0.8645965 0.8733881 1.0000000 0.8452713 0.8555522 0.7999185
## x4 0.9866719 0.9969160 0.8077384 0.8452713 1.0000000 0.9775889 0.9858089
## x5 0.9383222 0.9782518 0.8392165 0.8555522 0.9775889 1.0000000 0.9398166
## x6 0.9954148 0.9848198 0.7607780 0.7999185 0.9858089 0.9398166 1.0000000
## x7 0.9865880 0.9994515 0.8231476 0.8641402 0.9953453 0.9738288 0.9856809
## x8 0.9909723 0.9971702 0.8232053 0.8688257 0.9924830 0.9641020 0.9879218
## x9 0.9340695 0.9751552 0.8862737 0.9263941 0.9622229 0.9720657 0.9259696
## x7 x8 x9
## y 0.9865880 0.9909723 0.9340695
## x1 0.9994515 0.9971702 0.9751552
## x2 0.8231476 0.8232053 0.8862737
## x3 0.8641402 0.8688257 0.9263941
## x4 0.9953453 0.9924830 0.9622229
## x5 0.9738288 0.9641020 0.9720657
## x6 0.9856809 0.9879218 0.9259696
## x7 1.0000000 0.9986034 0.9744839
## x8 0.9986034 1.0000000 0.9697254
## x9 0.9744839 0.9697254 1.0000000
plot(Case4,gap=0)

fm=lm(y~.,data=Case4);summary(fm)
##
## Call:
## lm(formula = y ~ ., data = Case4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -281.99 -63.28 26.90 90.62 200.81
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.432e+02 6.283e+02 -0.228 0.8239
## x1 -1.139e-01 1.032e-01 -1.104 0.2933
## x2 -4.423e-03 2.848e-03 -1.553 0.1487
## x3 3.037e-02 1.585e-02 1.916 0.0817 .
## x4 2.292e-01 7.381e-02 3.105 0.0100 *
## x5 -7.918e-01 1.460e+00 -0.542 0.5983
## x6 1.164e-01 4.350e-02 2.676 0.0216 *
## x7 -1.494e+00 2.708e+00 -0.552 0.5922
## x8 3.007e-01 1.591e-01 1.890 0.0853 .
## x9 2.524e+00 6.836e+00 0.369 0.7189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 161.4 on 11 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9973
## F-statistic: 815.3 on 9 and 11 DF, p-value: 3.136e-14
sfm=step(fm);summary(sfm)
## Start: AIC=219.94
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9
##
## Df Sum of Sq RSS AIC
## - x9 1 3551 290025 218.20
## - x5 1 7662 294137 218.49
## - x7 1 7928 294402 218.51
## <none> 286474 219.94
## - x1 1 31719 318193 220.14
## - x2 1 62797 349271 222.10
## - x8 1 93076 379550 223.85
## - x3 1 95639 382113 223.99
## - x6 1 186459 472933 228.47
## - x4 1 251116 537590 231.16
##
## Step: AIC=218.2
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8
##
## Df Sum of Sq RSS AIC
## - x7 1 6328 296353 216.65
## - x5 1 10169 300194 216.92
## <none> 290025 218.20
## - x1 1 29332 319357 218.22
## - x2 1 61049 351074 220.21
## - x8 1 89933 379958 221.87
## - x3 1 130982 421007 224.02
## - x4 1 323379 613405 231.93
## - x6 1 325265 615290 231.99
##
## Step: AIC=216.65
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x8
##
## Df Sum of Sq RSS AIC
## - x5 1 8475 304829 215.24
## <none> 296353 216.65
## - x2 1 54730 351084 218.21
## - x8 1 187692 484045 224.95
## - x1 1 259293 555646 227.85
## - x3 1 290475 586828 229.00
## - x4 1 403048 699402 232.68
## - x6 1 618651 915004 238.32
##
## Step: AIC=215.24
## y ~ x1 + x2 + x3 + x4 + x6 + x8
##
## Df Sum of Sq RSS AIC
## <none> 304829 215.24
## - x2 1 64961 369789 217.30
## - x8 1 277592 582421 226.84
## - x3 1 317487 622316 228.23
## - x4 1 399680 704509 230.84
## - x1 1 557191 862020 235.07
## - x6 1 876510 1181338 241.69
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x6 + x8, data = Case4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -256.896 -66.598 -0.668 82.982 213.679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.378e+02 3.767e+02 -0.897 0.384958
## x1 -1.492e-01 2.950e-02 -5.059 0.000174 ***
## x2 -3.834e-03 2.220e-03 -1.727 0.106104
## x3 3.839e-02 1.005e-02 3.819 0.001881 **
## x4 2.157e-01 5.034e-02 4.284 0.000756 ***
## x6 1.198e-01 1.888e-02 6.345 1.81e-05 ***
## x8 2.487e-01 6.966e-02 3.571 0.003073 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 147.6 on 14 degrees of freedom
## Multiple R-squared: 0.9984, Adjusted R-squared: 0.9977
## F-statistic: 1463 on 6 and 14 DF, p-value: < 2.2e-16
plot(Case4$y);lines(sfm$fitted)
