#更改存储路径
setwd(dir="/Users/lujingni/Desktop/RData/")
library(readstata13)

#读取数据
hh <- read.dta13("hh2017_20191120_所有_version14副本.dta")

#将家庭总消费、总收入、总资产、总负债提取到一个数据框 顺序为(总消费 总支出 总资产 总负债)
total <- hh[,c(2365,2316,2315,2366)]

#以3:1的比例将总体数据随机分割为训练集与测试集
ince <- sample(x=1:40011,size=40011*0.75,replace=FALSE)
train_total <- total[ince,]
test_total <- total[-ince,]

#描述性分析
summary(train_total)
##  total_consump      total_income          asset               debt        
##  Min.   :    760   Min.   :-1000000   Min.   :       0   Min.   :      0  
##  1st Qu.:  23980   1st Qu.:   20135   1st Qu.:  101000   1st Qu.:      0  
##  Median :  42802   Median :   53612   Median :  381310   Median :      0  
##  Mean   :  59275   Mean   :   89277   Mean   : 1128325   Mean   :  56734  
##  3rd Qu.:  70910   3rd Qu.:  101776   3rd Qu.: 1094596   3rd Qu.:  13000  
##  Max.   :1000000   Max.   : 5000000   Max.   :30000000   Max.   :5000000
#描述性分析
library(psych)
describe(train_total)
##               vars     n       mean         sd median   trimmed       mad
## total_consump    1 30008   59274.91   67931.44  42802  47652.87  32363.68
## total_income     2 30008   89277.27  197079.86  53612  61778.94  57077.13
## asset            3 30008 1128325.03 2304562.63 381310 617325.05 507766.04
## debt             4 30008   56734.11  240222.35      0  10249.18      0.00
##                    min   max    range  skew kurtosis       se
## total_consump      760 1e+06   999240  5.52    50.51   392.15
## total_income  -1000000 5e+06  6000000 13.34   270.49  1137.69
## asset                0 3e+07 30000000  5.53    45.09 13303.62
## debt                 0 5e+06  5000000 10.47   152.40  1386.74
#描述性分析(相关性矩阵)
cor(train_total)
##               total_consump total_income     asset      debt
## total_consump     1.0000000    0.4050874 0.4504116 0.2387699
## total_income      0.4050874    1.0000000 0.4598174 0.2593655
## asset             0.4504116    0.4598174 1.0000000 0.2852109
## debt              0.2387699    0.2593655 0.2852109 1.0000000
corr.test(train_total,method = "spearman")
## Call:corr.test(x = train_total, method = "spearman")
## Correlation matrix 
##               total_consump total_income asset debt
## total_consump          1.00         0.57  0.53 0.14
## total_income           0.57         1.00  0.58 0.05
## asset                  0.53         0.58  1.00 0.09
## debt                   0.14         0.05  0.09 1.00
## Sample Size 
## [1] 30008
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##               total_consump total_income asset debt
## total_consump             0            0     0    0
## total_income              0            0     0    0
## asset                     0            0     0    0
## debt                      0            0     0    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
#散点图
#单变量散点图
plot(train_total$total_consump)
plot of chunk unnamed-chunk-1
plot(train_total$total_income)
plot of chunk unnamed-chunk-1
plot(train_total$asset)
plot of chunk unnamed-chunk-1
plot(train_total$debt)
plot of chunk unnamed-chunk-1
plot(sort(train_total$total_consump))
plot of chunk unnamed-chunk-1
plot(sort(train_total$total_income))
plot of chunk unnamed-chunk-1
plot(sort(train_total$asset))
plot of chunk unnamed-chunk-1
plot(sort(train_total$debt))
plot of chunk unnamed-chunk-1
#双变量散点矩阵图 如(1行,2列)的图表总支出x与总消费y的散点图
plot(train_total)
plot of chunk unnamed-chunk-1
#直方图
#income直方图
i <- train_total$total_income
hi <- hist(i,breaks = 22,col="darkred",xlab="total_income",main="Histogram with normal curve and box")
ifit <- seq(min(i),max(i),length=30000)
iyfit <- dnorm(ifit,mean=mean(i),sd=sd(i))
iyfit <- iyfit*diff(hi$mids[1:2])*length(i)
lines(ifit,iyfit,col="darkblue",lwd=2)
box()
plot of chunk unnamed-chunk-1
#consump直方图
c <- train_total$total_consump
hc <- hist(c,breaks = 80,col="darkred",xlab="total_consump",main="Histogram with normal curve and box")
cfit <- seq(min(c),max(c),length=10000000)
cyfit <- dnorm(cfit,mean=mean(c),sd=sd(c))
cyfit <- cyfit*diff(hc$mids[1:2])*length(c)
lines(cfit,cyfit,col="darkblue",lwd=2)
box()
plot of chunk unnamed-chunk-1
#asset直方图
a <- train_total$asset
ha <- hist(a,breaks = 16,col="darkred",xlab="asset",main="Histogram with normal curve and box")
afit <- seq(min(a),max(a),length=1000)
ayfit <- dnorm(afit,mean=mean(a),sd=sd(a))
ayfit <- ayfit*diff(ha$mids[1:2])*length(a)
lines(afit,ayfit,col="darkblue",lwd=2)
box()
plot of chunk unnamed-chunk-1
#debt直方图
d <- train_total$debt
hd <- hist(d,breaks = 8,col="darkred",xlab="debt",main="Histogram with normal curve and box")
dfit <- seq(min(d),max(d),length=20)
dyfit <- dnorm(dfit,mean=mean(d),sd=sd(d))
dyfit <- dyfit*diff(hd$mids[1:2])*length(d)
lines(dfit,dyfit,col="darkblue",lwd=2)
box()
plot of chunk unnamed-chunk-1
#qq图
qqnorm(train_total$total_consump)
qqline(train_total$total_consump, col=2, lwd=2)
abline(mean(train_total$total_consump), sd(train_total$total_consump), lwd=2, col="red")
plot of chunk unnamed-chunk-1
qqnorm(train_total$total_income)
plot of chunk unnamed-chunk-1
qqnorm(train_total$asset)
plot of chunk unnamed-chunk-1
qqnorm(train_total$debt)
plot of chunk unnamed-chunk-1
#检验数据是否服从正态分布
shapiro.test(train_total$total_consump)
## Error in shapiro.test(train_total$total_consump): sample size must be between 3 and 5000
#建立回归方程
#模型一:总消费=总收入+总资产+总负债
fit1 <- lm(total_consump~total_income+asset+debt,train_total)
summary(fit1)
## 
## Call:
## lm(formula = total_consump ~ total_income + asset + debt, data = train_total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -599397  -26491  -11640   11110  957468 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.006e+04  3.862e+02  103.72   <2e-16 ***
## total_income 8.166e-02  1.951e-03   41.86   <2e-16 ***
## asset        9.334e-03  1.681e-04   55.53   <2e-16 ***
## debt         2.460e-02  1.483e-03   16.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58460 on 30004 degrees of freedom
## Multiple R-squared:  0.2594,	Adjusted R-squared:  0.2593 
## F-statistic:  3503 on 3 and 30004 DF,  p-value: < 2.2e-16
#模型二:加入交互项总收入、总资产、总负债的二次项:总消费=总收入+总资产+总负债+总收入*总资产+总收入*总负债+总资产*总负债
fit2 <- lm(total_consump~total_income*asset*debt,train_total)
summary(fit2)
## 
## Call:
## lm(formula = total_consump ~ total_income * asset * debt, data = train_total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -553934  -25055  -10887   10900  960414 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.586e+04  4.229e+02  84.799   <2e-16 ***
## total_income             1.239e-01  2.805e-03  44.168   <2e-16 ***
## asset                    1.043e-02  1.821e-04  57.238   <2e-16 ***
## debt                     4.489e-02  2.220e-03  20.218   <2e-16 ***
## total_income:asset      -3.535e-09  1.890e-10 -18.708   <2e-16 ***
## total_income:debt       -3.539e-08  2.390e-09 -14.803   <2e-16 ***
## asset:debt              -2.592e-09  2.825e-10  -9.174   <2e-16 ***
## total_income:asset:debt  2.054e-15  1.365e-16  15.050   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57920 on 30000 degrees of freedom
## Multiple R-squared:  0.2732,	Adjusted R-squared:  0.2731 
## F-statistic:  1611 on 7 and 30000 DF,  p-value: < 2.2e-16
#模型三:加入交互项总收入、总资产、总负债的交互项与二次项:总消费=总收入+总资产+总负债+总收入*总资产+总收入*总负债+总资产*总负债+总收入*总资产*总负债+(总收入)^2+(总资产)^2+(总负债)^2
fit3 <- lm(total_consump~total_income*asset*debt+I(total_income^2)+I(asset^2)+I(debt^2),train_total)
summary(fit3)
## 
## Call:
## lm(formula = total_consump ~ total_income * asset * debt + I(total_income^2) + 
##     I(asset^2) + I(debt^2), data = train_total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -396576  -23854  -10192   10877  963157 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.230e+04  4.352e+02  74.212  < 2e-16 ***
## total_income             1.518e-01  3.583e-03  42.375  < 2e-16 ***
## asset                    1.243e-02  3.049e-04  40.778  < 2e-16 ***
## debt                     5.831e-02  2.993e-03  19.482  < 2e-16 ***
## I(total_income^2)       -2.426e-08  1.108e-09 -21.894  < 2e-16 ***
## I(asset^2)              -2.918e-10  1.874e-11 -15.576  < 2e-16 ***
## I(debt^2)               -9.400e-09  9.127e-10 -10.298  < 2e-16 ***
## total_income:asset       8.313e-10  2.485e-10   3.345 0.000824 ***
## total_income:debt       -2.458e-08  2.387e-09 -10.298  < 2e-16 ***
## asset:debt              -1.777e-09  2.913e-10  -6.100 1.07e-09 ***
## total_income:asset:debt  1.716e-15  1.356e-16  12.656  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57100 on 29997 degrees of freedom
## Multiple R-squared:  0.2937,	Adjusted R-squared:  0.2935 
## F-statistic:  1248 on 10 and 29997 DF,  p-value: < 2.2e-16
#模型四:加入所有项(包括二次项)的交互项
fit4 <- lm(total_consump~total_income*asset*debt*I(total_income^2)*I(asset^2)*I(debt^2),train_total)
summary(fit4)
## 
## Call:
## lm(formula = total_consump ~ total_income * asset * debt * I(total_income^2) * 
##     I(asset^2) * I(debt^2), data = train_total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -563999  -22132   -9253   10412  966919 
## 
## Coefficients:
##                                                                  Estimate
## (Intercept)                                                     2.452e+04
## total_income                                                    2.936e-01
## asset                                                           1.436e-02
## debt                                                            1.652e-01
## I(total_income^2)                                              -2.042e-07
## I(asset^2)                                                     -3.868e-10
## I(debt^2)                                                      -1.083e-07
## total_income:asset                                             -2.809e-08
## total_income:debt                                              -8.842e-07
## asset:debt                                                     -3.997e-08
## total_income:I(total_income^2)                                  3.323e-14
## asset:I(total_income^2)                                         3.378e-14
## debt:I(total_income^2)                                          7.381e-13
## total_income:I(asset^2)                                         4.487e-16
## asset:I(asset^2)                                                2.189e-19
## debt:I(asset^2)                                                 4.388e-15
## I(total_income^2):I(asset^2)                                   -3.787e-22
## total_income:I(debt^2)                                          6.935e-13
## asset:I(debt^2)                                                 4.465e-14
## debt:I(debt^2)                                                  1.566e-14
## I(total_income^2):I(debt^2)                                    -4.525e-19
## I(asset^2):I(debt^2)                                           -5.065e-21
## total_income:asset:debt                                         2.074e-13
## total_income:asset:I(total_income^2)                           -6.508e-21
## total_income:debt:I(total_income^2)                            -2.156e-19
## asset:debt:I(total_income^2)                                   -6.434e-20
## total_income:asset:I(asset^2)                                   1.844e-23
## total_income:debt:I(asset^2)                                   -1.296e-20
## asset:debt:I(asset^2)                                          -1.014e-22
## total_income:I(total_income^2):I(asset^2)                       8.707e-29
## asset:I(total_income^2):I(asset^2)                             -2.352e-29
## debt:I(total_income^2):I(asset^2)                              -6.022e-27
## total_income:asset:I(debt^2)                                   -2.249e-19
## total_income:debt:I(debt^2)                                    -1.091e-19
## asset:debt:I(debt^2)                                           -7.134e-21
## total_income:I(total_income^2):I(debt^2)                        1.086e-25
## asset:I(total_income^2):I(debt^2)                               5.387e-26
## debt:I(total_income^2):I(debt^2)                                5.900e-26
## total_income:I(asset^2):I(debt^2)                               1.795e-26
## asset:I(asset^2):I(debt^2)                                      1.106e-28
## debt:I(asset^2):I(debt^2)                                       7.385e-28
## I(total_income^2):I(asset^2):I(debt^2)                          1.914e-33
## total_income:asset:debt:I(total_income^2)                       2.474e-26
## total_income:asset:debt:I(asset^2)                              2.868e-28
## total_income:asset:I(total_income^2):I(asset^2)                 4.027e-36
## total_income:debt:I(total_income^2):I(asset^2)                  9.995e-34
## asset:debt:I(total_income^2):I(asset^2)                         1.971e-34
## total_income:asset:debt:I(debt^2)                               3.693e-26
## total_income:asset:I(total_income^2):I(debt^2)                 -6.824e-33
## total_income:debt:I(total_income^2):I(debt^2)                  -1.206e-32
## asset:debt:I(total_income^2):I(debt^2)                         -6.094e-33
## total_income:asset:I(asset^2):I(debt^2)                        -4.187e-34
## total_income:debt:I(asset^2):I(debt^2)                         -2.779e-33
## asset:debt:I(asset^2):I(debt^2)                                -1.108e-35
## total_income:I(total_income^2):I(asset^2):I(debt^2)            -1.484e-39
## asset:I(total_income^2):I(asset^2):I(debt^2)                   -4.280e-41
## debt:I(total_income^2):I(asset^2):I(debt^2)                    -5.096e-40
## total_income:asset:debt:I(total_income^2):I(asset^2)           -4.552e-41
## total_income:asset:debt:I(total_income^2):I(debt^2)            -2.591e-40
## total_income:asset:debt:I(asset^2):I(debt^2)                    5.528e-41
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2)       4.197e-47
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2)        3.504e-46
## asset:debt:I(total_income^2):I(asset^2):I(debt^2)               9.602e-48
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2) -8.555e-54
##                                                                Std. Error
## (Intercept)                                                     6.099e+02
## total_income                                                    9.042e-03
## asset                                                           7.250e-04
## debt                                                            1.053e-02
## I(total_income^2)                                               1.241e-08
## I(asset^2)                                                      1.106e-10
## I(debt^2)                                                       1.154e-08
## total_income:asset                                              4.855e-09
## total_income:debt                                               7.795e-08
## asset:debt                                                      7.989e-09
## total_income:I(total_income^2)                                  2.345e-15
## asset:I(total_income^2)                                         5.163e-15
## debt:I(total_income^2)                                          9.346e-14
## total_income:I(asset^2)                                         5.392e-16
## asset:I(asset^2)                                                3.482e-18
## debt:I(asset^2)                                                 1.151e-15
## I(total_income^2):I(asset^2)                                    4.797e-22
## total_income:I(debt^2)                                          7.918e-14
## asset:I(debt^2)                                                 7.048e-15
## debt:I(debt^2)                                                  2.133e-15
## I(total_income^2):I(debt^2)                                     8.662e-20
## I(asset^2):I(debt^2)                                            1.057e-21
## total_income:asset:debt                                         3.664e-14
## total_income:asset:I(total_income^2)                            9.629e-22
## total_income:debt:I(total_income^2)                             2.509e-20
## asset:debt:I(total_income^2)                                    3.658e-20
## total_income:asset:I(asset^2)                                   1.450e-23
## total_income:debt:I(asset^2)                                    4.195e-21
## asset:debt:I(asset^2)                                           3.387e-23
## total_income:I(total_income^2):I(asset^2)                       8.533e-29
## asset:I(total_income^2):I(asset^2)                              1.154e-29
## debt:I(total_income^2):I(asset^2)                               3.715e-27
## total_income:asset:I(debt^2)                                    3.119e-20
## total_income:debt:I(debt^2)                                     1.425e-20
## asset:debt:I(debt^2)                                            1.397e-21
## total_income:I(total_income^2):I(debt^2)                        2.346e-26
## asset:I(total_income^2):I(debt^2)                               3.132e-26
## debt:I(total_income^2):I(debt^2)                                1.562e-26
## total_income:I(asset^2):I(debt^2)                               3.327e-27
## asset:I(asset^2):I(debt^2)                                      3.223e-29
## debt:I(asset^2):I(debt^2)                                       2.473e-28
## I(total_income^2):I(asset^2):I(debt^2)                          2.945e-33
## total_income:asset:debt:I(total_income^2)                       7.387e-27
## total_income:asset:debt:I(asset^2)                              1.120e-28
## total_income:asset:I(total_income^2):I(asset^2)                 1.976e-36
## total_income:debt:I(total_income^2):I(asset^2)                  6.496e-34
## asset:debt:I(total_income^2):I(asset^2)                         9.039e-35
## total_income:asset:debt:I(debt^2)                               5.362e-27
## total_income:asset:I(total_income^2):I(debt^2)                  6.860e-33
## total_income:debt:I(total_income^2):I(debt^2)                   4.465e-33
## asset:debt:I(total_income^2):I(debt^2)                          5.168e-33
## total_income:asset:I(asset^2):I(debt^2)                         9.249e-35
## total_income:debt:I(asset^2):I(debt^2)                          6.527e-34
## asset:debt:I(asset^2):I(debt^2)                                 8.937e-36
## total_income:I(total_income^2):I(asset^2):I(debt^2)             5.871e-40
## asset:I(total_income^2):I(asset^2):I(debt^2)                    7.246e-41
## debt:I(total_income^2):I(asset^2):I(debt^2)                     4.771e-40
## total_income:asset:debt:I(total_income^2):I(asset^2)            1.523e-41
## total_income:asset:debt:I(total_income^2):I(debt^2)             1.209e-39
## total_income:asset:debt:I(asset^2):I(debt^2)                    2.281e-41
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2)       1.385e-47
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2)        1.046e-46
## asset:debt:I(total_income^2):I(asset^2):I(debt^2)               1.353e-47
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2)  2.599e-54
##                                                                t value Pr(>|t|)
## (Intercept)                                                     40.199  < 2e-16
## total_income                                                    32.473  < 2e-16
## asset                                                           19.804  < 2e-16
## debt                                                            15.684  < 2e-16
## I(total_income^2)                                              -16.458  < 2e-16
## I(asset^2)                                                      -3.497 0.000472
## I(debt^2)                                                       -9.382  < 2e-16
## total_income:asset                                              -5.786 7.29e-09
## total_income:debt                                              -11.344  < 2e-16
## asset:debt                                                      -5.003 5.69e-07
## total_income:I(total_income^2)                                  14.167  < 2e-16
## asset:I(total_income^2)                                          6.541 6.20e-11
## debt:I(total_income^2)                                           7.898 2.93e-15
## total_income:I(asset^2)                                          0.832 0.405302
## asset:I(asset^2)                                                 0.063 0.949861
## debt:I(asset^2)                                                  3.814 0.000137
## I(total_income^2):I(asset^2)                                    -0.789 0.429858
## total_income:I(debt^2)                                           8.760  < 2e-16
## asset:I(debt^2)                                                  6.336 2.40e-10
## debt:I(debt^2)                                                   7.345 2.11e-13
## I(total_income^2):I(debt^2)                                     -5.224 1.76e-07
## I(asset^2):I(debt^2)                                            -4.790 1.67e-06
## total_income:asset:debt                                          5.662 1.51e-08
## total_income:asset:I(total_income^2)                            -6.759 1.42e-11
## total_income:debt:I(total_income^2)                             -8.594  < 2e-16
## asset:debt:I(total_income^2)                                    -1.759 0.078595
## total_income:asset:I(asset^2)                                    1.271 0.203664
## total_income:debt:I(asset^2)                                    -3.089 0.002013
## asset:debt:I(asset^2)                                           -2.992 0.002773
## total_income:I(total_income^2):I(asset^2)                        1.020 0.307557
## asset:I(total_income^2):I(asset^2)                              -2.038 0.041522
## debt:I(total_income^2):I(asset^2)                               -1.621 0.105041
## total_income:asset:I(debt^2)                                    -7.212 5.66e-13
## total_income:debt:I(debt^2)                                     -7.657 1.96e-14
## asset:debt:I(debt^2)                                            -5.106 3.30e-07
## total_income:I(total_income^2):I(debt^2)                         4.627 3.73e-06
## asset:I(total_income^2):I(debt^2)                                1.720 0.085474
## debt:I(total_income^2):I(debt^2)                                 3.778 0.000158
## total_income:I(asset^2):I(debt^2)                                5.395 6.91e-08
## asset:I(asset^2):I(debt^2)                                       3.430 0.000604
## debt:I(asset^2):I(debt^2)                                        2.986 0.002830
## I(total_income^2):I(asset^2):I(debt^2)                           0.650 0.515572
## total_income:asset:debt:I(total_income^2)                        3.349 0.000812
## total_income:asset:debt:I(asset^2)                               2.560 0.010466
## total_income:asset:I(total_income^2):I(asset^2)                  2.038 0.041572
## total_income:debt:I(total_income^2):I(asset^2)                   1.539 0.123911
## asset:debt:I(total_income^2):I(asset^2)                          2.181 0.029210
## total_income:asset:debt:I(debt^2)                                6.887 5.83e-12
## total_income:asset:I(total_income^2):I(debt^2)                  -0.995 0.319851
## total_income:debt:I(total_income^2):I(debt^2)                   -2.702 0.006892
## asset:debt:I(total_income^2):I(debt^2)                          -1.179 0.238352
## total_income:asset:I(asset^2):I(debt^2)                         -4.527 6.01e-06
## total_income:debt:I(asset^2):I(debt^2)                          -4.258 2.07e-05
## asset:debt:I(asset^2):I(debt^2)                                 -1.240 0.215012
## total_income:I(total_income^2):I(asset^2):I(debt^2)             -2.528 0.011487
## asset:I(total_income^2):I(asset^2):I(debt^2)                    -0.591 0.554729
## debt:I(total_income^2):I(asset^2):I(debt^2)                     -1.068 0.285462
## total_income:asset:debt:I(total_income^2):I(asset^2)            -2.990 0.002796
## total_income:asset:debt:I(total_income^2):I(debt^2)             -0.214 0.830300
## total_income:asset:debt:I(asset^2):I(debt^2)                     2.423 0.015393
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2)        3.031 0.002443
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2)         3.350 0.000808
## asset:debt:I(total_income^2):I(asset^2):I(debt^2)                0.710 0.477795
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2)  -3.291 0.000998
##                                                                   
## (Intercept)                                                    ***
## total_income                                                   ***
## asset                                                          ***
## debt                                                           ***
## I(total_income^2)                                              ***
## I(asset^2)                                                     ***
## I(debt^2)                                                      ***
## total_income:asset                                             ***
## total_income:debt                                              ***
## asset:debt                                                     ***
## total_income:I(total_income^2)                                 ***
## asset:I(total_income^2)                                        ***
## debt:I(total_income^2)                                         ***
## total_income:I(asset^2)                                           
## asset:I(asset^2)                                                  
## debt:I(asset^2)                                                ***
## I(total_income^2):I(asset^2)                                      
## total_income:I(debt^2)                                         ***
## asset:I(debt^2)                                                ***
## debt:I(debt^2)                                                 ***
## I(total_income^2):I(debt^2)                                    ***
## I(asset^2):I(debt^2)                                           ***
## total_income:asset:debt                                        ***
## total_income:asset:I(total_income^2)                           ***
## total_income:debt:I(total_income^2)                            ***
## asset:debt:I(total_income^2)                                   .  
## total_income:asset:I(asset^2)                                     
## total_income:debt:I(asset^2)                                   ** 
## asset:debt:I(asset^2)                                          ** 
## total_income:I(total_income^2):I(asset^2)                         
## asset:I(total_income^2):I(asset^2)                             *  
## debt:I(total_income^2):I(asset^2)                                 
## total_income:asset:I(debt^2)                                   ***
## total_income:debt:I(debt^2)                                    ***
## asset:debt:I(debt^2)                                           ***
## total_income:I(total_income^2):I(debt^2)                       ***
## asset:I(total_income^2):I(debt^2)                              .  
## debt:I(total_income^2):I(debt^2)                               ***
## total_income:I(asset^2):I(debt^2)                              ***
## asset:I(asset^2):I(debt^2)                                     ***
## debt:I(asset^2):I(debt^2)                                      ** 
## I(total_income^2):I(asset^2):I(debt^2)                            
## total_income:asset:debt:I(total_income^2)                      ***
## total_income:asset:debt:I(asset^2)                             *  
## total_income:asset:I(total_income^2):I(asset^2)                *  
## total_income:debt:I(total_income^2):I(asset^2)                    
## asset:debt:I(total_income^2):I(asset^2)                        *  
## total_income:asset:debt:I(debt^2)                              ***
## total_income:asset:I(total_income^2):I(debt^2)                    
## total_income:debt:I(total_income^2):I(debt^2)                  ** 
## asset:debt:I(total_income^2):I(debt^2)                            
## total_income:asset:I(asset^2):I(debt^2)                        ***
## total_income:debt:I(asset^2):I(debt^2)                         ***
## asset:debt:I(asset^2):I(debt^2)                                   
## total_income:I(total_income^2):I(asset^2):I(debt^2)            *  
## asset:I(total_income^2):I(asset^2):I(debt^2)                      
## debt:I(total_income^2):I(asset^2):I(debt^2)                       
## total_income:asset:debt:I(total_income^2):I(asset^2)           ** 
## total_income:asset:debt:I(total_income^2):I(debt^2)               
## total_income:asset:debt:I(asset^2):I(debt^2)                   *  
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2)      ** 
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2)       ***
## asset:debt:I(total_income^2):I(asset^2):I(debt^2)                 
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2) ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55910 on 29944 degrees of freedom
## Multiple R-squared:  0.3241,	Adjusted R-squared:  0.3227 
## F-statistic: 227.9 on 63 and 29944 DF,  p-value: < 2.2e-16
#可以看到调整后的R方随着解释变量的增加而增加,即模型的解释能力随着解释变量的增加而增加


#运用赤池信息准则AIC来进行模型比较并选择较优模型(若AIC下降则认为增加解释变量是合理的)
AIC(fit1,fit2)
##      df      AIC
## fit1  5 743911.2
## fit2  9 743351.6
AIC(fit2,fit3)
##      df      AIC
## fit2  9 743351.6
## fit3 12 742499.0
#由对比可知,包含解释变量越多的模型AIC越小,意味着增加解释变量是合理的,因此认为AIC更小的模型更优。由下结果可得模型四最优,但通过观察模型四的F统计量通过显著性检验(方程的总体解释能力显著),但存在许多变量的t统计量未通过显著性检验(个别变量的解释能力不显著),初步判断模型四具有严重的多重共线性,且修正困难,因此排除模型四,选择模型三fit3

#多重共线性
#方差膨胀因子 一般方差膨胀因子^(1/2)>2 就代表存在多重共线性问题
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
vif(fit3)
##            total_income                   asset                    debt 
##                4.589650                4.545677                4.758288 
##       I(total_income^2)              I(asset^2)               I(debt^2) 
##                5.071612                5.365887                4.200702 
##      total_income:asset       total_income:debt              asset:debt 
##                5.386369                3.819041                4.243002 
## total_income:asset:debt 
##                4.372066
#若返回值为F 则不存在多重共线性
sqrt(vif(fit3))>2
##            total_income                   asset                    debt 
##                    TRUE                    TRUE                    TRUE 
##       I(total_income^2)              I(asset^2)               I(debt^2) 
##                    TRUE                    TRUE                    TRUE 
##      total_income:asset       total_income:debt              asset:debt 
##                    TRUE                   FALSE                    TRUE 
## total_income:asset:debt 
##                    TRUE
#全子集回归法消除多重共线性 图中最上面一行为消除共线性后的最优模...很难描述 宝你看不懂这个图你就来问我 我语音跟你说
library(leaps)
leaps3 <- regsubsets(total_consump~total_income*asset*debt+I(total_income^2)+I(asset^2)+I(debt^2),data=train_total,nbest=4)
plot(leaps3,scale="adjr2")
plot of chunk unnamed-chunk-1
#建立消除共线性后的模型
fit33 <- lm(total_consump~total_income+asset+debt+total_income:debt+total_income:asset:debt+I(total_income^2)+I(asset^2)+I(debt^2),train_total)

#同方差
#原假设为同方差 若p值<0.05,则认为存在异方差
ncvTest(fit33)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 41999.89, Df = 1, p = < 2.22e-16
#若散点在线的上下随机分布 则认为不存在异方差
spreadLevelPlot(fit33)
## Warning in spreadLevelPlot.lm(fit33): 
## 16 negative fitted values removed
plot of chunk unnamed-chunk-1
## 
## Suggested power transformation:  0.1442683
#怀特检验 若p值<0.05 则认为存在异方差
library(whitestrap)
## 
## Please cite as:
## Lopez, J. (2020), White's test and Bootstrapped White's test under the methodology of Jeong, J., Lee, K. (1999) package version 0.0.1
white_test(fit33)
## White's test results
## 
## Null hypothesis: Homoskedasticity of the residuals
## Alternative hypothesis: Heteroskedasticity of the residuals
## Test Statistic: 1590
## P-value: 0
#加权最小二乘构建新模型
fit333 <- lm(total_consump~total_income+asset+debt+total_income:debt+total_income:asset:debt+I(total_income^2)+I(asset^2)+I(debt^2),train_total,weights=1/abs(residuals(fit33)))
#再次怀特检验 还是存在异方差 但p值变大 说明异方差有所消除
white_test(fit333)
## White's test results
## 
## Null hypothesis: Homoskedasticity of the residuals
## Alternative hypothesis: Heteroskedasticity of the residuals
## Test Statistic: 1465.23
## P-value: 0
#综上 得到较优模型fit333
#再次观察模型 可发现模型已通过方程和变量的显著性检验 且拟合优度较好
summary(fit333)
## 
## Call:
## lm(formula = total_consump ~ total_income + asset + debt + total_income:debt + 
##     total_income:asset:debt + I(total_income^2) + I(asset^2) + 
##     I(debt^2), data = train_total, weights = 1/abs(residuals(fit33)))
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -616.58 -148.30  -92.13  114.68 1403.65 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.157e+04  7.546e+01  418.43   <2e-16 ***
## total_income             1.569e-01  8.702e-04  180.26   <2e-16 ***
## asset                    1.218e-02  7.729e-05  157.55   <2e-16 ***
## debt                     4.706e-02  9.796e-04   48.04   <2e-16 ***
## I(total_income^2)       -2.359e-08  5.847e-10  -40.35   <2e-16 ***
## I(asset^2)              -2.902e-10  6.830e-12  -42.48   <2e-16 ***
## I(debt^2)               -9.282e-09  3.226e-10  -28.77   <2e-16 ***
## total_income:debt       -3.548e-08  2.075e-09  -17.10   <2e-16 ***
## total_income:asset:debt  1.950e-15  1.052e-16   18.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 174.5 on 29999 degrees of freedom
## Multiple R-squared:  0.8819,	Adjusted R-squared:  0.8818 
## F-statistic: 2.799e+04 on 8 and 29999 DF,  p-value: < 2.2e-16
#模型分析
#密度图与轴须图  scatterplotMatrix(train_total,spread=F,lty.smooth=2,main="Scatter Plot Matrix")
#正态性(右上 Q-Q图)、独立性()、线性(左上 残差与拟合图)、同方差(左下 位置尺度图)性诊断
par(mfrow=c(2,2))
plot(fit333)
plot of chunk unnamed-chunk-1
library(car)
par(mfrow=c(1,1))
qqPlot(fit333,labels=row.names(train_total),id.method="identify",simulate=T,main="Q-Q Plot")
plot of chunk unnamed-chunk-1
## 21131 15659 
##  4861 23116
residplot333 <- function(fit333,nbreaks=10){
  z1 <- rstudent(fit333)
  hist(z1,breaks=nbreaks,freq = F,
       xlab="Studentized Residual",
       main="Distribution of Errors")
  rug(jitter(z1),col="brown")
  curve(dnorm(x,mean=mean(z1),sd=sd(z1)),
        add=T,col="darkblue",lwd=2)
  lines(density(z1)$x,density(z1)$y,
        col="darkred",lwd=2,lty=2)
  legend("topright",
         legend=c("Normal Curve","Kernel Density Curve"),
         lty=1:2,col=c("darkblue","darkred"),cex=.7)
}
residplot(fit333)
## Error in residplot(fit333): 没有"residplot"这个函数
qqnorm(fit1$residuals)
plot of chunk unnamed-chunk-1
#交互项 (effects包)  plot(effect("hp:wt",fit,list(wt=c(2.2,3.2,4.2))),multiline=T)

#异常值观测
#离群点 <0.05 离群
outlierTest(fit333)
##        rstudent unadjusted p-value Bonferroni p
## 15659 10.607549         3.0594e-26   9.1807e-22
## 21131  5.630268         1.8154e-08   5.4475e-04
## 37160  5.615966         1.9720e-08   5.9174e-04
## 1743   5.492668         3.9912e-08   1.1977e-03
## 37962  5.456029         4.9075e-08   1.4726e-03
## 36662  5.440502         5.3546e-08   1.6068e-03
## 13168  5.437496         5.4456e-08   1.6341e-03
## 4390   5.434268         5.5450e-08   1.6639e-03
## 23063  5.373722         7.7702e-08   2.3317e-03
## 10245  5.339421         9.3921e-08   2.8184e-03
#强影响点 以下Cook图反应强影响点 即去掉这些点有可能会对模型的斜率和方程造成明显变化
cutoff <- 4/(nrow(train_total)-length(fit333$coefficients)-2)
plot(fit333,which=4,cook.levels = cutoff)
abline(h=cutoff,lty=2,col="darkred")
plot of chunk unnamed-chunk-1
#拟合与预测检验
#拟合 fitted(fit333,train_total)
fittedfit333 <- data.frame(predict(fit333,train_total)-train_total)
plot(fittedfit333$total_consump,col='darkred')
plot of chunk unnamed-chunk-1
#预测 predict(fit333,test_total)
predictfit333 <- data.frame(predict(fit333,test_total)-test_total)
plot(predictfit333$total_consump,col='darkred')
plot of chunk unnamed-chunk-1