#更改存储路径
setwd(dir="/Users/lujingni/Desktop/RData/")
library(readstata13)
#读取数据
hh <- read.dta13("hh2017_20191120_所有_version14副本.dta")
#将家庭总消费、总收入、总资产、总负债提取到一个数据框 顺序为(总消费 总支出 总资产 总负债)
total <- hh[,c(2365,2316,2315,2366)]
#以3:1的比例将总体数据随机分割为训练集与测试集
ince <- sample(x=1:40011,size=40011*0.75,replace=FALSE)
train_total <- total[ince,]
test_total <- total[-ince,]
#描述性分析
summary(train_total)
## total_consump total_income asset debt
## Min. : 760 Min. :-1000000 Min. : 0 Min. : 0
## 1st Qu.: 23980 1st Qu.: 20135 1st Qu.: 101000 1st Qu.: 0
## Median : 42802 Median : 53612 Median : 381310 Median : 0
## Mean : 59275 Mean : 89277 Mean : 1128325 Mean : 56734
## 3rd Qu.: 70910 3rd Qu.: 101776 3rd Qu.: 1094596 3rd Qu.: 13000
## Max. :1000000 Max. : 5000000 Max. :30000000 Max. :5000000
#描述性分析
library(psych)
describe(train_total)
## vars n mean sd median trimmed mad
## total_consump 1 30008 59274.91 67931.44 42802 47652.87 32363.68
## total_income 2 30008 89277.27 197079.86 53612 61778.94 57077.13
## asset 3 30008 1128325.03 2304562.63 381310 617325.05 507766.04
## debt 4 30008 56734.11 240222.35 0 10249.18 0.00
## min max range skew kurtosis se
## total_consump 760 1e+06 999240 5.52 50.51 392.15
## total_income -1000000 5e+06 6000000 13.34 270.49 1137.69
## asset 0 3e+07 30000000 5.53 45.09 13303.62
## debt 0 5e+06 5000000 10.47 152.40 1386.74
#描述性分析(相关性矩阵)
cor(train_total)
## total_consump total_income asset debt
## total_consump 1.0000000 0.4050874 0.4504116 0.2387699
## total_income 0.4050874 1.0000000 0.4598174 0.2593655
## asset 0.4504116 0.4598174 1.0000000 0.2852109
## debt 0.2387699 0.2593655 0.2852109 1.0000000
corr.test(train_total,method = "spearman")
## Call:corr.test(x = train_total, method = "spearman")
## Correlation matrix
## total_consump total_income asset debt
## total_consump 1.00 0.57 0.53 0.14
## total_income 0.57 1.00 0.58 0.05
## asset 0.53 0.58 1.00 0.09
## debt 0.14 0.05 0.09 1.00
## Sample Size
## [1] 30008
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## total_consump total_income asset debt
## total_consump 0 0 0 0
## total_income 0 0 0 0
## asset 0 0 0 0
## debt 0 0 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
#散点图
#单变量散点图
plot(train_total$total_consump)
#直方图
#income直方图
i <- train_total$total_income
hi <- hist(i,breaks = 22,col="darkred",xlab="total_income",main="Histogram with normal curve and box")
ifit <- seq(min(i),max(i),length=30000)
iyfit <- dnorm(ifit,mean=mean(i),sd=sd(i))
iyfit <- iyfit*diff(hi$mids[1:2])*length(i)
lines(ifit,iyfit,col="darkblue",lwd=2)
box()
#consump直方图
c <- train_total$total_consump
hc <- hist(c,breaks = 80,col="darkred",xlab="total_consump",main="Histogram with normal curve and box")
cfit <- seq(min(c),max(c),length=10000000)
cyfit <- dnorm(cfit,mean=mean(c),sd=sd(c))
cyfit <- cyfit*diff(hc$mids[1:2])*length(c)
lines(cfit,cyfit,col="darkblue",lwd=2)
box()
#asset直方图
a <- train_total$asset
ha <- hist(a,breaks = 16,col="darkred",xlab="asset",main="Histogram with normal curve and box")
afit <- seq(min(a),max(a),length=1000)
ayfit <- dnorm(afit,mean=mean(a),sd=sd(a))
ayfit <- ayfit*diff(ha$mids[1:2])*length(a)
lines(afit,ayfit,col="darkblue",lwd=2)
box()
#debt直方图
d <- train_total$debt
hd <- hist(d,breaks = 8,col="darkred",xlab="debt",main="Histogram with normal curve and box")
dfit <- seq(min(d),max(d),length=20)
dyfit <- dnorm(dfit,mean=mean(d),sd=sd(d))
dyfit <- dyfit*diff(hd$mids[1:2])*length(d)
lines(dfit,dyfit,col="darkblue",lwd=2)
box()
#检验数据是否服从正态分布
shapiro.test(train_total$total_consump)
## Error in shapiro.test(train_total$total_consump): sample size must be between 3 and 5000
#建立回归方程
#模型一:总消费=总收入+总资产+总负债
fit1 <- lm(total_consump~total_income+asset+debt,train_total)
summary(fit1)
##
## Call:
## lm(formula = total_consump ~ total_income + asset + debt, data = train_total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -599397 -26491 -11640 11110 957468
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.006e+04 3.862e+02 103.72 <2e-16 ***
## total_income 8.166e-02 1.951e-03 41.86 <2e-16 ***
## asset 9.334e-03 1.681e-04 55.53 <2e-16 ***
## debt 2.460e-02 1.483e-03 16.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58460 on 30004 degrees of freedom
## Multiple R-squared: 0.2594, Adjusted R-squared: 0.2593
## F-statistic: 3503 on 3 and 30004 DF, p-value: < 2.2e-16
#模型二:加入交互项总收入、总资产、总负债的二次项:总消费=总收入+总资产+总负债+总收入*总资产+总收入*总负债+总资产*总负债
fit2 <- lm(total_consump~total_income*asset*debt,train_total)
summary(fit2)
##
## Call:
## lm(formula = total_consump ~ total_income * asset * debt, data = train_total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -553934 -25055 -10887 10900 960414
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.586e+04 4.229e+02 84.799 <2e-16 ***
## total_income 1.239e-01 2.805e-03 44.168 <2e-16 ***
## asset 1.043e-02 1.821e-04 57.238 <2e-16 ***
## debt 4.489e-02 2.220e-03 20.218 <2e-16 ***
## total_income:asset -3.535e-09 1.890e-10 -18.708 <2e-16 ***
## total_income:debt -3.539e-08 2.390e-09 -14.803 <2e-16 ***
## asset:debt -2.592e-09 2.825e-10 -9.174 <2e-16 ***
## total_income:asset:debt 2.054e-15 1.365e-16 15.050 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57920 on 30000 degrees of freedom
## Multiple R-squared: 0.2732, Adjusted R-squared: 0.2731
## F-statistic: 1611 on 7 and 30000 DF, p-value: < 2.2e-16
#模型三:加入交互项总收入、总资产、总负债的交互项与二次项:总消费=总收入+总资产+总负债+总收入*总资产+总收入*总负债+总资产*总负债+总收入*总资产*总负债+(总收入)^2+(总资产)^2+(总负债)^2
fit3 <- lm(total_consump~total_income*asset*debt+I(total_income^2)+I(asset^2)+I(debt^2),train_total)
summary(fit3)
##
## Call:
## lm(formula = total_consump ~ total_income * asset * debt + I(total_income^2) +
## I(asset^2) + I(debt^2), data = train_total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -396576 -23854 -10192 10877 963157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.230e+04 4.352e+02 74.212 < 2e-16 ***
## total_income 1.518e-01 3.583e-03 42.375 < 2e-16 ***
## asset 1.243e-02 3.049e-04 40.778 < 2e-16 ***
## debt 5.831e-02 2.993e-03 19.482 < 2e-16 ***
## I(total_income^2) -2.426e-08 1.108e-09 -21.894 < 2e-16 ***
## I(asset^2) -2.918e-10 1.874e-11 -15.576 < 2e-16 ***
## I(debt^2) -9.400e-09 9.127e-10 -10.298 < 2e-16 ***
## total_income:asset 8.313e-10 2.485e-10 3.345 0.000824 ***
## total_income:debt -2.458e-08 2.387e-09 -10.298 < 2e-16 ***
## asset:debt -1.777e-09 2.913e-10 -6.100 1.07e-09 ***
## total_income:asset:debt 1.716e-15 1.356e-16 12.656 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57100 on 29997 degrees of freedom
## Multiple R-squared: 0.2937, Adjusted R-squared: 0.2935
## F-statistic: 1248 on 10 and 29997 DF, p-value: < 2.2e-16
#模型四:加入所有项(包括二次项)的交互项
fit4 <- lm(total_consump~total_income*asset*debt*I(total_income^2)*I(asset^2)*I(debt^2),train_total)
summary(fit4)
##
## Call:
## lm(formula = total_consump ~ total_income * asset * debt * I(total_income^2) *
## I(asset^2) * I(debt^2), data = train_total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -563999 -22132 -9253 10412 966919
##
## Coefficients:
## Estimate
## (Intercept) 2.452e+04
## total_income 2.936e-01
## asset 1.436e-02
## debt 1.652e-01
## I(total_income^2) -2.042e-07
## I(asset^2) -3.868e-10
## I(debt^2) -1.083e-07
## total_income:asset -2.809e-08
## total_income:debt -8.842e-07
## asset:debt -3.997e-08
## total_income:I(total_income^2) 3.323e-14
## asset:I(total_income^2) 3.378e-14
## debt:I(total_income^2) 7.381e-13
## total_income:I(asset^2) 4.487e-16
## asset:I(asset^2) 2.189e-19
## debt:I(asset^2) 4.388e-15
## I(total_income^2):I(asset^2) -3.787e-22
## total_income:I(debt^2) 6.935e-13
## asset:I(debt^2) 4.465e-14
## debt:I(debt^2) 1.566e-14
## I(total_income^2):I(debt^2) -4.525e-19
## I(asset^2):I(debt^2) -5.065e-21
## total_income:asset:debt 2.074e-13
## total_income:asset:I(total_income^2) -6.508e-21
## total_income:debt:I(total_income^2) -2.156e-19
## asset:debt:I(total_income^2) -6.434e-20
## total_income:asset:I(asset^2) 1.844e-23
## total_income:debt:I(asset^2) -1.296e-20
## asset:debt:I(asset^2) -1.014e-22
## total_income:I(total_income^2):I(asset^2) 8.707e-29
## asset:I(total_income^2):I(asset^2) -2.352e-29
## debt:I(total_income^2):I(asset^2) -6.022e-27
## total_income:asset:I(debt^2) -2.249e-19
## total_income:debt:I(debt^2) -1.091e-19
## asset:debt:I(debt^2) -7.134e-21
## total_income:I(total_income^2):I(debt^2) 1.086e-25
## asset:I(total_income^2):I(debt^2) 5.387e-26
## debt:I(total_income^2):I(debt^2) 5.900e-26
## total_income:I(asset^2):I(debt^2) 1.795e-26
## asset:I(asset^2):I(debt^2) 1.106e-28
## debt:I(asset^2):I(debt^2) 7.385e-28
## I(total_income^2):I(asset^2):I(debt^2) 1.914e-33
## total_income:asset:debt:I(total_income^2) 2.474e-26
## total_income:asset:debt:I(asset^2) 2.868e-28
## total_income:asset:I(total_income^2):I(asset^2) 4.027e-36
## total_income:debt:I(total_income^2):I(asset^2) 9.995e-34
## asset:debt:I(total_income^2):I(asset^2) 1.971e-34
## total_income:asset:debt:I(debt^2) 3.693e-26
## total_income:asset:I(total_income^2):I(debt^2) -6.824e-33
## total_income:debt:I(total_income^2):I(debt^2) -1.206e-32
## asset:debt:I(total_income^2):I(debt^2) -6.094e-33
## total_income:asset:I(asset^2):I(debt^2) -4.187e-34
## total_income:debt:I(asset^2):I(debt^2) -2.779e-33
## asset:debt:I(asset^2):I(debt^2) -1.108e-35
## total_income:I(total_income^2):I(asset^2):I(debt^2) -1.484e-39
## asset:I(total_income^2):I(asset^2):I(debt^2) -4.280e-41
## debt:I(total_income^2):I(asset^2):I(debt^2) -5.096e-40
## total_income:asset:debt:I(total_income^2):I(asset^2) -4.552e-41
## total_income:asset:debt:I(total_income^2):I(debt^2) -2.591e-40
## total_income:asset:debt:I(asset^2):I(debt^2) 5.528e-41
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2) 4.197e-47
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2) 3.504e-46
## asset:debt:I(total_income^2):I(asset^2):I(debt^2) 9.602e-48
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2) -8.555e-54
## Std. Error
## (Intercept) 6.099e+02
## total_income 9.042e-03
## asset 7.250e-04
## debt 1.053e-02
## I(total_income^2) 1.241e-08
## I(asset^2) 1.106e-10
## I(debt^2) 1.154e-08
## total_income:asset 4.855e-09
## total_income:debt 7.795e-08
## asset:debt 7.989e-09
## total_income:I(total_income^2) 2.345e-15
## asset:I(total_income^2) 5.163e-15
## debt:I(total_income^2) 9.346e-14
## total_income:I(asset^2) 5.392e-16
## asset:I(asset^2) 3.482e-18
## debt:I(asset^2) 1.151e-15
## I(total_income^2):I(asset^2) 4.797e-22
## total_income:I(debt^2) 7.918e-14
## asset:I(debt^2) 7.048e-15
## debt:I(debt^2) 2.133e-15
## I(total_income^2):I(debt^2) 8.662e-20
## I(asset^2):I(debt^2) 1.057e-21
## total_income:asset:debt 3.664e-14
## total_income:asset:I(total_income^2) 9.629e-22
## total_income:debt:I(total_income^2) 2.509e-20
## asset:debt:I(total_income^2) 3.658e-20
## total_income:asset:I(asset^2) 1.450e-23
## total_income:debt:I(asset^2) 4.195e-21
## asset:debt:I(asset^2) 3.387e-23
## total_income:I(total_income^2):I(asset^2) 8.533e-29
## asset:I(total_income^2):I(asset^2) 1.154e-29
## debt:I(total_income^2):I(asset^2) 3.715e-27
## total_income:asset:I(debt^2) 3.119e-20
## total_income:debt:I(debt^2) 1.425e-20
## asset:debt:I(debt^2) 1.397e-21
## total_income:I(total_income^2):I(debt^2) 2.346e-26
## asset:I(total_income^2):I(debt^2) 3.132e-26
## debt:I(total_income^2):I(debt^2) 1.562e-26
## total_income:I(asset^2):I(debt^2) 3.327e-27
## asset:I(asset^2):I(debt^2) 3.223e-29
## debt:I(asset^2):I(debt^2) 2.473e-28
## I(total_income^2):I(asset^2):I(debt^2) 2.945e-33
## total_income:asset:debt:I(total_income^2) 7.387e-27
## total_income:asset:debt:I(asset^2) 1.120e-28
## total_income:asset:I(total_income^2):I(asset^2) 1.976e-36
## total_income:debt:I(total_income^2):I(asset^2) 6.496e-34
## asset:debt:I(total_income^2):I(asset^2) 9.039e-35
## total_income:asset:debt:I(debt^2) 5.362e-27
## total_income:asset:I(total_income^2):I(debt^2) 6.860e-33
## total_income:debt:I(total_income^2):I(debt^2) 4.465e-33
## asset:debt:I(total_income^2):I(debt^2) 5.168e-33
## total_income:asset:I(asset^2):I(debt^2) 9.249e-35
## total_income:debt:I(asset^2):I(debt^2) 6.527e-34
## asset:debt:I(asset^2):I(debt^2) 8.937e-36
## total_income:I(total_income^2):I(asset^2):I(debt^2) 5.871e-40
## asset:I(total_income^2):I(asset^2):I(debt^2) 7.246e-41
## debt:I(total_income^2):I(asset^2):I(debt^2) 4.771e-40
## total_income:asset:debt:I(total_income^2):I(asset^2) 1.523e-41
## total_income:asset:debt:I(total_income^2):I(debt^2) 1.209e-39
## total_income:asset:debt:I(asset^2):I(debt^2) 2.281e-41
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2) 1.385e-47
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2) 1.046e-46
## asset:debt:I(total_income^2):I(asset^2):I(debt^2) 1.353e-47
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2) 2.599e-54
## t value Pr(>|t|)
## (Intercept) 40.199 < 2e-16
## total_income 32.473 < 2e-16
## asset 19.804 < 2e-16
## debt 15.684 < 2e-16
## I(total_income^2) -16.458 < 2e-16
## I(asset^2) -3.497 0.000472
## I(debt^2) -9.382 < 2e-16
## total_income:asset -5.786 7.29e-09
## total_income:debt -11.344 < 2e-16
## asset:debt -5.003 5.69e-07
## total_income:I(total_income^2) 14.167 < 2e-16
## asset:I(total_income^2) 6.541 6.20e-11
## debt:I(total_income^2) 7.898 2.93e-15
## total_income:I(asset^2) 0.832 0.405302
## asset:I(asset^2) 0.063 0.949861
## debt:I(asset^2) 3.814 0.000137
## I(total_income^2):I(asset^2) -0.789 0.429858
## total_income:I(debt^2) 8.760 < 2e-16
## asset:I(debt^2) 6.336 2.40e-10
## debt:I(debt^2) 7.345 2.11e-13
## I(total_income^2):I(debt^2) -5.224 1.76e-07
## I(asset^2):I(debt^2) -4.790 1.67e-06
## total_income:asset:debt 5.662 1.51e-08
## total_income:asset:I(total_income^2) -6.759 1.42e-11
## total_income:debt:I(total_income^2) -8.594 < 2e-16
## asset:debt:I(total_income^2) -1.759 0.078595
## total_income:asset:I(asset^2) 1.271 0.203664
## total_income:debt:I(asset^2) -3.089 0.002013
## asset:debt:I(asset^2) -2.992 0.002773
## total_income:I(total_income^2):I(asset^2) 1.020 0.307557
## asset:I(total_income^2):I(asset^2) -2.038 0.041522
## debt:I(total_income^2):I(asset^2) -1.621 0.105041
## total_income:asset:I(debt^2) -7.212 5.66e-13
## total_income:debt:I(debt^2) -7.657 1.96e-14
## asset:debt:I(debt^2) -5.106 3.30e-07
## total_income:I(total_income^2):I(debt^2) 4.627 3.73e-06
## asset:I(total_income^2):I(debt^2) 1.720 0.085474
## debt:I(total_income^2):I(debt^2) 3.778 0.000158
## total_income:I(asset^2):I(debt^2) 5.395 6.91e-08
## asset:I(asset^2):I(debt^2) 3.430 0.000604
## debt:I(asset^2):I(debt^2) 2.986 0.002830
## I(total_income^2):I(asset^2):I(debt^2) 0.650 0.515572
## total_income:asset:debt:I(total_income^2) 3.349 0.000812
## total_income:asset:debt:I(asset^2) 2.560 0.010466
## total_income:asset:I(total_income^2):I(asset^2) 2.038 0.041572
## total_income:debt:I(total_income^2):I(asset^2) 1.539 0.123911
## asset:debt:I(total_income^2):I(asset^2) 2.181 0.029210
## total_income:asset:debt:I(debt^2) 6.887 5.83e-12
## total_income:asset:I(total_income^2):I(debt^2) -0.995 0.319851
## total_income:debt:I(total_income^2):I(debt^2) -2.702 0.006892
## asset:debt:I(total_income^2):I(debt^2) -1.179 0.238352
## total_income:asset:I(asset^2):I(debt^2) -4.527 6.01e-06
## total_income:debt:I(asset^2):I(debt^2) -4.258 2.07e-05
## asset:debt:I(asset^2):I(debt^2) -1.240 0.215012
## total_income:I(total_income^2):I(asset^2):I(debt^2) -2.528 0.011487
## asset:I(total_income^2):I(asset^2):I(debt^2) -0.591 0.554729
## debt:I(total_income^2):I(asset^2):I(debt^2) -1.068 0.285462
## total_income:asset:debt:I(total_income^2):I(asset^2) -2.990 0.002796
## total_income:asset:debt:I(total_income^2):I(debt^2) -0.214 0.830300
## total_income:asset:debt:I(asset^2):I(debt^2) 2.423 0.015393
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2) 3.031 0.002443
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2) 3.350 0.000808
## asset:debt:I(total_income^2):I(asset^2):I(debt^2) 0.710 0.477795
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2) -3.291 0.000998
##
## (Intercept) ***
## total_income ***
## asset ***
## debt ***
## I(total_income^2) ***
## I(asset^2) ***
## I(debt^2) ***
## total_income:asset ***
## total_income:debt ***
## asset:debt ***
## total_income:I(total_income^2) ***
## asset:I(total_income^2) ***
## debt:I(total_income^2) ***
## total_income:I(asset^2)
## asset:I(asset^2)
## debt:I(asset^2) ***
## I(total_income^2):I(asset^2)
## total_income:I(debt^2) ***
## asset:I(debt^2) ***
## debt:I(debt^2) ***
## I(total_income^2):I(debt^2) ***
## I(asset^2):I(debt^2) ***
## total_income:asset:debt ***
## total_income:asset:I(total_income^2) ***
## total_income:debt:I(total_income^2) ***
## asset:debt:I(total_income^2) .
## total_income:asset:I(asset^2)
## total_income:debt:I(asset^2) **
## asset:debt:I(asset^2) **
## total_income:I(total_income^2):I(asset^2)
## asset:I(total_income^2):I(asset^2) *
## debt:I(total_income^2):I(asset^2)
## total_income:asset:I(debt^2) ***
## total_income:debt:I(debt^2) ***
## asset:debt:I(debt^2) ***
## total_income:I(total_income^2):I(debt^2) ***
## asset:I(total_income^2):I(debt^2) .
## debt:I(total_income^2):I(debt^2) ***
## total_income:I(asset^2):I(debt^2) ***
## asset:I(asset^2):I(debt^2) ***
## debt:I(asset^2):I(debt^2) **
## I(total_income^2):I(asset^2):I(debt^2)
## total_income:asset:debt:I(total_income^2) ***
## total_income:asset:debt:I(asset^2) *
## total_income:asset:I(total_income^2):I(asset^2) *
## total_income:debt:I(total_income^2):I(asset^2)
## asset:debt:I(total_income^2):I(asset^2) *
## total_income:asset:debt:I(debt^2) ***
## total_income:asset:I(total_income^2):I(debt^2)
## total_income:debt:I(total_income^2):I(debt^2) **
## asset:debt:I(total_income^2):I(debt^2)
## total_income:asset:I(asset^2):I(debt^2) ***
## total_income:debt:I(asset^2):I(debt^2) ***
## asset:debt:I(asset^2):I(debt^2)
## total_income:I(total_income^2):I(asset^2):I(debt^2) *
## asset:I(total_income^2):I(asset^2):I(debt^2)
## debt:I(total_income^2):I(asset^2):I(debt^2)
## total_income:asset:debt:I(total_income^2):I(asset^2) **
## total_income:asset:debt:I(total_income^2):I(debt^2)
## total_income:asset:debt:I(asset^2):I(debt^2) *
## total_income:asset:I(total_income^2):I(asset^2):I(debt^2) **
## total_income:debt:I(total_income^2):I(asset^2):I(debt^2) ***
## asset:debt:I(total_income^2):I(asset^2):I(debt^2)
## total_income:asset:debt:I(total_income^2):I(asset^2):I(debt^2) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55910 on 29944 degrees of freedom
## Multiple R-squared: 0.3241, Adjusted R-squared: 0.3227
## F-statistic: 227.9 on 63 and 29944 DF, p-value: < 2.2e-16
#可以看到调整后的R方随着解释变量的增加而增加,即模型的解释能力随着解释变量的增加而增加
#运用赤池信息准则AIC来进行模型比较并选择较优模型(若AIC下降则认为增加解释变量是合理的)
AIC(fit1,fit2)
## df AIC
## fit1 5 743911.2
## fit2 9 743351.6
## df AIC
## fit2 9 743351.6
## fit3 12 742499.0
#由对比可知,包含解释变量越多的模型AIC越小,意味着增加解释变量是合理的,因此认为AIC更小的模型更优。由下结果可得模型四最优,但通过观察模型四的F统计量通过显著性检验(方程的总体解释能力显著),但存在许多变量的t统计量未通过显著性检验(个别变量的解释能力不显著),初步判断模型四具有严重的多重共线性,且修正困难,因此排除模型四,选择模型三fit3
#多重共线性
#方差膨胀因子 一般方差膨胀因子^(1/2)>2 就代表存在多重共线性问题
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## total_income asset debt
## 4.589650 4.545677 4.758288
## I(total_income^2) I(asset^2) I(debt^2)
## 5.071612 5.365887 4.200702
## total_income:asset total_income:debt asset:debt
## 5.386369 3.819041 4.243002
## total_income:asset:debt
## 4.372066
#若返回值为F 则不存在多重共线性
sqrt(vif(fit3))>2
## total_income asset debt
## TRUE TRUE TRUE
## I(total_income^2) I(asset^2) I(debt^2)
## TRUE TRUE TRUE
## total_income:asset total_income:debt asset:debt
## TRUE FALSE TRUE
## total_income:asset:debt
## TRUE
#全子集回归法消除多重共线性 图中最上面一行为消除共线性后的最优模...很难描述 宝你看不懂这个图你就来问我 我语音跟你说
library(leaps)
leaps3 <- regsubsets(total_consump~total_income*asset*debt+I(total_income^2)+I(asset^2)+I(debt^2),data=train_total,nbest=4)
plot(leaps3,scale="adjr2")
#建立消除共线性后的模型
fit33 <- lm(total_consump~total_income+asset+debt+total_income:debt+total_income:asset:debt+I(total_income^2)+I(asset^2)+I(debt^2),train_total)
#同方差
#原假设为同方差 若p值<0.05,则认为存在异方差
ncvTest(fit33)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 41999.89, Df = 1, p = < 2.22e-16
#若散点在线的上下随机分布 则认为不存在异方差
spreadLevelPlot(fit33)
## Warning in spreadLevelPlot.lm(fit33):
## 16 negative fitted values removed
##
## Suggested power transformation: 0.1442683
#怀特检验 若p值<0.05 则认为存在异方差
library(whitestrap)
## Lopez, J. (2020), White's test and Bootstrapped White's test under the methodology of Jeong, J., Lee, K. (1999) package version 0.0.1
## White's test results
##
## Null hypothesis: Homoskedasticity of the residuals
## Alternative hypothesis: Heteroskedasticity of the residuals
## Test Statistic: 1590
## P-value: 0
#加权最小二乘构建新模型
fit333 <- lm(total_consump~total_income+asset+debt+total_income:debt+total_income:asset:debt+I(total_income^2)+I(asset^2)+I(debt^2),train_total,weights=1/abs(residuals(fit33)))
#再次怀特检验 还是存在异方差 但p值变大 说明异方差有所消除
white_test(fit333)
## White's test results
##
## Null hypothesis: Homoskedasticity of the residuals
## Alternative hypothesis: Heteroskedasticity of the residuals
## Test Statistic: 1465.23
## P-value: 0
#综上 得到较优模型fit333
#再次观察模型 可发现模型已通过方程和变量的显著性检验 且拟合优度较好
summary(fit333)
##
## Call:
## lm(formula = total_consump ~ total_income + asset + debt + total_income:debt +
## total_income:asset:debt + I(total_income^2) + I(asset^2) +
## I(debt^2), data = train_total, weights = 1/abs(residuals(fit33)))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -616.58 -148.30 -92.13 114.68 1403.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.157e+04 7.546e+01 418.43 <2e-16 ***
## total_income 1.569e-01 8.702e-04 180.26 <2e-16 ***
## asset 1.218e-02 7.729e-05 157.55 <2e-16 ***
## debt 4.706e-02 9.796e-04 48.04 <2e-16 ***
## I(total_income^2) -2.359e-08 5.847e-10 -40.35 <2e-16 ***
## I(asset^2) -2.902e-10 6.830e-12 -42.48 <2e-16 ***
## I(debt^2) -9.282e-09 3.226e-10 -28.77 <2e-16 ***
## total_income:debt -3.548e-08 2.075e-09 -17.10 <2e-16 ***
## total_income:asset:debt 1.950e-15 1.052e-16 18.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 174.5 on 29999 degrees of freedom
## Multiple R-squared: 0.8819, Adjusted R-squared: 0.8818
## F-statistic: 2.799e+04 on 8 and 29999 DF, p-value: < 2.2e-16
#模型分析
#密度图与轴须图 scatterplotMatrix(train_total,spread=F,lty.smooth=2,main="Scatter Plot Matrix")
#正态性(右上 Q-Q图)、独立性()、线性(左上 残差与拟合图)、同方差(左下 位置尺度图)性诊断
par(mfrow=c(2,2))
plot(fit333)
## 21131 15659
## 4861 23116
residplot333 <- function(fit333,nbreaks=10){
z1 <- rstudent(fit333)
hist(z1,breaks=nbreaks,freq = F,
xlab="Studentized Residual",
main="Distribution of Errors")
rug(jitter(z1),col="brown")
curve(dnorm(x,mean=mean(z1),sd=sd(z1)),
add=T,col="darkblue",lwd=2)
lines(density(z1)$x,density(z1)$y,
col="darkred",lwd=2,lty=2)
legend("topright",
legend=c("Normal Curve","Kernel Density Curve"),
lty=1:2,col=c("darkblue","darkred"),cex=.7)
}
residplot(fit333)
## Error in residplot(fit333): 没有"residplot"这个函数
#交互项 (effects包) plot(effect("hp:wt",fit,list(wt=c(2.2,3.2,4.2))),multiline=T)
#异常值观测
#离群点 <0.05 离群
outlierTest(fit333)
## rstudent unadjusted p-value Bonferroni p
## 15659 10.607549 3.0594e-26 9.1807e-22
## 21131 5.630268 1.8154e-08 5.4475e-04
## 37160 5.615966 1.9720e-08 5.9174e-04
## 1743 5.492668 3.9912e-08 1.1977e-03
## 37962 5.456029 4.9075e-08 1.4726e-03
## 36662 5.440502 5.3546e-08 1.6068e-03
## 13168 5.437496 5.4456e-08 1.6341e-03
## 4390 5.434268 5.5450e-08 1.6639e-03
## 23063 5.373722 7.7702e-08 2.3317e-03
## 10245 5.339421 9.3921e-08 2.8184e-03
#强影响点 以下Cook图反应强影响点 即去掉这些点有可能会对模型的斜率和方程造成明显变化
cutoff <- 4/(nrow(train_total)-length(fit333$coefficients)-2)
plot(fit333,which=4,cook.levels = cutoff)
abline(h=cutoff,lty=2,col="darkred")