#package library
library(corrgram)
## Warning: 程辑包'corrgram'是用R版本4.1.3 来建造的
library(PerformanceAnalytics)
## Warning: 程辑包'PerformanceAnalytics'是用R版本4.1.3 来建造的
## 载入需要的程辑包:xts
## Warning: 程辑包'xts'是用R版本4.1.3 来建造的
## 载入需要的程辑包:zoo
## Warning: 程辑包'zoo'是用R版本4.1.3 来建造的
##
## 载入程辑包:'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## 载入程辑包:'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(car)
## Warning: 程辑包'car'是用R版本4.1.3 来建造的
## 载入需要的程辑包:carData
## Warning: 程辑包'carData'是用R版本4.1.3 来建造的
#导入数据
#导入数据
data <- read.delim("C:/Users/1/Desktop/Xing's Paper/data.txt", encoding="UTF-8", header=FALSE)
data <- data[-1,-5]
colnames(data) <- c("year","inc_tot","tour_dom","tour_ove","inc_urban","inc_rural","popul","highway","railway","prod_regi","inve_fix","highway_lev","hot_star")
#数据类型变换
for(i in 2:13){
data[,i] <- as.numeric(data[,i])
}
data1 <- data
colnames(data1) <- c("年份","旅游总收入","国内游客人数","海外游客人数","城镇居民人均可支配收入","农村居民人均可支配收入","人口数量","公路里程","铁路里程","地区生产总值","固定资产投资","等级公路","星级宾馆数")
data4 <- data1[,-11]
data4 <- data4[,-11]
data4 <- data4[,-1]
由于本文所选取的变量较多,而且变量之间的单位不统一,量纲不同,无法直接进行数据分析,因此需要进行标准化处理,转化成无量纲数据。数据标准化处理公式见下式(1): \[Z X_{i}=\frac{\left(X_{i}-\bar{X}\right)}{\sqrt{\operatorname{Var}\left(X_{i}\right)}}\] 式中,\(ZX_{i}\)是变量\(X_{i}\)标准化后的形式,\(\bar{X}\)代表变量\(X_{i}\)的数据均值。\(\operatorname{Var}\)代表变量\(X_{i}\)的方差。本文将所有变量的数据按照公式(1)进行标准化处理,消除量纲影响,同时变量的数 据经过标准化处理后符合正态分布规律,可以进行后面的回归分析。
for(i in 1:10){
data4[,i] <- (data4[,i]-mean(data4[,i]))/sd(data4[,i])
}
head(data4)
## 旅游总收入 国内游客人数 海外游客人数 城镇居民人均可支配收入
## 2 -0.8129251 -0.9302084 -0.7358325 -1.1529057
## 3 -0.7915752 -0.9169570 -0.6932889 -1.1013615
## 4 -0.7792470 -0.8886678 -0.6414966 -1.0850332
## 5 -0.7625761 -0.8786077 -0.9881516 -1.0193899
## 6 -0.7357085 -0.8597711 -0.6522587 -0.9328592
## 7 -0.7072951 -0.8329567 -0.5123523 -0.8497043
## 农村居民人均可支配收入 人口数量 公路里程 铁路里程 地区生产总值
## 2 -1.0956789 -1.4823721 -1.665858 -1.4682613 -1.1455024
## 3 -1.0685705 -1.3603901 -1.584103 -0.9323995 -1.1225131
## 4 -1.0421572 -1.2425584 -1.324892 -0.9323995 -1.0963074
## 5 -1.0064760 -1.1187719 -1.318844 -0.7180548 -1.0546222
## 6 -0.9293212 -0.9928200 -1.281247 -0.9323995 -0.9741171
## 7 -0.8574954 -0.8686726 -1.276622 -0.8252272 -0.8958904
## 星级宾馆数
## 2 -2.0684169
## 3 -1.5754308
## 4 -1.1419429
## 5 -0.8189520
## 6 -0.3259658
## 7 -0.5724589
#查看数据
stargazer(data4,type = "text")
##
## ===========================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------
## 旅游总收入 20 0.000 1.000 -0.813 2.519
## 国内游客人数 20 0.000 1.000 -0.930 2.156
## 海外游客人数 20 0.000 1.000 -0.997 2.808
## 城镇居民人均可支配收入 20 0.000 1.000 -1.153 1.931
## 农村居民人均可支配收入 20 0.000 1.000 -1.096 1.954
## 人口数量 20 -0.000 1.000 -1.482 1.277
## 公路里程 20 -0.000 1.000 -1.666 0.892
## 铁路里程 20 0.000 1.000 -1.468 2.068
## 地区生产总值 20 -0.000 1.000 -1.146 1.834
## 星级宾馆数 20 -0.000 1.000 -2.068 1.119
## -------------------------------------------
“年份”,“旅游总收入”,“国内游客人数”,“海外游客人数”,“城镇居民人均可支配收入”,“农村居民人均可支配收入”,“人口数量”,“公路里程”,“铁路里程”,“地区生产总值”,“固定资产投资”,“等级公路”,“星级宾馆数”
#分别做散点图
data5 <- data4
colnames(data5) <- c("inc_tot","tour_dom","tour_ove","inc_urban","inc_rural","popul","highway","railway","prod_regi","hot_star")
# head(data5)
plot(data5[,1],data5[,2],xlab = "X2",ylab = "Y")
plot(data5[,1],data5[,3],xlab = "X3",ylab = "Y")
plot(data5[,1],data5[,4],xlab = "X4",ylab = "Y")
plot(data5[,1],data5[,5],xlab = "X5",ylab = "Y")
plot(data5[,1],data5[,6],xlab = "X6",ylab = "Y")
plot(data5[,1],data5[,7],xlab = "X7",ylab = "Y")
plot(data5[,1],data5[,8],xlab = "X8",ylab = "Y")
plot(data5[,1],data5[,9],xlab = "X9",ylab = "Y")
plot(data5[,1],data5[,10],xlab = "X10",ylab = "Y")
#多元线性回归及统计检验
myline1 <- lm(inc_tot~tour_dom+tour_ove+inc_urban+inc_rural+popul+highway+railway+prod_regi+hot_star,data5)
stargazer(myline1,type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## inc_tot
## -----------------------------------------------
## tour_dom 1.576***
## (0.289)
##
## tour_ove 0.211***
## (0.031)
##
## inc_urban -1.288
## (0.969)
##
## inc_rural 0.280
## (1.385)
##
## popul -0.293**
## (0.107)
##
## highway -0.162*
## (0.074)
##
## railway 0.047
## (0.055)
##
## prod_regi 0.579
## (0.556)
##
## hot_star 0.167**
## (0.062)
##
## Constant -0.000
## (0.011)
##
## -----------------------------------------------
## Observations 20
## R2 0.999
## Adjusted R2 0.998
## Residual Std. Error 0.048 (df = 10)
## F Statistic 907.251*** (df = 9; 10)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# star(myline1)
#多重共线性检验 ##Pearson相关系数矩阵:
cor <- cor(data5)
stargazer(cor,type = "text")
##
## ================================================================================================
## inc_tot tour_dom tour_ove inc_urban inc_rural popul highway railway prod_regi hot_star
## ------------------------------------------------------------------------------------------------
## inc_tot 1 0.981 0.885 0.945 0.948 0.818 0.663 0.921 0.925 -0.091
## tour_dom 0.981 1 0.807 0.987 0.990 0.900 0.755 0.964 0.978 0.024
## tour_ove 0.885 0.807 1 0.755 0.754 0.627 0.542 0.726 0.725 -0.113
## inc_urban 0.945 0.987 0.755 1 1.000 0.950 0.838 0.977 0.997 0.170
## inc_rural 0.948 0.990 0.754 1.000 1 0.947 0.828 0.978 0.997 0.148
## popul 0.818 0.900 0.627 0.950 0.947 1 0.939 0.936 0.967 0.402
## highway 0.663 0.755 0.542 0.838 0.828 0.939 1 0.817 0.860 0.639
## railway 0.921 0.964 0.726 0.977 0.978 0.936 0.817 1 0.976 0.162
## prod_regi 0.925 0.978 0.725 0.997 0.997 0.967 0.860 0.976 1 0.206
## hot_star -0.091 0.024 -0.113 0.170 0.148 0.402 0.639 0.162 0.206 1
## ------------------------------------------------------------------------------------------------
由上图可知,有较多解释变量两两之间的相关系数较大,甚至超过了0.95,因此怀疑存在多重共线性。 PS. ##VIF(方差膨胀因子):
vif(myline1)
## tour_dom tour_ove inc_urban inc_rural popul highway
## 680.619611 8.082585 7677.587603 15680.167189 93.929534 44.578795
## railway prod_regi hot_star
## 24.466363 2531.735828 31.722930
做逐步回归,以克服多重共线性问题
myline2 <- step(myline1)
## Start: AIC=-115.15
## inc_tot ~ tour_dom + tour_ove + inc_urban + inc_rural + popul +
## highway + railway + prod_regi + hot_star
##
## Df Sum of Sq RSS AIC
## - inc_rural 1 0.000095 0.023336 -117.070
## - railway 1 0.001723 0.024964 -115.721
## <none> 0.023241 -115.152
## - prod_regi 1 0.002519 0.025760 -115.093
## - inc_urban 1 0.004108 0.027348 -113.897
## - highway 1 0.011143 0.034384 -109.318
## - hot_star 1 0.016694 0.039935 -106.325
## - popul 1 0.017408 0.040648 -105.971
## - tour_dom 1 0.069372 0.092613 -89.501
## - tour_ove 1 0.104745 0.127986 -83.031
##
## Step: AIC=-117.07
## inc_tot ~ tour_dom + tour_ove + inc_urban + popul + highway +
## railway + prod_regi + hot_star
##
## Df Sum of Sq RSS AIC
## - railway 1 0.001870 0.025205 -117.529
## <none> 0.023336 -117.070
## - prod_regi 1 0.007069 0.030404 -113.778
## - highway 1 0.011130 0.034466 -111.270
## - inc_urban 1 0.014439 0.037775 -109.437
## - popul 1 0.017325 0.040660 -107.965
## - hot_star 1 0.021184 0.044520 -106.151
## - tour_dom 1 0.086551 0.109887 -88.081
## - tour_ove 1 0.113205 0.136540 -83.737
##
## Step: AIC=-117.53
## inc_tot ~ tour_dom + tour_ove + inc_urban + popul + highway +
## prod_regi + hot_star
##
## Df Sum of Sq RSS AIC
## <none> 0.025205 -117.529
## - prod_regi 1 0.006406 0.031612 -114.999
## - highway 1 0.011690 0.036895 -111.908
## - inc_urban 1 0.012972 0.038177 -111.225
## - popul 1 0.015653 0.040858 -109.868
## - hot_star 1 0.020227 0.045432 -107.745
## - tour_dom 1 0.085440 0.110645 -89.943
## - tour_ove 1 0.112755 0.137960 -85.530
myline2
##
## Call:
## lm(formula = inc_tot ~ tour_dom + tour_ove + inc_urban + popul +
## highway + prod_regi + hot_star, data = data5)
##
## Coefficients:
## (Intercept) tour_dom tour_ove inc_urban popul highway
## -6.760e-17 1.588e+00 2.087e-01 -1.036e+00 -2.708e-01 -1.653e-01
## prod_regi hot_star
## 6.265e-01 1.561e-01
逐步回归后的方程: \[旅游总收入=国内游客人数+海外游客人数+城镇居民人均可支配收入+人口总数+公路里程+地区生产总值+星级宾馆数\]
vif(myline2)
## tour_dom tour_ove inc_urban popul highway prod_regi
## 560.852304 7.341054 1572.012719 88.997894 44.434095 1164.113020
## hot_star
## 22.880793
#异方差检验 ##BP检验
#step后的模型
myline3 <- lm(inc_tot ~ tour_dom + tour_ove + inc_urban + popul + highway + prod_regi + hot_star,data5)
#BP检验
# bptest(myline3)
e2 <- (myline3$residuals)^2
myline4 <- lm(e2~tour_dom + tour_ove + inc_urban + popul + highway + prod_regi + hot_star,data5)
stargazer(myline4,type="text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## e2
## -----------------------------------------------
## tour_dom 0.002
## (0.007)
##
## tour_ove -0.002**
## (0.001)
##
## inc_urban 0.008
## (0.011)
##
## popul -0.001
## (0.003)
##
## highway 0.001
## (0.002)
##
## prod_regi -0.009
## (0.009)
##
## hot_star -0.0001
## (0.001)
##
## Constant 0.001***
## (0.0003)
##
## -----------------------------------------------
## Observations 20
## R2 0.549
## Adjusted R2 0.286
## Residual Std. Error 0.001 (df = 12)
## F Statistic 2.086 (df = 7; 12)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
ys1 <- c("tour_dom=0","tour_ove=0","inc_urban=0","popul =0","highway=0","prod_regi=0","hot_star=0")
myline5 <- linearHypothesis(myline4,ys1)
myline5
## Linear hypothesis test
##
## Hypothesis:
## tour_dom = 0
## tour_ove = 0
## inc_urban = 0
## popul = 0
## highway = 0
## prod_regi = 0
## hot_star = 0
##
## Model 1: restricted model
## Model 2: e2 ~ tour_dom + tour_ove + inc_urban + popul + highway + prod_regi +
## hot_star
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19 3.9115e-05
## 2 12 1.7643e-05 7 2.1472e-05 2.0863 0.1257