The scope of this project is to analyze income in households in USA on county level and as distance from cities to state capitals. At the beggining the ESDA will be conducted. After that Moran’s I test and interactions models will be performed. To enrich the analysis, additional features wil be added. At the end the models will be compared and quality assessment checked.
For this project the data from https://www.kaggle.com/goldenoakresearch/us-household-income-stats-geo-locations is used.
The point data consists of Income information as Mean, Median and Standrad deviation values. We expect that median information inform about the income of the “middle” person living in the area where as standard deviation will inform about inequality in area.
head(kaggle_income)
Distributions of mean, median and standard deviation looks quite similar, But it is visible that in some counties median is quite high. It can indicate that there are observations with inequalities in income.
Below in tabs are visualisations of points on map with the values
variable <- kaggle_income$Median
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
col=cols[findInterval(variable, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
variable <- kaggle_income$Mean
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
col=cols[findInterval(variable, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
variable <- kaggle_income$Stdev
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
col=cols[findInterval(variable, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
As a spatial weights matrix we decided to use the knn matrix with k=1.
inc.knn<-knearneigh(coords.m, k=1) # knn class (k=1)
inc.knn.nb<-knn2nb(inc.knn)
inc.knn.nb<-make.sym.nb(inc.knn.nb)
inc.knn.listw<-nb2listw(inc.knn.nb)
We conducted Moran’s I test for each variable. All tests are significant, with a low p-value that allows us to reject the null hypothesis and assume there is a spatial autocorrelation. The value of the Morans I statistic is positive signifying a positive correlation which means that similar values touch more often than random. Below in tabs are plots for Moran’s test of each variable.
moran.test(kaggle_income$Median, inc.knn.listw)
##
## Moran I test under randomisation
##
## data: kaggle_income$Median
## weights: inc.knn.listw
##
## Moran I statistic standard deviate = 24.726, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 1.693882e-01 -3.134600e-05 4.694837e-05
x<-kaggle_income$Median
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="coral3")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)
moran.test(kaggle_income$Mean, inc.knn.listw)
##
## Moran I test under randomisation
##
## data: kaggle_income$Mean
## weights: inc.knn.listw
##
## Moran I statistic standard deviate = 85.088, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 5.829837e-01 -3.134600e-05 4.694889e-05
x<-kaggle_income$Mean
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="darkcyan")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)
moran.test(kaggle_income$Stdev, inc.knn.listw)
##
## Moran I test under randomisation
##
## data: kaggle_income$Stdev
## weights: inc.knn.listw
##
## Moran I statistic standard deviate = 67.374, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 4.616191e-01 -3.134600e-05 4.695085e-05
x<-kaggle_income$Stdev
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="chartreuse4")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)
Selecting state capital cities, calculating distances from cities and plotting distances on map.
state.capital.cities <- us.cities[us.cities$capital==2,c("name", "pop", "lat", "long")]
state.capital.cities <- state.capital.cities[state.capital.cities$name != "Honolulu HI",]
coords <- kaggle_income[,c("Lon", "Lat")]
coords <- as.matrix(coords)
for (city.name in state.capital.cities$name) {
col.city.name = paste0("dist_",city.name)
lat = state.capital.cities[state.capital.cities$name==city.name,]$lat
long = state.capital.cities[state.capital.cities$name==city.name,]$long
kaggle_income[col.city.name]<-spDistsN1(coords, c(long, lat), longlat=TRUE)
}
kaggle_income$mindist <- NULL
kaggle_income$mindist = apply(kaggle_income[,20:length(colnames(kaggle_income))], 1, min)
cols = rev(brewer.pal(9, "Blues"))
mindist <- kaggle_income$mindist
brks <- stats::quantile(mindist, probs=seq(0, 1, 0.13))
plot(usmap)
points(kaggle_income[,c("Lon", "Lat")], col=cols[findInterval(mindist, brks)], pch=21, cex=0.7,
bg=cols[findInterval(mindist, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
title(main="Distances from closest state capital")
Below we predict the income values for each variable using a sample of 3000 points. Modelling on the whole dataset was impossible due to computational limitations.
kaggle_income_s = dplyr::sample_n(kaggle_income, 3000)
coords.s = kaggle_income_s[,c("Lon", "Lat")]
coords.m.s = as.matrix(coords.s)
inc.knn<-knearneigh(coords.m.s, k=1) # knn class (k=1)
inc.knn.nb<-knn2nb(inc.knn)
inc.knn.nb<-make.sym.nb(inc.knn.nb)
# knn as listw class
inc.knn.listw.s<-nb2listw(inc.knn.nb)
kaggle_income_s$Median[kaggle_income_s$Median==0]<-2 # eliminating -inf for log(0)
#multinomial
eq.m = kaggle_income_s$Median~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.med<-glm(eq.m)
spatial.multi.med<-spatialreg::errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Registered S3 methods overwritten by 'spatialreg':
## method from
## residuals.stsls spdep
## deviance.stsls spdep
## coef.stsls spdep
## print.stsls spdep
## summary.stsls spdep
## print.summary.stsls spdep
## residuals.gmsar spdep
## deviance.gmsar spdep
## coef.gmsar spdep
## fitted.gmsar spdep
## print.gmsar spdep
## summary.gmsar spdep
## print.summary.gmsar spdep
## print.lagmess spdep
## summary.lagmess spdep
## print.summary.lagmess spdep
## residuals.lagmess spdep
## deviance.lagmess spdep
## coef.lagmess spdep
## fitted.lagmess spdep
## logLik.lagmess spdep
## fitted.SFResult spdep
## print.SFResult spdep
## fitted.ME_res spdep
## print.ME_res spdep
## print.lagImpact spdep
## plot.lagImpact spdep
## summary.lagImpact spdep
## HPDinterval.lagImpact spdep
## print.summary.lagImpact spdep
## print.sarlm spdep
## summary.sarlm spdep
## residuals.sarlm spdep
## deviance.sarlm spdep
## coef.sarlm spdep
## vcov.sarlm spdep
## fitted.sarlm spdep
## logLik.sarlm spdep
## anova.sarlm spdep
## predict.sarlm spdep
## print.summary.sarlm spdep
## print.sarlm.pred spdep
## as.data.frame.sarlm.pred spdep
## residuals.spautolm spdep
## deviance.spautolm spdep
## coef.spautolm spdep
## fitted.spautolm spdep
## print.spautolm spdep
## summary.spautolm spdep
## logLik.spautolm spdep
## print.summary.spautolm spdep
## print.WXImpact spdep
## summary.WXImpact spdep
## print.summary.WXImpact spdep
## predict.SLX spdep
# power
eq.p = log(kaggle_income_s$Median)~log(kaggle_income_s$mindist)
OLS.power.med<-lm(eq.p)
spatial.power.med<-spatialreg::errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
# exponential
eq.e = log(kaggle_income_s$Median)~kaggle_income_s$mindist
OLS.exp.med<-glm(eq.e)
spatial.exp.med<-spatialreg::errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
screenreg(list(OLS.multi.med, spatial.multi.med, OLS.power.med, spatial.power.med, OLS.exp.med, spatial.exp.med))
##
## =========================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## -------------------------------------------------------------------------------------------------------------------------
## (Intercept) 92914.72 *** 92795.39 *** 11.16 *** 11.17 *** 10.91 *** 10.92 ***
## (5616.45) (5904.51) (0.11) (0.12) (0.04) (0.04)
## kaggle_income_s$mindist 239.32 242.63 -0.00 -0.00
## (137.23) (144.10) (0.00) (0.00)
## kaggle_income_s$mindist^2 -3.57 *** -3.58 ***
## (0.99) (1.04)
## kaggle_income_s$mindist^3 0.01 *** 0.01 ***
## (0.00) (0.00)
## kaggle_income_s$mindist^4 -0.00 *** -0.00 ***
## (0.00) (0.00)
## lambda 0.06 *** 0.08 *** 0.08 ***
## (0.01) (0.01) (0.01)
## log(kaggle_income_s$mindist) -0.06 ** -0.06 *
## (0.02) (0.02)
## -------------------------------------------------------------------------------------------------------------------------
## AIC 76839.07 10467.95
## BIC 76875.11 10485.97
## Log Likelihood -38413.54 -38406.58 -5214.52 -5230.98 -5216.83
## Deviance 23254375025620.02 5743.41
## Num. obs. 3000 3000 3000 3000 3000 3000
## Parameters 7 4 4
## AIC (Linear model) 76839.07 10462.60 10467.95
## AIC (Spatial model) 76827.15 10437.03 10441.65
## LR test: statistic 13.92 27.57 28.30
## LR test: p-value 0.00 0.00 0.00
## R^2 0.00
## Adj. R^2 0.00
## =========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Spatial models are doing better than a-spatial
AIC.all<-matrix(0)
models<-c("spatial.multi.med","spatial.power.med","spatial.exp.med")
for(i in 1:3){
AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 76839.07 76827.15 10462.60 10437.03 10467.95 10441.65
Best SRMSE is for multinomial model
SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}
models<-c("OLS.multi.med", "spatial.multi.med", "OLS.power.med", "spatial.power.med", "OLS.exp.med", "spatial.exp.med")
SRMSE.all<-matrix(0)
for(i in 1:6){
SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Median")}
SRMSE.all
## [1] 1.028027 1.024558 1.440770 1.440770 1.440770 1.440770
kaggle_income_s$Mean[kaggle_income_s$Mean==0]<-2 # eliminating -inf for log(0)
#multinomial
eq.m = kaggle_income_s$Mean~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.mean<-glm(eq.m)
spatial.multi.mean<-spatialreg::errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
# power
eq.p = log(kaggle_income_s$Mean)~log(kaggle_income_s$mindist)
OLS.power.mean<-lm(eq.p)
spatial.power.mean<-spatialreg::errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
# exponential
eq.e = log(kaggle_income_s$Mean)~kaggle_income_s$mindist
OLS.exp.mean<-glm(eq.e)
spatial.exp.mean<-spatialreg::errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
screenreg(list(OLS.multi.mean, spatial.multi.mean, OLS.power.mean, spatial.power.mean, OLS.exp.mean, spatial.exp.mean))
##
## ========================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## ------------------------------------------------------------------------------------------------------------------------
## (Intercept) 69447.60 *** 69733.13 *** 11.10 *** 11.10 *** 10.91 *** 10.92 ***
## (1902.84) (2334.17) (0.10) (0.11) (0.03) (0.04)
## kaggle_income_s$mindist 128.57 ** 122.66 * -0.00 -0.00
## (46.49) (56.57) (0.00) (0.00)
## kaggle_income_s$mindist^2 -1.81 *** -1.74 ***
## (0.33) (0.41)
## kaggle_income_s$mindist^3 0.01 *** 0.01 ***
## (0.00) (0.00)
## kaggle_income_s$mindist^4 -0.00 *** -0.00 ***
## (0.00) (0.00)
## lambda 0.26 *** 0.08 *** 0.08 ***
## (0.01) (0.01) (0.01)
## log(kaggle_income_s$mindist) -0.04 * -0.04 *
## (0.02) (0.02)
## ------------------------------------------------------------------------------------------------------------------------
## AIC 70344.97 9707.68
## BIC 70381.01 9725.70
## Log Likelihood -35166.49 -35002.44 -4835.01 -4850.84 -4836.79
## Deviance 2669238635769.98 4457.68
## Num. obs. 3000 3000 3000 3000 3000 3000
## Parameters 7 4 4
## AIC (Linear model) 70344.97 9703.71 9707.68
## AIC (Spatial model) 70018.88 9678.03 9681.58
## LR test: statistic 328.09 27.68 28.10
## LR test: p-value 0.00 0.00 0.00
## R^2 0.00
## Adj. R^2 0.00
## ========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Spatial models are doing better than a-spatial
AIC.all<-matrix(0)
models<-c("spatial.multi.mean","spatial.power.mean","spatial.exp.mean")
for(i in 1:3){
AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 70344.972 70018.878 9703.713 9678.028 9707.681 9681.584
Best SRMSE is for multinomial model
SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}
models<-c("OLS.multi.mean", "spatial.multi.mean", "OLS.power.mean", "spatial.power.mean", "OLS.exp.mean", "spatial.exp.mean")
SRMSE.all<-matrix(0)
for(i in 1:6){
SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.4492500 0.4149251 1.0999516 1.0999515 1.0999516 1.0999515
kaggle_income_s$Stdev[kaggle_income_s$Stdev==0]<-2 # eliminating -inf for log(0)
#multinomial
eq.m = kaggle_income_s$Stdev~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.std<-glm(eq.m)
spatial.multi.std<-spatialreg::errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
# power
eq.p = log(kaggle_income_s$Stdev)~log(kaggle_income_s$mindist)
OLS.power.std<-lm(eq.p)
spatial.power.std<-spatialreg::errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
# exponential
eq.e = log(kaggle_income_s$Stdev)~kaggle_income_s$mindist
OLS.exp.std<-glm(eq.e)
spatial.exp.std<-spatialreg::errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
screenreg(list(OLS.multi.std, spatial.multi.std, OLS.power.std, spatial.power.std, OLS.exp.std, spatial.exp.std))
##
## =======================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## -----------------------------------------------------------------------------------------------------------------------
## (Intercept) 47922.57 *** 47888.62 *** 10.74 *** 10.74 *** 10.60 *** 10.60 ***
## (1041.47) (1219.64) (0.09) (0.10) (0.03) (0.04)
## kaggle_income_s$mindist 64.00 * 64.46 * -0.00 -0.00
## (25.45) (29.65) (0.00) (0.00)
## kaggle_income_s$mindist^2 -0.80 *** -0.79 ***
## (0.18) (0.21)
## kaggle_income_s$mindist^3 0.00 *** 0.00 ***
## (0.00) (0.00)
## kaggle_income_s$mindist^4 -0.00 *** -0.00 ***
## (0.00) (0.00)
## lambda 0.19 *** 0.07 *** 0.07 ***
## (0.01) (0.01) (0.01)
## log(kaggle_income_s$mindist) -0.03 -0.03
## (0.02) (0.02)
## -----------------------------------------------------------------------------------------------------------------------
## AIC 66728.65 9454.64
## BIC 66764.69 9472.65
## Log Likelihood -33358.32 -33271.78 -4712.41 -4724.32 -4713.65
## Deviance 799596354692.07 4097.10
## Num. obs. 3000 3000 3000 3000 3000 3000
## Parameters 7 4 4
## AIC (Linear model) 66728.65 9451.97 9454.64
## AIC (Spatial model) 66557.56 9432.83 9435.29
## LR test: statistic 173.09 21.14 21.34
## LR test: p-value 0.00 0.00 0.00
## R^2 0.00
## Adj. R^2 0.00
## =======================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Spatial models are doing better than a-spatial
AIC.all<-matrix(0)
models<-c("spatial.multi.std","spatial.power.std","spatial.exp.std")
for(i in 1:3){
AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 66728.647 66557.560 9451.966 9432.825 9454.636 9435.295
Best SRMSE is for multinomial model
SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}
models<-c("OLS.multi.std", "spatial.multi.std", "OLS.power.std", "spatial.power.std", "OLS.exp.std", "spatial.exp.std")
SRMSE.all<-matrix(0)
for(i in 1:6){
SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.5372586 0.5258286 1.0999558 1.0999558 1.0999558 1.0999558
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, OLS.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")
plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, spatial.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")
par(mfrow=c(1,2))
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, OLS.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")
plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, spatial.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, OLS.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")
plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, spatial.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")
par(mfrow=c(1,2))
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, OLS.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")
plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, spatial.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, OLS.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")
plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, spatial.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")
par(mfrow=c(1,2))
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), OLS.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), spatial.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, OLS.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")
plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, spatial.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")
The results from all variables are similar. For all models spatial models are doing better than a-spatial and for all variables the best interactions functional form is the multinomial model. However looking at the plots and R2 and Adjusted R2 we can notice that the models perform very poorly. This might be because of the sampling, however we are not able to compare the results on the whole dataset.
Because of a lack of satisfactory results on points data, we dicided to move to aggregated data on county level and analyse using additional variables.
Because we didn’t find any interesting relations using point data, we decided to move to a more aggregated level, using county data. We decided to include additional explanatory variables.
zero_pad <- function(x) { ifelse(str_starts(x, "0"), x, paste0("0",x))}
As a base for all plotting purposes and for joining multiple sources together, we use a shapefile provided by the US Census Bureau. It contains all counties, the primary legal divisions of most states.
uscounties <- readOGR(".", "tl_2017_us_county")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "tl_2017_us_county"
## with 3233 features
## It has 17 fields
## Integer64 fields read as strings: ALAND AWATER
uscounties <- spTransform(uscounties, CRS("+proj=longlat +datum=NAD83"))
uscounties$GEOID = as.character(uscounties$GEOID)
uscounties$GEOID0 = uscounties$GEOID %>% sapply(zero_pad)
The dependent value, income per capita, has been downloaded from the Bureau of Economic Analysis. In the data, there are three types of variables: Median personal income in region, Population in region and Median personal income per capita. We take the newest data from 2018 for income per capita.
bea CAINC1: Annual Personal Income by County https://apps.bea.gov/regional/downloadzip.cfm
income_county <- read.csv("CAINC1__ALL_AREAS_1969_2018.csv", stringsAsFactors=FALSE)
income_county <- income_county[,c("GeoFIPS", "GeoName", "Description", "LineCode", "Unit","X2018")]
income_county <- income_county[1:(dim(income_county)[1]-3),]
income_county <- income_county[income_county$Description=="Per capita personal income (dollars) 2/",]
colnames(income_county) = c("FIPS", "County", "Description", "LineCode", "Unit","Income")
income_county = income_county[!as.character(income_county$FIPS) %>% sapply(str_trim) %>% sapply(str_ends, "000"),]
income_county$FIPS = as.character(income_county$FIPS) %>% sapply(str_trim)
income_county$Income = as.integer(income_county$Income)
income_county$FIPS0 = income_county$FIPS %>% sapply(zero_pad)
income_county = income_county[!is.na(income_county$Income),]
Using data provided by the National Association of Realtors we got the median home prices per county from 4th quarter in 2019. The prices are not from home sales but the values instead from the American Community Survey (ACS).
house_index <- read.csv2("HI_data.csv", stringsAsFactors=FALSE)
colnames(house_index) = c("FIPS", "Price.RangeQ42019", "Geography", "HousingIndex" )
house_index$FIPS <- paste0("0",house_index$FIPS)
house_index$FIPS0 = house_index$FIPS %>% sapply(zero_pad)
Data on Population and Education is provided by the United States Department of Agriculture. As a proxy of the level of education we decided to use the variable “Percent of adults with a bachelor’s degree or higher, 2014-18” and from Population data we got the population per county POP_ESTIMATE_2018 as well as the level of domestic migration DOMESTIC_MIG_2018.
education_county <- read.csv2("Education.csv", stringsAsFactors=FALSE)
colnames(education_county) = c("FIPS", "State", "Area.name", "Education.perc")
education_county$Education.perc = as.numeric(education_county$Education.perc)
education_county = education_county[!is.na(education_county$Education.perc),]
education_county$FIPS0 <- paste0("0",education_county$FIPS)
population_county <- read.csv2("Pop.csv", stringsAsFactors=FALSE)
colnames(population_county) = c("FIPS", "State", "Area.name", "Econ.Typology", "Dom.Migr", "Pop")
population_county$Dom.Migr = as.numeric(population_county$Dom.Migr)
population_county$Pop = as.numeric(population_county$Pop)
population_county = population_county[!is.na(population_county$Dom.Migr),]
population_county = population_county[!is.na(population_county$Pop),]
population_county$FIPS0 <- paste0("0",population_county$FIPS)
All dataframes with variables were merged to spatial dataframe uscounties using the FIPS code, which is an identifier of county. Then the dataframe was cleaned from missing variables by removing suchobservations.
uscounties = merge(uscounties, house_index, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, income_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, population_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, education_county, by.x="GEOID0", by.y="FIPS0")
uscounties = uscounties[!uscounties$STATEFP %in% c("15", "02"),]
uscounties = uscounties[!is.na(uscounties$Education.perc),]
uscounties = uscounties[!is.na(uscounties$HousingIndex),]
uscounties = uscounties[!is.na(uscounties$Income),]
uscounties = uscounties[!is.na(uscounties$Dom.Migr),]
uscounties = uscounties[!is.na(uscounties$Pop),]
In tabs below are visualised distributions of each variable for all counties.
variable<-uscounties$HousingIndex
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
variable<-uscounties$Income
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
variable<-uscounties$Education.perc
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
variable<-uscounties$Pop
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
variable<-uscounties$Dom.Migr
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
For out analysis we decided to choose the knn spatial weights matrix. We rejected the matrix based on radius, because in the south-east part of the country, the states are much more densely packed, and it would be difficult to find an optimal radius across the country. As for contiguity matrix, we decided that since some of the counties were removed, the neighbors might be not that accurate. That is why we dicded to go with the knn spatial weights matrix using k=1.
uscounties.df = as.data.frame(uscounties)
crds<-coordinates(uscounties)
# calculate weights matrix
us.knn<-knearneigh(crds, k=1)
us.knn.nb<-knn2nb(us.knn)
us.sym.knn.nb<-make.sym.nb(us.knn.nb)
us.sym.knn.listw<-nb2listw(us.sym.knn.nb)
We start the analysis by printing the summary of variables, including logarythmic transformations of the variables which we found to be a necessary transformation in later steps of the analysis. Because Domestic Migration variable takes negative and positive values, we decided to take into account only absolute values of the effect. We also added 1 to the variable in order to get rid of errors caused by applying a log to 0 value. All variables were added as new to the dataframe with the suffix “.l”.
uscounties.df %>%
dplyr::mutate(Income.l = log(Income),
Pop.l = log(Pop),
HousingIndex.l = log(HousingIndex),
Education.perc.l = log(Education.perc),
Dom.Migr.l = log(abs(Dom.Migr)+1)) %>%
dplyr::select(GEOID0,
Income, HousingIndex, Dom.Migr, Pop, Education.perc,
Income.l, HousingIndex.l, Dom.Migr.l, Pop.l, Education.perc.l) %>%
summary()
## GEOID0 Income HousingIndex Dom.Migr
## Length:3044 Min. : 18541 Min. : 35114 Min. :-98218.00
## Class :character 1st Qu.: 36505 1st Qu.: 98618 1st Qu.: -126.00
## Mode :character Median : 41836 Median : 129868 Median : -8.00
## Mean : 43944 Mean : 158383 Mean : 9.42
## 3rd Qu.: 48502 3rd Qu.: 180946 3rd Qu.: 161.00
## Max. :251728 Max. :1195477 Max. : 49531.00
## Pop Education.perc Income.l HousingIndex.l
## Min. : 276 Min. : 5.40 Min. : 9.828 Min. :10.47
## 1st Qu.: 11030 1st Qu.:15.00 1st Qu.:10.505 1st Qu.:11.50
## Median : 26010 Median :19.10 Median :10.642 Median :11.77
## Mean : 105012 Mean :21.40 Mean :10.660 Mean :11.84
## 3rd Qu.: 67556 3rd Qu.:25.32 3rd Qu.:10.789 3rd Qu.:12.11
## Max. :10073906 Max. :74.60 Max. :12.436 Max. :13.99
## Dom.Migr.l Pop.l Education.perc.l
## Min. : 0.000 Min. : 5.620 Min. :1.686
## 1st Qu.: 3.951 1st Qu.: 9.308 1st Qu.:2.708
## Median : 4.920 Median :10.166 Median :2.950
## Mean : 5.037 Mean :10.283 Mean :2.983
## 3rd Qu.: 6.065 3rd Qu.:11.121 3rd Qu.:3.232
## Max. :11.495 Max. :16.125 Max. :4.312
uscounties.df = uscounties.df %>% dplyr::mutate(Income.l = log(Income),
Pop.l = log(Pop),
HousingIndex.l = log(HousingIndex),
Education.perc.l = log(Education.perc),
Dom.Migr.l = log(abs(Dom.Migr)+1))
As a next step we calculated the correlations of each variables with each other and plotted the results. The correlations are significant and seem pretty promising.
uscounties.df[,c("Income.l","Dom.Migr.l","HousingIndex.l","Pop.l","Education.perc.l")] %>% chart.Correlation(histogram=TRUE, pch=19)
Next we decided to check the spatial autocorrelation of the dependent variable using Morans I. Basing on the results we can conclude that there is significant positive autocorrelation of the variable.
moran.test(uscounties.df$Income.l, us.sym.knn.listw )
##
## Moran I test under randomisation
##
## data: uscounties.df$Income.l
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = 25.277, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.5494976506 -0.0003286231 0.0004731465
We later plotted the results as a scatterplot of the Income dependent on a spatially lagged value of the Income.
variable<-uscounties.df$Income.l
variable.std<-((variable-mean(variable))/sd(variable))
moran.plot(variable.std, us.sym.knn.listw, pch=20, labels=F, quiet=T)
The results of the moran test, that is in which quarter in the plot the county landed, was plotted on a map. We can notice a concentration of the III quarter - that is low surrounded by low - in the Southern region, in states like Mississippi, Alabama, Arkansas, Kentucky. The states in the Ist quarter are in the Midwestern region (Dakotas, Kansas, Nebraska, Iowa), Central Atlantic (New York, Pennsylvania, Delaware, New Jersey) and theindividual states, like California and parts of Washington.
w.var<-lag.listw(us.sym.knn.listw, variable.std)
cond1<-ifelse(variable.std>=0 & w.var>=0, 1,0) # I quarter
cond2<-ifelse(variable.std>=0 & w.var<0, 2,0) # II quarter
cond3<-ifelse(variable.std<0 & w.var<0, 3,0) # III quarter
cond4<-ifelse(variable.std<0 & w.var>=0, 4,0) # IV quarter
cond.all<-cond1+cond2+cond3+cond4 # all quarters in one
cond<-as.data.frame(cond.all)
# map
brks<-c(1,2,3,4)
cols<-c("#EFF3FF", "#6BAED6", "#2171B5", "#BDD7E7")
plot(uscounties, col=cols[findInterval(cond$cond.all, brks)], lty=0)
plot(us_states$geometry, add=T)
legend("bottomleft", legend=c("I Q - HH – high surrounded by high", "II Q - LH - low surrounded by high", "III Q - LL - low surrounded by low", "IV Q - HL - high surrounded by low"), fill=cols, bty="n", cex=0.80)
title(main="Mapping of Moran scatterplot results for Income")
We also calculated the joint count test. Below can be seen quantiles of variable Income.l
uscounties.df$Income.l %>% quantile()
## 0% 25% 50% 75% 100%
## 9.82774 10.50520 10.64151 10.78936 12.43610
# parameters of graphics
brks1<-c(0,10.4, 10.8, 13)
cols<-c("green", "blue", "red")
var.factor<-factor(cut(uscounties.df$Income.l, breaks=brks1, labels=c("low", "medium", "high")))
The scatterplot represents points representing values of income in given color ranges. We chose, using quantiles, 10.4, 10.8 and 13. Then the counties were colored acording to income group.
# scatterplot of values
plot(uscounties.df$Income.l, bg=cols[findInterval(log(uscounties.df$Income), brks1)], pch=21)
abline(h=brks1[2:4], lty=3)
# spatial distribution with three colours
plot(uscounties, col=cols[findInterval(uscounties.df$Income.l, brks1)])
title(main="Income log")
legend("bottomleft", legend=c("low", "medium", "high"), leglabs(brks1), fill=cols, bty="n")
For all income groups the p-value is very low so we cannot reject the null hypothesis that there is spatial correlation at each income level.
joincount.test(var.factor, us.sym.knn.listw)
##
## Join count test under nonfree sampling
##
## data: var.factor
## weights: us.sym.knn.listw
##
## Std. deviate for low = 13.771, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Same colour statistic Expectation Variance
## 63.20833 18.16563 10.69795
##
##
## Join count test under nonfree sampling
##
## data: var.factor
## weights: us.sym.knn.listw
##
## Std. deviate for medium = 9.508, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Same colour statistic Expectation Variance
## 736.58333 651.01709 80.98987
##
##
## Join count test under nonfree sampling
##
## data: var.factor
## weights: us.sym.knn.listw
##
## Std. deviate for high = 17.264, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Same colour statistic Expectation Variance
## 192.00000 85.06080 38.37036
We decided to start the modelling with the form of model \[log(Income=log(Pop)+log(HousingIndex)+log(Education.perc)+log(|Dom.Migr|+1)\]
form = log(Income)~log(Pop)+log(HousingIndex)+log(Education.perc)+log(abs(Dom.Migr)+1)
It turns out that the model using OLS is not vary accurate with an R2 at 0.55. However all the variables turn out to be significant apart from Dom.Migr.l.
#OLS
OLS_1<-lm(form, data=uscounties.df)
summary(OLS_1)
##
## Call:
## lm(formula = form, data = uscounties.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74902 -0.08740 -0.00307 0.08320 1.11212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.016789 0.090750 88.339 <2e-16 ***
## log(Pop) -0.028099 0.003211 -8.751 <2e-16 ***
## log(HousingIndex) 0.164470 0.009853 16.692 <2e-16 ***
## log(Education.perc) 0.337919 0.010767 31.383 <2e-16 ***
## log(abs(Dom.Migr) + 1) -0.004681 0.002598 -1.802 0.0717 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1583 on 3039 degrees of freedom
## Multiple R-squared: 0.5525, Adjusted R-squared: 0.5519
## F-statistic: 938.2 on 4 and 3039 DF, p-value: < 2.2e-16
Due to a low pvalue of the Global Morans I test we reject the hypothesis of a lack of spatial autocorrelation. That means that the OLS model is not a correct specification of the model and spatial ecnometric models should be used. We don’t know however what form of spatial variables should be added and so in the next steps we will try to find the optimal functional form.
lm.morantest(OLS_1, us.sym.knn.listw)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = form, data = uscounties.df)
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = 17.301, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3755644435 -0.0009809063 0.0004736751
The first used model is the Manski model, which uses all possible spatial paameters, that is spatial lag of y, lag of x and lag of error. As a result we get a model with all parameters significant, apart from Dom.Migr.
# GNS, Manski model (all)
GNS_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)
## Warning: Function sacsarlm moved to the spatialreg package
summary(GNS_1, Nagelkerke=TRUE)
##
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw,
## listw2 = listw2, na.action = na.action, Durbin = Durbin,
## type = type, method = method, quiet = quiet, zero.policy = zero.policy,
## tol.solve = tol.solve, llprof = llprof, interval1 = interval1,
## interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5891645 -0.0654680 -0.0006591 0.0615183 1.1184286
##
## Type: sacmixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.4329632 0.1793139 19.1450 < 2.2e-16
## log(Pop) -0.0265383 0.0037500 -7.0768 1.475e-12
## log(HousingIndex) 0.2468577 0.0146717 16.8254 < 2.2e-16
## log(Education.perc) 0.2385731 0.0128092 18.6251 < 2.2e-16
## log(abs(Dom.Migr) + 1) -0.0017874 0.0025098 -0.7122 0.4763
## lag.log(Pop) 0.0158909 0.0039658 4.0070 6.149e-05
## lag.log(HousingIndex) -0.1910021 0.0155729 -12.2651 < 2.2e-16
## lag.log(Education.perc) -0.0744758 0.0150416 -4.9513 7.371e-07
## lag.log(abs(Dom.Migr) + 1) -0.0011030 0.0027596 -0.3997 0.6894
##
## Rho: 0.58165
## Asymptotic standard error: 0.02082
## z-value: 27.938, p-value: < 2.22e-16
## Lambda: -0.39088
## Asymptotic standard error: 0.026817
## z-value: -14.576, p-value: < 2.22e-16
##
## LR test value: 451.46, p-value: < 2.22e-16
##
## Log likelihood: 1519.414 for sacmixed model
## ML residual variance (sigma squared): 0.014826, (sigma: 0.12176)
## Nagelkerke pseudo-R-squared: 0.61421
## Number of observations: 3044
## Number of parameters estimated: 12
## AIC: -3014.8, (AIC for lm: -2575.4)
For this model and all following the results are similar, where are spatial parameters are significant apart from those related to Dom.Migr.l
# SAC (rho and lambda)
SAC_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAC_1, Nagelkerke=TRUE)
##
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw,
## listw2 = listw2, na.action = na.action, Durbin = Durbin,
## type = type, method = method, quiet = quiet, zero.policy = zero.policy,
## tol.solve = tol.solve, llprof = llprof, interval1 = interval1,
## interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68456244 -0.07887936 -0.00020813 0.07450683 1.23705580
##
## Type: sac
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.3496842 0.2056854 35.7327 <2e-16
## log(Pop) -0.0280881 0.0032579 -8.6216 <2e-16
## log(HousingIndex) 0.1754074 0.0111026 15.7988 <2e-16
## log(Education.perc) 0.3035556 0.0109026 27.8426 <2e-16
## log(abs(Dom.Migr) + 1) -0.0035918 0.0024970 -1.4384 0.1503
##
## Rho: 0.059567
## Asymptotic standard error: 0.021057
## z-value: 2.8288, p-value: 0.0046724
## Lambda: 0.20314
## Asymptotic standard error: 0.024835
## z-value: 8.1793, p-value: 2.2204e-16
##
## LR test value: 325.74, p-value: < 2.22e-16
##
## Log likelihood: 1456.556 for sac model
## ML residual variance (sigma squared): 0.02182, (sigma: 0.14772)
## Nagelkerke pseudo-R-squared: 0.59795
## Number of observations: 3044
## Number of parameters estimated: 8
## AIC: -2897.1, (AIC for lm: -2575.4)
# SDM (rho and x lags)
SDM_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="mixed")
summary(SDM_1, Nagelkerke=TRUE)
##
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, type = type, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7178e-01 -7.8710e-02 -3.6943e-05 7.1507e-02 1.2029e+00
##
## Type: mixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.0402592 0.1472418 41.0227 < 2.2e-16
## log(Pop) -0.0258719 0.0034648 -7.4670 8.193e-14
## log(HousingIndex) 0.2196932 0.0132181 16.6206 < 2.2e-16
## log(Education.perc) 0.2685165 0.0118578 22.6447 < 2.2e-16
## log(abs(Dom.Migr) + 1) -0.0033040 0.0024614 -1.3423 0.17949
## lag.log(Pop) 0.0067342 0.0036808 1.8296 0.06731
## lag.log(HousingIndex) -0.1178494 0.0140635 -8.3798 < 2.2e-16
## lag.log(Education.perc) 0.0163228 0.0133287 1.2246 0.22071
## lag.log(abs(Dom.Migr) + 1) -0.0017060 0.0027424 -0.6221 0.53388
##
## Rho: 0.26138, LR test value: 324.24, p-value: < 2.22e-16
## Asymptotic standard error: 0.014024
## z-value: 18.638, p-value: < 2.22e-16
## Wald statistic: 347.38, p-value: < 2.22e-16
##
## Log likelihood: 1483.516 for mixed model
## ML residual variance (sigma squared): 0.021088, (sigma: 0.14522)
## Nagelkerke pseudo-R-squared: 0.60501
## Number of observations: 3044
## Number of parameters estimated: 11
## AIC: -2945, (AIC for lm: -2622.8)
## LM test for residual autocorrelation
## test value: 68.352, p-value: < 2.22e-16
#SDEM (lambda and x lags)
SDEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, etype="emixed")
summary(SDEM_1)
##
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, etype = etype, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67294082 -0.07809245 -0.00083009 0.07230371 1.20764394
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.13936115 0.12355436 65.8768 < 2.2e-16
## log(Pop) -0.02618536 0.00334065 -7.8384 4.663e-15
## log(HousingIndex) 0.20520741 0.01204106 17.0423 < 2.2e-16
## log(Education.perc) 0.28722481 0.01130742 25.4014 < 2.2e-16
## log(abs(Dom.Migr) + 1) -0.00369600 0.00250748 -1.4740 0.1405
## lag.log(Pop) -0.00077273 0.00358410 -0.2156 0.8293
## lag.log(HousingIndex) -0.06125638 0.01283390 -4.7730 1.815e-06
## lag.log(Education.perc) 0.08924571 0.01233006 7.2381 4.552e-13
## lag.log(abs(Dom.Migr) + 1) -0.00215866 0.00282696 -0.7636 0.4451
##
## Lambda: 0.25972, LR test value: 317.67, p-value: < 2.22e-16
## Asymptotic standard error: 0.014054
## z-value: 18.48, p-value: < 2.22e-16
## Wald statistic: 341.53, p-value: < 2.22e-16
##
## Log likelihood: 1480.233 for error model
## ML residual variance (sigma squared): 0.021146, (sigma: 0.14542)
## Number of observations: 3044
## Number of parameters estimated: 11
## AIC: -2938.5, (AIC for lm: -2622.8)
#SAR (rho)
SAR_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAR_1)
##
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, type = type, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7082361 -0.0816300 -0.0023959 0.0754489 1.2198112
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.2286407 0.1307949 47.6214 < 2.2e-16
## log(Pop) -0.0240740 0.0030313 -7.9418 1.998e-15
## log(HousingIndex) 0.1346401 0.0095517 14.0960 < 2.2e-16
## log(Education.perc) 0.3014655 0.0102722 29.3476 < 2.2e-16
## log(abs(Dom.Migr) + 1) -0.0043433 0.0024406 -1.7796 0.07515
##
## Rho: 0.20707, LR test value: 288.25, p-value: < 2.22e-16
## Asymptotic standard error: 0.01214
## z-value: 17.056, p-value: < 2.22e-16
## Wald statistic: 290.92, p-value: < 2.22e-16
##
## Log likelihood: 1437.812 for lag model
## ML residual variance (sigma squared): 0.022118, (sigma: 0.14872)
## Number of observations: 3044
## Number of parameters estimated: 7
## AIC: -2861.6, (AIC for lm: -2575.4)
## LM test for residual autocorrelation
## test value: 22.041, p-value: 2.6685e-06
#SLX (lags of x)
SLX_1 <- lmSLX(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SLX_1)
##
## Call:
## lm(formula = formula(paste("y ~ ", paste(colnames(x)[-1], collapse = "+"))),
## data = as.data.frame(x), weights = weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74431 -0.08816 -0.00328 0.08023 1.07731
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.1924349 0.1007032 81.352 < 2e-16 ***
## log.Pop. -0.0257950 0.0037458 -6.886 6.92e-12 ***
## log.HousingIndex. 0.2050671 0.0142716 14.369 < 2e-16 ***
## log.Education.perc. 0.2897433 0.0127789 22.673 < 2e-16 ***
## log.abs.Dom.Migr....1. -0.0038651 0.0026609 -1.453 0.146
## lag.log.Pop. 0.0006433 0.0039586 0.162 0.871
## lag.log.HousingIndex. -0.0697066 0.0149178 -4.673 3.10e-06 ***
## lag.log.Education.perc. 0.0997334 0.0136496 7.307 3.48e-13 ***
## lag.log.abs.Dom.Migr....1. -0.0038089 0.0029642 -1.285 0.199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.157 on 3035 degrees of freedom
## Multiple R-squared: 0.5606, Adjusted R-squared: 0.5595
## F-statistic: 484 on 8 and 3035 DF, p-value: < 2.2e-16
#SEM (lambda)
SEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SEM_1)
##
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, etype = etype, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6717050 -0.0795737 -0.0011856 0.0745068 1.2373582
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.8395830 0.1009663 77.6455 <2e-16
## log(Pop) -0.0289286 0.0032799 -8.8199 <2e-16
## log(HousingIndex) 0.1898747 0.0106929 17.7571 <2e-16
## log(Education.perc) 0.2970953 0.0110358 26.9210 <2e-16
## log(abs(Dom.Migr) + 1) -0.0032463 0.0024821 -1.3079 0.1909
##
## Lambda: 0.26514, LR test value: 320.26, p-value: < 2.22e-16
## Asymptotic standard error: 0.01401
## z-value: 18.925, p-value: < 2.22e-16
## Wald statistic: 358.14, p-value: < 2.22e-16
##
## Log likelihood: 1453.818 for error model
## ML residual variance (sigma squared): 0.021474, (sigma: 0.14654)
## Number of observations: 3044
## Number of parameters estimated: 7
## AIC: -2893.6, (AIC for lm: -2575.4)
Below is again the moran test on OLS residuals as above, signifying a spatial autocorrelation not included in this functional form. Below we calculated the Morans I test for the GNS model, and at p-value 0.62 we can conclude that the spatial autocorrelation has been filtered out by the GNS model form. What is interesting, both SAR and SLX models did not get rid of the autocorrelation, so neither rho or lags of x are resposible for that. However only adding lambda in SEM, gave a pvalue of 0.84 which can lead to believe that this is the primary factor removing the autocorrelation.
lm.morantest(OLS_1, us.sym.knn.listw)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = form, data = uscounties.df)
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = 17.301, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3755644435 -0.0009809063 0.0004736751
moran.test(GNS_1$residuals, us.sym.knn.listw)
##
## Moran I test under randomisation
##
## data: GNS_1$residuals
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = -0.31167, p-value = 0.6224
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.0071052191 -0.0003286231 0.0004727383
moran.test(SAR_1$residuals, us.sym.knn.listw)
##
## Moran I test under randomisation
##
## data: SAR_1$residuals
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = 2.5878, p-value = 0.004829
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.0559438427 -0.0003286231 0.0004728473
moran.test(SLX_1$residuals, us.sym.knn.listw)
##
## Moran I test under randomisation
##
## data: SLX_1$residuals
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = 17.494, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.3801823777 -0.0003286231 0.0004731032
moran.test(SEM_1$residuals, us.sym.knn.listw)
##
## Moran I test under randomisation
##
## data: SEM_1$residuals
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = -1.0043, p-value = 0.8424
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.0221669750 -0.0003286231 0.0004728189
As a first step we used the Likelihood ratio test to compare the most unlimited model, that is Manski model, that consists of all possible parameters - spatial lags of y, spatial lag of x and spatial lag of error u. This test as a null hypothesis assumes that the limited model is better, since the distances between the parameters are small. However tests compared to all possible limited models show that the difference is significant. We also copmared the model to OLS using Wald and LR tests, which also resulted in GNS being the better fitted model. Since GNS, being the most general model, is better than any limited model we conclude GNS is the best fitted model.
spatialreg::LR.sarlm(GNS_1, SAC_1)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 125.72, df = 4, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SAC_1
## 1519.414 1456.556
spatialreg::LR.sarlm(GNS_1, SDM_1)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 71.798, df = 1, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SDM_1
## 1519.414 1483.516
spatialreg::LR.sarlm(GNS_1, SDEM_1)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 78.362, df = 1, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SDEM_1
## 1519.414 1480.233
spatialreg::LR.sarlm(GNS_1, SAR_1)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 163.2, df = 5, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SAR_1
## 1519.414 1437.812
spatialreg::LR.sarlm(GNS_1, SLX_1)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 396.04, df = 2, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SLX_1
## 1519.414 1321.396
spatialreg::LR.sarlm(GNS_1, SEM_1)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 131.19, df = 5, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SEM_1
## 1519.414 1453.818
spatialreg::LR1.sarlm(GNS_1)
##
## Likelihood Ratio diagnostics for spatial dependence
##
## data:
## Likelihood ratio = 451.46, df = 6, p-value < 2.2e-16
## sample estimates:
## Log likelihood of spatial lag model Log likelihood of OLS fit y
## 1519.414 1293.686
spatialreg::Wald1.sarlm(GNS_1)
##
## Wald diagnostics for spatial dependence
##
## data:
## Wald statistic = 212.45, df = 6, p-value < 2.2e-16
## sample estimates:
## lambda
## -0.3908798
According to the Akaike Information Criterion, Manski model turns out to be once again the best having the lowest AIC value of -3015.
AIC(GNS_1, SAC_1, SDM_1, SDEM_1, SAR_1, SLX_1, SEM_1, OLS_1)
As we could see in the models above, the variable with Domestic migration turned out to be insignificant for all models, so we decided to check the results after removing that variable. The OLS model still has spatial autocorrelation of residuals according to morans test. However just as with the previous form it performs very poorly with an Adj R2 0.55.
form2 = log(Income)~log(Pop)+log(HousingIndex)+log(Education.perc)
#OLS
OLS_2<-lm(form2, data=uscounties.df)
summary(OLS_2)
##
## Call:
## lm(formula = form2, data = uscounties.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75627 -0.08959 -0.00245 0.08357 1.11011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.053951 0.088407 91.10 <2e-16 ***
## log(Pop) -0.031938 0.002403 -13.29 <2e-16 ***
## log(HousingIndex) 0.162590 0.009801 16.59 <2e-16 ***
## log(Education.perc) 0.338255 0.010770 31.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1584 on 3040 degrees of freedom
## Multiple R-squared: 0.5521, Adjusted R-squared: 0.5516
## F-statistic: 1249 on 3 and 3040 DF, p-value: < 2.2e-16
lm.morantest(OLS_2, us.sym.knn.listw)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = form2, data = uscounties.df)
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = 17.353, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3767650507 -0.0009075540 0.0004736819
Using the Nagelkerke Pseudo R2 we can see that the model fit is good.
# GNS, Manski model (all)
GNS_2<-sacsarlm(form2, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)
## Warning: Function sacsarlm moved to the spatialreg package
summary(GNS_2, Nagelkerke=TRUE)
##
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw,
## listw2 = listw2, na.action = na.action, Durbin = Durbin,
## type = type, method = method, quiet = quiet, zero.policy = zero.policy,
## tol.solve = tol.solve, llprof = llprof, interval1 = interval1,
## interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59166624 -0.06558576 -0.00034357 0.06061551 1.11769690
##
## Type: sacmixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.4472105 0.1788402 19.2754 < 2.2e-16
## log(Pop) -0.0280846 0.0030354 -9.2525 < 2.2e-16
## log(HousingIndex) 0.2464380 0.0146566 16.8142 < 2.2e-16
## log(Education.perc) 0.2386296 0.0128075 18.6320 < 2.2e-16
## lag.log(Pop) 0.0151882 0.0032144 4.7250 2.301e-06
## lag.log(HousingIndex) -0.1920608 0.0155290 -12.3678 < 2.2e-16
## lag.log(Education.perc) -0.0748255 0.0150327 -4.9775 6.440e-07
##
## Rho: 0.58284
## Asymptotic standard error: 0.020736
## z-value: 28.107, p-value: < 2.22e-16
## Lambda: -0.39173
## Asymptotic standard error: 0.026744
## z-value: -14.647, p-value: < 2.22e-16
##
## LR test value: 452.52, p-value: < 2.22e-16
##
## Log likelihood: 1518.322 for sacmixed model
## ML residual variance (sigma squared): 0.014809, (sigma: 0.12169)
## Nagelkerke pseudo-R-squared: 0.61394
## Number of observations: 3044
## Number of parameters estimated: 10
## AIC: -3016.6, (AIC for lm: -2574.1)
We also caluclated Morans I over GNS residuals with the new functional form and we can see the lack of spatial autocorrelation.
moran.test(GNS_2$residuals, us.sym.knn.listw)
##
## Moran I test under randomisation
##
## data: GNS_2$residuals
## weights: us.sym.knn.listw
##
## Moran I statistic standard deviate = -0.30914, p-value = 0.6214
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.0070502370 -0.0003286231 0.0004727459
By comparing two models - GNS1 and GNS2 - with each other using the LogLikelihood Ratio test, we compare both with OLS and check which one has a higher log likelohood ratio. It turns out that GNS2, without Domestic Migration, is the slightly better performing one.
spatialreg::LR1.sarlm(GNS_1)
##
## Likelihood Ratio diagnostics for spatial dependence
##
## data:
## Likelihood ratio = 451.46, df = 6, p-value < 2.2e-16
## sample estimates:
## Log likelihood of spatial lag model Log likelihood of OLS fit y
## 1519.414 1293.686
spatialreg::LR1.sarlm(GNS_2)
##
## Likelihood Ratio diagnostics for spatial dependence
##
## data:
## Likelihood ratio = 452.52, df = 5, p-value < 2.2e-16
## sample estimates:
## Log likelihood of spatial lag model Log likelihood of OLS fit y
## 1518.322 1292.062
Because GNS1 can also be interpreted as an unlimited version of GNS2, we calculated the LR test to compare each other and it turns out, that once again GNS2 - the limited model - is better.
spatialreg::LR.sarlm(GNS_1, GNS_2)
##
## Likelihood ratio for spatial linear models
##
## data:
## Likelihood ratio = 2.1857, df = 2, p-value = 0.3353
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of GNS_2
## 1519.414 1518.322
According to the BP test, both LM model and GNS model show spatial heteroscedasticity due to significantly rejected null hypothesis about the constant variance of residuals. The LOSH statistic for GNS and OLS is very similar and shows that local homogeneity can be found for mainly in areas far from big cities, like in the north of Minnesota National Forests, deserts of Nevada and other national parks and deserted areas. The highest heterogeneity is visible e.g. in the South of Arizona, around San Francisco and near the coast of North Carolina.
plot(x = OLS_2$fitted.values, y = uscounties.df$Income)
plot(x = GNS_2$fitted.values, y = uscounties.df$Income)
bptest(OLS_2)
##
## studentized Breusch-Pagan test
##
## data: OLS_2
## BP = 175.44, df = 3, p-value < 2.2e-16
bptest.sarlm(GNS_2)
## Warning: Method bptest.sarlm moved to the spatialreg package
##
## studentized Breusch-Pagan test
##
## data:
## BP = 143.13, df = 6, p-value < 2.2e-16
losh.ols = LOSH(OLS_2$residuals, us.sym.knn.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude)
losh.gns = LOSH(GNS_2$residuals, us.sym.knn.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude)
shades<-auto.shading(losh.gns[,1], n=5, cols=brewer.pal(9, "Blues")[5:9])
choropleth(uscounties, losh.gns[,1], main="Spatial distribution of LOSH
statistics for residuals from the GNS model", shading = shades, lty=0)
plot(us_states$geometry, add=T)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
shades<-auto.shading(losh.ols[,1], n=5, cols=brewer.pal(9, "Blues")[5:9])
choropleth(uscounties, losh.ols[,1], main="Spatial distribution of LOSH
statistics for residuals from the LM model ", shading = shades, lty=0)
plot(us_states$geometry, add=T)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")
Both OLS and GNS turned out to be spatially heteroskedastic, however they assume that the values are constant in space. One solution to that problem is using the Geographically Weighted Regression (GWR) model. It takes into account spatial autocorrelation as well as spatial heteroskedasticity.
The new dataset on county level however turned out to be much more promising than the point data. This is why as a followup analysis we would suggest performing GWR to find out conclusions that are localised.