Project overview

The scope of this project is to analyze income in households in USA on county level and as distance from cities to state capitals. At the beggining the ESDA will be conducted. After that Moran’s I test and interactions models will be performed. To enrich the analysis, additional features wil be added. At the end the models will be compared and quality assessment checked.

For this project the data from https://www.kaggle.com/goldenoakresearch/us-household-income-stats-geo-locations is used.

The point data consists of Income information as Mean, Median and Standrad deviation values. We expect that median information inform about the income of the “middle” person living in the area where as standard deviation will inform about inequality in area.

head(kaggle_income)

ESDA

Distributions of mean, median and standard deviation looks quite similar, But it is visible that in some counties median is quite high. It can indicate that there are observations with inequalities in income.

Points plot on map

Below in tabs are visualisations of points on map with the values

median

variable <- kaggle_income$Median
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")

mean

variable <- kaggle_income$Mean
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")

std dev

variable <- kaggle_income$Stdev
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")

Moran’s I

As a spatial weights matrix we decided to use the knn matrix with k=1.

inc.knn<-knearneigh(coords.m, k=1) # knn class (k=1)
inc.knn.nb<-knn2nb(inc.knn)
inc.knn.nb<-make.sym.nb(inc.knn.nb)
inc.knn.listw<-nb2listw(inc.knn.nb)

We conducted Moran’s I test for each variable. All tests are significant, with a low p-value that allows us to reject the null hypothesis and assume there is a spatial autocorrelation. The value of the Morans I statistic is positive signifying a positive correlation which means that similar values touch more often than random. Below in tabs are plots for Moran’s test of each variable.

Median

moran.test(kaggle_income$Median, inc.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Median  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 24.726, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      1.693882e-01     -3.134600e-05      4.694837e-05
x<-kaggle_income$Median
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="coral3")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Mean

moran.test(kaggle_income$Mean, inc.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Mean  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 85.088, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      5.829837e-01     -3.134600e-05      4.694889e-05
x<-kaggle_income$Mean
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="darkcyan")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Std dev

moran.test(kaggle_income$Stdev, inc.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Stdev  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 67.374, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      4.616191e-01     -3.134600e-05      4.695085e-05
x<-kaggle_income$Stdev
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="chartreuse4")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Distances calculation

Selecting state capital cities, calculating distances from cities and plotting distances on map.

state.capital.cities <- us.cities[us.cities$capital==2,c("name", "pop", "lat", "long")]
state.capital.cities <- state.capital.cities[state.capital.cities$name != "Honolulu HI",]
coords <- kaggle_income[,c("Lon", "Lat")]
coords <- as.matrix(coords)
for (city.name in state.capital.cities$name) {
  col.city.name = paste0("dist_",city.name)
  lat = state.capital.cities[state.capital.cities$name==city.name,]$lat
  long = state.capital.cities[state.capital.cities$name==city.name,]$long
  kaggle_income[col.city.name]<-spDistsN1(coords, c(long, lat), longlat=TRUE)
}
kaggle_income$mindist <- NULL
kaggle_income$mindist = apply(kaggle_income[,20:length(colnames(kaggle_income))], 1, min)
cols = rev(brewer.pal(9, "Blues"))
mindist <- kaggle_income$mindist
brks <- stats::quantile(mindist, probs=seq(0, 1, 0.13))

plot(usmap)
points(kaggle_income[,c("Lon", "Lat")], col=cols[findInterval(mindist, brks)], pch=21, cex=0.7,
       bg=cols[findInterval(mindist, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
title(main="Distances from closest state capital")

Interactions model

Below we predict the income values for each variable using a sample of 3000 points. Modelling on the whole dataset was impossible due to computational limitations.

kaggle_income_s = dplyr::sample_n(kaggle_income, 3000)

coords.s = kaggle_income_s[,c("Lon", "Lat")]
coords.m.s = as.matrix(coords.s)

inc.knn<-knearneigh(coords.m.s, k=1) # knn class (k=1)
inc.knn.nb<-knn2nb(inc.knn)
inc.knn.nb<-make.sym.nb(inc.knn.nb)

# knn as listw class
inc.knn.listw.s<-nb2listw(inc.knn.nb)

Median

kaggle_income_s$Median[kaggle_income_s$Median==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Median~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.med<-glm(eq.m)
spatial.multi.med<-spatialreg::errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Registered S3 methods overwritten by 'spatialreg':
##   method                   from 
##   residuals.stsls          spdep
##   deviance.stsls           spdep
##   coef.stsls               spdep
##   print.stsls              spdep
##   summary.stsls            spdep
##   print.summary.stsls      spdep
##   residuals.gmsar          spdep
##   deviance.gmsar           spdep
##   coef.gmsar               spdep
##   fitted.gmsar             spdep
##   print.gmsar              spdep
##   summary.gmsar            spdep
##   print.summary.gmsar      spdep
##   print.lagmess            spdep
##   summary.lagmess          spdep
##   print.summary.lagmess    spdep
##   residuals.lagmess        spdep
##   deviance.lagmess         spdep
##   coef.lagmess             spdep
##   fitted.lagmess           spdep
##   logLik.lagmess           spdep
##   fitted.SFResult          spdep
##   print.SFResult           spdep
##   fitted.ME_res            spdep
##   print.ME_res             spdep
##   print.lagImpact          spdep
##   plot.lagImpact           spdep
##   summary.lagImpact        spdep
##   HPDinterval.lagImpact    spdep
##   print.summary.lagImpact  spdep
##   print.sarlm              spdep
##   summary.sarlm            spdep
##   residuals.sarlm          spdep
##   deviance.sarlm           spdep
##   coef.sarlm               spdep
##   vcov.sarlm               spdep
##   fitted.sarlm             spdep
##   logLik.sarlm             spdep
##   anova.sarlm              spdep
##   predict.sarlm            spdep
##   print.summary.sarlm      spdep
##   print.sarlm.pred         spdep
##   as.data.frame.sarlm.pred spdep
##   residuals.spautolm       spdep
##   deviance.spautolm        spdep
##   coef.spautolm            spdep
##   fitted.spautolm          spdep
##   print.spautolm           spdep
##   summary.spautolm         spdep
##   logLik.spautolm          spdep
##   print.summary.spautolm   spdep
##   print.WXImpact           spdep
##   summary.WXImpact         spdep
##   print.summary.WXImpact   spdep
##   predict.SLX              spdep
# power
eq.p = log(kaggle_income_s$Median)~log(kaggle_income_s$mindist)
OLS.power.med<-lm(eq.p)
spatial.power.med<-spatialreg::errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
# exponential
eq.e = log(kaggle_income_s$Median)~kaggle_income_s$mindist
OLS.exp.med<-glm(eq.e)
spatial.exp.med<-spatialreg::errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
screenreg(list(OLS.multi.med, spatial.multi.med, OLS.power.med, spatial.power.med, OLS.exp.med, spatial.exp.med))
## 
## =========================================================================================================================
##                               Model 1                Model 2        Model 3      Model 4       Model 5       Model 6     
## -------------------------------------------------------------------------------------------------------------------------
## (Intercept)                            92914.72 ***   92795.39 ***    11.16 ***     11.17 ***     10.91 ***     10.92 ***
##                                        (5616.45)      (5904.51)       (0.11)        (0.12)        (0.04)        (0.04)   
## kaggle_income_s$mindist                  239.32         242.63                                    -0.00         -0.00    
##                                         (137.23)       (144.10)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2                 -3.57 ***      -3.58 ***                                                       
##                                           (0.99)         (1.04)                                                          
## kaggle_income_s$mindist^3                  0.01 ***       0.01 ***                                                       
##                                           (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4                 -0.00 ***      -0.00 ***                                                       
##                                           (0.00)         (0.00)                                                          
## lambda                                                    0.06 ***                   0.08 ***                    0.08 ***
##                                                          (0.01)                     (0.01)                      (0.01)   
## log(kaggle_income_s$mindist)                                          -0.06 **      -0.06 *                              
##                                                                       (0.02)        (0.02)                               
## -------------------------------------------------------------------------------------------------------------------------
## AIC                                    76839.07                                                10467.95                  
## BIC                                    76875.11                                                10485.97                  
## Log Likelihood                        -38413.54      -38406.58                   -5214.52      -5230.98      -5216.83    
## Deviance                      23254375025620.02                                                 5743.41                  
## Num. obs.                               3000           3000         3000          3000          3000          3000       
## Parameters                                                7                          4                           4       
## AIC (Linear model)                                    76839.07                   10462.60                    10467.95    
## AIC (Spatial model)                                   76827.15                   10437.03                    10441.65    
## LR test: statistic                                       13.92                      27.57                       28.30    
## LR test: p-value                                          0.00                       0.00                        0.00    
## R^2                                                                    0.00                                              
## Adj. R^2                                                               0.00                                              
## =========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.med","spatial.power.med","spatial.exp.med")
for(i in 1:3){
 AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
 AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 76839.07 76827.15 10462.60 10437.03 10467.95 10441.65

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.med", "spatial.multi.med", "OLS.power.med", "spatial.power.med", "OLS.exp.med", "spatial.exp.med")
SRMSE.all<-matrix(0)
for(i in 1:6){
 SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Median")}
SRMSE.all
## [1] 1.028027 1.024558 1.440770 1.440770 1.440770 1.440770

Mean

kaggle_income_s$Mean[kaggle_income_s$Mean==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Mean~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.mean<-glm(eq.m)
spatial.multi.mean<-spatialreg::errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
# power
eq.p = log(kaggle_income_s$Mean)~log(kaggle_income_s$mindist)
OLS.power.mean<-lm(eq.p)
spatial.power.mean<-spatialreg::errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
# exponential
eq.e = log(kaggle_income_s$Mean)~kaggle_income_s$mindist
OLS.exp.mean<-glm(eq.e)
spatial.exp.mean<-spatialreg::errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
screenreg(list(OLS.multi.mean, spatial.multi.mean, OLS.power.mean, spatial.power.mean, OLS.exp.mean, spatial.exp.mean))
## 
## ========================================================================================================================
##                               Model 1               Model 2        Model 3      Model 4       Model 5       Model 6     
## ------------------------------------------------------------------------------------------------------------------------
## (Intercept)                           69447.60 ***   69733.13 ***    11.10 ***     11.10 ***     10.91 ***     10.92 ***
##                                       (1902.84)      (2334.17)       (0.10)        (0.11)        (0.03)        (0.04)   
## kaggle_income_s$mindist                 128.57 **      122.66 *                                  -0.00         -0.00    
##                                         (46.49)        (56.57)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2                -1.81 ***      -1.74 ***                                                       
##                                          (0.33)         (0.41)                                                          
## kaggle_income_s$mindist^3                 0.01 ***       0.01 ***                                                       
##                                          (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4                -0.00 ***      -0.00 ***                                                       
##                                          (0.00)         (0.00)                                                          
## lambda                                                   0.26 ***                   0.08 ***                    0.08 ***
##                                                         (0.01)                     (0.01)                      (0.01)   
## log(kaggle_income_s$mindist)                                         -0.04 *       -0.04 *                              
##                                                                      (0.02)        (0.02)                               
## ------------------------------------------------------------------------------------------------------------------------
## AIC                                   70344.97                                                 9707.68                  
## BIC                                   70381.01                                                 9725.70                  
## Log Likelihood                       -35166.49      -35002.44                   -4835.01      -4850.84      -4836.79    
## Deviance                      2669238635769.98                                                 4457.68                  
## Num. obs.                              3000           3000         3000          3000          3000          3000       
## Parameters                                               7                          4                           4       
## AIC (Linear model)                                   70344.97                    9703.71                     9707.68    
## AIC (Spatial model)                                  70018.88                    9678.03                     9681.58    
## LR test: statistic                                     328.09                      27.68                       28.10    
## LR test: p-value                                         0.00                       0.00                        0.00    
## R^2                                                                   0.00                                              
## Adj. R^2                                                              0.00                                              
## ========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.mean","spatial.power.mean","spatial.exp.mean")
for(i in 1:3){
 AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
 AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 70344.972 70018.878  9703.713  9678.028  9707.681  9681.584

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.mean", "spatial.multi.mean", "OLS.power.mean", "spatial.power.mean", "OLS.exp.mean", "spatial.exp.mean")
SRMSE.all<-matrix(0)
for(i in 1:6){
 SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.4492500 0.4149251 1.0999516 1.0999515 1.0999516 1.0999515

Std dev

kaggle_income_s$Stdev[kaggle_income_s$Stdev==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Stdev~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.std<-glm(eq.m)
spatial.multi.std<-spatialreg::errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
# power
eq.p = log(kaggle_income_s$Stdev)~log(kaggle_income_s$mindist)
OLS.power.std<-lm(eq.p)
spatial.power.std<-spatialreg::errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
# exponential
eq.e = log(kaggle_income_s$Stdev)~kaggle_income_s$mindist
OLS.exp.std<-glm(eq.e)
spatial.exp.std<-spatialreg::errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
screenreg(list(OLS.multi.std, spatial.multi.std, OLS.power.std, spatial.power.std, OLS.exp.std, spatial.exp.std))
## 
## =======================================================================================================================
##                               Model 1              Model 2        Model 3      Model 4       Model 5       Model 6     
## -----------------------------------------------------------------------------------------------------------------------
## (Intercept)                          47922.57 ***   47888.62 ***    10.74 ***     10.74 ***     10.60 ***     10.60 ***
##                                      (1041.47)      (1219.64)       (0.09)        (0.10)        (0.03)        (0.04)   
## kaggle_income_s$mindist                 64.00 *        64.46 *                                  -0.00         -0.00    
##                                        (25.45)        (29.65)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2               -0.80 ***      -0.79 ***                                                       
##                                         (0.18)         (0.21)                                                          
## kaggle_income_s$mindist^3                0.00 ***       0.00 ***                                                       
##                                         (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4               -0.00 ***      -0.00 ***                                                       
##                                         (0.00)         (0.00)                                                          
## lambda                                                  0.19 ***                   0.07 ***                    0.07 ***
##                                                        (0.01)                     (0.01)                      (0.01)   
## log(kaggle_income_s$mindist)                                        -0.03         -0.03                                
##                                                                     (0.02)        (0.02)                               
## -----------------------------------------------------------------------------------------------------------------------
## AIC                                  66728.65                                                 9454.64                  
## BIC                                  66764.69                                                 9472.65                  
## Log Likelihood                      -33358.32      -33271.78                   -4712.41      -4724.32      -4713.65    
## Deviance                      799596354692.07                                                 4097.10                  
## Num. obs.                             3000           3000         3000          3000          3000          3000       
## Parameters                                              7                          4                           4       
## AIC (Linear model)                                  66728.65                    9451.97                     9454.64    
## AIC (Spatial model)                                 66557.56                    9432.83                     9435.29    
## LR test: statistic                                    173.09                      21.14                       21.34    
## LR test: p-value                                        0.00                       0.00                        0.00    
## R^2                                                                  0.00                                              
## Adj. R^2                                                             0.00                                              
## =======================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.std","spatial.power.std","spatial.exp.std")
for(i in 1:3){
 AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
 AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 66728.647 66557.560  9451.966  9432.825  9454.636  9435.295

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.std", "spatial.multi.std", "OLS.power.std", "spatial.power.std", "OLS.exp.std", "spatial.exp.std")
SRMSE.all<-matrix(0)
for(i in 1:6){
 SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.5372586 0.5258286 1.0999558 1.0999558 1.0999558 1.0999558

Model quality plots

Median

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, OLS.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, spatial.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, OLS.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, spatial.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Mean

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, OLS.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, spatial.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, OLS.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, spatial.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Standard Deviation

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, OLS.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, spatial.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), OLS.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), spatial.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, OLS.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, spatial.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Interaction results and modelling on county level

The results from all variables are similar. For all models spatial models are doing better than a-spatial and for all variables the best interactions functional form is the multinomial model. However looking at the plots and R2 and Adjusted R2 we can notice that the models perform very poorly. This might be because of the sampling, however we are not able to compare the results on the whole dataset.

Because of a lack of satisfactory results on points data, we dicided to move to aggregated data on county level and analyse using additional variables.

County level with additional variables

Because we didn’t find any interesting relations using point data, we decided to move to a more aggregated level, using county data. We decided to include additional explanatory variables.

zero_pad <- function(x) { ifelse(str_starts(x, "0"), x, paste0("0",x))}

US counties shapefile

As a base for all plotting purposes and for joining multiple sources together, we use a shapefile provided by the US Census Bureau. It contains all counties, the primary legal divisions of most states.

https://catalog.data.gov/dataset/tiger-line-shapefile-2017-nation-u-s-current-county-and-equivalent-national-shapefile

uscounties <- readOGR(".", "tl_2017_us_county")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "tl_2017_us_county"
## with 3233 features
## It has 17 fields
## Integer64 fields read as strings:  ALAND AWATER
uscounties <- spTransform(uscounties, CRS("+proj=longlat +datum=NAD83"))
uscounties$GEOID = as.character(uscounties$GEOID)
uscounties$GEOID0 = uscounties$GEOID %>% sapply(zero_pad)

Personal income per county

The dependent value, income per capita, has been downloaded from the Bureau of Economic Analysis. In the data, there are three types of variables: Median personal income in region, Population in region and Median personal income per capita. We take the newest data from 2018 for income per capita.

bea CAINC1: Annual Personal Income by County https://apps.bea.gov/regional/downloadzip.cfm

income_county <- read.csv("CAINC1__ALL_AREAS_1969_2018.csv", stringsAsFactors=FALSE)
income_county <- income_county[,c("GeoFIPS", "GeoName", "Description", "LineCode", "Unit","X2018")]
income_county <- income_county[1:(dim(income_county)[1]-3),]
income_county <- income_county[income_county$Description=="Per capita personal income (dollars) 2/",]
colnames(income_county) = c("FIPS", "County", "Description", "LineCode", "Unit","Income")
income_county = income_county[!as.character(income_county$FIPS) %>% sapply(str_trim) %>% sapply(str_ends, "000"),]
income_county$FIPS = as.character(income_county$FIPS) %>% sapply(str_trim)
income_county$Income = as.integer(income_county$Income)
income_county$FIPS0 = income_county$FIPS %>% sapply(zero_pad)
income_county = income_county[!is.na(income_county$Income),]

Housing Values

Using data provided by the National Association of Realtors we got the median home prices per county from 4th quarter in 2019. The prices are not from home sales but the values instead from the American Community Survey (ACS).

https://www.nar.realtor/research-and-statistics/housing-statistics/county-median-home-prices-and-monthly-mortgage-payment

house_index <- read.csv2("HI_data.csv", stringsAsFactors=FALSE)
colnames(house_index) = c("FIPS", "Price.RangeQ42019", "Geography", "HousingIndex" )
house_index$FIPS <- paste0("0",house_index$FIPS)
house_index$FIPS0 = house_index$FIPS %>% sapply(zero_pad)

Education and population

Data on Population and Education is provided by the United States Department of Agriculture. As a proxy of the level of education we decided to use the variable “Percent of adults with a bachelor’s degree or higher, 2014-18” and from Population data we got the population per county POP_ESTIMATE_2018 as well as the level of domestic migration DOMESTIC_MIG_2018.

education_county <- read.csv2("Education.csv", stringsAsFactors=FALSE)
colnames(education_county) = c("FIPS", "State", "Area.name", "Education.perc")
education_county$Education.perc = as.numeric(education_county$Education.perc)
education_county = education_county[!is.na(education_county$Education.perc),]
education_county$FIPS0 <- paste0("0",education_county$FIPS)
population_county <- read.csv2("Pop.csv", stringsAsFactors=FALSE)
colnames(population_county) = c("FIPS", "State", "Area.name", "Econ.Typology", "Dom.Migr", "Pop")
population_county$Dom.Migr = as.numeric(population_county$Dom.Migr)
population_county$Pop = as.numeric(population_county$Pop)
population_county = population_county[!is.na(population_county$Dom.Migr),]
population_county = population_county[!is.na(population_county$Pop),]
population_county$FIPS0 <- paste0("0",population_county$FIPS)

All dataframes with variables were merged to spatial dataframe uscounties using the FIPS code, which is an identifier of county. Then the dataframe was cleaned from missing variables by removing suchobservations.

uscounties = merge(uscounties, house_index, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, income_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, population_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, education_county, by.x="GEOID0", by.y="FIPS0")
uscounties = uscounties[!uscounties$STATEFP %in% c("15", "02"),]

uscounties = uscounties[!is.na(uscounties$Education.perc),]
uscounties = uscounties[!is.na(uscounties$HousingIndex),]
uscounties = uscounties[!is.na(uscounties$Income),]
uscounties = uscounties[!is.na(uscounties$Dom.Migr),]
uscounties = uscounties[!is.na(uscounties$Pop),]

ESDA

Maps of variables

In tabs below are visualised distributions of each variable for all counties.

Housing Index

variable<-uscounties$HousingIndex
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

Income

variable<-uscounties$Income
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

Education

variable<-uscounties$Education.perc
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

Population

variable<-uscounties$Pop
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

Domestic migration

variable<-uscounties$Dom.Migr
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

Spatial weights matrix - KNN

For out analysis we decided to choose the knn spatial weights matrix. We rejected the matrix based on radius, because in the south-east part of the country, the states are much more densely packed, and it would be difficult to find an optimal radius across the country. As for contiguity matrix, we decided that since some of the counties were removed, the neighbors might be not that accurate. That is why we dicded to go with the knn spatial weights matrix using k=1.

uscounties.df = as.data.frame(uscounties)
crds<-coordinates(uscounties)

# calculate weights matrix
us.knn<-knearneigh(crds, k=1)
us.knn.nb<-knn2nb(us.knn)
us.sym.knn.nb<-make.sym.nb(us.knn.nb)
us.sym.knn.listw<-nb2listw(us.sym.knn.nb)

Logarithmic transformation

We start the analysis by printing the summary of variables, including logarythmic transformations of the variables which we found to be a necessary transformation in later steps of the analysis. Because Domestic Migration variable takes negative and positive values, we decided to take into account only absolute values of the effect. We also added 1 to the variable in order to get rid of errors caused by applying a log to 0 value. All variables were added as new to the dataframe with the suffix “.l”.

uscounties.df %>% 
        dplyr::mutate(Income.l = log(Income), 
               Pop.l = log(Pop), 
               HousingIndex.l = log(HousingIndex), 
               Education.perc.l = log(Education.perc),
               Dom.Migr.l = log(abs(Dom.Migr)+1)) %>% 
        dplyr::select(GEOID0,
                      Income, HousingIndex, Dom.Migr, Pop, Education.perc,
                      Income.l, HousingIndex.l, Dom.Migr.l, Pop.l, Education.perc.l) %>%
        summary()
##     GEOID0              Income        HousingIndex        Dom.Migr        
##  Length:3044        Min.   : 18541   Min.   :  35114   Min.   :-98218.00  
##  Class :character   1st Qu.: 36505   1st Qu.:  98618   1st Qu.:  -126.00  
##  Mode  :character   Median : 41836   Median : 129868   Median :    -8.00  
##                     Mean   : 43944   Mean   : 158383   Mean   :     9.42  
##                     3rd Qu.: 48502   3rd Qu.: 180946   3rd Qu.:   161.00  
##                     Max.   :251728   Max.   :1195477   Max.   : 49531.00  
##       Pop           Education.perc     Income.l      HousingIndex.l 
##  Min.   :     276   Min.   : 5.40   Min.   : 9.828   Min.   :10.47  
##  1st Qu.:   11030   1st Qu.:15.00   1st Qu.:10.505   1st Qu.:11.50  
##  Median :   26010   Median :19.10   Median :10.642   Median :11.77  
##  Mean   :  105012   Mean   :21.40   Mean   :10.660   Mean   :11.84  
##  3rd Qu.:   67556   3rd Qu.:25.32   3rd Qu.:10.789   3rd Qu.:12.11  
##  Max.   :10073906   Max.   :74.60   Max.   :12.436   Max.   :13.99  
##    Dom.Migr.l         Pop.l        Education.perc.l
##  Min.   : 0.000   Min.   : 5.620   Min.   :1.686   
##  1st Qu.: 3.951   1st Qu.: 9.308   1st Qu.:2.708   
##  Median : 4.920   Median :10.166   Median :2.950   
##  Mean   : 5.037   Mean   :10.283   Mean   :2.983   
##  3rd Qu.: 6.065   3rd Qu.:11.121   3rd Qu.:3.232   
##  Max.   :11.495   Max.   :16.125   Max.   :4.312
uscounties.df = uscounties.df %>%  dplyr::mutate(Income.l = log(Income), 
                                          Pop.l = log(Pop), 
                                          HousingIndex.l = log(HousingIndex), 
                                          Education.perc.l = log(Education.perc),
                                          Dom.Migr.l = log(abs(Dom.Migr)+1))

As a next step we calculated the correlations of each variables with each other and plotted the results. The correlations are significant and seem pretty promising.

uscounties.df[,c("Income.l","Dom.Migr.l","HousingIndex.l","Pop.l","Education.perc.l")] %>% chart.Correlation(histogram=TRUE, pch=19)

Morans I

Next we decided to check the spatial autocorrelation of the dependent variable using Morans I. Basing on the results we can conclude that there is significant positive autocorrelation of the variable.

moran.test(uscounties.df$Income.l, us.sym.knn.listw )
## 
##  Moran I test under randomisation
## 
## data:  uscounties.df$Income.l  
## weights: us.sym.knn.listw    
## 
## Moran I statistic standard deviate = 25.277, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.5494976506     -0.0003286231      0.0004731465

We later plotted the results as a scatterplot of the Income dependent on a spatially lagged value of the Income.

variable<-uscounties.df$Income.l
variable.std<-((variable-mean(variable))/sd(variable))
moran.plot(variable.std, us.sym.knn.listw, pch=20, labels=F, quiet=T)

The results of the moran test, that is in which quarter in the plot the county landed, was plotted on a map. We can notice a concentration of the III quarter - that is low surrounded by low - in the Southern region, in states like Mississippi, Alabama, Arkansas, Kentucky. The states in the Ist quarter are in the Midwestern region (Dakotas, Kansas, Nebraska, Iowa), Central Atlantic (New York, Pennsylvania, Delaware, New Jersey) and theindividual states, like California and parts of Washington.

w.var<-lag.listw(us.sym.knn.listw, variable.std) 

cond1<-ifelse(variable.std>=0 & w.var>=0, 1,0)  # I quarter
cond2<-ifelse(variable.std>=0 & w.var<0, 2,0)  # II quarter
cond3<-ifelse(variable.std<0  & w.var<0, 3,0)  # III quarter
cond4<-ifelse(variable.std<0  & w.var>=0, 4,0)  # IV quarter
cond.all<-cond1+cond2+cond3+cond4 # all quarters in one
cond<-as.data.frame(cond.all)

# map
brks<-c(1,2,3,4)
cols<-c("#EFF3FF", "#6BAED6", "#2171B5", "#BDD7E7")
plot(uscounties, col=cols[findInterval(cond$cond.all, brks)], lty=0)
plot(us_states$geometry, add=T)
legend("bottomleft", legend=c("I Q - HH – high surrounded by high", "II Q - LH - low surrounded by high", "III Q - LL - low surrounded by low", "IV Q - HL - high surrounded by low"), fill=cols, bty="n", cex=0.80)
title(main="Mapping of Moran scatterplot results for Income")

Join-count test

We also calculated the joint count test. Below can be seen quantiles of variable Income.l

uscounties.df$Income.l %>% quantile()
##       0%      25%      50%      75%     100% 
##  9.82774 10.50520 10.64151 10.78936 12.43610
# parameters of graphics
brks1<-c(0,10.4, 10.8, 13) 
cols<-c("green", "blue", "red")
var.factor<-factor(cut(uscounties.df$Income.l, breaks=brks1, labels=c("low", "medium", "high")))

The scatterplot represents points representing values of income in given color ranges. We chose, using quantiles, 10.4, 10.8 and 13. Then the counties were colored acording to income group.

# scatterplot of values
plot(uscounties.df$Income.l, bg=cols[findInterval(log(uscounties.df$Income), brks1)], pch=21)
abline(h=brks1[2:4], lty=3)

# spatial distribution with three colours
plot(uscounties, col=cols[findInterval(uscounties.df$Income.l, brks1)])
title(main="Income log")
legend("bottomleft", legend=c("low", "medium", "high"), leglabs(brks1), fill=cols, bty="n")

For all income groups the p-value is very low so we cannot reject the null hypothesis that there is spatial correlation at each income level.

joincount.test(var.factor, us.sym.knn.listw)
## 
##  Join count test under nonfree sampling
## 
## data:  var.factor 
## weights: us.sym.knn.listw 
## 
## Std. deviate for low = 13.771, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Same colour statistic           Expectation              Variance 
##              63.20833              18.16563              10.69795 
## 
## 
##  Join count test under nonfree sampling
## 
## data:  var.factor 
## weights: us.sym.knn.listw 
## 
## Std. deviate for medium = 9.508, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Same colour statistic           Expectation              Variance 
##             736.58333             651.01709              80.98987 
## 
## 
##  Join count test under nonfree sampling
## 
## data:  var.factor 
## weights: us.sym.knn.listw 
## 
## Std. deviate for high = 17.264, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Same colour statistic           Expectation              Variance 
##             192.00000              85.06080              38.37036

Modelling

We decided to start the modelling with the form of model \[log(Income=log(Pop)+log(HousingIndex)+log(Education.perc)+log(|Dom.Migr|+1)\]

form = log(Income)~log(Pop)+log(HousingIndex)+log(Education.perc)+log(abs(Dom.Migr)+1)

OLS

It turns out that the model using OLS is not vary accurate with an R2 at 0.55. However all the variables turn out to be significant apart from Dom.Migr.l.

#OLS
OLS_1<-lm(form, data=uscounties.df)
summary(OLS_1)
## 
## Call:
## lm(formula = form, data = uscounties.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74902 -0.08740 -0.00307  0.08320  1.11212 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.016789   0.090750  88.339   <2e-16 ***
## log(Pop)               -0.028099   0.003211  -8.751   <2e-16 ***
## log(HousingIndex)       0.164470   0.009853  16.692   <2e-16 ***
## log(Education.perc)     0.337919   0.010767  31.383   <2e-16 ***
## log(abs(Dom.Migr) + 1) -0.004681   0.002598  -1.802   0.0717 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1583 on 3039 degrees of freedom
## Multiple R-squared:  0.5525, Adjusted R-squared:  0.5519 
## F-statistic: 938.2 on 4 and 3039 DF,  p-value: < 2.2e-16

checking for spatial autocorrelation of residuals in OLS

Due to a low pvalue of the Global Morans I test we reject the hypothesis of a lack of spatial autocorrelation. That means that the OLS model is not a correct specification of the model and spatial ecnometric models should be used. We don’t know however what form of spatial variables should be added and so in the next steps we will try to find the optimal functional form.

lm.morantest(OLS_1, us.sym.knn.listw)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = form, data = uscounties.df)
## weights: us.sym.knn.listw
## 
## Moran I statistic standard deviate = 17.301, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3755644435    -0.0009809063     0.0004736751

GNS

The first used model is the Manski model, which uses all possible spatial paameters, that is spatial lag of y, lag of x and lag of error. As a result we get a model with all parameters significant, apart from Dom.Migr.

# GNS, Manski model (all)
GNS_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)
## Warning: Function sacsarlm moved to the spatialreg package
summary(GNS_1, Nagelkerke=TRUE)
## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.5891645 -0.0654680 -0.0006591  0.0615183  1.1184286 
## 
## Type: sacmixed 
## Coefficients: (asymptotic standard errors) 
##                              Estimate Std. Error  z value  Pr(>|z|)
## (Intercept)                 3.4329632  0.1793139  19.1450 < 2.2e-16
## log(Pop)                   -0.0265383  0.0037500  -7.0768 1.475e-12
## log(HousingIndex)           0.2468577  0.0146717  16.8254 < 2.2e-16
## log(Education.perc)         0.2385731  0.0128092  18.6251 < 2.2e-16
## log(abs(Dom.Migr) + 1)     -0.0017874  0.0025098  -0.7122    0.4763
## lag.log(Pop)                0.0158909  0.0039658   4.0070 6.149e-05
## lag.log(HousingIndex)      -0.1910021  0.0155729 -12.2651 < 2.2e-16
## lag.log(Education.perc)    -0.0744758  0.0150416  -4.9513 7.371e-07
## lag.log(abs(Dom.Migr) + 1) -0.0011030  0.0027596  -0.3997    0.6894
## 
## Rho: 0.58165
## Asymptotic standard error: 0.02082
##     z-value: 27.938, p-value: < 2.22e-16
## Lambda: -0.39088
## Asymptotic standard error: 0.026817
##     z-value: -14.576, p-value: < 2.22e-16
## 
## LR test value: 451.46, p-value: < 2.22e-16
## 
## Log likelihood: 1519.414 for sacmixed model
## ML residual variance (sigma squared): 0.014826, (sigma: 0.12176)
## Nagelkerke pseudo-R-squared: 0.61421 
## Number of observations: 3044 
## Number of parameters estimated: 12 
## AIC: -3014.8, (AIC for lm: -2575.4)

SAC

For this model and all following the results are similar, where are spatial parameters are significant apart from those related to Dom.Migr.l

# SAC (rho and lambda)
SAC_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAC_1, Nagelkerke=TRUE)
## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -0.68456244 -0.07887936 -0.00020813  0.07450683  1.23705580 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)             7.3496842  0.2056854 35.7327   <2e-16
## log(Pop)               -0.0280881  0.0032579 -8.6216   <2e-16
## log(HousingIndex)       0.1754074  0.0111026 15.7988   <2e-16
## log(Education.perc)     0.3035556  0.0109026 27.8426   <2e-16
## log(abs(Dom.Migr) + 1) -0.0035918  0.0024970 -1.4384   0.1503
## 
## Rho: 0.059567
## Asymptotic standard error: 0.021057
##     z-value: 2.8288, p-value: 0.0046724
## Lambda: 0.20314
## Asymptotic standard error: 0.024835
##     z-value: 8.1793, p-value: 2.2204e-16
## 
## LR test value: 325.74, p-value: < 2.22e-16
## 
## Log likelihood: 1456.556 for sac model
## ML residual variance (sigma squared): 0.02182, (sigma: 0.14772)
## Nagelkerke pseudo-R-squared: 0.59795 
## Number of observations: 3044 
## Number of parameters estimated: 8 
## AIC: -2897.1, (AIC for lm: -2575.4)

SDM

# SDM (rho and x lags)
SDM_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="mixed") 
summary(SDM_1, Nagelkerke=TRUE)
## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -6.7178e-01 -7.8710e-02 -3.6943e-05  7.1507e-02  1.2029e+00 
## 
## Type: mixed 
## Coefficients: (asymptotic standard errors) 
##                              Estimate Std. Error z value  Pr(>|z|)
## (Intercept)                 6.0402592  0.1472418 41.0227 < 2.2e-16
## log(Pop)                   -0.0258719  0.0034648 -7.4670 8.193e-14
## log(HousingIndex)           0.2196932  0.0132181 16.6206 < 2.2e-16
## log(Education.perc)         0.2685165  0.0118578 22.6447 < 2.2e-16
## log(abs(Dom.Migr) + 1)     -0.0033040  0.0024614 -1.3423   0.17949
## lag.log(Pop)                0.0067342  0.0036808  1.8296   0.06731
## lag.log(HousingIndex)      -0.1178494  0.0140635 -8.3798 < 2.2e-16
## lag.log(Education.perc)     0.0163228  0.0133287  1.2246   0.22071
## lag.log(abs(Dom.Migr) + 1) -0.0017060  0.0027424 -0.6221   0.53388
## 
## Rho: 0.26138, LR test value: 324.24, p-value: < 2.22e-16
## Asymptotic standard error: 0.014024
##     z-value: 18.638, p-value: < 2.22e-16
## Wald statistic: 347.38, p-value: < 2.22e-16
## 
## Log likelihood: 1483.516 for mixed model
## ML residual variance (sigma squared): 0.021088, (sigma: 0.14522)
## Nagelkerke pseudo-R-squared: 0.60501 
## Number of observations: 3044 
## Number of parameters estimated: 11 
## AIC: -2945, (AIC for lm: -2622.8)
## LM test for residual autocorrelation
## test value: 68.352, p-value: < 2.22e-16

SDEM

#SDEM (lambda and x lags)
SDEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, etype="emixed")
summary(SDEM_1)
## 
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -0.67294082 -0.07809245 -0.00083009  0.07230371  1.20764394 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                               Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)                 8.13936115  0.12355436 65.8768 < 2.2e-16
## log(Pop)                   -0.02618536  0.00334065 -7.8384 4.663e-15
## log(HousingIndex)           0.20520741  0.01204106 17.0423 < 2.2e-16
## log(Education.perc)         0.28722481  0.01130742 25.4014 < 2.2e-16
## log(abs(Dom.Migr) + 1)     -0.00369600  0.00250748 -1.4740    0.1405
## lag.log(Pop)               -0.00077273  0.00358410 -0.2156    0.8293
## lag.log(HousingIndex)      -0.06125638  0.01283390 -4.7730 1.815e-06
## lag.log(Education.perc)     0.08924571  0.01233006  7.2381 4.552e-13
## lag.log(abs(Dom.Migr) + 1) -0.00215866  0.00282696 -0.7636    0.4451
## 
## Lambda: 0.25972, LR test value: 317.67, p-value: < 2.22e-16
## Asymptotic standard error: 0.014054
##     z-value: 18.48, p-value: < 2.22e-16
## Wald statistic: 341.53, p-value: < 2.22e-16
## 
## Log likelihood: 1480.233 for error model
## ML residual variance (sigma squared): 0.021146, (sigma: 0.14542)
## Number of observations: 3044 
## Number of parameters estimated: 11 
## AIC: -2938.5, (AIC for lm: -2622.8)

SAR

#SAR (rho)
SAR_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw) 
summary(SAR_1)
## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.7082361 -0.0816300 -0.0023959  0.0754489  1.2198112 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                          Estimate Std. Error z value  Pr(>|z|)
## (Intercept)             6.2286407  0.1307949 47.6214 < 2.2e-16
## log(Pop)               -0.0240740  0.0030313 -7.9418 1.998e-15
## log(HousingIndex)       0.1346401  0.0095517 14.0960 < 2.2e-16
## log(Education.perc)     0.3014655  0.0102722 29.3476 < 2.2e-16
## log(abs(Dom.Migr) + 1) -0.0043433  0.0024406 -1.7796   0.07515
## 
## Rho: 0.20707, LR test value: 288.25, p-value: < 2.22e-16
## Asymptotic standard error: 0.01214
##     z-value: 17.056, p-value: < 2.22e-16
## Wald statistic: 290.92, p-value: < 2.22e-16
## 
## Log likelihood: 1437.812 for lag model
## ML residual variance (sigma squared): 0.022118, (sigma: 0.14872)
## Number of observations: 3044 
## Number of parameters estimated: 7 
## AIC: -2861.6, (AIC for lm: -2575.4)
## LM test for residual autocorrelation
## test value: 22.041, p-value: 2.6685e-06

SLX

#SLX (lags of x)
SLX_1 <- lmSLX(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SLX_1)
## 
## Call:
## lm(formula = formula(paste("y ~ ", paste(colnames(x)[-1], collapse = "+"))), 
##     data = as.data.frame(x), weights = weights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74431 -0.08816 -0.00328  0.08023  1.07731 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 8.1924349  0.1007032  81.352  < 2e-16 ***
## log.Pop.                   -0.0257950  0.0037458  -6.886 6.92e-12 ***
## log.HousingIndex.           0.2050671  0.0142716  14.369  < 2e-16 ***
## log.Education.perc.         0.2897433  0.0127789  22.673  < 2e-16 ***
## log.abs.Dom.Migr....1.     -0.0038651  0.0026609  -1.453    0.146    
## lag.log.Pop.                0.0006433  0.0039586   0.162    0.871    
## lag.log.HousingIndex.      -0.0697066  0.0149178  -4.673 3.10e-06 ***
## lag.log.Education.perc.     0.0997334  0.0136496   7.307 3.48e-13 ***
## lag.log.abs.Dom.Migr....1. -0.0038089  0.0029642  -1.285    0.199    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.157 on 3035 degrees of freedom
## Multiple R-squared:  0.5606, Adjusted R-squared:  0.5595 
## F-statistic:   484 on 8 and 3035 DF,  p-value: < 2.2e-16

SEM

#SEM (lambda)
SEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SEM_1)
## 
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.6717050 -0.0795737 -0.0011856  0.0745068  1.2373582 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)             7.8395830  0.1009663 77.6455   <2e-16
## log(Pop)               -0.0289286  0.0032799 -8.8199   <2e-16
## log(HousingIndex)       0.1898747  0.0106929 17.7571   <2e-16
## log(Education.perc)     0.2970953  0.0110358 26.9210   <2e-16
## log(abs(Dom.Migr) + 1) -0.0032463  0.0024821 -1.3079   0.1909
## 
## Lambda: 0.26514, LR test value: 320.26, p-value: < 2.22e-16
## Asymptotic standard error: 0.01401
##     z-value: 18.925, p-value: < 2.22e-16
## Wald statistic: 358.14, p-value: < 2.22e-16
## 
## Log likelihood: 1453.818 for error model
## ML residual variance (sigma squared): 0.021474, (sigma: 0.14654)
## Number of observations: 3044 
## Number of parameters estimated: 7 
## AIC: -2893.6, (AIC for lm: -2575.4)

Model quality

Spatial autocorrelation of residuals

Below is again the moran test on OLS residuals as above, signifying a spatial autocorrelation not included in this functional form. Below we calculated the Morans I test for the GNS model, and at p-value 0.62 we can conclude that the spatial autocorrelation has been filtered out by the GNS model form. What is interesting, both SAR and SLX models did not get rid of the autocorrelation, so neither rho or lags of x are resposible for that. However only adding lambda in SEM, gave a pvalue of 0.84 which can lead to believe that this is the primary factor removing the autocorrelation.

OLS

lm.morantest(OLS_1, us.sym.knn.listw)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = form, data = uscounties.df)
## weights: us.sym.knn.listw
## 
## Moran I statistic standard deviate = 17.301, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3755644435    -0.0009809063     0.0004736751

GNS

moran.test(GNS_1$residuals, us.sym.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  GNS_1$residuals  
## weights: us.sym.knn.listw    
## 
## Moran I statistic standard deviate = -0.31167, p-value = 0.6224
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0071052191     -0.0003286231      0.0004727383

SAR

moran.test(SAR_1$residuals, us.sym.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  SAR_1$residuals  
## weights: us.sym.knn.listw    
## 
## Moran I statistic standard deviate = 2.5878, p-value = 0.004829
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.0559438427     -0.0003286231      0.0004728473

SLX

moran.test(SLX_1$residuals, us.sym.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  SLX_1$residuals  
## weights: us.sym.knn.listw    
## 
## Moran I statistic standard deviate = 17.494, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.3801823777     -0.0003286231      0.0004731032

SEM

moran.test(SEM_1$residuals, us.sym.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  SEM_1$residuals  
## weights: us.sym.knn.listw    
## 
## Moran I statistic standard deviate = -1.0043, p-value = 0.8424
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0221669750     -0.0003286231      0.0004728189

GoF Test

As a first step we used the Likelihood ratio test to compare the most unlimited model, that is Manski model, that consists of all possible parameters - spatial lags of y, spatial lag of x and spatial lag of error u. This test as a null hypothesis assumes that the limited model is better, since the distances between the parameters are small. However tests compared to all possible limited models show that the difference is significant. We also copmared the model to OLS using Wald and LR tests, which also resulted in GNS being the better fitted model. Since GNS, being the most general model, is better than any limited model we conclude GNS is the best fitted model.

spatialreg::LR.sarlm(GNS_1, SAC_1)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 125.72, df = 4, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SAC_1 
##                1519.414                1456.556
spatialreg::LR.sarlm(GNS_1, SDM_1)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 71.798, df = 1, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SDM_1 
##                1519.414                1483.516
spatialreg::LR.sarlm(GNS_1, SDEM_1)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 78.362, df = 1, p-value < 2.2e-16
## sample estimates:
##  Log likelihood of GNS_1 Log likelihood of SDEM_1 
##                 1519.414                 1480.233
spatialreg::LR.sarlm(GNS_1, SAR_1)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 163.2, df = 5, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SAR_1 
##                1519.414                1437.812
spatialreg::LR.sarlm(GNS_1, SLX_1)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 396.04, df = 2, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SLX_1 
##                1519.414                1321.396
spatialreg::LR.sarlm(GNS_1, SEM_1)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 131.19, df = 5, p-value < 2.2e-16
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of SEM_1 
##                1519.414                1453.818
spatialreg::LR1.sarlm(GNS_1)
## 
##  Likelihood Ratio diagnostics for spatial dependence
## 
## data:  
## Likelihood ratio = 451.46, df = 6, p-value < 2.2e-16
## sample estimates:
## Log likelihood of spatial lag model         Log likelihood of OLS fit y 
##                            1519.414                            1293.686
spatialreg::Wald1.sarlm(GNS_1)
## 
##  Wald diagnostics for spatial dependence
## 
## data:  
## Wald statistic = 212.45, df = 6, p-value < 2.2e-16
## sample estimates:
##     lambda 
## -0.3908798

AIC

According to the Akaike Information Criterion, Manski model turns out to be once again the best having the lowest AIC value of -3015.

AIC(GNS_1, SAC_1, SDM_1, SDEM_1, SAR_1, SLX_1, SEM_1, OLS_1)

Removing insignificant variables

As we could see in the models above, the variable with Domestic migration turned out to be insignificant for all models, so we decided to check the results after removing that variable. The OLS model still has spatial autocorrelation of residuals according to morans test. However just as with the previous form it performs very poorly with an Adj R2 0.55.

form2 = log(Income)~log(Pop)+log(HousingIndex)+log(Education.perc)
#OLS
OLS_2<-lm(form2, data=uscounties.df)
summary(OLS_2)
## 
## Call:
## lm(formula = form2, data = uscounties.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75627 -0.08959 -0.00245  0.08357  1.11011 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          8.053951   0.088407   91.10   <2e-16 ***
## log(Pop)            -0.031938   0.002403  -13.29   <2e-16 ***
## log(HousingIndex)    0.162590   0.009801   16.59   <2e-16 ***
## log(Education.perc)  0.338255   0.010770   31.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1584 on 3040 degrees of freedom
## Multiple R-squared:  0.5521, Adjusted R-squared:  0.5516 
## F-statistic:  1249 on 3 and 3040 DF,  p-value: < 2.2e-16
lm.morantest(OLS_2, us.sym.knn.listw)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = form2, data = uscounties.df)
## weights: us.sym.knn.listw
## 
## Moran I statistic standard deviate = 17.353, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3767650507    -0.0009075540     0.0004736819

Using the Nagelkerke Pseudo R2 we can see that the model fit is good.

# GNS, Manski model (all)
GNS_2<-sacsarlm(form2, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)
## Warning: Function sacsarlm moved to the spatialreg package
summary(GNS_2, Nagelkerke=TRUE)
## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -0.59166624 -0.06558576 -0.00034357  0.06061551  1.11769690 
## 
## Type: sacmixed 
## Coefficients: (asymptotic standard errors) 
##                           Estimate Std. Error  z value  Pr(>|z|)
## (Intercept)              3.4472105  0.1788402  19.2754 < 2.2e-16
## log(Pop)                -0.0280846  0.0030354  -9.2525 < 2.2e-16
## log(HousingIndex)        0.2464380  0.0146566  16.8142 < 2.2e-16
## log(Education.perc)      0.2386296  0.0128075  18.6320 < 2.2e-16
## lag.log(Pop)             0.0151882  0.0032144   4.7250 2.301e-06
## lag.log(HousingIndex)   -0.1920608  0.0155290 -12.3678 < 2.2e-16
## lag.log(Education.perc) -0.0748255  0.0150327  -4.9775 6.440e-07
## 
## Rho: 0.58284
## Asymptotic standard error: 0.020736
##     z-value: 28.107, p-value: < 2.22e-16
## Lambda: -0.39173
## Asymptotic standard error: 0.026744
##     z-value: -14.647, p-value: < 2.22e-16
## 
## LR test value: 452.52, p-value: < 2.22e-16
## 
## Log likelihood: 1518.322 for sacmixed model
## ML residual variance (sigma squared): 0.014809, (sigma: 0.12169)
## Nagelkerke pseudo-R-squared: 0.61394 
## Number of observations: 3044 
## Number of parameters estimated: 10 
## AIC: -3016.6, (AIC for lm: -2574.1)

We also caluclated Morans I over GNS residuals with the new functional form and we can see the lack of spatial autocorrelation.

moran.test(GNS_2$residuals, us.sym.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  GNS_2$residuals  
## weights: us.sym.knn.listw    
## 
## Moran I statistic standard deviate = -0.30914, p-value = 0.6214
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0070502370     -0.0003286231      0.0004727459

By comparing two models - GNS1 and GNS2 - with each other using the LogLikelihood Ratio test, we compare both with OLS and check which one has a higher log likelohood ratio. It turns out that GNS2, without Domestic Migration, is the slightly better performing one.

spatialreg::LR1.sarlm(GNS_1)
## 
##  Likelihood Ratio diagnostics for spatial dependence
## 
## data:  
## Likelihood ratio = 451.46, df = 6, p-value < 2.2e-16
## sample estimates:
## Log likelihood of spatial lag model         Log likelihood of OLS fit y 
##                            1519.414                            1293.686
spatialreg::LR1.sarlm(GNS_2)
## 
##  Likelihood Ratio diagnostics for spatial dependence
## 
## data:  
## Likelihood ratio = 452.52, df = 5, p-value < 2.2e-16
## sample estimates:
## Log likelihood of spatial lag model         Log likelihood of OLS fit y 
##                            1518.322                            1292.062

Because GNS1 can also be interpreted as an unlimited version of GNS2, we calculated the LR test to compare each other and it turns out, that once again GNS2 - the limited model - is better.

spatialreg::LR.sarlm(GNS_1, GNS_2)
## 
##  Likelihood ratio for spatial linear models
## 
## data:  
## Likelihood ratio = 2.1857, df = 2, p-value = 0.3353
## sample estimates:
## Log likelihood of GNS_1 Log likelihood of GNS_2 
##                1519.414                1518.322

Heteroskedasticity

According to the BP test, both LM model and GNS model show spatial heteroscedasticity due to significantly rejected null hypothesis about the constant variance of residuals. The LOSH statistic for GNS and OLS is very similar and shows that local homogeneity can be found for mainly in areas far from big cities, like in the north of Minnesota National Forests, deserts of Nevada and other national parks and deserted areas. The highest heterogeneity is visible e.g. in the South of Arizona, around San Francisco and near the coast of North Carolina.

plot(x = OLS_2$fitted.values, y = uscounties.df$Income)

plot(x = GNS_2$fitted.values, y = uscounties.df$Income)

bptest(OLS_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  OLS_2
## BP = 175.44, df = 3, p-value < 2.2e-16
bptest.sarlm(GNS_2)
## Warning: Method bptest.sarlm moved to the spatialreg package
## 
##  studentized Breusch-Pagan test
## 
## data:  
## BP = 143.13, df = 6, p-value < 2.2e-16
losh.ols = LOSH(OLS_2$residuals, us.sym.knn.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude)
losh.gns = LOSH(GNS_2$residuals, us.sym.knn.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude)
shades<-auto.shading(losh.gns[,1], n=5, cols=brewer.pal(9, "Blues")[5:9])
choropleth(uscounties, losh.gns[,1], main="Spatial distribution of LOSH
statistics for residuals from the GNS model", shading = shades, lty=0)
plot(us_states$geometry, add=T)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

shades<-auto.shading(losh.ols[,1], n=5, cols=brewer.pal(9, "Blues")[5:9])
choropleth(uscounties, losh.ols[,1], main="Spatial distribution of LOSH 
statistics for residuals from the LM model ", shading = shades, lty=0)
plot(us_states$geometry, add=T)
choro.legend(-123, 28, shades, under="below", over="above", between="to", cex=0.6, bty="n")

Summary and next steps

Both OLS and GNS turned out to be spatially heteroskedastic, however they assume that the values are constant in space. One solution to that problem is using the Geographically Weighted Regression (GWR) model. It takes into account spatial autocorrelation as well as spatial heteroskedasticity.

The new dataset on county level however turned out to be much more promising than the point data. This is why as a followup analysis we would suggest performing GWR to find out conclusions that are localised.