# install.packages('texreg')
library(spdep)
library(rgdal)
library(maptools)
library(sp)
library(RColorBrewer)
library(classInt)
library(GISTools)
library(maps)
library(viridis)
library(corrplot)
library(texreg)
library(ggplot2)
library(gridExtra) 
library(readr)
library(cluster)
library(factoextra)
library(stringr)
kaggle_income <- read.csv("kaggle_income.csv")

usmap <- readOGR(".", "cb_2018_us_state_500k")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "cb_2018_us_state_500k"
## with 56 features
## It has 9 fields
## Integer64 fields read as strings:  ALAND AWATER
usmap <- spTransform(usmap, CRS("+proj=longlat +datum=NAD83"))

usmap = usmap[!(usmap$NAME %in% c("Puerto Rico", "American Samoa", "United States Virgin Islands" , "Alaska", "Hawaii", "Guam", "Commonwealth of the Northern Mariana Islands")),]
kaggle_income[,"Lon"] %>% min()
## [1] -175.86
kaggle_income[,"Lon"] %>% max()
## [1] -65.50082
kaggle_income[,"Lat"] %>% min()
## [1] 17.92909
kaggle_income[,"Lat"] %>% max()
## [1] 71.2535
kaggle_income = kaggle_income[(kaggle_income$Lon>-125)&(kaggle_income$Lon<-66)&(kaggle_income$Lat>24)&(kaggle_income$Lat<50),]

START

coords = kaggle_income[,c("Lon", "Lat")]
coords.m = as.matrix(coords)
ggp1 = ggplot(data=kaggle_income, aes(x=Mean))+geom_density(color="coral3", fill="coral", alpha=0.3) + theme_light()
ggp2 = ggplot(data=kaggle_income, aes(x=Median))+geom_density(color="darkcyan", fill="cyan3", alpha=0.3) + theme_light()
ggp3 = ggplot(data=kaggle_income, aes(x=Stdev))+geom_density(color="chartreuse4", fill="chartreuse3", alpha=0.3) + theme_light()
grid.arrange(ggp1, ggp2, ggp3, ncol = 3) 

Points plot on map

median

variable <- kaggle_income$Median
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])

mean

variable <- kaggle_income$Mean
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])

std dev

variable <- kaggle_income$Stdev
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])

Moran’s I

inc.knn<-knearneigh(coords.m, k=1) # knn class (k=1)
## Warning in knearneigh(coords.m, k = 1): knearneigh: identical points found
inc.knn.nb<-knn2nb(inc.knn)

# checking for matrix symmetry
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: FALSE"
inc.knn.nb<-make.sym.nb(inc.knn.nb)
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: TRUE"
# knn as listw class
inc.knn.listw<-nb2listw(inc.knn.nb)

All tests are significant, with a low p-value that allows us to reject the null hypothesis and assume there is a spatial autocorrelation. The value of the Morans I statistic is positive signifying a positive

Median

moran.test(kaggle_income$Median, inc.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Median  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 24.726, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      1.693882e-01     -3.134600e-05      4.694837e-05
x<-kaggle_income$Median
zx<-as.data.frame(scale(x))  
wzx<-lag.listw(inc.knn.listw, zx$V1) 
morlm<-lm(wzx~zx$V1)  
summary(morlm)
## 
## Call:
## lm(formula = wzx ~ zx$V1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.37462 -0.49356 -0.32937  0.00814  2.61268 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.015652   0.004954   -3.16  0.00158 ** 
## zx$V1        0.169388   0.004954   34.20  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8848 on 31901 degrees of freedom
## Multiple R-squared:  0.03536,    Adjusted R-squared:  0.03533 
## F-statistic:  1169 on 1 and 31901 DF,  p-value: < 2.2e-16
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="coral3")  
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Mean

moran.test(kaggle_income$Mean, inc.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Mean  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 85.088, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      5.829837e-01     -3.134600e-05      4.694889e-05
x<-kaggle_income$Mean
zx<-as.data.frame(scale(x))  
wzx<-lag.listw(inc.knn.listw, zx$V1) 
morlm<-lm(wzx~zx$V1)  
summary(morlm)
## 
## Call:
## lm(formula = wzx ~ zx$V1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5850 -0.4375 -0.0744  0.3741  7.1161 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.016918   0.004112  -4.114  3.9e-05 ***
## zx$V1        0.582984   0.004112 141.761  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7345 on 31901 degrees of freedom
## Multiple R-squared:  0.3865, Adjusted R-squared:  0.3865 
## F-statistic: 2.01e+04 on 1 and 31901 DF,  p-value: < 2.2e-16
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="darkcyan")  
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Std dev

moran.test(kaggle_income$Stdev, inc.knn.listw)
## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Stdev  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 67.374, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      4.616191e-01     -3.134600e-05      4.695085e-05
x<-kaggle_income$Stdev
zx<-as.data.frame(scale(x))  
wzx<-lag.listw(inc.knn.listw, zx$V1) 
morlm<-lm(wzx~zx$V1)  
summary(morlm)
## 
## Call:
## lm(formula = wzx ~ zx$V1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0390 -0.5030 -0.0140  0.5099  3.9653 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.010419   0.004533  -2.299   0.0215 *  
## zx$V1        0.461619   0.004533 101.845   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8096 on 31901 degrees of freedom
## Multiple R-squared:  0.2454, Adjusted R-squared:  0.2453 
## F-statistic: 1.037e+04 on 1 and 31901 DF,  p-value: < 2.2e-16
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="chartreuse4")  
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Distance

# Select capital cities
state.capital.cities <- us.cities[us.cities$capital==2,c("name", "pop", "lat", "long")]
state.capital.cities <- state.capital.cities[state.capital.cities$name != "Honolulu HI",]
state.capital.cities
##                  name     pop   lat    long
## 5           Albany NY   93576 42.67  -73.80
## 26       Annapolis MD   36300 38.98  -76.49
## 41         Atlanta GA  424096 33.76  -84.42
## 46         Augusta ME   18626 44.32  -69.77
## 50          Austin TX  683404 30.31  -97.75
## 58     Baton Rouge LA  222217 30.45  -91.13
## 84        Bismarck ND   56927 46.81 -100.77
## 94           Boise ID  193628 43.61 -116.23
## 98          Boston MA  567759 42.34  -71.02
## 144    Carson City NV   58350 39.15 -119.74
## 165     Charleston WV   49804 38.35  -81.63
## 172       Cheyenne WY   55833 41.15 -104.79
## 196       Columbia SC  118020 34.04  -80.89
## 199       Columbus OH  741677 39.99  -82.99
## 203        Concord NH   42967 43.23  -71.56
## 247         Denver CO  556575 39.77 -104.87
## 248     Des Moines IA  192050 41.58  -93.62
## 253          Dover DE   34288 39.16  -75.53
## 332      Frankfort KY   27210 38.20  -84.86
## 385     Harrisburg PA   47576 40.28  -76.88
## 387       Hartford CT  123836 41.77  -72.68
## 392         Helena MT   27383 46.60 -112.03
## 422   Indianapolis IN  771725 39.78  -86.15
## 430        Jackson MS  175085 32.32  -90.21
## 435 Jefferson City MO   39062 38.57  -92.19
## 441         Juneau AK   31187 58.30 -134.42
## 485        Lansing MI  117236 42.71  -84.55
## 507        Lincoln NE  245301 40.82  -96.69
## 509    Little Rock AR  184323 34.72  -92.35
## 536        Madison WI  227642 43.08  -89.39
## 583     Montgomery AL  197653 32.35  -86.28
## 585     Montpelier VT    8003 44.26  -72.57
## 601      Nashville TN  523547 36.17  -86.78
## 647  Oklahoma City OK  538141 35.47  -97.51
## 649        Olympia WA   45403 47.04 -122.89
## 694        Phoenix AZ 1450884 33.54 -112.07
## 696         Pierre SD   14052 44.37 -100.34
## 726     Providence RI  178295 41.82  -71.42
## 732        Raleigh NC  350822 35.82  -78.66
## 754       Richmond VA  189498 37.53  -77.47
## 778     Sacramento CA  480392 38.57 -121.47
## 787     Saint Paul MN  272469 44.95  -93.10
## 791          Salem OR  148942 44.92 -123.02
## 794 Salt Lake City UT  177318 40.78 -111.93
## 821       Santa Fe NM   70598 35.68 -105.95
## 863    Springfield IL  116725 39.78  -89.64
## 886    Tallahassee FL  154114 30.46  -84.28
## 910         Topeka KS  121229 39.04  -95.69
## 915        Trenton NJ   84653 40.22  -74.76
coords <- kaggle_income[,c("Lon", "Lat")]
coords <- as.matrix(coords)
for (city.name in state.capital.cities$name) {
  col.city.name = print(paste0("dist_",city.name))
  lat = state.capital.cities[state.capital.cities$name==city.name,]$lat
  long = state.capital.cities[state.capital.cities$name==city.name,]$long
  kaggle_income[col.city.name]<-spDistsN1(coords, c(long, lat), longlat=TRUE)
}
## [1] "dist_Albany NY"
## [1] "dist_Annapolis MD"
## [1] "dist_Atlanta GA"
## [1] "dist_Augusta ME"
## [1] "dist_Austin TX"
## [1] "dist_Baton Rouge LA"
## [1] "dist_Bismarck ND"
## [1] "dist_Boise ID"
## [1] "dist_Boston MA"
## [1] "dist_Carson City NV"
## [1] "dist_Charleston WV"
## [1] "dist_Cheyenne WY"
## [1] "dist_Columbia SC"
## [1] "dist_Columbus OH"
## [1] "dist_Concord NH"
## [1] "dist_Denver CO"
## [1] "dist_Des Moines IA"
## [1] "dist_Dover DE"
## [1] "dist_Frankfort KY"
## [1] "dist_Harrisburg PA"
## [1] "dist_Hartford CT"
## [1] "dist_Helena MT"
## [1] "dist_Indianapolis IN"
## [1] "dist_Jackson MS"
## [1] "dist_Jefferson City MO"
## [1] "dist_Juneau AK"
## [1] "dist_Lansing MI"
## [1] "dist_Lincoln NE"
## [1] "dist_Little Rock AR"
## [1] "dist_Madison WI"
## [1] "dist_Montgomery AL"
## [1] "dist_Montpelier VT"
## [1] "dist_Nashville TN"
## [1] "dist_Oklahoma City OK"
## [1] "dist_Olympia WA"
## [1] "dist_Phoenix AZ"
## [1] "dist_Pierre SD"
## [1] "dist_Providence RI"
## [1] "dist_Raleigh NC"
## [1] "dist_Richmond VA"
## [1] "dist_Sacramento CA"
## [1] "dist_Saint Paul MN"
## [1] "dist_Salem OR"
## [1] "dist_Salt Lake City UT"
## [1] "dist_Santa Fe NM"
## [1] "dist_Springfield IL"
## [1] "dist_Tallahassee FL"
## [1] "dist_Topeka KS"
## [1] "dist_Trenton NJ"
kaggle_income$mindist <- NULL
kaggle_income$mindist = apply(kaggle_income[,20:length(colnames(kaggle_income))], 1, min)
cols = rev(brewer.pal(9, "Blues"))
mindist <- kaggle_income$mindist
brks <- stats::quantile(mindist, probs=seq(0, 1, 0.13))

plot(usmap)
points(kaggle_income[,c("Lon", "Lat")], col=cols[findInterval(mindist, brks)], pch=21, cex=0.7, 
       bg=cols[findInterval(mindist, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
title(main="Distances from closest state capital")

Interactions

kaggle_income_s = dplyr::sample_n(kaggle_income, 3000)

coords.s = kaggle_income_s[,c("Lon", "Lat")]
coords.m.s = as.matrix(coords.s)

inc.knn<-knearneigh(coords.m.s, k=1) # knn class (k=1)
## Warning in knearneigh(coords.m.s, k = 1): knearneigh: identical points found
inc.knn.nb<-knn2nb(inc.knn)

# checking for matrix symmetry
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: FALSE"
inc.knn.nb<-make.sym.nb(inc.knn.nb)
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: TRUE"
# knn as listw class
inc.knn.listw.s<-nb2listw(inc.knn.nb)

Median

kaggle_income_s$Median[kaggle_income_s$Median==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Median~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.med<-glm(eq.m)
spatial.multi.med<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Warning: Function errorsarlm moved to the spatialreg package
## Registered S3 methods overwritten by 'spatialreg':
##   method                   from 
##   residuals.stsls          spdep
##   deviance.stsls           spdep
##   coef.stsls               spdep
##   print.stsls              spdep
##   summary.stsls            spdep
##   print.summary.stsls      spdep
##   residuals.gmsar          spdep
##   deviance.gmsar           spdep
##   coef.gmsar               spdep
##   fitted.gmsar             spdep
##   print.gmsar              spdep
##   summary.gmsar            spdep
##   print.summary.gmsar      spdep
##   print.lagmess            spdep
##   summary.lagmess          spdep
##   print.summary.lagmess    spdep
##   residuals.lagmess        spdep
##   deviance.lagmess         spdep
##   coef.lagmess             spdep
##   fitted.lagmess           spdep
##   logLik.lagmess           spdep
##   fitted.SFResult          spdep
##   print.SFResult           spdep
##   fitted.ME_res            spdep
##   print.ME_res             spdep
##   print.lagImpact          spdep
##   plot.lagImpact           spdep
##   summary.lagImpact        spdep
##   HPDinterval.lagImpact    spdep
##   print.summary.lagImpact  spdep
##   print.sarlm              spdep
##   summary.sarlm            spdep
##   residuals.sarlm          spdep
##   deviance.sarlm           spdep
##   coef.sarlm               spdep
##   vcov.sarlm               spdep
##   fitted.sarlm             spdep
##   logLik.sarlm             spdep
##   anova.sarlm              spdep
##   predict.sarlm            spdep
##   print.summary.sarlm      spdep
##   print.sarlm.pred         spdep
##   as.data.frame.sarlm.pred spdep
##   residuals.spautolm       spdep
##   deviance.spautolm        spdep
##   coef.spautolm            spdep
##   fitted.spautolm          spdep
##   print.spautolm           spdep
##   summary.spautolm         spdep
##   logLik.spautolm          spdep
##   print.summary.spautolm   spdep
##   print.WXImpact           spdep
##   summary.WXImpact         spdep
##   print.summary.WXImpact   spdep
##   predict.SLX              spdep
# power
eq.p = log(kaggle_income_s$Median)~log(kaggle_income_s$mindist)
OLS.power.med<-lm(eq.p)
spatial.power.med<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
# exponential
eq.e = log(kaggle_income_s$Median)~kaggle_income_s$mindist
OLS.exp.med<-glm(eq.e)
spatial.exp.med<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
screenreg(list(OLS.multi.med, spatial.multi.med, OLS.power.med, spatial.power.med, OLS.exp.med, spatial.exp.med))
## 
## =========================================================================================================================
##                               Model 1                Model 2        Model 3      Model 4       Model 5       Model 6     
## -------------------------------------------------------------------------------------------------------------------------
## (Intercept)                           107441.01 ***  107765.32 ***    11.21 ***     11.21 ***     10.99 ***     10.99 ***
##                                        (5512.62)      (5904.13)       (0.10)        (0.10)        (0.03)        (0.04)   
## kaggle_income_s$mindist                 -158.25        -166.34                                    -0.00 *       -0.00 *  
##                                         (132.84)       (141.98)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2                 -0.77          -0.71                                                           
##                                           (0.94)         (1.01)                                                          
## kaggle_income_s$mindist^3                  0.00           0.00                                                           
##                                           (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4                 -0.00 *        -0.00 *                                                         
##                                           (0.00)         (0.00)                                                          
## lambda                                                    0.08 ***                   0.06 ***                    0.06 ***
##                                                          (0.01)                     (0.02)                      (0.02)   
## log(kaggle_income_s$mindist)                                          -0.06 **      -0.06 **                             
##                                                                       (0.02)        (0.02)                               
## -------------------------------------------------------------------------------------------------------------------------
## AIC                                    76768.66                                                 9645.91                  
## BIC                                    76804.70                                                 9663.93                  
## Log Likelihood                        -38378.33      -38365.51                   -4811.75      -4819.95      -4812.68    
## Deviance                      22714919906323.84                                                 4366.83                  
## Num. obs.                               3000           3000         3000          3000          3000          3000       
## Parameters                                                7                          4                           4       
## AIC (Linear model)                                    76768.66                    9643.65                     9645.91    
## AIC (Spatial model)                                   76745.01                    9631.50                     9633.37    
## LR test: statistic                                       25.65                      14.15                       14.54    
## LR test: p-value                                          0.00                       0.00                        0.00    
## R^2                                                                    0.00                                              
## Adj. R^2                                                               0.00                                              
## =========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.med","spatial.power.med","spatial.exp.med")
for(i in 1:3){
  AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
  AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 76768.658 76745.012  9643.649  9631.502  9645.906  9633.368

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.med", "spatial.multi.med", "OLS.power.med", "spatial.power.med", "OLS.exp.med", "spatial.exp.med")
SRMSE.all<-matrix(0)
for(i in 1:6){
  SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Median")}
SRMSE.all
## [1] 1.013628 1.007353 1.430255 1.430255 1.430255 1.430255

Mean

kaggle_income_s$Mean[kaggle_income_s$Mean==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Mean~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.mean<-glm(eq.m)
spatial.multi.mean<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Warning: Function errorsarlm moved to the spatialreg package
# power
eq.p = log(kaggle_income_s$Mean)~log(kaggle_income_s$mindist)
OLS.power.mean<-lm(eq.p)
spatial.power.mean<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
# exponential
eq.e = log(kaggle_income_s$Mean)~kaggle_income_s$mindist
OLS.exp.mean<-glm(eq.e)
spatial.exp.mean<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
screenreg(list(OLS.multi.mean, spatial.multi.mean, OLS.power.mean, spatial.power.mean, OLS.exp.mean, spatial.exp.mean))
## 
## ========================================================================================================================
##                               Model 1               Model 2        Model 3      Model 4       Model 5       Model 6     
## ------------------------------------------------------------------------------------------------------------------------
## (Intercept)                           73979.98 ***   75215.32 ***    11.12 ***     11.12 ***     11.00 ***     11.00 ***
##                                       (1925.86)      (2415.57)       (0.08)        (0.09)        (0.03)        (0.03)   
## kaggle_income_s$mindist                  39.62           8.03                                    -0.00 *       -0.00 *  
##                                         (46.41)        (57.51)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2                -1.15 ***      -0.91 *                                                         
##                                          (0.33)         (0.41)                                                          
## kaggle_income_s$mindist^3                 0.00 ***       0.00 **                                                        
##                                          (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4                -0.00 ***      -0.00 **                                                        
##                                          (0.00)         (0.00)                                                          
## lambda                                                   0.29 ***                   0.05 **                     0.05 ** 
##                                                         (0.01)                     (0.02)                      (0.02)   
## log(kaggle_income_s$mindist)                                         -0.04 *       -0.04 *                              
##                                                                      (0.02)        (0.02)                               
## ------------------------------------------------------------------------------------------------------------------------
## AIC                                   70458.64                                                 8632.45                  
## BIC                                   70494.68                                                 8650.47                  
## Log Likelihood                       -35223.32      -35029.64                   -4308.64      -4313.22      -4308.39    
## Deviance                      2772319618034.09                                                 3114.97                  
## Num. obs.                              3000           3000         3000          3000          3000          3000       
## Parameters                                               7                          4                           4       
## AIC (Linear model)                                   70458.64                    8632.86                     8632.45    
## AIC (Spatial model)                                  70073.27                    8625.28                     8624.78    
## LR test: statistic                                     387.37                       9.58                        9.67    
## LR test: p-value                                         0.00                       0.00                        0.00    
## R^2                                                                   0.00                                              
## Adj. R^2                                                              0.00                                              
## ========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.mean","spatial.power.mean","spatial.exp.mean")
for(i in 1:3){
  AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
  AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 70458.645 70073.270  8632.858  8625.278  8632.449  8624.783

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.mean", "spatial.multi.mean", "OLS.power.mean", "spatial.power.mean", "OLS.exp.mean", "spatial.exp.mean")
SRMSE.all<-matrix(0)
for(i in 1:6){
  SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.4506187 0.4102389 1.1003455 1.1003454 1.1003455 1.1003454

Std dev

kaggle_income_s$Stdev[kaggle_income_s$Stdev==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Stdev~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.std<-glm(eq.m)
spatial.multi.std<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Warning: Function errorsarlm moved to the spatialreg package
# power
eq.p = log(kaggle_income_s$Stdev)~log(kaggle_income_s$mindist)
OLS.power.std<-lm(eq.p)
spatial.power.std<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
# exponential
eq.e = log(kaggle_income_s$Stdev)~kaggle_income_s$mindist
OLS.exp.std<-glm(eq.e)
spatial.exp.std<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
screenreg(list(OLS.multi.std, spatial.multi.std, OLS.power.std, spatial.power.std, OLS.exp.std, spatial.exp.std))
## 
## =======================================================================================================================
##                               Model 1              Model 2        Model 3      Model 4       Model 5       Model 6     
## -----------------------------------------------------------------------------------------------------------------------
## (Intercept)                          49176.30 ***   49251.10 ***    10.76 ***     10.76 ***     10.68 ***     10.68 ***
##                                      (1026.16)      (1218.46)       (0.08)        (0.08)        (0.03)        (0.03)   
## kaggle_income_s$mindist                 55.30 *        51.62                                    -0.00 *       -0.00 *  
##                                        (24.73)        (29.15)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2               -0.76 ***      -0.72 ***                                                       
##                                         (0.18)         (0.21)                                                          
## kaggle_income_s$mindist^3                0.00 ***       0.00 ***                                                       
##                                         (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4               -0.00 ***      -0.00 ***                                                       
##                                         (0.00)         (0.00)                                                          
## lambda                                                  0.21 ***                   0.02                        0.02    
##                                                        (0.01)                     (0.02)                      (0.02)   
## log(kaggle_income_s$mindist)                                        -0.03         -0.03                                
##                                                                     (0.02)        (0.02)                               
## -----------------------------------------------------------------------------------------------------------------------
## AIC                                  66681.35                                                 8305.33                  
## BIC                                  66717.39                                                 8323.35                  
## Log Likelihood                      -33334.67      -33238.47                   -4149.63      -4149.67      -4148.82    
## Deviance                      787088200271.66                                                 2793.18                  
## Num. obs.                             3000           3000         3000          3000          3000          3000       
## Parameters                                              7                          4                           4       
## AIC (Linear model)                                  66681.35                    8306.98                     8305.33    
## AIC (Spatial model)                                 66490.93                    8307.26                     8305.63    
## LR test: statistic                                    192.42                       1.72                        1.70    
## LR test: p-value                                        0.00                       0.19                        0.19    
## R^2                                                                  0.00                                              
## Adj. R^2                                                             0.00                                              
## =======================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.std","spatial.power.std","spatial.exp.std")
for(i in 1:3){
  AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
  AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 66681.347 66490.930  8306.981  8307.262  8305.335  8305.632

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.std", "spatial.multi.std", "OLS.power.std", "spatial.power.std", "OLS.exp.std", "spatial.exp.std")
SRMSE.all<-matrix(0)
for(i in 1:6){
  SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.5407556 0.5283016 1.1003497 1.1003497 1.1003497 1.1003497

Model quality plots

Median

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, OLS.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, spatial.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, OLS.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, spatial.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Mean

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, OLS.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, spatial.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, OLS.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, spatial.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Standard Deviation

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, OLS.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, spatial.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), OLS.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), spatial.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, OLS.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, spatial.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Additional variables

cost of living, homeless, house index, crime (types) per county/zip code

bea CAINC1: Annual Personal Income by County https://apps.bea.gov/regional/downloadzip.cfm

Housing Median Home Prices https://www.nar.realtor/research-and-statistics/housing-statistics/county-median-home-prices-and-monthly-mortgage-payment

zero_pad <- function(x) { ifelse(str_starts(x, "0"), x, paste0("0",x))}

US counties shapefile

uscounties <- readOGR(".", "tl_2017_us_county")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "tl_2017_us_county"
## with 3233 features
## It has 17 fields
## Integer64 fields read as strings:  ALAND AWATER
uscounties <- spTransform(uscounties, CRS("+proj=longlat +datum=NAD83"))
uscounties$GEOID = as.character(uscounties$GEOID)
uscounties$GEOID0 = uscounties$GEOID %>% sapply(zero_pad)

Personal income per county

income_county <- read.csv("CAINC1__ALL_AREAS_1969_2018.csv", stringsAsFactors=FALSE)
income_county <- income_county[,c("GeoFIPS", "GeoName", "Description", "LineCode", "Unit","X2018")]
income_county <- income_county[1:(dim(income_county)[1]-3),]
income_county <- income_county[income_county$Description=="Personal income (thousands of dollars)",]
colnames(income_county) = c("FIPS", "County", "Description", "LineCode", "Unit","Income")
income_county = income_county[!as.character(income_county$FIPS) %>% sapply(str_trim) %>% sapply(str_ends, "000"),]
income_county$FIPS = as.character(income_county$FIPS) %>% sapply(str_trim)
income_county$Income = as.integer(income_county$Income)
## Warning: NAs introduced by coercion
income_county$FIPS0 = income_county$FIPS %>% sapply(zero_pad)

House Index

house_index <- read.csv2("HI_data.csv", stringsAsFactors=FALSE)
colnames(house_index) = c("FIPS", "Price.RangeQ42019", "Geography", "HousingIndex" )
house_index$FIPS <- paste0("0",house_index$FIPS)
house_index$Geography = house_index$Geography %>% sapply(str_replace, "De Witt", "DeWitt")
house_index$FIPS0 = house_index$FIPS %>% sapply(zero_pad)

Education

education_county <- read.csv2("Education.csv", stringsAsFactors=FALSE)
colnames(education_county) = c("FIPS", "State", "Area.name", "Education.perc")
education_county$Education.perc = as.numeric(education_county$Education.perc)
education_county = education_county[!is.na(education_county$Education.perc),]
education_county$FIPS0 <- paste0("0",education_county$FIPS)

Population

population_county <- read.csv2("Pop.csv", stringsAsFactors=FALSE)
colnames(population_county) = c("FIPS", "State", "Area.name", "Econ.Typology", "Dom.Migr", "Pop")
population_county$Dom.Migr = as.numeric(population_county$Dom.Migr)
population_county$Pop = as.numeric(population_county$Pop)
population_county = population_county[!is.na(population_county$Dom.Migr),]
population_county = population_county[!is.na(population_county$Pop),]
population_county$FIPS0 <- paste0("0",population_county$FIPS)
uscounties = merge(uscounties, house_index, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, income_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, population_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, education_county, by.x="GEOID0", by.y="FIPS0")
uscounties = uscounties[!uscounties$STATEFP %in% c("15", "02"),]

uscounties = uscounties[!is.na(uscounties$Education.perc),]
uscounties = uscounties[!is.na(uscounties$HousingIndex),]
uscounties = uscounties[!is.na(uscounties$Income),]
uscounties = uscounties[!is.na(uscounties$Econ.Typology),]
uscounties = uscounties[!is.na(uscounties$Dom.Migr),]
uscounties = uscounties[!is.na(uscounties$Pop),]

Maps of variables

Housing Index

variable<-uscounties$HousingIndex
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Income

variable<-uscounties$Income
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Education

variable<-uscounties$Education.perc
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Population

variable<-uscounties$Pop
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Domestic migration

variable<-uscounties$Dom.Migr
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Spatial weights matrix

KNN

uscounties.df = as.data.frame(uscounties)
crds<-coordinates(uscounties)

# calculate weights matrix
us.knn<-knearneigh(crds, k=1)
us.knn.nb<-knn2nb(us.knn)
us.sym.knn.nb<-make.sym.nb(us.knn.nb)
us.sym.knn.listw<-nb2listw(us.sym.knn.nb)
M<-cor(uscounties.df[,c("Dom.Migr","HousingIndex", "Pop", "Education.perc")])
corrplot(M)

form = log(Income)~Pop+HousingIndex+Education.perc+Dom.Migr
# GNS, Manski model (all)
GNS_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)
## Warning: Function sacsarlm moved to the spatialreg package
summary(GNS_1)
## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.219885 -0.430513  0.051912  0.502844  2.036685 
## 
## Type: sacmixed 
## Coefficients: (asymptotic standard errors) 
##                      Estimate Std. Error z value  Pr(>|z|)
## (Intercept)        1.7660e+01 3.0450e-01 57.9962 < 2.2e-16
## Pop                2.0950e-06 7.5370e-08 27.7968 < 2.2e-16
## HousingIndex       3.1966e-06 3.3315e-07  9.5950 < 2.2e-16
## Education.perc     4.2094e-02 3.1613e-03 13.3155 < 2.2e-16
## Dom.Migr           7.9429e-05 6.5624e-06 12.1037 < 2.2e-16
## lag.Pop            1.3480e-06 8.9170e-08 15.1171 < 2.2e-16
## lag.HousingIndex   2.0255e-06 3.7899e-07  5.3445 9.068e-08
## lag.Education.perc 1.1461e-02 3.6973e-03  3.1000  0.001936
## lag.Dom.Migr       5.6074e-05 7.2591e-06  7.7246 1.132e-14
## 
## Rho: -0.42174
## Asymptotic standard error: 0.023213
##     z-value: -18.168, p-value: < 2.22e-16
## Lambda: 0.66235
## Asymptotic standard error: 0.015754
##     z-value: 42.044, p-value: < 2.22e-16
## 
## LR test value: 774.27, p-value: < 2.22e-16
## 
## Log likelihood: -4131.385 for sacmixed model
## ML residual variance (sigma squared): 0.53705, (sigma: 0.73284)
## Number of observations: 3044 
## Number of parameters estimated: 12 
## AIC: 8286.8, (AIC for lm: 9049)
# SAC (rho and lambda)
SAC_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAC_1)
## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -8.061383 -0.544972  0.086083  0.641902  2.402963 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                  Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    1.0567e+01 2.9889e-01 35.3552 < 2.2e-16
## Pop            1.9255e-06 7.0171e-08 27.4405 < 2.2e-16
## HousingIndex   2.9650e-06 3.2724e-07  9.0606 < 2.2e-16
## Education.perc 4.3327e-02 2.9820e-03 14.5298 < 2.2e-16
## Dom.Migr       7.4546e-05 6.0359e-06 12.3504 < 2.2e-16
## 
## Rho: 0.1346
## Asymptotic standard error: 0.02219
##     z-value: 6.0659, p-value: 1.3119e-09
## Lambda: 0.21442
## Asymptotic standard error: 0.025894
##     z-value: 8.2806, p-value: 2.2204e-16
## 
## LR test value: 601.74, p-value: < 2.22e-16
## 
## Log likelihood: -4217.654 for sac model
## ML residual variance (sigma squared): 0.89609, (sigma: 0.94662)
## Number of observations: 3044 
## Number of parameters estimated: 8 
## AIC: 8451.3, (AIC for lm: 9049)
# SDM (rho and x lags)
SDM_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="mixed") 
summary(SDM_1)
## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.755311 -0.530582  0.081613  0.634963  2.330427 
## 
## Type: mixed 
## Coefficients: (numerical Hessian approximate standard errors) 
##                       Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)         8.0787e+00  1.6731e-01 48.2870 < 2.2e-16
## Pop                 1.8141e-06  7.3142e-08 24.8021 < 2.2e-16
## HousingIndex        2.7254e-06  4.3920e-07  6.2054 5.455e-10
## Education.perc      4.6333e-02  3.1864e-03 14.5407 < 2.2e-16
## Dom.Migr            6.8381e-05  6.2857e-06 10.8788 < 2.2e-16
## lag.Pop            -9.6857e-08  9.8321e-08 -0.9851    0.3246
## lag.HousingIndex   -3.8078e-07  4.9309e-07 -0.7722    0.4400
## lag.Education.perc -2.2539e-02  3.5741e-03 -6.3063 2.858e-10
## lag.Dom.Migr        4.3336e-06  8.0089e-06  0.5411    0.5884
## 
## Rho: 0.34999, LR test value: 615.83, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.012946
##     z-value: 27.035, p-value: < 2.22e-16
## Wald statistic: 730.9, p-value: < 2.22e-16
## 
## Log likelihood: -4187.434 for mixed model
## ML residual variance (sigma squared): 0.84186, (sigma: 0.91753)
## Number of observations: 3044 
## Number of parameters estimated: 11 
## AIC: 8396.9, (AIC for lm: 9010.7)
#SDEM (lambda and x lags)
SDEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, etype="emixed")
summary(SDEM_1)
## 
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.714501 -0.533814  0.079177  0.637080  2.310477 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                       Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)         1.2429e+01  7.3899e-02 168.1852 < 2.2e-16
## Pop                 1.9696e-06  7.0818e-08  27.8122 < 2.2e-16
## HousingIndex        2.9041e-06  3.5404e-07   8.2028 2.220e-16
## Education.perc      4.4272e-02  3.0648e-03  14.4451 < 2.2e-16
## Dom.Migr            7.6099e-05  6.1529e-06  12.3680 < 2.2e-16
## lag.Pop             6.1167e-07  7.1543e-08   8.5497 < 2.2e-16
## lag.HousingIndex    7.2878e-07  3.7768e-07   1.9296   0.05365
## lag.Education.perc -7.5739e-03  3.3392e-03  -2.2681   0.02332
## lag.Dom.Migr        3.1340e-05  6.3166e-06   4.9616 6.993e-07
## 
## Lambda: 0.3509, LR test value: 616.25, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.012733
##     z-value: 27.558, p-value: < 2.22e-16
## Wald statistic: 759.43, p-value: < 2.22e-16
## 
## Log likelihood: -4187.227 for error model
## ML residual variance (sigma squared): 0.84135, (sigma: 0.91725)
## Number of observations: 3044 
## Number of parameters estimated: 11 
## AIC: 8396.5, (AIC for lm: 9010.7)
#SAR (rho)
SAR_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw) 
summary(SAR_1)
## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.895077 -0.539679  0.086321  0.642340  2.458343 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##                  Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    8.5184e+00 1.5549e-01 54.7829 < 2.2e-16
## Pop            1.8452e-06 6.7280e-08 27.4264 < 2.2e-16
## HousingIndex   2.0318e-06 2.6397e-07  7.6971 1.399e-14
## Education.perc 4.0013e-02 2.7275e-03 14.6704 < 2.2e-16
## Dom.Migr       7.5200e-05 5.7502e-06 13.0778 < 2.2e-16
## 
## Rho: 0.29637, LR test value: 566.44, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.011469
##     z-value: 25.841, p-value: < 2.22e-16
## Wald statistic: 667.75, p-value: < 2.22e-16
## 
## Log likelihood: -4235.303 for lag model
## ML residual variance (sigma squared): 0.89097, (sigma: 0.94391)
## Number of observations: 3044 
## Number of parameters estimated: 7 
## AIC: 8484.6, (AIC for lm: 9049)
#SLX (lags of x)
SLX_1 <- lmSLX(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SLX_1)
## 
## Call:
## lm(formula = formula(paste("y ~ ", paste(colnames(x)[-1], collapse = "+"))), 
##     data = as.data.frame(x), weights = weights)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1240 -0.5948  0.1281  0.7240  2.4385 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.238e+01  5.783e-02 214.030  < 2e-16 ***
## Pop                 2.055e-06  7.987e-08  25.735  < 2e-16 ***
## HousingIndex        2.817e-06  4.600e-07   6.125 1.02e-09 ***
## Education.perc      4.412e-02  3.611e-03  12.218  < 2e-16 ***
## Dom.Migr            8.025e-05  6.852e-06  11.712  < 2e-16 ***
## lag.Pop             5.036e-07  7.928e-08   6.352 2.44e-10 ***
## lag.HousingIndex    7.451e-07  4.772e-07   1.561    0.119    
## lag.Education.perc -5.500e-03  3.832e-03  -1.435    0.151    
## lag.Dom.Migr        3.015e-05  6.843e-06   4.406 1.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.061 on 3035 degrees of freedom
## Multiple R-squared:  0.5406, Adjusted R-squared:  0.5394 
## F-statistic: 446.4 on 8 and 3035 DF,  p-value: < 2.2e-16
#SEM (lambda)
SEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SEM_1)
## 
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.537158 -0.535216  0.076571  0.626528  3.317991 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                  Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    1.2368e+01 5.3042e-02 233.170 < 2.2e-16
## Pop            1.8597e-06 6.9592e-08  26.722 < 2.2e-16
## HousingIndex   3.7812e-06 3.1551e-07  11.985 < 2.2e-16
## Education.perc 4.1921e-02 3.0193e-03  13.885 < 2.2e-16
## Dom.Migr       6.8902e-05 5.9837e-06  11.515 < 2.2e-16
## 
## Lambda: 0.34815, LR test value: 579.2, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.013123
##     z-value: 26.53, p-value: < 2.22e-16
## Wald statistic: 703.87, p-value: < 2.22e-16
## 
## Log likelihood: -4228.921 for error model
## ML residual variance (sigma squared): 0.86594, (sigma: 0.93056)
## Number of observations: 3044 
## Number of parameters estimated: 7 
## AIC: 8471.8, (AIC for lm: 9049)
#OLS
OLS_1<-lm(form, data=uscounties.df)
summary(OLS_1)
## 
## Call:
## lm(formula = form, data = uscounties.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0086  -0.6060   0.1288   0.7388   2.5707 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.237e+01  5.006e-02  247.10   <2e-16 ***
## Pop            2.267e-06  7.388e-08   30.68   <2e-16 ***
## HousingIndex   3.707e-06  2.896e-07   12.80   <2e-16 ***
## Education.perc 3.923e-02  3.087e-03   12.71   <2e-16 ***
## Dom.Migr       9.036e-05  6.475e-06   13.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.069 on 3039 degrees of freedom
## Multiple R-squared:  0.5335, Adjusted R-squared:  0.5329 
## F-statistic:   869 on 4 and 3039 DF,  p-value: < 2.2e-16

Clustering

Optimal clustering using silhouette

Optimal number of clusters k=2

fviz_nbclust(coords[1:5000,], clara, method="silhouette")

Optimal clustering using silhouette

Optimal number of clusters k=2

gap<-clusGap(coords[1:5000,], FUN=clara, K.max=50, B=5)
fviz_gap_stat(gap)
fviz_gap_stat(gap, linecolor="red", maxSE=list(method="globalmax"))

9

c9<-clara(coords, 9, metric="euclidean")
fviz_cluster(c9, pch=19, geom="point")
fviz_silhouette(c9)
kaggle_income$c9_clust = c9$clustering
ggplot(kaggle_income[,c("Median", "c9_clust")], aes(x=Median))+
  geom_density()+
  facet_wrap(~c9_clust,  ncol=3)

2

c2<-clara(coords, 2, metric="euclidean", sampsize=1000)
fviz_cluster(c2, pch=19, geom="point")
fviz_silhouette(c2)

44

c44<-clara(coords, 44, metric="euclidean")
fviz_cluster(c44, pch=19, geom="point")
fviz_silhouette(c44)
kaggle_income$c44_clust = c44$clustering
ggplot(kaggle_income[,c("Median", "c44_clust")], aes(x=Median))+
  geom_density()+
  facet_wrap(~c44_clust,  ncol=8)
ggplot(kaggle_income[,c("Median", "State_Code")], aes(x=Median))+
  geom_density()+
  facet_wrap(~State_Code,  ncol=8)