# install.packages('texreg')
library(spdep)
library(rgdal)
library(maptools)
library(sp)
library(RColorBrewer)
library(classInt)
library(GISTools)
library(maps)
library(viridis)
library(corrplot)
library(texreg)
library(ggplot2)
library(gridExtra) 
library(readr)
library(cluster)
library(factoextra)
library(stringr)

kaggle_income <- read.csv("kaggle_income.csv")

usmap <- readOGR(".", "cb_2018_us_state_500k")

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "cb_2018_us_state_500k"
## with 56 features
## It has 9 fields
## Integer64 fields read as strings:  ALAND AWATER

usmap <- spTransform(usmap, CRS("+proj=longlat +datum=NAD83"))

usmap = usmap[!(usmap$NAME %in% c("Puerto Rico", "American Samoa", "United States Virgin Islands" , "Alaska", "Hawaii", "Guam", "Commonwealth of the Northern Mariana Islands")),]

kaggle_income[,"Lon"] %>% min()

## [1] -175.86

kaggle_income[,"Lon"] %>% max()

## [1] -65.50082

kaggle_income[,"Lat"] %>% min()

## [1] 17.92909

kaggle_income[,"Lat"] %>% max()

## [1] 71.2535

kaggle_income = kaggle_income[(kaggle_income$Lon>-125)&(kaggle_income$Lon<-66)&(kaggle_income$Lat>24)&(kaggle_income$Lat<50),]

START

coords = kaggle_income[,c("Lon", "Lat")]
coords.m = as.matrix(coords)

ggp1 = ggplot(data=kaggle_income, aes(x=Mean))+geom_density(color="coral3", fill="coral", alpha=0.3) + theme_light()
ggp2 = ggplot(data=kaggle_income, aes(x=Median))+geom_density(color="darkcyan", fill="cyan3", alpha=0.3) + theme_light()
ggp3 = ggplot(data=kaggle_income, aes(x=Stdev))+geom_density(color="chartreuse4", fill="chartreuse3", alpha=0.3) + theme_light()
grid.arrange(ggp1, ggp2, ggp3, ncol = 3)

Points plot on map

median

variable <- kaggle_income$Median
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])

mean

variable <- kaggle_income$Mean
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])

std dev

variable <- kaggle_income$Stdev
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
       col=cols[findInterval(variable, brks)])

Moran’s I

inc.knn<-knearneigh(coords.m, k=1) # knn class (k=1)

## Warning in knearneigh(coords.m, k = 1): knearneigh: identical points found

inc.knn.nb<-knn2nb(inc.knn)

# checking for matrix symmetry
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))

## [1] "Is matrix symmetric: FALSE"

inc.knn.nb<-make.sym.nb(inc.knn.nb)
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))

## [1] "Is matrix symmetric: TRUE"

# knn as listw class
inc.knn.listw<-nb2listw(inc.knn.nb)

All tests are significant, with a low p-value that allows us to reject the null hypothesis and assume there is a spatial autocorrelation. The value of the Morans I statistic is positive signifying a positive

Median

moran.test(kaggle_income$Median, inc.knn.listw)

## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Median  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 24.726, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      1.693882e-01     -3.134600e-05      4.694837e-05

x<-kaggle_income$Median
zx<-as.data.frame(scale(x))  
wzx<-lag.listw(inc.knn.listw, zx$V1) 
morlm<-lm(wzx~zx$V1)  
summary(morlm)

## 
## Call:
## lm(formula = wzx ~ zx$V1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.37462 -0.49356 -0.32937  0.00814  2.61268 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.015652   0.004954   -3.16  0.00158 ** 
## zx$V1        0.169388   0.004954   34.20  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8848 on 31901 degrees of freedom
## Multiple R-squared:  0.03536,    Adjusted R-squared:  0.03533 
## F-statistic:  1169 on 1 and 31901 DF,  p-value: < 2.2e-16

slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="coral3")  
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Mean

moran.test(kaggle_income$Mean, inc.knn.listw)

## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Mean  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 85.088, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      5.829837e-01     -3.134600e-05      4.694889e-05

x<-kaggle_income$Mean
zx<-as.data.frame(scale(x))  
wzx<-lag.listw(inc.knn.listw, zx$V1) 
morlm<-lm(wzx~zx$V1)  
summary(morlm)

## 
## Call:
## lm(formula = wzx ~ zx$V1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5850 -0.4375 -0.0744  0.3741  7.1161 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.016918   0.004112  -4.114  3.9e-05 ***
## zx$V1        0.582984   0.004112 141.761  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7345 on 31901 degrees of freedom
## Multiple R-squared:  0.3865, Adjusted R-squared:  0.3865 
## F-statistic: 2.01e+04 on 1 and 31901 DF,  p-value: < 2.2e-16

slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="darkcyan")  
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Std dev

moran.test(kaggle_income$Stdev, inc.knn.listw)

## 
##  Moran I test under randomisation
## 
## data:  kaggle_income$Stdev  
## weights: inc.knn.listw    
## 
## Moran I statistic standard deviate = 67.374, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      4.616191e-01     -3.134600e-05      4.695085e-05

x<-kaggle_income$Stdev
zx<-as.data.frame(scale(x))  
wzx<-lag.listw(inc.knn.listw, zx$V1) 
morlm<-lm(wzx~zx$V1)  
summary(morlm)

## 
## Call:
## lm(formula = wzx ~ zx$V1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0390 -0.5030 -0.0140  0.5099  3.9653 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.010419   0.004533  -2.299   0.0215 *  
## zx$V1        0.461619   0.004533 101.845   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8096 on 31901 degrees of freedom
## Multiple R-squared:  0.2454, Adjusted R-squared:  0.2453 
## F-statistic: 1.037e+04 on 1 and 31901 DF,  p-value: < 2.2e-16

slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="chartreuse4")  
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)

Distance

# Select capital cities
state.capital.cities <- us.cities[us.cities$capital==2,c("name", "pop", "lat", "long")]
state.capital.cities <- state.capital.cities[state.capital.cities$name != "Honolulu HI",]
state.capital.cities

##                  name     pop   lat    long
## 5           Albany NY   93576 42.67  -73.80
## 26       Annapolis MD   36300 38.98  -76.49
## 41         Atlanta GA  424096 33.76  -84.42
## 46         Augusta ME   18626 44.32  -69.77
## 50          Austin TX  683404 30.31  -97.75
## 58     Baton Rouge LA  222217 30.45  -91.13
## 84        Bismarck ND   56927 46.81 -100.77
## 94           Boise ID  193628 43.61 -116.23
## 98          Boston MA  567759 42.34  -71.02
## 144    Carson City NV   58350 39.15 -119.74
## 165     Charleston WV   49804 38.35  -81.63
## 172       Cheyenne WY   55833 41.15 -104.79
## 196       Columbia SC  118020 34.04  -80.89
## 199       Columbus OH  741677 39.99  -82.99
## 203        Concord NH   42967 43.23  -71.56
## 247         Denver CO  556575 39.77 -104.87
## 248     Des Moines IA  192050 41.58  -93.62
## 253          Dover DE   34288 39.16  -75.53
## 332      Frankfort KY   27210 38.20  -84.86
## 385     Harrisburg PA   47576 40.28  -76.88
## 387       Hartford CT  123836 41.77  -72.68
## 392         Helena MT   27383 46.60 -112.03
## 422   Indianapolis IN  771725 39.78  -86.15
## 430        Jackson MS  175085 32.32  -90.21
## 435 Jefferson City MO   39062 38.57  -92.19
## 441         Juneau AK   31187 58.30 -134.42
## 485        Lansing MI  117236 42.71  -84.55
## 507        Lincoln NE  245301 40.82  -96.69
## 509    Little Rock AR  184323 34.72  -92.35
## 536        Madison WI  227642 43.08  -89.39
## 583     Montgomery AL  197653 32.35  -86.28
## 585     Montpelier VT    8003 44.26  -72.57
## 601      Nashville TN  523547 36.17  -86.78
## 647  Oklahoma City OK  538141 35.47  -97.51
## 649        Olympia WA   45403 47.04 -122.89
## 694        Phoenix AZ 1450884 33.54 -112.07
## 696         Pierre SD   14052 44.37 -100.34
## 726     Providence RI  178295 41.82  -71.42
## 732        Raleigh NC  350822 35.82  -78.66
## 754       Richmond VA  189498 37.53  -77.47
## 778     Sacramento CA  480392 38.57 -121.47
## 787     Saint Paul MN  272469 44.95  -93.10
## 791          Salem OR  148942 44.92 -123.02
## 794 Salt Lake City UT  177318 40.78 -111.93
## 821       Santa Fe NM   70598 35.68 -105.95
## 863    Springfield IL  116725 39.78  -89.64
## 886    Tallahassee FL  154114 30.46  -84.28
## 910         Topeka KS  121229 39.04  -95.69
## 915        Trenton NJ   84653 40.22  -74.76

coords <- kaggle_income[,c("Lon", "Lat")]
coords <- as.matrix(coords)
for (city.name in state.capital.cities$name) {
  col.city.name = print(paste0("dist_",city.name))
  lat = state.capital.cities[state.capital.cities$name==city.name,]$lat
  long = state.capital.cities[state.capital.cities$name==city.name,]$long
  kaggle_income[col.city.name]<-spDistsN1(coords, c(long, lat), longlat=TRUE)
}

## [1] "dist_Albany NY"
## [1] "dist_Annapolis MD"
## [1] "dist_Atlanta GA"
## [1] "dist_Augusta ME"
## [1] "dist_Austin TX"
## [1] "dist_Baton Rouge LA"
## [1] "dist_Bismarck ND"
## [1] "dist_Boise ID"
## [1] "dist_Boston MA"
## [1] "dist_Carson City NV"
## [1] "dist_Charleston WV"
## [1] "dist_Cheyenne WY"
## [1] "dist_Columbia SC"
## [1] "dist_Columbus OH"
## [1] "dist_Concord NH"
## [1] "dist_Denver CO"
## [1] "dist_Des Moines IA"
## [1] "dist_Dover DE"
## [1] "dist_Frankfort KY"
## [1] "dist_Harrisburg PA"
## [1] "dist_Hartford CT"
## [1] "dist_Helena MT"
## [1] "dist_Indianapolis IN"
## [1] "dist_Jackson MS"
## [1] "dist_Jefferson City MO"
## [1] "dist_Juneau AK"
## [1] "dist_Lansing MI"
## [1] "dist_Lincoln NE"
## [1] "dist_Little Rock AR"
## [1] "dist_Madison WI"
## [1] "dist_Montgomery AL"
## [1] "dist_Montpelier VT"
## [1] "dist_Nashville TN"
## [1] "dist_Oklahoma City OK"
## [1] "dist_Olympia WA"
## [1] "dist_Phoenix AZ"
## [1] "dist_Pierre SD"
## [1] "dist_Providence RI"
## [1] "dist_Raleigh NC"
## [1] "dist_Richmond VA"
## [1] "dist_Sacramento CA"
## [1] "dist_Saint Paul MN"
## [1] "dist_Salem OR"
## [1] "dist_Salt Lake City UT"
## [1] "dist_Santa Fe NM"
## [1] "dist_Springfield IL"
## [1] "dist_Tallahassee FL"
## [1] "dist_Topeka KS"
## [1] "dist_Trenton NJ"

kaggle_income$mindist <- NULL
kaggle_income$mindist = apply(kaggle_income[,20:length(colnames(kaggle_income))], 1, min)

cols = rev(brewer.pal(9, "Blues"))
mindist <- kaggle_income$mindist
brks <- stats::quantile(mindist, probs=seq(0, 1, 0.13))

plot(usmap)
points(kaggle_income[,c("Lon", "Lat")], col=cols[findInterval(mindist, brks)], pch=21, cex=0.7, 
       bg=cols[findInterval(mindist, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
title(main="Distances from closest state capital")

Interactions

kaggle_income_s = dplyr::sample_n(kaggle_income, 3000)

coords.s = kaggle_income_s[,c("Lon", "Lat")]
coords.m.s = as.matrix(coords.s)

inc.knn<-knearneigh(coords.m.s, k=1) # knn class (k=1)

## Warning in knearneigh(coords.m.s, k = 1): knearneigh: identical points found

inc.knn.nb<-knn2nb(inc.knn)

# checking for matrix symmetry
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))

## [1] "Is matrix symmetric: FALSE"

inc.knn.nb<-make.sym.nb(inc.knn.nb)
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))

## [1] "Is matrix symmetric: TRUE"

# knn as listw class
inc.knn.listw.s<-nb2listw(inc.knn.nb)

Median

kaggle_income_s$Median[kaggle_income_s$Median==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Median~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.med<-glm(eq.m)
spatial.multi.med<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)

## Warning: Function errorsarlm moved to the spatialreg package

## Registered S3 methods overwritten by 'spatialreg':
##   method                   from 
##   residuals.stsls          spdep
##   deviance.stsls           spdep
##   coef.stsls               spdep
##   print.stsls              spdep
##   summary.stsls            spdep
##   print.summary.stsls      spdep
##   residuals.gmsar          spdep
##   deviance.gmsar           spdep
##   coef.gmsar               spdep
##   fitted.gmsar             spdep
##   print.gmsar              spdep
##   summary.gmsar            spdep
##   print.summary.gmsar      spdep
##   print.lagmess            spdep
##   summary.lagmess          spdep
##   print.summary.lagmess    spdep
##   residuals.lagmess        spdep
##   deviance.lagmess         spdep
##   coef.lagmess             spdep
##   fitted.lagmess           spdep
##   logLik.lagmess           spdep
##   fitted.SFResult          spdep
##   print.SFResult           spdep
##   fitted.ME_res            spdep
##   print.ME_res             spdep
##   print.lagImpact          spdep
##   plot.lagImpact           spdep
##   summary.lagImpact        spdep
##   HPDinterval.lagImpact    spdep
##   print.summary.lagImpact  spdep
##   print.sarlm              spdep
##   summary.sarlm            spdep
##   residuals.sarlm          spdep
##   deviance.sarlm           spdep
##   coef.sarlm               spdep
##   vcov.sarlm               spdep
##   fitted.sarlm             spdep
##   logLik.sarlm             spdep
##   anova.sarlm              spdep
##   predict.sarlm            spdep
##   print.summary.sarlm      spdep
##   print.sarlm.pred         spdep
##   as.data.frame.sarlm.pred spdep
##   residuals.spautolm       spdep
##   deviance.spautolm        spdep
##   coef.spautolm            spdep
##   fitted.spautolm          spdep
##   print.spautolm           spdep
##   summary.spautolm         spdep
##   logLik.spautolm          spdep
##   print.summary.spautolm   spdep
##   print.WXImpact           spdep
##   summary.WXImpact         spdep
##   print.summary.WXImpact   spdep
##   predict.SLX              spdep

# power
eq.p = log(kaggle_income_s$Median)~log(kaggle_income_s$mindist)
OLS.power.med<-lm(eq.p)
spatial.power.med<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)

## Warning: Function errorsarlm moved to the spatialreg package

# exponential
eq.e = log(kaggle_income_s$Median)~kaggle_income_s$mindist
OLS.exp.med<-glm(eq.e)
spatial.exp.med<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)

## Warning: Function errorsarlm moved to the spatialreg package

screenreg(list(OLS.multi.med, spatial.multi.med, OLS.power.med, spatial.power.med, OLS.exp.med, spatial.exp.med))

## 
## =========================================================================================================================
##                               Model 1                Model 2        Model 3      Model 4       Model 5       Model 6     
## -------------------------------------------------------------------------------------------------------------------------
## (Intercept)                           107441.01 ***  107765.32 ***    11.21 ***     11.21 ***     10.99 ***     10.99 ***
##                                        (5512.62)      (5904.13)       (0.10)        (0.10)        (0.03)        (0.04)   
## kaggle_income_s$mindist                 -158.25        -166.34                                    -0.00 *       -0.00 *  
##                                         (132.84)       (141.98)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2                 -0.77          -0.71                                                           
##                                           (0.94)         (1.01)                                                          
## kaggle_income_s$mindist^3                  0.00           0.00                                                           
##                                           (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4                 -0.00 *        -0.00 *                                                         
##                                           (0.00)         (0.00)                                                          
## lambda                                                    0.08 ***                   0.06 ***                    0.06 ***
##                                                          (0.01)                     (0.02)                      (0.02)   
## log(kaggle_income_s$mindist)                                          -0.06 **      -0.06 **                             
##                                                                       (0.02)        (0.02)                               
## -------------------------------------------------------------------------------------------------------------------------
## AIC                                    76768.66                                                 9645.91                  
## BIC                                    76804.70                                                 9663.93                  
## Log Likelihood                        -38378.33      -38365.51                   -4811.75      -4819.95      -4812.68    
## Deviance                      22714919906323.84                                                 4366.83                  
## Num. obs.                               3000           3000         3000          3000          3000          3000       
## Parameters                                                7                          4                           4       
## AIC (Linear model)                                    76768.66                    9643.65                     9645.91    
## AIC (Spatial model)                                   76745.01                    9631.50                     9633.37    
## LR test: statistic                                       25.65                      14.15                       14.54    
## LR test: p-value                                          0.00                       0.00                        0.00    
## R^2                                                                    0.00                                              
## Adj. R^2                                                               0.00                                              
## =========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.med","spatial.power.med","spatial.exp.med")
for(i in 1:3){
  AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
  AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all

## [1] 76768.658 76745.012  9643.649  9631.502  9645.906  9633.368

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.med", "spatial.multi.med", "OLS.power.med", "spatial.power.med", "OLS.exp.med", "spatial.exp.med")
SRMSE.all<-matrix(0)
for(i in 1:6){
  SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Median")}
SRMSE.all

## [1] 1.013628 1.007353 1.430255 1.430255 1.430255 1.430255

Mean

kaggle_income_s$Mean[kaggle_income_s$Mean==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Mean~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.mean<-glm(eq.m)
spatial.multi.mean<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)

## Warning: Function errorsarlm moved to the spatialreg package

# power
eq.p = log(kaggle_income_s$Mean)~log(kaggle_income_s$mindist)
OLS.power.mean<-lm(eq.p)
spatial.power.mean<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)

## Warning: Function errorsarlm moved to the spatialreg package

# exponential
eq.e = log(kaggle_income_s$Mean)~kaggle_income_s$mindist
OLS.exp.mean<-glm(eq.e)
spatial.exp.mean<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)

## Warning: Function errorsarlm moved to the spatialreg package

screenreg(list(OLS.multi.mean, spatial.multi.mean, OLS.power.mean, spatial.power.mean, OLS.exp.mean, spatial.exp.mean))

## 
## ========================================================================================================================
##                               Model 1               Model 2        Model 3      Model 4       Model 5       Model 6     
## ------------------------------------------------------------------------------------------------------------------------
## (Intercept)                           73979.98 ***   75215.32 ***    11.12 ***     11.12 ***     11.00 ***     11.00 ***
##                                       (1925.86)      (2415.57)       (0.08)        (0.09)        (0.03)        (0.03)   
## kaggle_income_s$mindist                  39.62           8.03                                    -0.00 *       -0.00 *  
##                                         (46.41)        (57.51)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2                -1.15 ***      -0.91 *                                                         
##                                          (0.33)         (0.41)                                                          
## kaggle_income_s$mindist^3                 0.00 ***       0.00 **                                                        
##                                          (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4                -0.00 ***      -0.00 **                                                        
##                                          (0.00)         (0.00)                                                          
## lambda                                                   0.29 ***                   0.05 **                     0.05 ** 
##                                                         (0.01)                     (0.02)                      (0.02)   
## log(kaggle_income_s$mindist)                                         -0.04 *       -0.04 *                              
##                                                                      (0.02)        (0.02)                               
## ------------------------------------------------------------------------------------------------------------------------
## AIC                                   70458.64                                                 8632.45                  
## BIC                                   70494.68                                                 8650.47                  
## Log Likelihood                       -35223.32      -35029.64                   -4308.64      -4313.22      -4308.39    
## Deviance                      2772319618034.09                                                 3114.97                  
## Num. obs.                              3000           3000         3000          3000          3000          3000       
## Parameters                                               7                          4                           4       
## AIC (Linear model)                                   70458.64                    8632.86                     8632.45    
## AIC (Spatial model)                                  70073.27                    8625.28                     8624.78    
## LR test: statistic                                     387.37                       9.58                        9.67    
## LR test: p-value                                         0.00                       0.00                        0.00    
## R^2                                                                   0.00                                              
## Adj. R^2                                                              0.00                                              
## ========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.mean","spatial.power.mean","spatial.exp.mean")
for(i in 1:3){
  AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
  AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all

## [1] 70458.645 70073.270  8632.858  8625.278  8632.449  8624.783

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.mean", "spatial.multi.mean", "OLS.power.mean", "spatial.power.mean", "OLS.exp.mean", "spatial.exp.mean")
SRMSE.all<-matrix(0)
for(i in 1:6){
  SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all

## [1] 0.4506187 0.4102389 1.1003455 1.1003454 1.1003455 1.1003454

Std dev

kaggle_income_s$Stdev[kaggle_income_s$Stdev==0]<-2 # eliminating -inf for log(0)

#multinomial
eq.m = kaggle_income_s$Stdev~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.std<-glm(eq.m)
spatial.multi.std<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)

## Warning: Function errorsarlm moved to the spatialreg package

# power
eq.p = log(kaggle_income_s$Stdev)~log(kaggle_income_s$mindist)
OLS.power.std<-lm(eq.p)
spatial.power.std<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)

## Warning: Function errorsarlm moved to the spatialreg package

# exponential
eq.e = log(kaggle_income_s$Stdev)~kaggle_income_s$mindist
OLS.exp.std<-glm(eq.e)
spatial.exp.std<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)

## Warning: Function errorsarlm moved to the spatialreg package

screenreg(list(OLS.multi.std, spatial.multi.std, OLS.power.std, spatial.power.std, OLS.exp.std, spatial.exp.std))

## 
## =======================================================================================================================
##                               Model 1              Model 2        Model 3      Model 4       Model 5       Model 6     
## -----------------------------------------------------------------------------------------------------------------------
## (Intercept)                          49176.30 ***   49251.10 ***    10.76 ***     10.76 ***     10.68 ***     10.68 ***
##                                      (1026.16)      (1218.46)       (0.08)        (0.08)        (0.03)        (0.03)   
## kaggle_income_s$mindist                 55.30 *        51.62                                    -0.00 *       -0.00 *  
##                                        (24.73)        (29.15)                                   (0.00)        (0.00)   
## kaggle_income_s$mindist^2               -0.76 ***      -0.72 ***                                                       
##                                         (0.18)         (0.21)                                                          
## kaggle_income_s$mindist^3                0.00 ***       0.00 ***                                                       
##                                         (0.00)         (0.00)                                                          
## kaggle_income_s$mindist^4               -0.00 ***      -0.00 ***                                                       
##                                         (0.00)         (0.00)                                                          
## lambda                                                  0.21 ***                   0.02                        0.02    
##                                                        (0.01)                     (0.02)                      (0.02)   
## log(kaggle_income_s$mindist)                                        -0.03         -0.03                                
##                                                                     (0.02)        (0.02)                               
## -----------------------------------------------------------------------------------------------------------------------
## AIC                                  66681.35                                                 8305.33                  
## BIC                                  66717.39                                                 8323.35                  
## Log Likelihood                      -33334.67      -33238.47                   -4149.63      -4149.67      -4148.82    
## Deviance                      787088200271.66                                                 2793.18                  
## Num. obs.                             3000           3000         3000          3000          3000          3000       
## Parameters                                              7                          4                           4       
## AIC (Linear model)                                  66681.35                    8306.98                     8305.33    
## AIC (Spatial model)                                 66490.93                    8307.26                     8305.63    
## LR test: statistic                                    192.42                       1.72                        1.70    
## LR test: p-value                                        0.00                       0.19                        0.19    
## R^2                                                                  0.00                                              
## Adj. R^2                                                             0.00                                              
## =======================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Spatial models are doing better than a-spatial

AIC.all<-matrix(0)
models<-c("spatial.multi.std","spatial.power.std","spatial.exp.std")
for(i in 1:3){
  AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
  AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all

## [1] 66681.347 66490.930  8306.981  8307.262  8305.335  8305.632

Best SRMSE is for multinomial model

SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}

models<-c("OLS.multi.std", "spatial.multi.std", "OLS.power.std", "spatial.power.std", "OLS.exp.std", "spatial.exp.std")
SRMSE.all<-matrix(0)
for(i in 1:6){
  SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all

## [1] 0.5407556 0.5283016 1.1003497 1.1003497 1.1003497 1.1003497

Model quality plots

Median

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, OLS.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, spatial.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, OLS.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, spatial.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Mean

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, OLS.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, spatial.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, OLS.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, spatial.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Standard Deviation

Multinomial

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, OLS.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")

plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, spatial.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")

Power

par(mfrow=c(1,2))

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), OLS.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")

plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), spatial.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")

Exponential

par(mfrow=c(1,2))

plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, OLS.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")

plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, spatial.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")

Additional variables

cost of living, homeless, house index, crime (types) per county/zip code

bea CAINC1: Annual Personal Income by County https://apps.bea.gov/regional/downloadzip.cfm

Housing Median Home Prices https://www.nar.realtor/research-and-statistics/housing-statistics/county-median-home-prices-and-monthly-mortgage-payment

zero_pad <- function(x) { ifelse(str_starts(x, "0"), x, paste0("0",x))}

US counties shapefile

uscounties <- readOGR(".", "tl_2017_us_county")

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "tl_2017_us_county"
## with 3233 features
## It has 17 fields
## Integer64 fields read as strings:  ALAND AWATER

uscounties <- spTransform(uscounties, CRS("+proj=longlat +datum=NAD83"))
uscounties$GEOID = as.character(uscounties$GEOID)
uscounties$GEOID0 = uscounties$GEOID %>% sapply(zero_pad)

Personal income per county

income_county <- read.csv("CAINC1__ALL_AREAS_1969_2018.csv", stringsAsFactors=FALSE)
income_county <- income_county[,c("GeoFIPS", "GeoName", "Description", "LineCode", "Unit","X2018")]
income_county <- income_county[1:(dim(income_county)[1]-3),]
income_county <- income_county[income_county$Description=="Personal income (thousands of dollars)",]
colnames(income_county) = c("FIPS", "County", "Description", "LineCode", "Unit","Income")
income_county = income_county[!as.character(income_county$FIPS) %>% sapply(str_trim) %>% sapply(str_ends, "000"),]
income_county$FIPS = as.character(income_county$FIPS) %>% sapply(str_trim)
income_county$Income = as.integer(income_county$Income)

## Warning: NAs introduced by coercion

income_county$FIPS0 = income_county$FIPS %>% sapply(zero_pad)

House Index

house_index <- read.csv2("HI_data.csv", stringsAsFactors=FALSE)
colnames(house_index) = c("FIPS", "Price.RangeQ42019", "Geography", "HousingIndex" )
house_index$FIPS <- paste0("0",house_index$FIPS)
house_index$Geography = house_index$Geography %>% sapply(str_replace, "De Witt", "DeWitt")
house_index$FIPS0 = house_index$FIPS %>% sapply(zero_pad)

Education

education_county <- read.csv2("Education.csv", stringsAsFactors=FALSE)
colnames(education_county) = c("FIPS", "State", "Area.name", "Education.perc")
education_county$Education.perc = as.numeric(education_county$Education.perc)
education_county = education_county[!is.na(education_county$Education.perc),]
education_county$FIPS0 <- paste0("0",education_county$FIPS)

Population

population_county <- read.csv2("Pop.csv", stringsAsFactors=FALSE)
colnames(population_county) = c("FIPS", "State", "Area.name", "Econ.Typology", "Dom.Migr", "Pop")
population_county$Dom.Migr = as.numeric(population_county$Dom.Migr)
population_county$Pop = as.numeric(population_county$Pop)
population_county = population_county[!is.na(population_county$Dom.Migr),]
population_county = population_county[!is.na(population_county$Pop),]
population_county$FIPS0 <- paste0("0",population_county$FIPS)

uscounties = merge(uscounties, house_index, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, income_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, population_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, education_county, by.x="GEOID0", by.y="FIPS0")
uscounties = uscounties[!uscounties$STATEFP %in% c("15", "02"),]

uscounties = uscounties[!is.na(uscounties$Education.perc),]
uscounties = uscounties[!is.na(uscounties$HousingIndex),]
uscounties = uscounties[!is.na(uscounties$Income),]
uscounties = uscounties[!is.na(uscounties$Econ.Typology),]
uscounties = uscounties[!is.na(uscounties$Dom.Migr),]
uscounties = uscounties[!is.na(uscounties$Pop),]

Maps of variables

Housing Index

variable<-uscounties$HousingIndex
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Income

variable<-uscounties$Income
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Education

variable<-uscounties$Education.perc
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Population

variable<-uscounties$Pop
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Domestic migration

variable<-uscounties$Dom.Migr
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)

Spatial weights matrix

KNN

uscounties.df = as.data.frame(uscounties)

crds<-coordinates(uscounties)

# calculate weights matrix
us.knn<-knearneigh(crds, k=1)
us.knn.nb<-knn2nb(us.knn)
us.sym.knn.nb<-make.sym.nb(us.knn.nb)
us.sym.knn.listw<-nb2listw(us.sym.knn.nb)

M<-cor(uscounties.df[,c("Dom.Migr","HousingIndex", "Pop", "Education.perc")])
corrplot(M)

form = log(Income)~Pop+HousingIndex+Education.perc+Dom.Migr

# GNS, Manski model (all)
GNS_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)

## Warning: Function sacsarlm moved to the spatialreg package

summary(GNS_1)

## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.219885 -0.430513  0.051912  0.502844  2.036685 
## 
## Type: sacmixed 
## Coefficients: (asymptotic standard errors) 
##                      Estimate Std. Error z value  Pr(>|z|)
## (Intercept)        1.7660e+01 3.0450e-01 57.9962 < 2.2e-16
## Pop                2.0950e-06 7.5370e-08 27.7968 < 2.2e-16
## HousingIndex       3.1966e-06 3.3315e-07  9.5950 < 2.2e-16
## Education.perc     4.2094e-02 3.1613e-03 13.3155 < 2.2e-16
## Dom.Migr           7.9429e-05 6.5624e-06 12.1037 < 2.2e-16
## lag.Pop            1.3480e-06 8.9170e-08 15.1171 < 2.2e-16
## lag.HousingIndex   2.0255e-06 3.7899e-07  5.3445 9.068e-08
## lag.Education.perc 1.1461e-02 3.6973e-03  3.1000  0.001936
## lag.Dom.Migr       5.6074e-05 7.2591e-06  7.7246 1.132e-14
## 
## Rho: -0.42174
## Asymptotic standard error: 0.023213
##     z-value: -18.168, p-value: < 2.22e-16
## Lambda: 0.66235
## Asymptotic standard error: 0.015754
##     z-value: 42.044, p-value: < 2.22e-16
## 
## LR test value: 774.27, p-value: < 2.22e-16
## 
## Log likelihood: -4131.385 for sacmixed model
## ML residual variance (sigma squared): 0.53705, (sigma: 0.73284)
## Number of observations: 3044 
## Number of parameters estimated: 12 
## AIC: 8286.8, (AIC for lm: 9049)

# SAC (rho and lambda)
SAC_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAC_1)

## 
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw, 
##     listw2 = listw2, na.action = na.action, Durbin = Durbin, 
##     type = type, method = method, quiet = quiet, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, interval1 = interval1, 
##     interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -8.061383 -0.544972  0.086083  0.641902  2.402963 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                  Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    1.0567e+01 2.9889e-01 35.3552 < 2.2e-16
## Pop            1.9255e-06 7.0171e-08 27.4405 < 2.2e-16
## HousingIndex   2.9650e-06 3.2724e-07  9.0606 < 2.2e-16
## Education.perc 4.3327e-02 2.9820e-03 14.5298 < 2.2e-16
## Dom.Migr       7.4546e-05 6.0359e-06 12.3504 < 2.2e-16
## 
## Rho: 0.1346
## Asymptotic standard error: 0.02219
##     z-value: 6.0659, p-value: 1.3119e-09
## Lambda: 0.21442
## Asymptotic standard error: 0.025894
##     z-value: 8.2806, p-value: 2.2204e-16
## 
## LR test value: 601.74, p-value: < 2.22e-16
## 
## Log likelihood: -4217.654 for sac model
## ML residual variance (sigma squared): 0.89609, (sigma: 0.94662)
## Number of observations: 3044 
## Number of parameters estimated: 8 
## AIC: 8451.3, (AIC for lm: 9049)

# SDM (rho and x lags)
SDM_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="mixed") 
summary(SDM_1)

## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.755311 -0.530582  0.081613  0.634963  2.330427 
## 
## Type: mixed 
## Coefficients: (numerical Hessian approximate standard errors) 
##                       Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)         8.0787e+00  1.6731e-01 48.2870 < 2.2e-16
## Pop                 1.8141e-06  7.3142e-08 24.8021 < 2.2e-16
## HousingIndex        2.7254e-06  4.3920e-07  6.2054 5.455e-10
## Education.perc      4.6333e-02  3.1864e-03 14.5407 < 2.2e-16
## Dom.Migr            6.8381e-05  6.2857e-06 10.8788 < 2.2e-16
## lag.Pop            -9.6857e-08  9.8321e-08 -0.9851    0.3246
## lag.HousingIndex   -3.8078e-07  4.9309e-07 -0.7722    0.4400
## lag.Education.perc -2.2539e-02  3.5741e-03 -6.3063 2.858e-10
## lag.Dom.Migr        4.3336e-06  8.0089e-06  0.5411    0.5884
## 
## Rho: 0.34999, LR test value: 615.83, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.012946
##     z-value: 27.035, p-value: < 2.22e-16
## Wald statistic: 730.9, p-value: < 2.22e-16
## 
## Log likelihood: -4187.434 for mixed model
## ML residual variance (sigma squared): 0.84186, (sigma: 0.91753)
## Number of observations: 3044 
## Number of parameters estimated: 11 
## AIC: 8396.9, (AIC for lm: 9010.7)

#SDEM (lambda and x lags)
SDEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, etype="emixed")
summary(SDEM_1)

## 
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.714501 -0.533814  0.079177  0.637080  2.310477 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                       Estimate  Std. Error  z value  Pr(>|z|)
## (Intercept)         1.2429e+01  7.3899e-02 168.1852 < 2.2e-16
## Pop                 1.9696e-06  7.0818e-08  27.8122 < 2.2e-16
## HousingIndex        2.9041e-06  3.5404e-07   8.2028 2.220e-16
## Education.perc      4.4272e-02  3.0648e-03  14.4451 < 2.2e-16
## Dom.Migr            7.6099e-05  6.1529e-06  12.3680 < 2.2e-16
## lag.Pop             6.1167e-07  7.1543e-08   8.5497 < 2.2e-16
## lag.HousingIndex    7.2878e-07  3.7768e-07   1.9296   0.05365
## lag.Education.perc -7.5739e-03  3.3392e-03  -2.2681   0.02332
## lag.Dom.Migr        3.1340e-05  6.3166e-06   4.9616 6.993e-07
## 
## Lambda: 0.3509, LR test value: 616.25, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.012733
##     z-value: 27.558, p-value: < 2.22e-16
## Wald statistic: 759.43, p-value: < 2.22e-16
## 
## Log likelihood: -4187.227 for error model
## ML residual variance (sigma squared): 0.84135, (sigma: 0.91725)
## Number of observations: 3044 
## Number of parameters estimated: 11 
## AIC: 8396.5, (AIC for lm: 9010.7)

#SAR (rho)
SAR_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw) 
summary(SAR_1)

## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.895077 -0.539679  0.086321  0.642340  2.458343 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##                  Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    8.5184e+00 1.5549e-01 54.7829 < 2.2e-16
## Pop            1.8452e-06 6.7280e-08 27.4264 < 2.2e-16
## HousingIndex   2.0318e-06 2.6397e-07  7.6971 1.399e-14
## Education.perc 4.0013e-02 2.7275e-03 14.6704 < 2.2e-16
## Dom.Migr       7.5200e-05 5.7502e-06 13.0778 < 2.2e-16
## 
## Rho: 0.29637, LR test value: 566.44, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.011469
##     z-value: 25.841, p-value: < 2.22e-16
## Wald statistic: 667.75, p-value: < 2.22e-16
## 
## Log likelihood: -4235.303 for lag model
## ML residual variance (sigma squared): 0.89097, (sigma: 0.94391)
## Number of observations: 3044 
## Number of parameters estimated: 7 
## AIC: 8484.6, (AIC for lm: 9049)

#SLX (lags of x)
SLX_1 <- lmSLX(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SLX_1)

## 
## Call:
## lm(formula = formula(paste("y ~ ", paste(colnames(x)[-1], collapse = "+"))), 
##     data = as.data.frame(x), weights = weights)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1240 -0.5948  0.1281  0.7240  2.4385 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.238e+01  5.783e-02 214.030  < 2e-16 ***
## Pop                 2.055e-06  7.987e-08  25.735  < 2e-16 ***
## HousingIndex        2.817e-06  4.600e-07   6.125 1.02e-09 ***
## Education.perc      4.412e-02  3.611e-03  12.218  < 2e-16 ***
## Dom.Migr            8.025e-05  6.852e-06  11.712  < 2e-16 ***
## lag.Pop             5.036e-07  7.928e-08   6.352 2.44e-10 ***
## lag.HousingIndex    7.451e-07  4.772e-07   1.561    0.119    
## lag.Education.perc -5.500e-03  3.832e-03  -1.435    0.151    
## lag.Dom.Migr        3.015e-05  6.843e-06   4.406 1.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.061 on 3035 degrees of freedom
## Multiple R-squared:  0.5406, Adjusted R-squared:  0.5394 
## F-statistic: 446.4 on 8 and 3035 DF,  p-value: < 2.2e-16

#SEM (lambda)
SEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SEM_1)

## 
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -7.537158 -0.535216  0.076571  0.626528  3.317991 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                  Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    1.2368e+01 5.3042e-02 233.170 < 2.2e-16
## Pop            1.8597e-06 6.9592e-08  26.722 < 2.2e-16
## HousingIndex   3.7812e-06 3.1551e-07  11.985 < 2.2e-16
## Education.perc 4.1921e-02 3.0193e-03  13.885 < 2.2e-16
## Dom.Migr       6.8902e-05 5.9837e-06  11.515 < 2.2e-16
## 
## Lambda: 0.34815, LR test value: 579.2, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.013123
##     z-value: 26.53, p-value: < 2.22e-16
## Wald statistic: 703.87, p-value: < 2.22e-16
## 
## Log likelihood: -4228.921 for error model
## ML residual variance (sigma squared): 0.86594, (sigma: 0.93056)
## Number of observations: 3044 
## Number of parameters estimated: 7 
## AIC: 8471.8, (AIC for lm: 9049)

#OLS
OLS_1<-lm(form, data=uscounties.df)
summary(OLS_1)

## 
## Call:
## lm(formula = form, data = uscounties.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0086  -0.6060   0.1288   0.7388   2.5707 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.237e+01  5.006e-02  247.10   <2e-16 ***
## Pop            2.267e-06  7.388e-08   30.68   <2e-16 ***
## HousingIndex   3.707e-06  2.896e-07   12.80   <2e-16 ***
## Education.perc 3.923e-02  3.087e-03   12.71   <2e-16 ***
## Dom.Migr       9.036e-05  6.475e-06   13.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.069 on 3039 degrees of freedom
## Multiple R-squared:  0.5335, Adjusted R-squared:  0.5329 
## F-statistic:   869 on 4 and 3039 DF,  p-value: < 2.2e-16

Clustering

Optimal clustering using silhouette

Optimal number of clusters k=2

fviz_nbclust(coords[1:5000,], clara, method="silhouette")

Optimal clustering using silhouette

Optimal number of clusters k=2

gap<-clusGap(coords[1:5000,], FUN=clara, K.max=50, B=5)
fviz_gap_stat(gap)
fviz_gap_stat(gap, linecolor="red", maxSE=list(method="globalmax"))

9

c9<-clara(coords, 9, metric="euclidean")
fviz_cluster(c9, pch=19, geom="point")
fviz_silhouette(c9)

kaggle_income$c9_clust = c9$clustering

ggplot(kaggle_income[,c("Median", "c9_clust")], aes(x=Median))+
  geom_density()+
  facet_wrap(~c9_clust,  ncol=3)

2

c2<-clara(coords, 2, metric="euclidean", sampsize=1000)
fviz_cluster(c2, pch=19, geom="point")
fviz_silhouette(c2)

44

c44<-clara(coords, 44, metric="euclidean")
fviz_cluster(c44, pch=19, geom="point")
fviz_silhouette(c44)

kaggle_income$c44_clust = c44$clustering

ggplot(kaggle_income[,c("Median", "c44_clust")], aes(x=Median))+
  geom_density()+
  facet_wrap(~c44_clust,  ncol=8)

ggplot(kaggle_income[,c("Median", "State_Code")], aes(x=Median))+
  geom_density()+
  facet_wrap(~State_Code,  ncol=8)

Main

Krystian Andruszek, Ewa Sobolewska

20/06/2020

START

Points plot on map

median

mean

std dev

Moran’s I

Median

Mean

Std dev

Distance

Interactions

Median

Mean

Std dev

Model quality plots

Median

Multinomial

Power

Exponential

Mean

Multinomial

Power

Exponential

Standard Deviation

Multinomial

Power

Exponential

Additional variables

US counties shapefile

Personal income per county

House Index

Education

Population

Maps of variables

Housing Index

Income

Education

Population

Domestic migration

Spatial weights matrix

KNN

Clustering

9

2

44