# install.packages('texreg')
library(spdep)
library(rgdal)
library(maptools)
library(sp)
library(RColorBrewer)
library(classInt)
library(GISTools)
library(maps)
library(viridis)
library(corrplot)
library(texreg)
library(ggplot2)
library(gridExtra)
library(readr)
library(cluster)
library(factoextra)
library(stringr)
kaggle_income <- read.csv("kaggle_income.csv")
usmap <- readOGR(".", "cb_2018_us_state_500k")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "cb_2018_us_state_500k"
## with 56 features
## It has 9 fields
## Integer64 fields read as strings: ALAND AWATER
usmap <- spTransform(usmap, CRS("+proj=longlat +datum=NAD83"))
usmap = usmap[!(usmap$NAME %in% c("Puerto Rico", "American Samoa", "United States Virgin Islands" , "Alaska", "Hawaii", "Guam", "Commonwealth of the Northern Mariana Islands")),]
kaggle_income[,"Lon"] %>% min()
## [1] -175.86
kaggle_income[,"Lon"] %>% max()
## [1] -65.50082
kaggle_income[,"Lat"] %>% min()
## [1] 17.92909
kaggle_income[,"Lat"] %>% max()
## [1] 71.2535
kaggle_income = kaggle_income[(kaggle_income$Lon>-125)&(kaggle_income$Lon<-66)&(kaggle_income$Lat>24)&(kaggle_income$Lat<50),]
coords = kaggle_income[,c("Lon", "Lat")]
coords.m = as.matrix(coords)
ggp1 = ggplot(data=kaggle_income, aes(x=Mean))+geom_density(color="coral3", fill="coral", alpha=0.3) + theme_light()
ggp2 = ggplot(data=kaggle_income, aes(x=Median))+geom_density(color="darkcyan", fill="cyan3", alpha=0.3) + theme_light()
ggp3 = ggplot(data=kaggle_income, aes(x=Stdev))+geom_density(color="chartreuse4", fill="chartreuse3", alpha=0.3) + theme_light()
grid.arrange(ggp1, ggp2, ggp3, ncol = 3)
variable <- kaggle_income$Median
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
col=cols[findInterval(variable, brks)])
variable <- kaggle_income$Mean
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
col=cols[findInterval(variable, brks)])
variable <- kaggle_income$Stdev
brks <- stats::quantile(variable, probs=seq(0, 1, 0.12))
cols = brewer.pal(9, "Blues")
plot(usmap)
points(coords, pch=19, cex=0.5,
col=cols[findInterval(variable, brks)])
inc.knn<-knearneigh(coords.m, k=1) # knn class (k=1)
## Warning in knearneigh(coords.m, k = 1): knearneigh: identical points found
inc.knn.nb<-knn2nb(inc.knn)
# checking for matrix symmetry
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: FALSE"
inc.knn.nb<-make.sym.nb(inc.knn.nb)
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: TRUE"
# knn as listw class
inc.knn.listw<-nb2listw(inc.knn.nb)
All tests are significant, with a low p-value that allows us to reject the null hypothesis and assume there is a spatial autocorrelation. The value of the Morans I statistic is positive signifying a positive
moran.test(kaggle_income$Median, inc.knn.listw)
##
## Moran I test under randomisation
##
## data: kaggle_income$Median
## weights: inc.knn.listw
##
## Moran I statistic standard deviate = 24.726, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 1.693882e-01 -3.134600e-05 4.694837e-05
x<-kaggle_income$Median
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
summary(morlm)
##
## Call:
## lm(formula = wzx ~ zx$V1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.37462 -0.49356 -0.32937 0.00814 2.61268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.015652 0.004954 -3.16 0.00158 **
## zx$V1 0.169388 0.004954 34.20 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8848 on 31901 degrees of freedom
## Multiple R-squared: 0.03536, Adjusted R-squared: 0.03533
## F-statistic: 1169 on 1 and 31901 DF, p-value: < 2.2e-16
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="coral3")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)
moran.test(kaggle_income$Mean, inc.knn.listw)
##
## Moran I test under randomisation
##
## data: kaggle_income$Mean
## weights: inc.knn.listw
##
## Moran I statistic standard deviate = 85.088, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 5.829837e-01 -3.134600e-05 4.694889e-05
x<-kaggle_income$Mean
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
summary(morlm)
##
## Call:
## lm(formula = wzx ~ zx$V1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5850 -0.4375 -0.0744 0.3741 7.1161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.016918 0.004112 -4.114 3.9e-05 ***
## zx$V1 0.582984 0.004112 141.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7345 on 31901 degrees of freedom
## Multiple R-squared: 0.3865, Adjusted R-squared: 0.3865
## F-statistic: 2.01e+04 on 1 and 31901 DF, p-value: < 2.2e-16
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="darkcyan")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)
moran.test(kaggle_income$Stdev, inc.knn.listw)
##
## Moran I test under randomisation
##
## data: kaggle_income$Stdev
## weights: inc.knn.listw
##
## Moran I statistic standard deviate = 67.374, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 4.616191e-01 -3.134600e-05 4.695085e-05
x<-kaggle_income$Stdev
zx<-as.data.frame(scale(x))
wzx<-lag.listw(inc.knn.listw, zx$V1)
morlm<-lm(wzx~zx$V1)
summary(morlm)
##
## Call:
## lm(formula = wzx ~ zx$V1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0390 -0.5030 -0.0140 0.5099 3.9653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.010419 0.004533 -2.299 0.0215 *
## zx$V1 0.461619 0.004533 101.845 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8096 on 31901 degrees of freedom
## Multiple R-squared: 0.2454, Adjusted R-squared: 0.2453
## F-statistic: 1.037e+04 on 1 and 31901 DF, p-value: < 2.2e-16
slope<-morlm$coefficients[2]
intercept<-morlm$coefficients[1]
plot(zx$V1, wzx, xlab="zx",ylab="spatial lag of zx", pch=".", col="chartreuse4")
abline(intercept, slope)
abline(h=0, lty=2)
abline(v=0, lty=2)
# Select capital cities
state.capital.cities <- us.cities[us.cities$capital==2,c("name", "pop", "lat", "long")]
state.capital.cities <- state.capital.cities[state.capital.cities$name != "Honolulu HI",]
state.capital.cities
## name pop lat long
## 5 Albany NY 93576 42.67 -73.80
## 26 Annapolis MD 36300 38.98 -76.49
## 41 Atlanta GA 424096 33.76 -84.42
## 46 Augusta ME 18626 44.32 -69.77
## 50 Austin TX 683404 30.31 -97.75
## 58 Baton Rouge LA 222217 30.45 -91.13
## 84 Bismarck ND 56927 46.81 -100.77
## 94 Boise ID 193628 43.61 -116.23
## 98 Boston MA 567759 42.34 -71.02
## 144 Carson City NV 58350 39.15 -119.74
## 165 Charleston WV 49804 38.35 -81.63
## 172 Cheyenne WY 55833 41.15 -104.79
## 196 Columbia SC 118020 34.04 -80.89
## 199 Columbus OH 741677 39.99 -82.99
## 203 Concord NH 42967 43.23 -71.56
## 247 Denver CO 556575 39.77 -104.87
## 248 Des Moines IA 192050 41.58 -93.62
## 253 Dover DE 34288 39.16 -75.53
## 332 Frankfort KY 27210 38.20 -84.86
## 385 Harrisburg PA 47576 40.28 -76.88
## 387 Hartford CT 123836 41.77 -72.68
## 392 Helena MT 27383 46.60 -112.03
## 422 Indianapolis IN 771725 39.78 -86.15
## 430 Jackson MS 175085 32.32 -90.21
## 435 Jefferson City MO 39062 38.57 -92.19
## 441 Juneau AK 31187 58.30 -134.42
## 485 Lansing MI 117236 42.71 -84.55
## 507 Lincoln NE 245301 40.82 -96.69
## 509 Little Rock AR 184323 34.72 -92.35
## 536 Madison WI 227642 43.08 -89.39
## 583 Montgomery AL 197653 32.35 -86.28
## 585 Montpelier VT 8003 44.26 -72.57
## 601 Nashville TN 523547 36.17 -86.78
## 647 Oklahoma City OK 538141 35.47 -97.51
## 649 Olympia WA 45403 47.04 -122.89
## 694 Phoenix AZ 1450884 33.54 -112.07
## 696 Pierre SD 14052 44.37 -100.34
## 726 Providence RI 178295 41.82 -71.42
## 732 Raleigh NC 350822 35.82 -78.66
## 754 Richmond VA 189498 37.53 -77.47
## 778 Sacramento CA 480392 38.57 -121.47
## 787 Saint Paul MN 272469 44.95 -93.10
## 791 Salem OR 148942 44.92 -123.02
## 794 Salt Lake City UT 177318 40.78 -111.93
## 821 Santa Fe NM 70598 35.68 -105.95
## 863 Springfield IL 116725 39.78 -89.64
## 886 Tallahassee FL 154114 30.46 -84.28
## 910 Topeka KS 121229 39.04 -95.69
## 915 Trenton NJ 84653 40.22 -74.76
coords <- kaggle_income[,c("Lon", "Lat")]
coords <- as.matrix(coords)
for (city.name in state.capital.cities$name) {
col.city.name = print(paste0("dist_",city.name))
lat = state.capital.cities[state.capital.cities$name==city.name,]$lat
long = state.capital.cities[state.capital.cities$name==city.name,]$long
kaggle_income[col.city.name]<-spDistsN1(coords, c(long, lat), longlat=TRUE)
}
## [1] "dist_Albany NY"
## [1] "dist_Annapolis MD"
## [1] "dist_Atlanta GA"
## [1] "dist_Augusta ME"
## [1] "dist_Austin TX"
## [1] "dist_Baton Rouge LA"
## [1] "dist_Bismarck ND"
## [1] "dist_Boise ID"
## [1] "dist_Boston MA"
## [1] "dist_Carson City NV"
## [1] "dist_Charleston WV"
## [1] "dist_Cheyenne WY"
## [1] "dist_Columbia SC"
## [1] "dist_Columbus OH"
## [1] "dist_Concord NH"
## [1] "dist_Denver CO"
## [1] "dist_Des Moines IA"
## [1] "dist_Dover DE"
## [1] "dist_Frankfort KY"
## [1] "dist_Harrisburg PA"
## [1] "dist_Hartford CT"
## [1] "dist_Helena MT"
## [1] "dist_Indianapolis IN"
## [1] "dist_Jackson MS"
## [1] "dist_Jefferson City MO"
## [1] "dist_Juneau AK"
## [1] "dist_Lansing MI"
## [1] "dist_Lincoln NE"
## [1] "dist_Little Rock AR"
## [1] "dist_Madison WI"
## [1] "dist_Montgomery AL"
## [1] "dist_Montpelier VT"
## [1] "dist_Nashville TN"
## [1] "dist_Oklahoma City OK"
## [1] "dist_Olympia WA"
## [1] "dist_Phoenix AZ"
## [1] "dist_Pierre SD"
## [1] "dist_Providence RI"
## [1] "dist_Raleigh NC"
## [1] "dist_Richmond VA"
## [1] "dist_Sacramento CA"
## [1] "dist_Saint Paul MN"
## [1] "dist_Salem OR"
## [1] "dist_Salt Lake City UT"
## [1] "dist_Santa Fe NM"
## [1] "dist_Springfield IL"
## [1] "dist_Tallahassee FL"
## [1] "dist_Topeka KS"
## [1] "dist_Trenton NJ"
kaggle_income$mindist <- NULL
kaggle_income$mindist = apply(kaggle_income[,20:length(colnames(kaggle_income))], 1, min)
cols = rev(brewer.pal(9, "Blues"))
mindist <- kaggle_income$mindist
brks <- stats::quantile(mindist, probs=seq(0, 1, 0.13))
plot(usmap)
points(kaggle_income[,c("Lon", "Lat")], col=cols[findInterval(mindist, brks)], pch=21, cex=0.7,
bg=cols[findInterval(mindist, brks)])
legend("bottomleft", legend=round(brks,3), fill=cols, cex=0.8, bty="n")
title(main="Distances from closest state capital")
kaggle_income_s = dplyr::sample_n(kaggle_income, 3000)
coords.s = kaggle_income_s[,c("Lon", "Lat")]
coords.m.s = as.matrix(coords.s)
inc.knn<-knearneigh(coords.m.s, k=1) # knn class (k=1)
## Warning in knearneigh(coords.m.s, k = 1): knearneigh: identical points found
inc.knn.nb<-knn2nb(inc.knn)
# checking for matrix symmetry
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: FALSE"
inc.knn.nb<-make.sym.nb(inc.knn.nb)
print(paste0("Is matrix symmetric: ", is.symmetric.nb(inc.knn.nb)))
## [1] "Is matrix symmetric: TRUE"
# knn as listw class
inc.knn.listw.s<-nb2listw(inc.knn.nb)
kaggle_income_s$Median[kaggle_income_s$Median==0]<-2 # eliminating -inf for log(0)
#multinomial
eq.m = kaggle_income_s$Median~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.med<-glm(eq.m)
spatial.multi.med<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Warning: Function errorsarlm moved to the spatialreg package
## Registered S3 methods overwritten by 'spatialreg':
## method from
## residuals.stsls spdep
## deviance.stsls spdep
## coef.stsls spdep
## print.stsls spdep
## summary.stsls spdep
## print.summary.stsls spdep
## residuals.gmsar spdep
## deviance.gmsar spdep
## coef.gmsar spdep
## fitted.gmsar spdep
## print.gmsar spdep
## summary.gmsar spdep
## print.summary.gmsar spdep
## print.lagmess spdep
## summary.lagmess spdep
## print.summary.lagmess spdep
## residuals.lagmess spdep
## deviance.lagmess spdep
## coef.lagmess spdep
## fitted.lagmess spdep
## logLik.lagmess spdep
## fitted.SFResult spdep
## print.SFResult spdep
## fitted.ME_res spdep
## print.ME_res spdep
## print.lagImpact spdep
## plot.lagImpact spdep
## summary.lagImpact spdep
## HPDinterval.lagImpact spdep
## print.summary.lagImpact spdep
## print.sarlm spdep
## summary.sarlm spdep
## residuals.sarlm spdep
## deviance.sarlm spdep
## coef.sarlm spdep
## vcov.sarlm spdep
## fitted.sarlm spdep
## logLik.sarlm spdep
## anova.sarlm spdep
## predict.sarlm spdep
## print.summary.sarlm spdep
## print.sarlm.pred spdep
## as.data.frame.sarlm.pred spdep
## residuals.spautolm spdep
## deviance.spautolm spdep
## coef.spautolm spdep
## fitted.spautolm spdep
## print.spautolm spdep
## summary.spautolm spdep
## logLik.spautolm spdep
## print.summary.spautolm spdep
## print.WXImpact spdep
## summary.WXImpact spdep
## print.summary.WXImpact spdep
## predict.SLX spdep
# power
eq.p = log(kaggle_income_s$Median)~log(kaggle_income_s$mindist)
OLS.power.med<-lm(eq.p)
spatial.power.med<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
# exponential
eq.e = log(kaggle_income_s$Median)~kaggle_income_s$mindist
OLS.exp.med<-glm(eq.e)
spatial.exp.med<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
screenreg(list(OLS.multi.med, spatial.multi.med, OLS.power.med, spatial.power.med, OLS.exp.med, spatial.exp.med))
##
## =========================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## -------------------------------------------------------------------------------------------------------------------------
## (Intercept) 107441.01 *** 107765.32 *** 11.21 *** 11.21 *** 10.99 *** 10.99 ***
## (5512.62) (5904.13) (0.10) (0.10) (0.03) (0.04)
## kaggle_income_s$mindist -158.25 -166.34 -0.00 * -0.00 *
## (132.84) (141.98) (0.00) (0.00)
## kaggle_income_s$mindist^2 -0.77 -0.71
## (0.94) (1.01)
## kaggle_income_s$mindist^3 0.00 0.00
## (0.00) (0.00)
## kaggle_income_s$mindist^4 -0.00 * -0.00 *
## (0.00) (0.00)
## lambda 0.08 *** 0.06 *** 0.06 ***
## (0.01) (0.02) (0.02)
## log(kaggle_income_s$mindist) -0.06 ** -0.06 **
## (0.02) (0.02)
## -------------------------------------------------------------------------------------------------------------------------
## AIC 76768.66 9645.91
## BIC 76804.70 9663.93
## Log Likelihood -38378.33 -38365.51 -4811.75 -4819.95 -4812.68
## Deviance 22714919906323.84 4366.83
## Num. obs. 3000 3000 3000 3000 3000 3000
## Parameters 7 4 4
## AIC (Linear model) 76768.66 9643.65 9645.91
## AIC (Spatial model) 76745.01 9631.50 9633.37
## LR test: statistic 25.65 14.15 14.54
## LR test: p-value 0.00 0.00 0.00
## R^2 0.00
## Adj. R^2 0.00
## =========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Spatial models are doing better than a-spatial
AIC.all<-matrix(0)
models<-c("spatial.multi.med","spatial.power.med","spatial.exp.med")
for(i in 1:3){
AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 76768.658 76745.012 9643.649 9631.502 9645.906 9633.368
Best SRMSE is for multinomial model
SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}
models<-c("OLS.multi.med", "spatial.multi.med", "OLS.power.med", "spatial.power.med", "OLS.exp.med", "spatial.exp.med")
SRMSE.all<-matrix(0)
for(i in 1:6){
SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Median")}
SRMSE.all
## [1] 1.013628 1.007353 1.430255 1.430255 1.430255 1.430255
kaggle_income_s$Mean[kaggle_income_s$Mean==0]<-2 # eliminating -inf for log(0)
#multinomial
eq.m = kaggle_income_s$Mean~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.mean<-glm(eq.m)
spatial.multi.mean<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Warning: Function errorsarlm moved to the spatialreg package
# power
eq.p = log(kaggle_income_s$Mean)~log(kaggle_income_s$mindist)
OLS.power.mean<-lm(eq.p)
spatial.power.mean<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
# exponential
eq.e = log(kaggle_income_s$Mean)~kaggle_income_s$mindist
OLS.exp.mean<-glm(eq.e)
spatial.exp.mean<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
screenreg(list(OLS.multi.mean, spatial.multi.mean, OLS.power.mean, spatial.power.mean, OLS.exp.mean, spatial.exp.mean))
##
## ========================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## ------------------------------------------------------------------------------------------------------------------------
## (Intercept) 73979.98 *** 75215.32 *** 11.12 *** 11.12 *** 11.00 *** 11.00 ***
## (1925.86) (2415.57) (0.08) (0.09) (0.03) (0.03)
## kaggle_income_s$mindist 39.62 8.03 -0.00 * -0.00 *
## (46.41) (57.51) (0.00) (0.00)
## kaggle_income_s$mindist^2 -1.15 *** -0.91 *
## (0.33) (0.41)
## kaggle_income_s$mindist^3 0.00 *** 0.00 **
## (0.00) (0.00)
## kaggle_income_s$mindist^4 -0.00 *** -0.00 **
## (0.00) (0.00)
## lambda 0.29 *** 0.05 ** 0.05 **
## (0.01) (0.02) (0.02)
## log(kaggle_income_s$mindist) -0.04 * -0.04 *
## (0.02) (0.02)
## ------------------------------------------------------------------------------------------------------------------------
## AIC 70458.64 8632.45
## BIC 70494.68 8650.47
## Log Likelihood -35223.32 -35029.64 -4308.64 -4313.22 -4308.39
## Deviance 2772319618034.09 3114.97
## Num. obs. 3000 3000 3000 3000 3000 3000
## Parameters 7 4 4
## AIC (Linear model) 70458.64 8632.86 8632.45
## AIC (Spatial model) 70073.27 8625.28 8624.78
## LR test: statistic 387.37 9.58 9.67
## LR test: p-value 0.00 0.00 0.00
## R^2 0.00
## Adj. R^2 0.00
## ========================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Spatial models are doing better than a-spatial
AIC.all<-matrix(0)
models<-c("spatial.multi.mean","spatial.power.mean","spatial.exp.mean")
for(i in 1:3){
AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 70458.645 70073.270 8632.858 8625.278 8632.449 8624.783
Best SRMSE is for multinomial model
SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}
models<-c("OLS.multi.mean", "spatial.multi.mean", "OLS.power.mean", "spatial.power.mean", "OLS.exp.mean", "spatial.exp.mean")
SRMSE.all<-matrix(0)
for(i in 1:6){
SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.4506187 0.4102389 1.1003455 1.1003454 1.1003455 1.1003454
kaggle_income_s$Stdev[kaggle_income_s$Stdev==0]<-2 # eliminating -inf for log(0)
#multinomial
eq.m = kaggle_income_s$Stdev~kaggle_income_s$mindist+I(kaggle_income_s$mindist^2)+ I(kaggle_income_s$mindist^3)+ I(kaggle_income_s$mindist^4)
OLS.multi.std<-glm(eq.m)
spatial.multi.std<-errorsarlm(eq.m, data=kaggle_income_s, inc.knn.listw.s, tol.solve=2e-40)
## Warning: Function errorsarlm moved to the spatialreg package
# power
eq.p = log(kaggle_income_s$Stdev)~log(kaggle_income_s$mindist)
OLS.power.std<-lm(eq.p)
spatial.power.std<-errorsarlm(eq.p, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
# exponential
eq.e = log(kaggle_income_s$Stdev)~kaggle_income_s$mindist
OLS.exp.std<-glm(eq.e)
spatial.exp.std<-errorsarlm(eq.e, data=kaggle_income_s, inc.knn.listw.s)
## Warning: Function errorsarlm moved to the spatialreg package
screenreg(list(OLS.multi.std, spatial.multi.std, OLS.power.std, spatial.power.std, OLS.exp.std, spatial.exp.std))
##
## =======================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## -----------------------------------------------------------------------------------------------------------------------
## (Intercept) 49176.30 *** 49251.10 *** 10.76 *** 10.76 *** 10.68 *** 10.68 ***
## (1026.16) (1218.46) (0.08) (0.08) (0.03) (0.03)
## kaggle_income_s$mindist 55.30 * 51.62 -0.00 * -0.00 *
## (24.73) (29.15) (0.00) (0.00)
## kaggle_income_s$mindist^2 -0.76 *** -0.72 ***
## (0.18) (0.21)
## kaggle_income_s$mindist^3 0.00 *** 0.00 ***
## (0.00) (0.00)
## kaggle_income_s$mindist^4 -0.00 *** -0.00 ***
## (0.00) (0.00)
## lambda 0.21 *** 0.02 0.02
## (0.01) (0.02) (0.02)
## log(kaggle_income_s$mindist) -0.03 -0.03
## (0.02) (0.02)
## -----------------------------------------------------------------------------------------------------------------------
## AIC 66681.35 8305.33
## BIC 66717.39 8323.35
## Log Likelihood -33334.67 -33238.47 -4149.63 -4149.67 -4148.82
## Deviance 787088200271.66 2793.18
## Num. obs. 3000 3000 3000 3000 3000 3000
## Parameters 7 4 4
## AIC (Linear model) 66681.35 8306.98 8305.33
## AIC (Spatial model) 66490.93 8307.26 8305.63
## LR test: statistic 192.42 1.72 1.70
## LR test: p-value 0.00 0.19 0.19
## R^2 0.00
## Adj. R^2 0.00
## =======================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Spatial models are doing better than a-spatial
AIC.all<-matrix(0)
models<-c("spatial.multi.std","spatial.power.std","spatial.exp.std")
for(i in 1:3){
AIC.all[2*i-1]<-get(models[i])$AIC_lm.model
AIC.all[2*i]<-AIC(get(models[i]))
}
AIC.all
## [1] 66681.347 66490.930 8306.981 8307.262 8305.335 8305.632
Best SRMSE is for multinomial model
SRMSE<-function(model, dataset, variable) { sqrt(sum((model$fitted.values-dataset[,variable])^2)/ dim(dataset)[1]) / (mean(dataset[,variable]))}
models<-c("OLS.multi.std", "spatial.multi.std", "OLS.power.std", "spatial.power.std", "OLS.exp.std", "spatial.exp.std")
SRMSE.all<-matrix(0)
for(i in 1:6){
SRMSE.all[i]<-SRMSE(get(models[i]), kaggle_income_s, "Mean")}
SRMSE.all
## [1] 0.5407556 0.5283016 1.1003497 1.1003497 1.1003497 1.1003497
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, OLS.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")
plot(kaggle_income_s$mindist, kaggle_income_s$Median)
points(kaggle_income_s$mindist, spatial.multi.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")
par(mfrow=c(1,2))
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Median))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, OLS.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")
plot(kaggle_income_s$mindist, log(kaggle_income_s$Median))
points(kaggle_income_s$mindist, spatial.exp.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, OLS.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")
plot(kaggle_income_s$mindist, kaggle_income_s$Mean)
points(kaggle_income_s$mindist, spatial.multi.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")
par(mfrow=c(1,2))
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), OLS.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Mean))
points(log(kaggle_income_s$mindist), spatial.power.med$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, OLS.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")
plot(kaggle_income_s$mindist, log(kaggle_income_s$Mean))
points(kaggle_income_s$mindist, spatial.exp.mean$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, OLS.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, multinominal, fitted values")
plot(kaggle_income_s$mindist, kaggle_income_s$Stdev)
points(kaggle_income_s$mindist, spatial.multi.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, multinominal, fitted values")
par(mfrow=c(1,2))
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), OLS.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, power, fitted values")
plot(log(kaggle_income_s$mindist), log(kaggle_income_s$Stdev))
points(log(kaggle_income_s$mindist), spatial.power.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, power, fitted values")
par(mfrow=c(1,2))
plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, OLS.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="OLS, exponential, fitted values")
plot(kaggle_income_s$mindist, log(kaggle_income_s$Stdev))
points(kaggle_income_s$mindist, spatial.exp.std$fitted.values, col="red")
abline(h=1, lty=3)
title(main="spatial, exponential, fitted values")
cost of living, homeless, house index, crime (types) per county/zip code
bea CAINC1: Annual Personal Income by County https://apps.bea.gov/regional/downloadzip.cfm
Housing Median Home Prices https://www.nar.realtor/research-and-statistics/housing-statistics/county-median-home-prices-and-monthly-mortgage-payment
zero_pad <- function(x) { ifelse(str_starts(x, "0"), x, paste0("0",x))}
uscounties <- readOGR(".", "tl_2017_us_county")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\esobolewska\Documents\Spatial_econometrics\data", layer: "tl_2017_us_county"
## with 3233 features
## It has 17 fields
## Integer64 fields read as strings: ALAND AWATER
uscounties <- spTransform(uscounties, CRS("+proj=longlat +datum=NAD83"))
uscounties$GEOID = as.character(uscounties$GEOID)
uscounties$GEOID0 = uscounties$GEOID %>% sapply(zero_pad)
income_county <- read.csv("CAINC1__ALL_AREAS_1969_2018.csv", stringsAsFactors=FALSE)
income_county <- income_county[,c("GeoFIPS", "GeoName", "Description", "LineCode", "Unit","X2018")]
income_county <- income_county[1:(dim(income_county)[1]-3),]
income_county <- income_county[income_county$Description=="Personal income (thousands of dollars)",]
colnames(income_county) = c("FIPS", "County", "Description", "LineCode", "Unit","Income")
income_county = income_county[!as.character(income_county$FIPS) %>% sapply(str_trim) %>% sapply(str_ends, "000"),]
income_county$FIPS = as.character(income_county$FIPS) %>% sapply(str_trim)
income_county$Income = as.integer(income_county$Income)
## Warning: NAs introduced by coercion
income_county$FIPS0 = income_county$FIPS %>% sapply(zero_pad)
house_index <- read.csv2("HI_data.csv", stringsAsFactors=FALSE)
colnames(house_index) = c("FIPS", "Price.RangeQ42019", "Geography", "HousingIndex" )
house_index$FIPS <- paste0("0",house_index$FIPS)
house_index$Geography = house_index$Geography %>% sapply(str_replace, "De Witt", "DeWitt")
house_index$FIPS0 = house_index$FIPS %>% sapply(zero_pad)
education_county <- read.csv2("Education.csv", stringsAsFactors=FALSE)
colnames(education_county) = c("FIPS", "State", "Area.name", "Education.perc")
education_county$Education.perc = as.numeric(education_county$Education.perc)
education_county = education_county[!is.na(education_county$Education.perc),]
education_county$FIPS0 <- paste0("0",education_county$FIPS)
population_county <- read.csv2("Pop.csv", stringsAsFactors=FALSE)
colnames(population_county) = c("FIPS", "State", "Area.name", "Econ.Typology", "Dom.Migr", "Pop")
population_county$Dom.Migr = as.numeric(population_county$Dom.Migr)
population_county$Pop = as.numeric(population_county$Pop)
population_county = population_county[!is.na(population_county$Dom.Migr),]
population_county = population_county[!is.na(population_county$Pop),]
population_county$FIPS0 <- paste0("0",population_county$FIPS)
uscounties = merge(uscounties, house_index, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, income_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, population_county, by.x="GEOID0", by.y="FIPS0")
uscounties = merge(uscounties, education_county, by.x="GEOID0", by.y="FIPS0")
uscounties = uscounties[!uscounties$STATEFP %in% c("15", "02"),]
uscounties = uscounties[!is.na(uscounties$Education.perc),]
uscounties = uscounties[!is.na(uscounties$HousingIndex),]
uscounties = uscounties[!is.na(uscounties$Income),]
uscounties = uscounties[!is.na(uscounties$Econ.Typology),]
uscounties = uscounties[!is.na(uscounties$Dom.Migr),]
uscounties = uscounties[!is.na(uscounties$Pop),]
variable<-uscounties$HousingIndex
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
variable<-uscounties$Income
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
variable<-uscounties$Education.perc
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
variable<-uscounties$Pop
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
variable<-uscounties$Dom.Migr
shades<-auto.shading(variable, n=6, cols=brewer.pal(9, "Blues")[4:9])
choropleth(uscounties, variable, shading=shades)
uscounties.df = as.data.frame(uscounties)
crds<-coordinates(uscounties)
# calculate weights matrix
us.knn<-knearneigh(crds, k=1)
us.knn.nb<-knn2nb(us.knn)
us.sym.knn.nb<-make.sym.nb(us.knn.nb)
us.sym.knn.listw<-nb2listw(us.sym.knn.nb)
M<-cor(uscounties.df[,c("Dom.Migr","HousingIndex", "Pop", "Education.perc")])
corrplot(M)
form = log(Income)~Pop+HousingIndex+Education.perc+Dom.Migr
# GNS, Manski model (all)
GNS_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="sacmixed", tol.solve=1e-20)
## Warning: Function sacsarlm moved to the spatialreg package
summary(GNS_1)
##
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw,
## listw2 = listw2, na.action = na.action, Durbin = Durbin,
## type = type, method = method, quiet = quiet, zero.policy = zero.policy,
## tol.solve = tol.solve, llprof = llprof, interval1 = interval1,
## interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.219885 -0.430513 0.051912 0.502844 2.036685
##
## Type: sacmixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.7660e+01 3.0450e-01 57.9962 < 2.2e-16
## Pop 2.0950e-06 7.5370e-08 27.7968 < 2.2e-16
## HousingIndex 3.1966e-06 3.3315e-07 9.5950 < 2.2e-16
## Education.perc 4.2094e-02 3.1613e-03 13.3155 < 2.2e-16
## Dom.Migr 7.9429e-05 6.5624e-06 12.1037 < 2.2e-16
## lag.Pop 1.3480e-06 8.9170e-08 15.1171 < 2.2e-16
## lag.HousingIndex 2.0255e-06 3.7899e-07 5.3445 9.068e-08
## lag.Education.perc 1.1461e-02 3.6973e-03 3.1000 0.001936
## lag.Dom.Migr 5.6074e-05 7.2591e-06 7.7246 1.132e-14
##
## Rho: -0.42174
## Asymptotic standard error: 0.023213
## z-value: -18.168, p-value: < 2.22e-16
## Lambda: 0.66235
## Asymptotic standard error: 0.015754
## z-value: 42.044, p-value: < 2.22e-16
##
## LR test value: 774.27, p-value: < 2.22e-16
##
## Log likelihood: -4131.385 for sacmixed model
## ML residual variance (sigma squared): 0.53705, (sigma: 0.73284)
## Number of observations: 3044
## Number of parameters estimated: 12
## AIC: 8286.8, (AIC for lm: 9049)
# SAC (rho and lambda)
SAC_1<-sacsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAC_1)
##
## Call:spatialreg::sacsarlm(formula = formula, data = data, listw = listw,
## listw2 = listw2, na.action = na.action, Durbin = Durbin,
## type = type, method = method, quiet = quiet, zero.policy = zero.policy,
## tol.solve = tol.solve, llprof = llprof, interval1 = interval1,
## interval2 = interval2, trs1 = trs1, trs2 = trs2, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.061383 -0.544972 0.086083 0.641902 2.402963
##
## Type: sac
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0567e+01 2.9889e-01 35.3552 < 2.2e-16
## Pop 1.9255e-06 7.0171e-08 27.4405 < 2.2e-16
## HousingIndex 2.9650e-06 3.2724e-07 9.0606 < 2.2e-16
## Education.perc 4.3327e-02 2.9820e-03 14.5298 < 2.2e-16
## Dom.Migr 7.4546e-05 6.0359e-06 12.3504 < 2.2e-16
##
## Rho: 0.1346
## Asymptotic standard error: 0.02219
## z-value: 6.0659, p-value: 1.3119e-09
## Lambda: 0.21442
## Asymptotic standard error: 0.025894
## z-value: 8.2806, p-value: 2.2204e-16
##
## LR test value: 601.74, p-value: < 2.22e-16
##
## Log likelihood: -4217.654 for sac model
## ML residual variance (sigma squared): 0.89609, (sigma: 0.94662)
## Number of observations: 3044
## Number of parameters estimated: 8
## AIC: 8451.3, (AIC for lm: 9049)
# SDM (rho and x lags)
SDM_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, type="mixed")
summary(SDM_1)
##
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, type = type, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.755311 -0.530582 0.081613 0.634963 2.330427
##
## Type: mixed
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.0787e+00 1.6731e-01 48.2870 < 2.2e-16
## Pop 1.8141e-06 7.3142e-08 24.8021 < 2.2e-16
## HousingIndex 2.7254e-06 4.3920e-07 6.2054 5.455e-10
## Education.perc 4.6333e-02 3.1864e-03 14.5407 < 2.2e-16
## Dom.Migr 6.8381e-05 6.2857e-06 10.8788 < 2.2e-16
## lag.Pop -9.6857e-08 9.8321e-08 -0.9851 0.3246
## lag.HousingIndex -3.8078e-07 4.9309e-07 -0.7722 0.4400
## lag.Education.perc -2.2539e-02 3.5741e-03 -6.3063 2.858e-10
## lag.Dom.Migr 4.3336e-06 8.0089e-06 0.5411 0.5884
##
## Rho: 0.34999, LR test value: 615.83, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.012946
## z-value: 27.035, p-value: < 2.22e-16
## Wald statistic: 730.9, p-value: < 2.22e-16
##
## Log likelihood: -4187.434 for mixed model
## ML residual variance (sigma squared): 0.84186, (sigma: 0.91753)
## Number of observations: 3044
## Number of parameters estimated: 11
## AIC: 8396.9, (AIC for lm: 9010.7)
#SDEM (lambda and x lags)
SDEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw, etype="emixed")
summary(SDEM_1)
##
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, etype = etype, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.714501 -0.533814 0.079177 0.637080 2.310477
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2429e+01 7.3899e-02 168.1852 < 2.2e-16
## Pop 1.9696e-06 7.0818e-08 27.8122 < 2.2e-16
## HousingIndex 2.9041e-06 3.5404e-07 8.2028 2.220e-16
## Education.perc 4.4272e-02 3.0648e-03 14.4451 < 2.2e-16
## Dom.Migr 7.6099e-05 6.1529e-06 12.3680 < 2.2e-16
## lag.Pop 6.1167e-07 7.1543e-08 8.5497 < 2.2e-16
## lag.HousingIndex 7.2878e-07 3.7768e-07 1.9296 0.05365
## lag.Education.perc -7.5739e-03 3.3392e-03 -2.2681 0.02332
## lag.Dom.Migr 3.1340e-05 6.3166e-06 4.9616 6.993e-07
##
## Lambda: 0.3509, LR test value: 616.25, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.012733
## z-value: 27.558, p-value: < 2.22e-16
## Wald statistic: 759.43, p-value: < 2.22e-16
##
## Log likelihood: -4187.227 for error model
## ML residual variance (sigma squared): 0.84135, (sigma: 0.91725)
## Number of observations: 3044
## Number of parameters estimated: 11
## AIC: 8396.5, (AIC for lm: 9010.7)
#SAR (rho)
SAR_1 <- lagsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SAR_1)
##
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, type = type, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.895077 -0.539679 0.086321 0.642340 2.458343
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.5184e+00 1.5549e-01 54.7829 < 2.2e-16
## Pop 1.8452e-06 6.7280e-08 27.4264 < 2.2e-16
## HousingIndex 2.0318e-06 2.6397e-07 7.6971 1.399e-14
## Education.perc 4.0013e-02 2.7275e-03 14.6704 < 2.2e-16
## Dom.Migr 7.5200e-05 5.7502e-06 13.0778 < 2.2e-16
##
## Rho: 0.29637, LR test value: 566.44, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.011469
## z-value: 25.841, p-value: < 2.22e-16
## Wald statistic: 667.75, p-value: < 2.22e-16
##
## Log likelihood: -4235.303 for lag model
## ML residual variance (sigma squared): 0.89097, (sigma: 0.94391)
## Number of observations: 3044
## Number of parameters estimated: 7
## AIC: 8484.6, (AIC for lm: 9049)
#SLX (lags of x)
SLX_1 <- lmSLX(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SLX_1)
##
## Call:
## lm(formula = formula(paste("y ~ ", paste(colnames(x)[-1], collapse = "+"))),
## data = as.data.frame(x), weights = weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1240 -0.5948 0.1281 0.7240 2.4385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.238e+01 5.783e-02 214.030 < 2e-16 ***
## Pop 2.055e-06 7.987e-08 25.735 < 2e-16 ***
## HousingIndex 2.817e-06 4.600e-07 6.125 1.02e-09 ***
## Education.perc 4.412e-02 3.611e-03 12.218 < 2e-16 ***
## Dom.Migr 8.025e-05 6.852e-06 11.712 < 2e-16 ***
## lag.Pop 5.036e-07 7.928e-08 6.352 2.44e-10 ***
## lag.HousingIndex 7.451e-07 4.772e-07 1.561 0.119
## lag.Education.perc -5.500e-03 3.832e-03 -1.435 0.151
## lag.Dom.Migr 3.015e-05 6.843e-06 4.406 1.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.061 on 3035 degrees of freedom
## Multiple R-squared: 0.5406, Adjusted R-squared: 0.5394
## F-statistic: 446.4 on 8 and 3035 DF, p-value: < 2.2e-16
#SEM (lambda)
SEM_1 <- errorsarlm(form, data=uscounties.df, listw=us.sym.knn.listw)
summary(SEM_1)
##
## Call:spatialreg::errorsarlm(formula = formula, data = data, listw = listw,
## na.action = na.action, Durbin = Durbin, etype = etype, method = method,
## quiet = quiet, zero.policy = zero.policy, interval = interval,
## tol.solve = tol.solve, trs = trs, control = control)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.537158 -0.535216 0.076571 0.626528 3.317991
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2368e+01 5.3042e-02 233.170 < 2.2e-16
## Pop 1.8597e-06 6.9592e-08 26.722 < 2.2e-16
## HousingIndex 3.7812e-06 3.1551e-07 11.985 < 2.2e-16
## Education.perc 4.1921e-02 3.0193e-03 13.885 < 2.2e-16
## Dom.Migr 6.8902e-05 5.9837e-06 11.515 < 2.2e-16
##
## Lambda: 0.34815, LR test value: 579.2, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.013123
## z-value: 26.53, p-value: < 2.22e-16
## Wald statistic: 703.87, p-value: < 2.22e-16
##
## Log likelihood: -4228.921 for error model
## ML residual variance (sigma squared): 0.86594, (sigma: 0.93056)
## Number of observations: 3044
## Number of parameters estimated: 7
## AIC: 8471.8, (AIC for lm: 9049)
#OLS
OLS_1<-lm(form, data=uscounties.df)
summary(OLS_1)
##
## Call:
## lm(formula = form, data = uscounties.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0086 -0.6060 0.1288 0.7388 2.5707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.237e+01 5.006e-02 247.10 <2e-16 ***
## Pop 2.267e-06 7.388e-08 30.68 <2e-16 ***
## HousingIndex 3.707e-06 2.896e-07 12.80 <2e-16 ***
## Education.perc 3.923e-02 3.087e-03 12.71 <2e-16 ***
## Dom.Migr 9.036e-05 6.475e-06 13.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.069 on 3039 degrees of freedom
## Multiple R-squared: 0.5335, Adjusted R-squared: 0.5329
## F-statistic: 869 on 4 and 3039 DF, p-value: < 2.2e-16
Optimal clustering using silhouette
Optimal number of clusters k=2
fviz_nbclust(coords[1:5000,], clara, method="silhouette")
Optimal clustering using silhouette
Optimal number of clusters k=2
gap<-clusGap(coords[1:5000,], FUN=clara, K.max=50, B=5)
fviz_gap_stat(gap)
fviz_gap_stat(gap, linecolor="red", maxSE=list(method="globalmax"))
c9<-clara(coords, 9, metric="euclidean")
fviz_cluster(c9, pch=19, geom="point")
fviz_silhouette(c9)
kaggle_income$c9_clust = c9$clustering
ggplot(kaggle_income[,c("Median", "c9_clust")], aes(x=Median))+
geom_density()+
facet_wrap(~c9_clust, ncol=3)
c2<-clara(coords, 2, metric="euclidean", sampsize=1000)
fviz_cluster(c2, pch=19, geom="point")
fviz_silhouette(c2)
c44<-clara(coords, 44, metric="euclidean")
fviz_cluster(c44, pch=19, geom="point")
fviz_silhouette(c44)
kaggle_income$c44_clust = c44$clustering
ggplot(kaggle_income[,c("Median", "c44_clust")], aes(x=Median))+
geom_density()+
facet_wrap(~c44_clust, ncol=8)
ggplot(kaggle_income[,c("Median", "State_Code")], aes(x=Median))+
geom_density()+
facet_wrap(~State_Code, ncol=8)