3rd Mini-project Answers

library(tidyverse)
library(rgdal)
library(maptools)
library(sp)
library(raster)
library(sf)
library(tmap)
library(spatstat)
library(spdep)
library(RColorBrewer)

Question 1. Cafes in the downtown of Boston and San Francisco

# read in input objects 
sanf_cbd_cafes_ppp <- read_rds("sanf_cbd_cafes_ppp.rds") # planar point pattern object 
sanf_innercity_owin <- read_rds("sanf_innercity_owin.rds") # 5 miles from the cbd window object  
sanf_pctyoung <- read_rds("sanf_pctyoung.rds") # % 25-34 college graduates im object

# Average distance to the nearest neighbor 
ann.p <- mean(nndist(sanf_cbd_cafes_ppp, k = 1))
ann.p

## [1] 82.781

# Monte Carlo: Null hypothesis is complete spatial random 
n <- 599L # Number of simulations
ann.r <- vector(length = n) # Create an empty object to be used to store simulated ANN values

for (i in 1:n){
  rand.p   <- rpoint(n=sanf_cbd_cafes_ppp$n, win=sanf_innercity_owin) # Generate random point locations
  ann.r[i] <- mean(nndist(rand.p, k=1))  # Tally the ANN values
}

hist(ann.r, main=NULL, las=1, breaks=40, col="bisque", xlim=range(ann.p, ann.r))
abline(v=ann.p, col="blue")

N.greater <- sum(ann.r > ann.p)
p <- min(N.greater + 1, n + 1 - N.greater)/(n+1)
p

## [1] 0.001666667

# Monte Carlo: Null hypothesis is complete spatial random, after taking % 25-34 college graduates into account 
n <- 599L
ann.r <- vector(length=n)

for (i in 1:n){
  rand.p   <- rpoint(n=sanf_cbd_cafes_ppp$n, f=sanf_pctyoung) # f defines the probability density of the points
  ann.r[i] <- mean(nndist(rand.p, k=1))
}

hist(ann.r, main=NULL, las=1, breaks=40, col="bisque", xlim=range(ann.p, ann.r))
abline(v=ann.p, col="blue")

N.greater <- sum(ann.r > ann.p)
p <- min(N.greater + 1, n + 1 - N.greater)/(n+1)
p

## [1] 0.001666667

Question 2. Residential segregation by race, income, and educational attainment in Fulton County, GA

# three variables are available: % non-Hispanic Whte, % college graduates, & log(median household income)
# these variables are already standardized (i.e., mean 0 standard deviation 1)
fulton2010 <- read_rds("fulton2010.rds") %>% as('Spatial')

# plot neighbor links 
nb <- poly2nb(fulton2010, queen = TRUE)
xy <- sp::coordinates(fulton2010)
plot(fulton2010, border = "grey", lwd=1)
plot(nb, xy, col="red", lwd=0.1, add=TRUE)

# compute spatial weights 
lw <- nb2listw(nb, style = "W", zero.policy = TRUE)

By race

# compute the spatially weighted mean for the neighbors of each tract 
nhw.lag <- lag.listw(lw, fulton2010$pct_nhw)
# run regression and plot 
M <- lm(nhw.lag ~ fulton2010$pct_nhw)
plot(nhw.lag ~ fulton2010$pct_nhw, pch = 20, asp = 1, las = 1)
abline(M, col = "red")

coef(M)[2]

## fulton2010$pct_nhw 
##          0.8192557

# run Monte Carlo simulation 
# this helps determine the statistical significance of the computed Moran's I 
n <- 599L   # Define the number of simulations
I.r <- vector(length=n)  # Create an empty vector

for (i in seq_along(I.r)){
  # Randomly shuffle income values
  x <- sample(fulton2010$pct_nhw, replace=FALSE)
  # Compute new set of lagged values
  x.lag <- lag.listw(lw, x)
  # Compute the regression slope and store its value
  M.r    <- lm(x.lag ~ x)
  I.r[i] <- coef(M.r)[2]
}

hist(I.r, main=NULL, xlab="Moran's I", las=1)
abline(v=coef(M)[2], col="red")

N.greater <- sum(coef(M)[2] > I.r)
p <- min(N.greater + 1, n + 1 - N.greater) / (n + 1)
p # your value might be slightly different from the printed value below b/c of the random simulation

## [1] 0.001666667

moran.test(fulton2010$pct_nhw, lw)

## 
##  Moran I test under randomisation
## 
## data:  fulton2010$pct_nhw  
## weights: lw    
## 
## Moran I statistic standard deviate = 19.354, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.819255661      -0.004926108       0.001813372

MC<- moran.mc(fulton2010$pct_nhw, lw, nsim = 599)
MC

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  fulton2010$pct_nhw 
## weights: lw  
## number of simulations + 1: 600 
## 
## statistic = 0.81926, observed rank = 600, p-value = 0.001667
## alternative hypothesis: greater

plot(MC, main="", las=1)

By educational attainment

# compute the spatially weighted mean for the neighbors of each tract 
coll.lag <- lag.listw(lw, fulton2010$pct_coll)
# run regression and plot 
M <- lm(coll.lag ~ fulton2010$pct_coll)
plot(coll.lag ~ fulton2010$pct_coll, pch = 20, asp = 1, las = 1)
abline(M, col = "red")

coef(M)[2]

## fulton2010$pct_coll 
##           0.7614924

# run Monte Carlo simulation 
# this helps determine the statistical significance of the computed Moran's I 
n <- 599L   # Define the number of simulations
I.r <- vector(length=n)  # Create an empty vector

for (i in seq_along(I.r)){
  # Randomly shuffle income values
  x <- sample(fulton2010$pct_coll, replace=FALSE)
  # Compute new set of lagged values
  x.lag <- lag.listw(lw, x)
  # Compute the regression slope and store its value
  M.r    <- lm(x.lag ~ x)
  I.r[i] <- coef(M.r)[2]
}

hist(I.r, main=NULL, xlab="Moran's I", las=1)
abline(v=coef(M)[2], col="red")

N.greater <- sum(coef(M)[2] > I.r)
p <- min(N.greater + 1, n + 1 - N.greater) / (n + 1)
p # your value might be slightly different from the printed value below b/c of the random simulation

## [1] 0.001666667

moran.test(fulton2010$pct_coll, lw)

## 
##  Moran I test under randomisation
## 
## data:  fulton2010$pct_coll  
## weights: lw    
## 
## Moran I statistic standard deviate = 18.008, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.761492428      -0.004926108       0.001811385

MC<- moran.mc(fulton2010$pct_coll, lw, nsim = 599)
MC

## 
##  Monte-Carlo simulation of Moran I
## 
## data:  fulton2010$pct_coll 
## weights: lw  
## number of simulations + 1: 600 
## 
## statistic = 0.76149, observed rank = 600, p-value = 0.001667
## alternative hypothesis: greater

plot(MC, main="", las=1)

Question 3. The count of property crime in the City of Chicago, IL

chi.poly <- maptools::readShapePoly('foreclosures.shp')
# the median home value of owner-occupied units for each tract  
chi.poly@data$oomedval_lg <- log(chi.poly@data$oomedval+1) 
# recode NA to zero 
chi.poly@data$property2 <- ifelse(is.na(chi.poly@data$property) == FALSE, chi.poly@data$property, 0)

tmap_mode("view")
tm_basemap("Stamen.TonerLite", alpha = 0.25) + 
  tm_shape(chi.poly) + 
  tm_fill(col = "property2", alpha = 0.75, style = "quantile", n = 10, 
          palette = "OrRd") + 
  tm_borders(col = "white", lwd = 0.5)

chi.ols2 <-lm(property2 ~ est_fcs_rt + oomedval_lg, data=chi.poly@data)
summary(chi.ols2)

## 
## Call:
## lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data = chi.poly@data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -862.09 -206.17  -76.24  119.80 2929.92 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  151.900     64.531   2.354   0.0188 *  
## est_fcs_rt    18.699      2.999   6.234 6.99e-10 ***
## oomedval_lg   12.157      5.396   2.253   0.0245 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 336.1 on 894 degrees of freedom
## Multiple R-squared:  0.04997,    Adjusted R-squared:  0.04784 
## F-statistic: 23.51 on 2 and 894 DF,  p-value: 1.12e-10

# create a list of neighbors for each polygon 
list.queen <- poly2nb(chi.poly, queen=TRUE)
# next, supplement the neighbor list with spatial weights 
W <- nb2listw(list.queen, style="W", zero.policy=TRUE) # "W" row-standardize weights 
# W
# plot(W,coordinates(chi.poly))

moran.lm2<-lm.morantest(chi.ols2, W, alternative="two.sided")
print(moran.lm2)

## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data =
## chi.poly@data)
## weights: W
## 
## Moran I statistic standard deviate = 18.6, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3391759402    -0.0020608813     0.0003365739

LM2 <- lm.LMtests(chi.ols2, W, test="all")
print(LM2)

## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data =
## chi.poly@data)
## weights: W
## 
## LMerr = 337.22, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data =
## chi.poly@data)
## weights: W
## 
## LMlag = 339.61, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data =
## chi.poly@data)
## weights: W
## 
## RLMerr = 2.156, df = 1, p-value = 0.142
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data =
## chi.poly@data)
## weights: W
## 
## RLMlag = 4.5498, df = 1, p-value = 0.03292
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = property2 ~ est_fcs_rt + oomedval_lg, data =
## chi.poly@data)
## weights: W
## 
## SARMA = 341.77, df = 2, p-value < 2.2e-16

Since the p-value of RLMlag is smaller than that of RLMerr, we proceed with a spatial lag model.

chi.sar2 <- lagsarlm(property2 ~ est_fcs_rt + oomedval_lg, data=chi.poly@data, W)
summary(chi.sar2)

## 
## Call:
## lagsarlm(formula = property2 ~ est_fcs_rt + oomedval_lg, data = chi.poly@data, 
##     listw = W)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -814.611 -162.156  -54.249  102.418 2343.951 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept) -62.5149    58.8557 -1.0622 0.2881573
## est_fcs_rt   10.2921     2.6471  3.8880 0.0001011
## oomedval_lg  14.3355     4.7757  3.0017 0.0026846
## 
## Rho: 0.56477, LR test value: 202.48, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.034758
##     z-value: 16.249, p-value: < 2.22e-16
## Wald statistic: 264.03, p-value: < 2.22e-16
## 
## Log likelihood: -6388.386 for lag model
## ML residual variance (sigma squared): 84726, (sigma: 291.08)
## Number of observations: 897 
## Number of parameters estimated: 5 
## AIC: 12787, (AIC for lm: 12987)

Save residuals from the OLS and spatial lag models.

chi.poly@data$chi.ols2.res <- resid(chi.ols2) #residuals ols
chi.poly@data$chi.sar2.res <- resid(chi.sar2) #residual sar

sp::spplot(
  chi.poly, 
  "chi.ols2.res", 
  at = seq(min(chi.poly@data$chi.ols2.res,na.rm=TRUE), max(chi.poly@data$chi.ols2.res,na.rm=TRUE), length=12),
  col.regions = rev(brewer.pal(11,"RdBu"))
)

sp::spplot(
  chi.poly, 
  "chi.sar2.res", 
  at = seq(min(chi.poly@data$chi.sar2.res,na.rm=TRUE), max(chi.poly@data$chi.sar2.res,na.rm=TRUE), length=12),
  col.regions = rev(brewer.pal(11,"RdBu"))
)

3rd Mini-project Answers

April 15, 2019 (Monday)

Question 1. Cafes in the downtown of Boston and San Francisco

Question 2. Residential segregation by race, income, and educational attainment in Fulton County, GA

By race

By educational attainment

Question 3. The count of property crime in the City of Chicago, IL