Data Prep and Visualization

Download the zipfile that contains the spatial data, Load in the data. Show the differences between readOGR and readShapeSpatial with projections.

library(rgdal)
library(maptools)
library(spdep)
library(ggplot2)

setwd("~/Dropbox/classes/Geog515_f13/Labs/MoranExercise/NYData/")
# You have at least two ways to bring in a shapefile
NY <- readOGR(".", "NY8_utm18")  #readOGR is in rgdal

## OGR data source with driver: ESRI Shapefile 
## Source: ".", layer: "NY8_utm18"
## with 281 features and 17 fields
## Feature type: wkbPolygon with 2 dimensions

# Look at the projection info:
proj4string(NY)

## [1] "+proj=utm +zone=18 +ellps=WGS84 +units=m +no_defs"

# Bring in the data using readShapeSpatial
NY2 <- readShapeSpatial("NY8_utm18")  #readShapeSpatial is in maptools
# Look at the projection info:
proj4string(NY2)  # Oops, it is missing

## [1] NA

# edit(file='NY8_utm18.prj') # But there it is. # Uncomment this line to
# run
NY2 <- readShapeSpatial("NY8_utm18", IDvar = "AREAKEY", CRS("+proj=utm +zone=18 +units=m"))  #readShapeSpatial is in maptools
proj4string(NY2)

## [1] "+proj=utm +zone=18 +units=m +ellps=WGS84"

A few different ways to plot the Cases variable

# A few plotting options 1) Quick and dirty with spplot
spplot(NY, "Cases")

plot of chunk unnamed-chunk-2


# 2) A little more elegantly with ggplot
library(ggplot2)
# Create useful id's and row.names
NY2@data$id <- NY2@data$AREAKEY
row.names(NY2@data) <- NY2@data$AREAKEY
# Create the polygons
NY.ggplot <- fortify(NY2)
# Merge the Cases value to the polygons
NY.ggplot <- cbind(NY.ggplot, Cases = NY2[NY.ggplot$id, "Cases"])
ggplot(data = NY.ggplot) + geom_polygon(aes(x = long, y = lat, fill = Cases, 
    group = group)) + coord_equal()

plot of chunk unnamed-chunk-2


# 3) And now with a little cartographic thought.  As a choropleth with
# classes
NY.ggplot$Cases_cat <- cut(NY.ggplot$Cases, breaks = c(0, 2, 4, 6, 9))
ggplot(data = NY.ggplot) + geom_polygon(aes(x = long, y = lat, fill = Cases_cat, 
    group = group)) + coord_equal() + scale_fill_brewer("Cases", palette = "YlOrRd")

plot of chunk unnamed-chunk-2


# 4) And on google maps with the ggmap library
library(ggmap)
NY.ll <- spTransform(NY2, CRS("+proj=longlat +ellps=WGS84"))

NY.ll@data$id <- NY.ll@data$AREAKEY
row.names(NY.ll@data) <- NY.ll@data$AREAKEY
# Create the polygons
NY.ggplot <- fortify(NY.ll)
# Merge the Cases value to the polygons
NY.ggplot <- cbind(NY.ggplot, Cases = NY.ll[NY.ggplot$id, "Cases"])
NY.ggplot$Cases_cat <- cut(NY.ggplot$Cases, breaks = c(0, 2, 4, 6, 9))


map <- get_map(location = "Cortland, NY", maptype = "roadmap", zoom = 8, color = "bw")  #get map
ggmap(map, extent = "panel") + geom_polygon(aes(x = long, y = lat, fill = Cases_cat, 
    group = group), data = NY.ggplot, alpha = 0.7) + scale_fill_brewer("Cases", 
    palette = "YlOrRd")

plot of chunk unnamed-chunk-2

Create weight matrices and do some Morans I tests

######################################################## Create a
######################################################## neighborhood list
######################################################## from the
######################################################## shapefile
NY_nb <- poly2nb(NY, queen = FALSE)
summary(NY_nb)

## Neighbour list object:
## Number of regions: 281 
## Number of nonzero links: 1528 
## Percentage nonzero weights: 1.935 
## Average number of links: 5.438 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 
##  6  8 18 53 67 48 49 20  8  2  2 
## 6 least connected regions:
## 55 97 100 101 244 245 with 1 link
## 2 most connected regions:
## 34 82 with 11 links



# Plot boundaries and edges
plot(NY, border = "grey60", axes = TRUE)
title("Spatial Connectivity")
plot(NY_nb, coordinates(NY), pch = 19, cex = 0.6, add = TRUE)

plot of chunk unnamed-chunk-3


help(nb2listw)
# Construct row-standardized spatial weights
NY_W <- nb2listw(NY_nb, style = "W", zero.policy = TRUE)
# Construct unstandardized spatial weights
NY_B <- nb2listw(NY_nb, style = "B", zero.policy = TRUE)


########### Calculate Moran's I of cases
help(moran.test)
moran.test(NY$Cases, NY_W)  #row-standardized

## 
##  Moran's I test under randomisation
## 
## data:  NY$Cases  
## weights: NY_W  
## 
## Moran I statistic standard deviate = 4.005, p-value = 3.097e-05
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.146251         -0.003571          0.001399

moran.test(NY$Cases, NY_B)  # unstandardized

## 
##  Moran's I test under randomisation
## 
## data:  NY$Cases  
## weights: NY_B  
## 
## Moran I statistic standard deviate = 3.385, p-value = 0.0003562
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.117285         -0.003571          0.001275

Question for the student: Make sure that you are able to interpret each of the above summary outputs before proceeding.

# Because Jessica asked
# 
# Calculate the range of possible I statistics
1/range(eigenw(NY_W))

## [1] -1.594  1.000

1/range(eigenw(NY_B))

## [1] -0.3052  0.1619

We'll do Moran's I test four ways. Twice with each weight matrix, and twice with the randomisation option set to TRUE and FALSE.

A little caution on the randomisation option. This is not the Monte Carlo randomisation that people think of with Moran's I. In fact, randomisation is a bit of a misnomer. The randomisation option is a mathematical correction that tries to correct Moran's I for non-normality. It tries to correct the test for skewed data.

# Calculate tests assuming normality randomisation =T applies a correction
# for kurtosis
help(moran.test)
moran.test(NY$Cases, NY_W)

## 
##  Moran's I test under randomisation
## 
## data:  NY$Cases  
## weights: NY_W  
## 
## Moran I statistic standard deviate = 4.005, p-value = 3.097e-05
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.146251         -0.003571          0.001399

moran.test(NY$Cases, NY_W, randomisation = F)

## 
##  Moran's I test under normality
## 
## data:  NY$Cases  
## weights: NY_W  
## 
## Moran I statistic standard deviate = 4.001, p-value = 3.159e-05
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.146251         -0.003571          0.001403

moran.test(NY$Cases, NY_B)

## 
##  Moran's I test under randomisation
## 
## data:  NY$Cases  
## weights: NY_B  
## 
## Moran I statistic standard deviate = 3.385, p-value = 0.0003562
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.117285         -0.003571          0.001275

moran.test(NY$Cases, NY_B, randomisation = F)

## 
##  Moran's I test under normality
## 
## data:  NY$Cases  
## weights: NY_B  
## 
## Moran I statistic standard deviate = 3.381, p-value = 0.0003614
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.117285         -0.003571          0.001278

# Just to show that we are calculating a z-score for I
pnorm((0.117285431 - -0.003571429)/sqrt(0.00127793), lower.tail = F)

## [1] 0.0003614

# You should see the same p-value as in the last moran.test

Each test gives approximately the same results. This is good. Wildly different results would be difficult to interpret. Make sure that you understand this before proceeding.

Here is the Moran's I calculated by randomly scrambling the data, again, with each weight matrix:

# Calculate tests using Monte Carlo
help(moran.mc)
W.mc <- moran.mc(NY$Cases, NY_W, nsim = 999)
W.mc

## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  NY$Cases 
## weights: NY_W  
## number of simulations + 1: 1000 
## 
## statistic = 0.1463, observed rank = 999, p-value = 0.001
## alternative hypothesis: greater


B.mc <- moran.mc(NY$Cases, NY_B, nsim = 999)
B.mc

## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  NY$Cases 
## weights: NY_B  
## number of simulations + 1: 1000 
## 
## statistic = 0.1173, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Bootstrapping the rates, rather than the Counts.

There is a real serious problem with the above analysis. We are assuming that each region is exchangeable under the null hypothesis.

But what is the actual hypothesis? That disease counts are exchangeable? or that disease rates are exchangeable?

Disease counts are controlled by both (1) the distribution of disease, which we are interested in, and (2) the distribution of population, which we are not interested in. If we find clustering - which we did - is that clustering due to clustering of the disease, or just clustering of the population?

Ideally, we would like to exchange the disease rates, while controlling for the population distribution.

What we will do now is assume that the count is a Poisson random variable, not a normal random variable. Under the null hypothesis of no spatial pattern, we will assume that the probability of disease is the same everywhere. Then, the number of cases in a tract should be a random variable with mean = Pop x rate.

What we will do is simulate a bunch of different maps of counts. Each count will simulated as a poisson variable, with a constant rate, but with population equal to the population of that tract. For each map, we will calculate Moran's I. At the end, we can calculate where the Moran's I of the actual data fits in with the Moran's I of the simulations.

###########################################################
library(boot)
# Create a function for Moran's I of counts assuming constant risk
# Calculate global rate of disease
r <- sum(NY$Cases)/sum(NY$POP8)
# Calculate predicted number of cases in each tract The predicted number
# is Population * rate (i.e. Pop * cases/person)
rni <- r * NY$POP8
help(boot)
# We will need to specify the data, the statistic (moran's I), and the
# random number generator Write a function to simulate a poisson RV with
# local risk rate rni
CR <- function(var, mle) rpois(n = length(var), lambda = mle)
# Mnemonic: CR = Constant Risk We will use the boot function, which will
# return something caled mle, but we will tell it to use the local risk
# rate Wrapper function to calculate Moran's I This wrapper is slow
MoranI.pboot2 <- function(data, i, listw, ...) {
    I <- moran.test(x = data, listw = listw, randomisation = FALSE)$estimate[[1]]
    return(I)
}
# This wrapper is fast, but more cryptic
MoranI.pboot <- function(data, i, listw, n, S0, ...) {
    moran.test <- moran(x = data, listw = listw, n = n, S0 = S0)
    I <- moran.test$I
    return(I)
}

# This is fast
boot2 <- boot(NY$Cases, statistic = MoranI.pboot, R = 999, sim = "parametric", 
    ran.gen = CR, listw = NY_B, n = length(NY$Cases), S0 = Szero(NY_B), mle = rni)

# This is slow boot3 <- boot(data=NY$Cases, statistic=MoranI.pboot2,
# R=999, sim='parametric', ran.gen=CR, listw=NY_B, mle=rni)

# There are competing ways to get p-values: Standardize using the mean and
# sd of the bootstrap statistics
z.stat <- (boot2$t0 - mean(boot2$t))/sd(boot2$t)
pnorm(z.stat, lower.tail = FALSE)

## [1] 0.07822

# Calculate the fraction of t's greater than or equal to the sample value
# of t the fraction must include the sample value, since the sample value
# is trivially greater than or equal to the sample value.  This is the
# plus 1.
(sum(boot2$t > boot2$t0) + 1)/1000

## [1] 0.079


####### Plot the two different monte carlo tests For the permutation and
####### poisson approaches: plot the histogram and the actual value Create
####### a data frame with the 999 simulated t-values and the 1 real
####### t-value
gg.df <- data.frame(poisson = c(boot2$t, boot2$t0), permutation = c(B.mc$res))
# The last value in each column should be the statistic from the data.
# Verify this
tail(gg.df)

##        poisson permutation
## 995   0.112392    0.040022
## 996  -0.004674    0.011668
## 997   0.038657   -0.070888
## 998   0.047552   -0.004966
## 999   0.041612   -0.038559
## 1000  0.117285    0.117285


# I finally figured out what to do from here....
# http://wiki.stdout.org/rcookbook/Graphs/Plotting%20distributions%20%28ggplot2%29/
# First, melt those two 1000 number columns into one long column with 2000
# numbers, but with an extra column indicating if poisson or permulation
# Monte Carlo.
library(reshape)
gg.df2 <- melt(gg.df)
gplt <- ggplot(gg.df2, aes(x = value, fill = variable)) + geom_histogram(position = "identity", 
    alpha = 0.5)
gplt <- gplt + geom_vline(aes(xintercept = boot2$t0))
gplt

plot of chunk unnamed-chunk-7