Alex Crawford R-Journal

GEOG 5023: Quantitative Methods In Geography

Lecture 2:

Introduction:

Help Function: help() – opens help information in Help tab on the indicated function

help(log)

Combine Function: c() – creates a vector object from the list

x <- c(1, 2, 4, 8, 16)
x
## [1]  1  2  4  8 16

R Arithmetic: +,-,,/,*
R Arithmetic Functions: sqrt(),log(),exp()

methusela <- 1 + 3
exp(methusela)
## [1] 54.6
prometheus <- x * 6
sqrt(prometheus - 1)
## [1] 2.236 3.317 4.796 6.856 9.747

loga <- log(2.718)
logb <- log(10)
logc <- log(10, 2.718)
logd <- log(2.718, 10)
logtest <- c(loga, logb, logc, logd)
logtest  # log() is a natural log function unless specified as log(x,base = y); shorthand is log(x,y) (see above)
## [1] 0.9999 2.3026 2.3028 0.4342

R Logic: & is and, | is or, == is equals, != is not equals, also can use <, >, <=, >=

Reading & Importing Files:

Importing csv files:

CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/CO2.csv", 
    header = TRUE)  #Normal method of importing a csv file with headers
# CO2 <- read.csv(choose.file(), header = T) # Cheater's method; gives a
# pop-up window; you select the file directory-style

Reading a data.frame:

summary(CO2)  # To get a summary of data in the imported table
##      Plant           Spec             Type         Treatment 
##  Min.   : 1.0   Mc1    : 7   Mississippi:42   chilled   :42  
##  1st Qu.:21.8   Mc2    : 7   Quebec     :42   nonchilled:42  
##  Median :42.5   Mc3    : 7                                   
##  Mean   :42.5   Mn1    : 7                                   
##  3rd Qu.:63.2   Mn2    : 7                                   
##  Max.   :84.0   Mn3    : 7                                   
##                 (Other):42                                   
##       conc          uptake    
##  Min.   :  95   Min.   : 7.7  
##  1st Qu.: 175   1st Qu.:17.9  
##  Median : 350   Median :28.3  
##  Mean   : 435   Mean   :27.2  
##  3rd Qu.: 675   3rd Qu.:37.1  
##  Max.   :1000   Max.   :45.5  
## 
head(CO2)  # To get a sample of first six rows & headers
##   Plant Spec   Type  Treatment conc uptake
## 1     1  Qn1 Quebec nonchilled   95   16.0
## 2     2  Qn1 Quebec nonchilled  175   30.4
## 3     3  Qn1 Quebec nonchilled  250   34.8
## 4     4  Qn1 Quebec nonchilled  350   37.2
## 5     5  Qn1 Quebec nonchilled  500   35.3
## 6     6  Qn1 Quebec nonchilled  675   39.2
names(CO2)  # To get header names
## [1] "Plant"     "Spec"      "Type"      "Treatment" "conc"      "uptake"

Selecting Parts of a Data Frame:

# use data.frame.name$column.name:
CO2$conc
##  [1]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [15]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [29]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [43]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [57]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [71]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
# use bracket notation where data.frame[row criteria , column criteria]
# determine what you print
CO2[, "conc"]  # Note how the blank space before the comma means all rows
##  [1]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [15]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [29]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [43]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [57]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [71]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
CO2[, 5]  # Can also call out the column or row by number
##  [1]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [15]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [29]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [43]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [57]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
## [71]   95  175  250  350  500  675 1000   95  175  250  350  500  675 1000
CO2.Q <- CO2[CO2$Type == "Quebec", ]  # creates new dataset only including rows where Type is Quebec; remember to keep the comma to select all rows!
CO2.Q <- subset(CO2, Type == "Quebec")  # Can also use the subset function to do the same thing

Creating New Variables

# Creating a new variable by assigning values based on logical expressions
# (think reclassify in ArcGIS)
CO2$concat[CO2$conc <= 200] <- 1
CO2$concat[CO2$conc > 200 & CO2$conc <= 400] <- 2
CO2$concat[CO2$conc > 400 & CO2$conc <= 600] <- 3
CO2$concat[CO2$conc > 600] <- 4
# Or create a new variable by arthimetic
CO2$conc1000 <- CO2$conc * 1000
head(CO2)
##   Plant Spec   Type  Treatment conc uptake concat conc1000
## 1     1  Qn1 Quebec nonchilled   95   16.0      1    95000
## 2     2  Qn1 Quebec nonchilled  175   30.4      1   175000
## 3     3  Qn1 Quebec nonchilled  250   34.8      2   250000
## 4     4  Qn1 Quebec nonchilled  350   37.2      2   350000
## 5     5  Qn1 Quebec nonchilled  500   35.3      3   500000
## 6     6  Qn1 Quebec nonchilled  675   39.2      4   675000

Basic Graphs:

Histograms: hist() – Useful Arguments:

# breaks= (if a number) is number of breaks (i.e. the one less than the
# number of bins); suggestion only breaks= (if a vector) is the
# breakpoints b/wn bins breaks= seq(x,y,by=z) where x = min, y = max, and
# z = bin size col= gives the col of the bins, xlab= is the x-axis label,
# and main= is the title border= gives the color of the border around the
# bars main= 'title' ylab= 'y labels' xlab= 'x labels'
hist(CO2$uptake, col = "blue", xlab = "Rate (umol/m^2 sec)", main = "CO2 Uptake Rates", 
    border = "black", breaks = seq(0, 50, by = 2))

plot of chunk unnamed-chunk-8

Box Plots: boxplot() – Useful Arguments:

# col= color of box main= 'title' ylab= 'y labels' xlab= 'x labels' Use
# the ~ symbol to say 'with respect to' (compare by categories):
boxplot(CO2$uptake ~ CO2$Type, main = "CO2 Uptake Rates", ylab = "Rate (umol/m^2 sec)", 
    col = rainbow(2))

plot of chunk unnamed-chunk-9

Scatter Plots: plot(,) – Useful Arguments:

# If you only give one variable, the other will be an index pch= number
# that determines point type xlim=(x1,x2) and ylim=(y1,y2) provide ranges
# for x and y col= color of points main= 'title' ylab= 'y labels' xlab= 'x
# labels'
plot(CO2$conc, CO2$uptake, xlab = "CO2 Concentration", ylab = "CO2 Uptake Rate", 
    pch = 4, col = "red")

plot of chunk unnamed-chunk-11

Line Graphs: plot() with added argumet type= “l” Add a legend using legend(); note several arguments

# ylim=c(y1,y2) to set range for y-axis xlim=c(x1,x2) to set range for
# x-axis main= 'title' ylab= 'y labels' xlab= 'x labels'
thames <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/thames.csv", 
    sep = ",", header = T)
t1 <- 1:nrow(thames)  # Creates a vector with a range of intergers from 1 to the nrows in thames
plot(t1, thames$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Feildes", 
    type = "l", ylim = c(-1, 100))
# Add another line to same graph using lines() lty= determines
# dotted/dahsed/solid
lines(t1, thames$Redbridge, lty = 2, col = "red")
# legend(position, names, line types, colors)
legend("topright", c("Feildes", "Redbridge"), lty = c(1, 2), col = c("black", 
    "red"))

plot of chunk unnamed-chunk-12

# Use dev.off() to clear the screen and the panels

Homework 1:

Calculating Standard Deviation:

Basics: mean(), sum(), and length()

rain <- c(16, 18, 14, 22, 27, 17, 19, 17, 17, 22, 20, 22)
mean(rain)  # Returns the average of data set
## [1] 19.25
sum(rain)  # Returns the total of data set
## [1] 231
length(rain)  # Returns the length of the list
## [1] 12

Function for a Population Standard Deviation stdevp()

stdevp <- function(someData) {
    return(sqrt(mean((someData - mean(someData))^2)))
}
stdevp(rain)
## [1] 3.394

Homework 2

Importing & Reading Data

Importing a dbf file:

library("foreign", lib.loc = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library")
MN <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/mnmappluto.dbf")
MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]

Reading a table: dim() & grep()

# SS: dim() returns the number of rows and comlumns in a table
dim(MN)
## [1] 43318    78
# SS: Grep() is a text searching function; returns the row numbers where
# the text was found.  AC: Arguments include the text to search, the data,
# and ignore.case=TRUE to make NOT case sensative
CrawfordBldgs <- grep("crawford", MN$OwnerName, ignore.case = TRUE)
MN[CrawfordBldgs, c("Address", "AssessTot", "OwnerName")]
##                   Address AssessTot           OwnerName
## 19634 214 WEST 122 STREET     15772      CRAWFORD STACY
## 31738  320 EAST 51 STREET    101760        J H CRAWFORD
## 40468 531 WEST 150 STREET     35591     CRAWFORD, JOANN
## 41676        118 2 AVENUE    913500 CRAWFORD REALTY CO.

Logical Statements

Selecting NA Values: is.na()

# Returns TRUE for NA values.
is.na(MN[1:100, "HistDist"])
##   [1] FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [23]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [34]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [45]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [100]  TRUE

IF/ELSE: ifelse(logicexpression,ifTRUE,ifFALSE)

# SS: The function ifelse() takes three arguments, a logical expression, a
# value for true, and a value for false. If the logical expression
# evaluates to “TRUE” the first value is used, if not the second value is
# used.
MN$HD <- ifelse(is.na(MN[, "HistDist"]), 0, 1)

Conversions

Converting to a Factor

# AC: We convert MN$HD to a factor because it is a dummy variable for
# categories, not a ratio level variable
MN$HD <- as.factor(MN$HD)
inHD <- MN[MN$HD == 1, ]
outHD <- MN[MN$HD == 0, ]

The %in% Operator

# AC: the %in% operator selects all rows where the block column contains
# values in our list of blocks.
blocks <- inHD$Block
HDB <- MN[MN$Block %in% blocks, ]

Hypothesis Testing

T Tests

# Arguments for t.test(): x= the data being tested y= the comparison data
# (for two sample t-tests) mu= a fixed value (for one sample t-tests)
# alternative= can be 'two.sided', 'less', or 'greater' depending on the
# hypothesis test. Default = 'two.sided' conf.level= significance
# threshold.  Default = 0.95 var.equal = TRUE or FALSE; depends on whether
# the two variances are treated as equal AC: Two sample t-test (two-sided,
# variance treated as equal) comparing assessment values for buildings in
# and out of an historic district.  Null Hypothesis: The buildings in a
# historic district have the same value as those outside of a historic
# district, and difference between the two groups is due to random chance.
t.test(x = inHD$AssessTot, y = outHD$AssessTot)  #Hypothesis Test 1
## 
##  Welch Two Sample t-test
## 
## data:  inHD$AssessTot and outHD$AssessTot 
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -1724546 -1327117 
## sample estimates:
## mean of x mean of y 
##   1233743   2759574

Correlation

# AC: cor(x,y) finds the correlation between two data columns (x and y),
# or the slope of the best fit line
cor(MN$BldgArea, MN$AssessTot)
## [1] 0.4225
# AC: Types of correlation: Pearson correlation coefficient is used to
# summarize the relationship between two continuous variables.  Kendall's
# τ (“tau”) and Spearman's rank correlation coefficient compute the
# association between two variables by sorting each variables and
# assigning each observation a rank.
cor(MN$BldgArea, MN$AssessTot, method = "pearson")  # Pearson is the default; continuous
## [1] 0.4225
cor(MN$BldgArea, MN$AssessTot, method = "kendall")  # Ranked
## [1] 0.61
cor(MN$BldgArea, MN$AssessTot, method = "spearman")  # Ranked
## [1] 0.78

Correlation Hypothesis Testing

# SS: cor.test() performs a hypothesis test on observed correlations, it
# tests the null hypothesis that the observed correlation is zero.  SS:
# more likely to return significant if sample size is larger or if
# correlation is larger
cor.test(MN$BldgArea, MN$AssessTot)
## 
##  Pearson's product-moment correlation
## 
## data:  MN$BldgArea and MN$AssessTot 
## t = 97.02, df = 43316, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0 
## 95 percent confidence interval:
##  0.4148 0.4302 
## sample estimates:
##    cor 
## 0.4225

Plotting

Normal QQ Plot

# qqnorm() provides a visual test of normality; data should plot on a
# diagonal line if normal
qqnorm(MN$AssessTot, ylim = c(0, 5e+08), main = "Normal Q-Q Plot for Property Assessment Values")  # One outlier omitted
# Use qqline() to add a diagonal line; makes it easier to judge
qqline(MN$AssessTot)

plot of chunk unnamed-chunk-25

Layout Feature

# matrix(data=c(Fig#,Fig#),#rows,#columns) creates a matrix to use for the
# layout
matrx <- matrix(data = c(1, 2), 1, 2)
# layout(matrix) creates a layout for multiple plots based on the matrix
# parameters.
layout(matrx)
# layout.show(n=#ofFigs) displays blank holders for each figure.
layout.show(n = 2)

plot of chunk unnamed-chunk-26

# Feel free to plot!
plot(MN$YCoord ~ MN$XCoord, xlab = "X Coordinate", ylab = "Y Coordinate", main = "Manhattan Building Locations")
plot(MN$AssessTot, MN$YCoord, xlim = c(0, 5e+08), main = "Value by Y Coord", 
    ylab = "Y Coordinate (Larger = Further North)", xlab = "Property Assessment Value ($)")

plot of chunk unnamed-chunk-26

Exercise 3

Making a linear regression model

object <- lm(Y ~ X1+X2+…,data=) creates a linear model. We assign the model to an object so that we can extract information from that object later.
data= is for the data frame you're using. Unnecessary if you're diligent about the $ notation.

# Load Data
larain <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/LA_RAIN.csv", 
    header = TRUE)
names(larain)
## [1] "Year"    "APMAM"   "APSAB"   "APSLAKE" "OPBPC"   "OPRC"    "OPSLAKE"
## [8] "BSAAM"
opslake <- lm(BSAAM ~ OPSLAKE, data = larain)

Exploring the results

After running the model, use extractor function to learn about it.
summary() is the best starting point. It returns most of the data you'd be looking for.

summary(opslake)
## 
## Call:
## lm(formula = BSAAM ~ OPSLAKE, data = larain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17604  -5338    332   3411  20876 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    27015       3219    8.39  1.9e-10 ***
## OPSLAKE         3752        216   17.39  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 8920 on 41 degrees of freedom
## Multiple R-squared: 0.881,   Adjusted R-squared: 0.878 
## F-statistic:  303 on 1 and 41 DF,  p-value: <2e-16
names(opslake)  # 12 columns!
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

Reading the summary
1) Call:
lm(formula = BSAAM ~ OPSLAKE, data = larain)
This shows you the formula for the model.

2) Residuals:
Min 1Q Median 3Q Max
-17603.8 -5338.0 332.1 3410.6 20875.6
Median should be near 0 (not great here…) and the 1Q and 3Q and min and max should be roughly equal in magnitude (again, ehh…).

3) Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 27014.6 3218.9 8.393 1.93e-10
OPSLAKE 3752.5 215.7 17.394 < 2e-16
The estimate provides the magnitude of the relationship. The standard error is reported (rather than standard deviation) because the coefficient is an estimated value for the real relationship. The t statistic and p-value are the t-test performed to determine if the coefficient (and intercept) are different from 0 (i.e. to determine if the relationship estimated really exists).

4) Residual standard error: 8922 on 41 degrees of freedom
This is the variation of the residuals about the regression line.

5) Multiple R-squared: 0.8807, Adjusted R-squared: 0.8778
The R-squared value is percentage of variation in y that can be explained by the model. Adjusted R-squared is more conservative, penalizing the use of more independent variables to increase explanatory power. R2 = 1-(RSS/TSS) = MSS/TSS

6) F-statistic: 302.6 on 1 and 41 DF, p-value: < 2.2e-16
This tests whether the entire model is greater than 0. Not interesting if there's only one coefficient because then it's just redundant.

Quadratic Regression

To make it quadratic, you have to include the I() function, which indicates another independent variable.
Use anova(model1,model2) to determine if two models are different. If significant, this means the better model really better.

opslake2 <- lm(BSAAM ~ OPSLAKE + I(OPSLAKE^2), data = larain)
summary(opslake2)  # adj. R2 = 0.8755 -- note, this is also an ANOVA
## 
## Call:
## lm(formula = BSAAM ~ OPSLAKE + I(OPSLAKE^2), data = larain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18060  -5452    290   3790  20250 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   23800.2     7089.2    3.36   0.0017 ** 
## OPSLAKE        4220.2      942.3    4.48  6.1e-05 ***
## I(OPSLAKE^2)    -14.0       27.4   -0.51   0.6128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9000 on 40 degrees of freedom
## Multiple R-squared: 0.881,   Adjusted R-squared: 0.876 
## F-statistic:  149 on 2 and 40 DF,  p-value: <2e-16
anova(opslake, opslake2)  # This ANOVA is comparing two models.  So it's a different sort of ANOVA.
## Analysis of Variance Table
## 
## Model 1: BSAAM ~ OPSLAKE
## Model 2: BSAAM ~ OPSLAKE + I(OPSLAKE^2)
##   Res.Df      RSS Df Sum of Sq    F Pr(>F)
## 1     41 3.26e+09                         
## 2     40 3.24e+09  1  21097409 0.26   0.61

regsubsets()

Use regsubsets to select the best regression models (linear or otherwise)

# regsubsets(DEPENDENT~.,data= data.frame) Using the period after the ~
# means that you're ogin to try ALL other variables as independent
# variables.
library(HH)  # Need HH loaded.
## Loading required package: lattice
## Loading required package: grid
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: splines
## Loading required package: leaps
## Loading required package: RColorBrewer
## Loading required package: latticeExtra
subsetsmodel <- regsubsets(BSAAM ~ ., data = larain)
summary(subsetsmodel)
## Subset selection object
## Call: regsubsets.formula(BSAAM ~ ., data = larain)
## 7 Variables  (and intercept)
##         Forced in Forced out
## Year        FALSE      FALSE
## APMAM       FALSE      FALSE
## APSAB       FALSE      FALSE
## APSLAKE     FALSE      FALSE
## OPBPC       FALSE      FALSE
## OPRC        FALSE      FALSE
## OPSLAKE     FALSE      FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
##          Year APMAM APSAB APSLAKE OPBPC OPRC OPSLAKE
## 1  ( 1 ) " "  " "   " "   " "     " "   " "  "*"    
## 2  ( 1 ) " "  " "   " "   "*"     " "   " "  "*"    
## 3  ( 1 ) " "  " "   " "   "*"     " "   "*"  "*"    
## 4  ( 1 ) "*"  " "   " "   "*"     " "   "*"  "*"    
## 5  ( 1 ) "*"  " "   "*"   "*"     " "   "*"  "*"    
## 6  ( 1 ) "*"  "*"   "*"   "*"     " "   "*"  "*"    
## 7  ( 1 ) "*"  "*"   "*"   "*"     "*"   "*"  "*"
# The asterisks indicate the variables to use.
coef(subsetsmodel, 1:3)  # See the results coefficients.
## [[1]]
## (Intercept)     OPSLAKE 
##       27015        3752 
## 
## [[2]]
## (Intercept)     APSLAKE     OPSLAKE 
##       19145        1769        3690 
## 
## [[3]]
## (Intercept)     APSLAKE        OPRC     OPSLAKE 
##       15425        1712        1797        2390
names(subsetsmodel)  # See the names of variables stored in the regsubsets object.
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"   
##  [6] "first"     "last"      "vorder"    "tol"       "rss"      
## [11] "bound"     "nvmax"     "ress"      "ir"        "nbest"    
## [16] "lopt"      "il"        "ier"       "xnames"    "method"   
## [21] "force.in"  "force.out" "sserr"     "intercept" "lindep"   
## [26] "nullrss"   "nn"        "call"

Exercise 4

Introduction to Poly Shapefiles

Reading a Poly Shapefile

# This is loaded at a spatial polygon data frame, which means I can
# display it as a map.
library(sp)
library(maptools)
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)
USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
plot(USA)

plot of chunk unnamed-chunk-31

List the pieces of the shapefile using slotNames()

# This can be useful especially if there are a lot of fields for the data.
slotNames(USA)
## [1] "data"        "polygons"    "plotOrder"   "bbox"        "proj4string"

The @data function

summary(USA)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
##       min    max
## x -124.73 -66.97
## y   24.96  49.37
## Is projected: NA 
## proj4string : [NA]
## Data attributes:
##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 136   51     : 136  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3105   (Other)   :2960   (Other) :2222   (Other):2222  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19048  
##  005    :  48   01005  :   1   Median :  622   Median :29217  
##  009    :  47   01007  :   1   Mean   :  965   Mean   :30699  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46012  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2828   (Other):3105                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :     0   Min.   :      0   Min.   :    0   Min.   :    0  
##  1st Qu.:  2926   1st Qu.:   1778   1st Qu.:19042   1st Qu.:    0  
##  Median :  6357   Median :   4041   Median :29211   Median :   14  
##  Mean   : 19055   Mean   :  17940   Mean   :30656   Mean   :  145  
##  3rd Qu.: 15894   3rd Qu.:  10418   3rd Qu.:46008   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct      Kerry_pct      Nader_pct    
##  Min.   :      0   Min.   : 0.0   Min.   : 0.0   Min.   :0.000  
##  1st Qu.:   4808   1st Qu.:52.7   1st Qu.:30.2   1st Qu.:0.000  
##  Median :  10407   Median :61.2   Median :38.5   Median :0.302  
##  Mean   :  37140   Mean   :60.6   Mean   :38.9   Mean   :0.401  
##  3rd Qu.:  26552   3rd Qu.:69.4   3rd Qu.:46.8   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.8   Max.   :90.0   Max.   :4.467  
##                                                                 
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15466  
##  Median :  65.6   Median : 33.4   Median :12.2   Median :17448  
##  Mean   :  93.0   Mean   : 35.3   Mean   :13.0   Mean   :17788  
##  3rd Qu.: 117.5   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.0   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35850  
##  Median :15.1   Median :11.6   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.87   Mean   : 52015  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.18  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.3   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
## 
# In general, use the @data to see just data.
summary(USA@data)
##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 136   51     : 136  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3105   (Other)   :2960   (Other) :2222   (Other):2222  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19048  
##  005    :  48   01005  :   1   Median :  622   Median :29217  
##  009    :  47   01007  :   1   Mean   :  965   Mean   :30699  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46012  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2828   (Other):3105                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :     0   Min.   :      0   Min.   :    0   Min.   :    0  
##  1st Qu.:  2926   1st Qu.:   1778   1st Qu.:19042   1st Qu.:    0  
##  Median :  6357   Median :   4041   Median :29211   Median :   14  
##  Mean   : 19055   Mean   :  17940   Mean   :30656   Mean   :  145  
##  3rd Qu.: 15894   3rd Qu.:  10418   3rd Qu.:46008   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct      Kerry_pct      Nader_pct    
##  Min.   :      0   Min.   : 0.0   Min.   : 0.0   Min.   :0.000  
##  1st Qu.:   4808   1st Qu.:52.7   1st Qu.:30.2   1st Qu.:0.000  
##  Median :  10407   Median :61.2   Median :38.5   Median :0.302  
##  Mean   :  37140   Mean   :60.6   Mean   :38.9   Mean   :0.401  
##  3rd Qu.:  26552   3rd Qu.:69.4   3rd Qu.:46.8   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.8   Max.   :90.0   Max.   :4.467  
##                                                                 
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15466  
##  Median :  65.6   Median : 33.4   Median :12.2   Median :17448  
##  Mean   :  93.0   Mean   : 35.3   Mean   :13.0   Mean   :17788  
##  3rd Qu.: 117.5   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.0   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35850  
##  Median :15.1   Median :11.6   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.87   Mean   : 52015  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.18  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.3   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
## 
# The only difference for the summary function is that if I use @data, it
# adds the coordinates and projection.

Color Brewer Palettes

# Color Brewer links with the shapefile to make thematic maps.
display.brewer.all()  # Displays all the colors available

plot of chunk unnamed-chunk-34

# Use brewer.pal(#,'Type') to create a palette
pal7 <- brewer.pal(7, "Spectral")  # Makes a 7-color spectral palette
display.brewer.pal(7, "Spectral")  # This displays the colors

plot of chunk unnamed-chunk-34

pal7  # Returns esoteric list of color codes.  Not helpful
## [1] "#D53E4F" "#FC8D59" "#FEE08B" "#FFFFBF" "#E6F598" "#99D594" "#3288BD"

Classifying Data using classIntervals() and findColours()

# SS: Create a column that holds the percent of all votes that went to
# G.W. Bush in 2004.
USA$BushPct <- USA$Bush/USA$Total
# Create the classification intervals using classIntervals(DATA, n=
# #OFCLASSES, style='STYLE') Styles are the basics like
# 'quantile','equal','fixed','sd','jenks', and apparently 'pretty'
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
## Warning: var has missing values, omitted in finding classes
cats7  # Returns the bins for each classification.
## style: quantile
## [0.09308,0.4746)  [0.4746,0.5415)  [0.5415,0.5907)  [0.5907,0.6336) 
##              444              444              444              444 
##  [0.6336,0.6801)  [0.6801,0.7421)  [0.7421,0.9283] 
##              444              444              444
# Link the color pallette to the categories using
# findColours(CATEGORIES,PALETTE)
SevenColors <- findColours(cats7, pal7)
# Draw a map using specificed data and colors
plot(USA, col = SevenColors)  # The col= argument provides the coloring scheme.

plot of chunk unnamed-chunk-35

Multiple Regression

Multiple regression is very similar to simple linear regression. The only difference is that you can add independent variables using the plus sign. These show up in the ANOVA from the summary() function and are read just like the coefficient in a single linear regression.

# Example: This linear model uses the percentage of the county population
# that is urban and the percentage that is 'poor', the percentage of
# households that are female-headed, and the median age in 2000.  Null
# Hypothesis: The model does not explain the variation in perctage vote
# for George Bush better than the mean percentage.
lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + MEDAGE2000, USA)
summary(lm1)  # The adj. R^2 is 0.2402
## 
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + MEDAGE2000, 
##     data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5688 -0.0732  0.0108  0.0807  0.2666 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.76e-01   2.40e-02   32.33   <2e-16 ***
## pcturban    -6.05e-05   8.35e-05   -0.72    0.469    
## pctfemhh    -1.32e-02   5.10e-04  -25.95   <2e-16 ***
## pctpoor      2.76e-03   3.29e-04    8.38   <2e-16 ***
## MEDAGE2000  -1.10e-03   5.57e-04   -1.98    0.048 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.11 on 3103 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared: 0.241,   Adjusted R-squared: 0.24 
## F-statistic:  247 on 4 and 3103 DF,  p-value: <2e-16

Extracting Values

# Residuals and predicted values can be extracted using resid() and
# predict(), respectively, or they can be recalled using MODEL$residuals
# and MODEL$fitted.values.
lm1.resid <- resid(lm1)  # Extracting Residuals
lm1.predicted <- predict(lm1)  # Extracting Predicted Values

Paneling Graphics: par(mfrow = c( nrow, ncol)) partitions the plot area where nrow is the number of rows and ncol is the number of columns

par(mfrow = c(2, 2))
plot(lm1)

plot of chunk unnamed-chunk-38

Lab 1

Transformations

CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/co2_LAB1.csv", 
    header = T)
mauna <- subset(CO2, site == 0)
jolla <- subset(CO2, site == 1)

## The ladder() function shows an array of possible transformations for
## both variables -- the aim being to improve normality.  No need to
## memorize formulae because the orange and green headings tell you what
## to do to achieve a particular distribution.
library(HH)
ladder(co2 ~ time, CO2)

plot of chunk unnamed-chunk-39

Shapiro-Wilk Test

# Use to test for normality.  Null Hypotheses: The distributions of these
# data are normal.  Alternative Hypotheses: The distributions of these
# data are not normal.
shapiro.test(CO2$co2)  # Significance indicates distribution is not normal.
## 
##  Shapiro-Wilk normality test
## 
## data:  CO2$co2 
## W = 0.963, p-value = 1.202e-14
# p-value indicates the probability of having the given test statistic if
# the distirbution were normal... one of the few times you dislike
# rejecting the null hypothesis.

Wilcoxon Rank Test

# If data cannot be normally distributed, use wilcox, which acts like a
# t-test, only with ranked data.  (Outliers no longer have a major
# impact.)  If paired=T, performs a paired t-test, where corresponding
# instances (same row #) are compared.  Appropriate, for example, when you
# have one hundred sites tested at time 1 and time 2.  Site A at time one
# should be compared to site A at time two.  Null Hypothesis: The average
# CO2 concentration is the same at each site and any variation is due to
# random chance.  Alternative Hypothesis: The average CO2 concentration is
# different at La Jolla from Mauna Loa.
wilcox.test(mauna$co2, jolla$co2, paired = T)  # Therefore, there is justification for using one site over the other.  Since Mauna Loa is considered more favorable, only Mauna Loa will be used for to answer questions 2 & 3.
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mauna$co2 and jolla$co2 
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0
summary(jolla$co2 - mauna$co2)  # Mean difference of 0.9496 ppm
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -5.84   -0.30    1.92    0.95    2.60    4.81

Adding Lines to a Model Plot
abline

lm1 <- lm(co2 ~ time, mauna)
plot(mauna$co2 ~ mauna$time, main = "Fit for Linear Regression Model", xlab = "Time (years)", 
    ylab = "CO2 concentration (ppm)")
abline(coef(lm1), col = "blue", lwd = 3)

plot of chunk unnamed-chunk-42

# abline(coef(MODEL)) is good if you have a linear regression.  It looks
# for a slope and an intercept.

curve

mauna$cent <- mauna$time - mean(mauna$time)
lm2 <- lm(co2 ~ cent + I(cent^2), mauna)
plot(co2 ~ cent, mauna, main = "CO2 Concentration Regression Models", xlab = "Centered Years (1969-2007)", 
    ylab = "CO2 Concentration (ppm)", cex = 0.6)
curve(predict(lm2, data.frame(cent = x), type = "resp"), add = TRUE, col = "red", 
    lwd = 3)

plot of chunk unnamed-chunk-43

# curve(predict(MODEL,data.frame(XVARIABLE=x),type='resp'),add=T) looks a
# bit complicated.  USe it if your regression model is nonlinear.  The one
# trick is that you need to define the x variables using the same
# independent variable (and data frame) as when you made the model.

lines

CO2$cent <- CO2$time - mean(CO2$time)
lm3 <- lm(co2 ~ cent + I(cent^2) + site, CO2)
plot(CO2$co2 ~ CO2$cent, pch = CO2$site, cex = 0.6, main = "Fits for Quadratic Split Model", 
    xlab = "Centered Years (1969-2007)", ylab = "CO2 Concentration (ppm)")
lines(predict(lm3) ~ cent, data = CO2, subset = site == 0, col = "red", lwd = 2)
lines(predict(lm3) ~ cent, data = CO2, subset = site == 1, col = "blue", lwd = 2)
# lines(predict(MODEL)~X-VARIABLE, data= X-DATAFRAME, subset =
# DUMMYVARIABLE == ##) is useful when you have a dummy variable and want
# to draw a separate line for each subset.  Like with curve, you have to
# include the x variable from the dataframe you made the model from.  Also
# note you have to use subset = DUMMYVARIABLE == ##, which involves a
# logical argument.
legend("bottomright", c("Mauna Loa", "La Jolla"), lty = c(1, 1), col = c("red", 
    "blue"))

plot of chunk unnamed-chunk-44

Exercises 5A & B

Logistic Regression

bd <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/babybirthdata.csv")
# MAGE = mother's age, casegrp: 0 = no defect, 1 = defects

# Logistic Regression is a type of generalized linear model.  Use it when
# the outcome is categorical (regardless of independent variable).  The
# function is glm(DVAR ~ IVARs, family=binomial('logit'))
bdlog <- glm(bd$casegrp ~ bd$MAGE, family = binomial("logit"))
# Use glm instead of lm for generalized linear model Use
# family=binomial('logit') to specify a logit test.  Other steps work the
# same as linear regression.

Plotting Logistic Regression – Almost a Lost Cause

plot(casegrp ~ MAGE, data = bd)  # Our normal plots are not helpful b/c we only have two possible outcomes per x value -- can't see the # of instances for each x

plot of chunk unnamed-chunk-46


plot(jitter(casegrp) ~ MAGE, data = bd)  # jitter adds some randomness to the y values... gives a better sense of spread -- but Seth doesn't like it

plot of chunk unnamed-chunk-46


plot(bd$MAGE, fitted(glm(casegrp ~ MAGE, binomial, data = bd)), xlab = "Maternal Age", 
    ylab = "P(Birth Defect)", pch = 15)  # Plots the pi hat i (probability of a y=1) on the y-axis; this is actually useful

plot of chunk unnamed-chunk-46

Odds Ratio

summary(bdlog)
## 
## Call:
## glm(formula = bd$casegrp ~ bd$MAGE, family = binomial("logit"))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.567  -0.240  -0.147  -0.089   3.610  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.837      0.328    2.56    0.011 *  
## bd$MAGE       -0.199      0.015  -13.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2096.3  on 11750  degrees of freedom
## AIC: 2100
## 
## Number of Fisher Scoring iterations: 8
# We need to turn the estimate into an odds ratio or probability value in
# order to interpret them.  We can get probaiblity equation using the
# intercept (b0) and independent variable (b1)

# Calculating an odds ratio: This is a fairly simple operation.  All you
# need is the exp() function and the estimate from logistic regression.
library(MASS)
exp(-0.19872)  # This returns a value less than one, indicating that probability goes down
## [1] 0.8198
# this means that for every 1-year increase in MAGE, the probability of a
# birth defect goes down by ~18% (since (OR-1)*100% = % increase and OR =
# 0.820) Note that when OR > 1 there is a positive relationship, when OR <
# 1 there is a negative relationship.  To obtain a confidence interval,
# add the function confint():
exp(confint(bdlog))  # returns a range of 0.795 to 0.844 of the odds ratio
## Waiting for profiling to be done...
##              2.5 % 97.5 %
## (Intercept) 1.2214 4.4155
## bd$MAGE     0.7955 0.8438
# The default is a 95% confidence interval.

Logistic Regression & Categorical Independent Variables

bd$magecat3 <- ifelse(bd$MAGE > 25, c(1), c(0))  # Older than 25 (1 is true, 0 is false)
bd$magecat2 <- ifelse(bd$MAGE >= 20 & bd$MAGE <= 25, c(1), c(0))  # Aged 20 to 25
bd$magecat1 <- ifelse(bd$MAGE < 20, c(1), c(0))  # Younger than 20
# Fit a model with new catergories:
bdlog2 <- glm(casegrp ~ magecat1 + magecat2, data = bd, binomial)
# Remember: always drop one of the dummy variable cats; otherwise, it
# gives NA as the coeff
summary(bdlog2)
## 
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2, family = binomial, 
##     data = bd)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.378  -0.232  -0.106  -0.106   3.222  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.186      0.167   -31.0  < 2e-16 ***
## magecat1       2.582      0.196    13.2  < 2e-16 ***
## magecat2       1.582      0.195     8.1  5.4e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2113.7  on 11749  degrees of freedom
## AIC: 2120
## 
## Number of Fisher Scoring iterations: 8
# Our odds ratios will be in comparison to anyone older than 25:
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
##                    OR    2.5 %    97.5 %
## (Intercept)  0.005594 0.003956  0.007633
## magecat1    13.228246 9.092820 19.682178
## magecat2     4.863064 3.350905  7.219970
# Tells us that women in magecat1 are 13 times more likely to have a birth
# defect than the reference group (over 25) Tells us that women in
# magecat2 are 5 times more likely to have a birth defect than the
# reference group (over 25)

# Calculating Pi i: This involves reading the summary table estimates and
# plugging them into the equation exp(b0 + biXi...)/1+exp(b0 + biXi...)
exp(-5.1861 + (2.5824 * (1)) + (1.5817 * (0)))/(1 + exp(-5.1861 + (2.5824 * 
    (1)) + (1.5817 * (0))))  # for under age 20
## [1] 0.0689
# Note that above, I multipled b1 by x1 and b2 by x2... and for women
# under 20, x1 = magecat1 = 1 and x2 = magecat2 = 0 Returns 0.0689
exp(-5.1861 + (2.5824 * (0)) + (1.5817 * (1)))/(1 + exp(-5.1861 + (2.5824 * 
    (0)) + (1.5817 * (1))))  # for under age 20 to 25
## [1] 0.02648
# Returns: 0.0265
exp(-5.1861 + 2.5824 * (0) + 1.5817 * (0))/(1 + exp(-5.1861 + 2.5824 * (0) + 
    1.5817 * (0)))  # for over age 25
## [1] 0.005563
# Returns: 0.0055

Significance Testing

# Significance Testing
bd.log <- glm(casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, binomial, 
    data = bd)
# Wald's Test shows us whether a particular independent variable explains
# a part of the variation in the dependent variable (Null Hypothesis = No)
# Deviance Test is like the F-test for normal regression. It tests whether
# the model overall has explanatory value (Null Hypothesis = None)
summary(bd.log)
## 
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, 
##     family = binomial, data = bd)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.510  -0.247  -0.121  -0.091   3.315  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -4.907      0.188  -26.13  < 2e-16 ***
## magecat1       2.257      0.208   10.87  < 2e-16 ***
## magecat2       1.432      0.198    7.25  4.3e-13 ***
## bthparity2    -0.582      0.151   -3.85  0.00012 ***
## smoke          0.676      0.155    4.36  1.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2319.1  on 11739  degrees of freedom
## Residual deviance: 2078.1  on 11735  degrees of freedom
##   (12 observations deleted due to missingness)
## AIC: 2088
## 
## Number of Fisher Scoring iterations: 8
# This changes the odds ratios
exp(cbind(OR = coef(bd.log), CI = confint(bd.log)))
## Waiting for profiling to be done...
##                   OR    2.5 %   97.5 %
## (Intercept) 0.007393 0.005033  0.01052
## magecat1    9.555339 6.419046 14.52035
## magecat2    4.185062 2.869565  6.24008
## bthparity2  0.558605 0.413840  0.74906
## smoke       1.965743 1.440422  2.64938
# Ex. maybe some of the differnce b/wn cat 1 and cat 3 was that there were
# more smokers in cat 1

Comparing Logistic Models with ANOVA

anova(bd.log, test = "LRT")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: casegrp
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       11739       2319             
## magecat1    1    134.2     11738       2185  < 2e-16 ***
## magecat2    1     77.1     11737       2108  < 2e-16 ***
## bthparity2  1     12.4     11736       2095  0.00042 ***
## smoke       1     17.2     11735       2078  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This is giving us the bare model and adds an indepedent variable one by
# one... cumulative -- the ORDER MATTERS the anova() function is smart
# enough to know a logistic regression from a linear regression
anova(bd.log, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: casegrp
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       11739       2319             
## magecat1    1    134.2     11738       2185  < 2e-16 ***
## magecat2    1     77.1     11737       2108  < 2e-16 ***
## bthparity2  1     12.4     11736       2095  0.00042 ***
## smoke       1     17.2     11735       2078  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This one is the same idea, just a different method.
drop1(bd.log, test = "LRT")
## Single term deletions
## 
## Model:
## casegrp ~ magecat1 + magecat2 + bthparity2 + smoke
##            Df Deviance  AIC   LRT Pr(>Chi)    
## <none>            2078 2088                   
## magecat1    1     2216 2224 137.9  < 2e-16 ***
## magecat2    1     2139 2147  60.7  6.6e-15 ***
## bthparity2  1     2093 2101  15.4  8.9e-05 ***
## smoke       1     2095 2103  17.2  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Can also try this backwards by starting with all independent variables
# and dropping one at a time.

Test for Multicollinearity

# The vif(MODEL) tests for multicollinearity in a multiple regression.  A
# mean value of 10 is bad and merits exploring the removal of a variable.
library(foreign)
Elect04 <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/2004_Election_Counties.dbf")
m8a <- lm(Bush_pct ~ ASIA + AIAN + UNDER18 + ginirev + unemploy + pctcoled + 
    pctfemhh + Nader_pct, data = Elect04)
library(car)
## Loading required package: nnet
## Attaching package: 'car'
## The following object(s) are masked from 'package:HH':
## 
## logit, vif
m8a_vif <- vif(m8a)
summary(m8a_vif)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.13    1.17    1.45    1.42    1.65    1.69
# Mean VIF = 1.418 Range if 1.127 to 1.686 So there isn't much concern
# here for multicollinearity.

Exercise 7: Time Series

Material from Lab 1

CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/co2_LAB1.csv", 
    header = T)
mauna <- subset(CO2, site == 0)
mauna$cent <- mauna$time - mean(mauna$time)
lm2 <- lm(co2 ~ cent + I(cent^2), mauna)

Run Durbin-Watson Test

library(lmtest)
## Loading required package: zoo
## Attaching package: 'zoo'
## The following object(s) are masked from 'package:base':
## 
## as.Date, as.Date.numeric
# Null Hypothesis: rho = 0; there is only random error and no
# autocorrelation
dwtest(lm2)  # DW = 0.3067, p-value < 2.2e16
## 
##  Durbin-Watson test
## 
## data:  lm2 
## DW = 0.3067, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# This tells us that there is autocorrelation.

Plot Residuals on a Correllelogram to Examine Seasonality of Residuals

acf(lm2$residuals, main = "Correllelogram of lm2 Residuals")  # That is some fine seasonality.

plot of chunk unnamed-chunk-54

Create a time series and plot it using decompose

# Create a time series using ts()
lm2.ts <- ts(mauna$co2, start = min(mauna$year), frequency = 12)
# start= indicates the first observation time unit. start=min(mauna$year)
# indicates that we should start in the minimum year (1969).  frequency=
# indicates the number of observations per unit of time. (In this case we
# have 12.)
head(mauna)
##   year month time   co2 site   cent
## 1 1969     2 1969 324.4    0 -19.42
## 2 1969     3 1969 325.6    0 -19.33
## 3 1969     4 1969 326.7    0 -19.25
## 4 1969     5 1969 327.3    0 -19.17
## 5 1969     6 1970 326.8    0 -19.08
## 6 1969     7 1970 325.9    0 -19.00
# Plot using decompose will separate a time series into trend, seasonal,
# and random parts.
plot(decompose(lm2.ts))

plot of chunk unnamed-chunk-56

Create a Seasonal Factor and Refit Model with Dummy

# Cycle() assigns values for each observation of each time unit (the
# seasonal effect).
season <- cycle(lm2.ts)  # In this case, we're using months in a time unit of years.
head(season)
## [1] 1 2 3 4 5 6
# Compare to the time() function, which treats observations as fractions
# of the time unit
time <- time(lm2.ts)  # In this case,0.083333 for each month
head(time)
## [1] 1969 1969 1969 1969 1969 1969

# Refitting the model with factor(season) as a dummy.
lm3 <- lm(mauna$co2 ~ mauna$cent + I(mauna$cent^2) + factor(season))

Diagnostics on Time Series Model

summary(lm3)  # adj. R^2 = 0.9983
## 
## Call:
## lm(formula = mauna$co2 ~ mauna$cent + I(mauna$cent^2) + factor(season))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8937 -0.5283  0.0676  0.4837  1.7820 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.51e+02   1.21e-01 2901.81  < 2e-16 ***
## mauna$cent        1.55e+00   2.96e-03  522.78  < 2e-16 ***
## I(mauna$cent^2)   9.92e-03   2.94e-04   33.69  < 2e-16 ***
## factor(season)2   8.01e-01   1.63e-01    4.92  1.2e-06 ***
## factor(season)3   1.92e+00   1.63e-01   11.83  < 2e-16 ***
## factor(season)4   2.30e+00   1.63e-01   14.14  < 2e-16 ***
## factor(season)5   1.63e+00   1.63e-01   10.00  < 2e-16 ***
## factor(season)6  -2.12e-02   1.63e-01   -0.13      0.9    
## factor(season)7  -2.18e+00   1.63e-01  -13.38  < 2e-16 ***
## factor(season)8  -3.90e+00   1.63e-01  -23.97  < 2e-16 ***
## factor(season)9  -3.94e+00   1.63e-01  -24.25  < 2e-16 ***
## factor(season)10 -2.79e+00   1.63e-01  -17.16  < 2e-16 ***
## factor(season)11 -1.59e+00   1.63e-01   -9.75  < 2e-16 ***
## factor(season)12 -6.69e-01   1.64e-01   -4.09  5.1e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.718 on 453 degrees of freedom
## Multiple R-squared: 0.998,   Adjusted R-squared: 0.998 
## F-statistic: 2.13e+04 on 13 and 453 DF,  p-value: <2e-16
# Why isn't June significant? I don't know, but the mdoel does fit better.
par(mfrow = c(1, 2))
ts.plot(cbind(lm2.ts, lm3$fitted), lty = 1:2, col = c(1, 3), lwd = c(1, 2), 
    main = "CO2 Concentration Seasonal Quadratic Model", ylab = "CO2 Concentration (ppm)", 
    xlab = "Years")
plot(lm3$residuals ~ lm3$fitted.values, main = "Residuals for Seasonal Quadratic Model", 
    xlab = "Fitted Values (ppm)", ylab = "Residuals")

plot of chunk unnamed-chunk-59

Cochrane Orcutt

# The Cochrane-Orcutt Procedure attempts to correct for the
# autocorrelation.  It also returns a value for rho.  Note that to read
# the output, you just type the object name; don't use the summary()
# function.
library(orcutt)
lm4 <- cochrane.orcutt(lm2)
lm4  # Gives a rho of 0.8466136; the new model has an adj. R^2 of 0.9995
## $Cochrane.Orcutt
## 
## Call:
## lm(formula = YB ~ XB - 1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.732 -0.898  0.516  0.884  2.175 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## XB(Intercept) 3.50e+02   5.34e-01  655.21   <2e-16 ***
## XBcent        1.54e+00   3.20e-02   48.28   <2e-16 ***
## XBI(cent^2)   9.77e-03   3.18e-03    3.08   0.0022 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 1.18 on 463 degrees of freedom
## Multiple R-squared:    1,    Adjusted R-squared:    1 
## F-statistic: 3.25e+05 on 3 and 463 DF,  p-value: <2e-16 
## 
## 
## $rho
## [1] 0.8466
## 
## $number.interaction
## [1] 1

Exercise 8A: Spatial Autocorrelation

Load Tools and Data

# Load tools and read in data.
library(maptools)
library(spdep)
## Loading required package: boot
## Attaching package: 'boot'
## The following object(s) are masked from 'package:car':
## 
## logit
## The following object(s) are masked from 'package:HH':
## 
## logit
## The following object(s) are masked from 'package:survival':
## 
## aml
## The following object(s) are masked from 'package:lattice':
## 
## melanoma
## Loading required package: Matrix
## Loading required package: nlme
## Loading required package: deldir
## deldir 0.0-21
## Loading required package: coda
## Attaching package: 'coda'
## The following object(s) are masked from 'package:HH':
## 
## acfplot
library(rgdal)
## rgdal: version: 0.8-5, (SVN revision 449) Geospatial Data Abstraction
## Library extensions to R successfully loaded Loaded GDAL runtime: GDAL
## 1.9.0, released 2011/12/29 Path to GDAL shared files:
## /Library/Frameworks/R.framework/Versions/2.15/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480] Path to
## PROJ.4 shared files:
## /Library/Frameworks/R.framework/Versions/2.15/Resources/library/rgdal/proj

sids <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/sids2/sids2.shp")
class(sids)  # Checkign to see file type
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"

Assigning and Changing Projection Data

# Assignment of a projection using a CRS code:
# http://www.spatialreference.org/ref/
proj4string(sids) <- CRS("+proj=longlat +ellps=WGS84")

# Assigning projections using CRS code:
sids_Albers <- spTransform(sids, CRS("+init=epsg:2163"))  # Albers Equal Area
sids_SP <- spTransform(sids, CRS("+init=ESRI:102719"))  # NC State Plane

par(mfrow = c(3, 1))
plot(sids, axes = T)
title("WGS84")
plot(sids_SP, axes = T)
title("NC State Plane (feet)")
plot(sids_Albers, axes = T)
title("Albers Equal Area Conic USA\n(seems wrong)")

plot of chunk unnamed-chunk-62

1) Identify Neighbors

Contiguity based neighbors

# Find neighbors using Queen's or Rook's case.
library(spdep)
sids_nbq <- poly2nb(sids)  # Queen's Case
sids_nbr <- poly2nb(sids, queen = FALSE)  # Rook's Case

# Extract the coordinates for the locations:
coords <- coordinates(sids)
dev.off()  #clears the screen and the panels
## null device 
##           1
par(mfrow = c(2, 1))
# Plot the links for neighbors:
plot(sids_nbq, coords)  # Queen's case -- selects all locations that share an edge or a point with the focus location.
plot(sids, add = T)  #Add the location outlines
plot(sids_nbr, coords)  # Rook's case -- selects all locations that share an edge with the focus location.
plot(sids, add = T)  #Add the location outlines

plot of chunk unnamed-chunk-64

Distance-based neighbors (k nearest neighbors)

# Set coords using state plane projection:
coords <- coordinates(sids_SP)
# use knn2nb(knearneigh(COORDS, k=##)) to identify nearest neighbors based
# on the coordinates. k= determines the number of neighbors to select for
# each location.  k-nearest neighbor identifies the k nearest neighbors
# (by centroid), irrespective of the actual distance.  Only the relative
# distance matters.
sids_kn1 <- knn2nb(knearneigh(coords, k = 1))
sids_kn2 <- knn2nb(knearneigh(coords, k = 2))
sids_kn4 <- knn2nb(knearneigh(coords, k = 4))

# Plotting:
dev.off()  #clears the screen and the panels
## null device 
##           1
plot(sids_SP)
plot(sids_kn2, coords, add = T)  # Adds 2 links for each location.

Distance-based neighbors (specified distance)

# Distance-based will select all centroids within a certain distance of
# the focus location.  This can result in 0 negihbors.  Use the k1-nearest
# neighbor as a starting point if you want each location to have at least
# 1 neighbor.  Use unlist(nbdists(K-NEAREST, COORDS))
dist <- unlist(nbdists(sids_kn1, coords))  # Using the State Plane version so that the distances are easier to interpret
summary(dist)  # The summary gives a good sense of what might be an appropriate threshold. The maximum distance of a k1 neighbor is the highest you can make the minimum threshold without having a location with 0 neighbors.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   40100   89800   97600   96300  107000  135000
max_k1 <- max(dist)  # Store the max dist as an object.

# Now you can set various levels. using dnearneigh(COORDS, d1=#,
# d2=#*MAX_K1) d1 is usually 0 d2 if you multiply the k1 max by a number
# under 1, you will end up with some locations that have 0 neighbors.
sids_kd1 <- dnearneigh(coords, d1 = 0, d2 = 0.75 * max_k1)
sids_kd2 <- dnearneigh(coords, d1 = 0, d2 = 1 * max_k1)
sids_kd3 <- dnearneigh(coords, d1 = 0, d2 = 1.5 * max_k1)
# This can also be done by raw distance:
sids_ran1 <- dnearneigh(coords, d1 = 0, d2 = 134600)

2) Assigning Weights to the Chosen Neighbors

Row Standardization

# Not all neighbors are equal. Some should be weighted more than others.

# For instance, if location 1 has 3 neighbors and location 2 has 5
# neighbors, the neighbors for location 1 need to be weighted differently
# than those for location 2.  This is resolved by row standardization,
# which assigns the appropriate weight depending on how many neighbors a
# location has.

# The function nb2listw(NEIGHBORS_LIST) performs row standardization.
sids_nbq_w <- nb2listw(sids_nbq)  # Example with Queen's case neighbors
sids_nbq_w
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 490 
## Percentage nonzero weights: 4.9 
## Average number of links: 4.9 
## 
## Weights style: W 
## Weights constants summary:
##     n    nn  S0    S1    S2
## W 100 10000 100 44.65 410.5
sids_nbq_w$neighbours[1:3]  # The neighbours section of the weights object gives you the id #s of all a location's neighbors.
## [[1]]
## [1]  2 18 19
## 
## [[2]]
## [1]  1  3 18
## 
## [[3]]
## [1]  2 10 18 23 25
sids_nbq_w$weights[1:3]  # The weights section of the weights object gives you the decimal weight for each neighbor (the weight is always 1/x when x is the number of neighbors).
## [[1]]
## [1] 0.3333 0.3333 0.3333
## 
## [[2]]
## [1] 0.3333 0.3333 0.3333
## 
## [[3]]
## [1] 0.2 0.2 0.2 0.2 0.2

Binary Weights

# Since row standardization gives more weight to neighbors of lcoations
# with few neighbors, you may prefer binary weights if looking for global
# characteristizations. Binary weights bumps up the many-neighbored
# neighbors compared to row standardization.
sids_nbq_wb <- nb2listw(sids_nbq, style = "B")
sids_nbq_wb
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 490 
## Percentage nonzero weights: 4.9 
## Average number of links: 4.9 
## 
## Weights style: B 
## Weights constants summary:
##     n    nn  S0  S1    S2
## B 100 10000 490 980 10696
sids_nbq_wb$weights[1:3]  # Notice how all the neighbors here are given equal weight!
## [[1]]
## [1] 1 1 1
## 
## [[2]]
## [1] 1 1 1
## 
## [[3]]
## [1] 1 1 1 1 1

Regions with No Negihbors

# Islands do not have neighbors in a queen's or rook's case. Use k nearest
# neighbor to rectify, or use 'zero.policy=T' as an argument when
# assigning weights:
sids_nbq_w <- nb2listw(sids_nbq, zero.policy = T)

IDW

# Inverse distance weighting bases weights on the distance between
# centroids.
dists <- nbdists(sids_nbq, coordinates(sids_SP))
idw <- lapply(dists, function(x) 1/(x/1000))  # The function can be whatever you want; so you could make more distant neighbors worth more! Although that is usually ridiculous...
sids_nbq_idwb <- nb2listw(sids_nbq, glist = idw, style = "B")
summary(unlist(sids_nbq_idwb$weights))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00412 0.00627 0.00764 0.00804 0.00927 0.02490

3) Examine Spatial Autocorrelation

Moran's I

# To test spatial dependence, you need: A variable of interest A lagged
# version of that variable (based on the weights determined in step 2)
sids$NWBIR79  #births to non-white mothers
##   [1]    19    12   260   145  1197  1237   139   371   844   176   597
##  [12]  1369  1058   650  1492  2980   933   222    33   310   491     5
##  [23]    76   950  5031  7089  1397  1161  1086  4948  2274     4  2696
##  [34]   360    10  1033  6221     3  1305   177   150   941   407   651
##  [45]   141   128   483   687  2330  1504  3059   914  1206  1349    62
##  [56]    73  1163   406   664   905   576  3073  1453  1729   350   307
##  [67]  1151 11631  1203   588   528   264    45  2047   104  2194    79
##  [78]    22  1524   277    42 10614   305  1348  1161  1172   169  1227
##  [89]  1218     5  2342  1436  3568  6899   487  1023   763  1832  2100
## [100]   841
sids$NWBIR79_lag <- lag(sids_nbq_w, sids$NWBIR79)  #lag using row standardized weights
lm1 <- lm(sids$NWBIR79_lag ~ sids$NWBIR79)
# The regression coefficient IS the Moran's I statistic.
summary(lm1)
## 
## Call:
## lm(formula = sids$NWBIR79_lag ~ sids$NWBIR79)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -1667   -803   -266    644   3036 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.21e+03   1.26e+02    9.59  9.2e-16 ***
## sids$NWBIR79 1.50e-01   5.27e-02    2.84   0.0055 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 1040 on 98 degrees of freedom
## Multiple R-squared: 0.0761,  Adjusted R-squared: 0.0667 
## F-statistic: 8.08 on 1 and 98 DF,  p-value: 0.00546
plot(y = sids$NWBIR79_lag, x = sids$NWBIR79, ylab = "Births to non-white mothers lag", 
    xlab = "Births to non-white mothers")
title("The relationship between a variable \n and the lag of itself is Moran’s I")
## Warning: conversion failure on ' and the lag of itself is Moran’s I' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning: conversion failure on ' and the lag of itself is Moran’s I' in
## 'mbcsToSbcs': dot substituted for <80>
## Warning: conversion failure on ' and the lag of itself is Moran’s I' in
## 'mbcsToSbcs': dot substituted for <99>

plot of chunk unnamed-chunk-71


## Alternate way of getting Moran's I: Find the Moran's I value using
## moran.test(VARIABLE, listw=WEIGHTSLIST, zero.policy=T) variable is the
## variable you wish to test for autocorrelation listw= is the weight list
## object zero.policy=T is used to protect from islands (units without a
## nearest neighbor)
moran.test(sids$NWBIR79, listw = sids_nbq_w, zero.policy = T)
## 
##  Moran's I test under randomisation
## 
## data:  sids$NWBIR79  
## weights: sids_nbq_w  
##  
## Moran I statistic standard deviate = 2.613, p-value = 0.004487
## alternative hypothesis: greater 
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.149686         -0.010101          0.003739
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.

Exercise 8B - Testing Moran's I and Local Moran's I

Load Data

# Load libraries

library(spdep)
library(classInt)
library(gstat)

# Load soco dataset with load() function Automatically creates an object
# called soco
load("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/soco.rda")

Moran's I - Global

# Find neighbors for each location and save them to an object.
soco_nbq <- poly2nb(soco)  # Queen’s neighborhood method
# Create the weight matrix for the neighbors
soco_nbq_w <- nb2listw(soco_nbq)

# Moran's I Test
moran.test(soco$PERPOV, listw = soco_nbq_w, zero.policy = T)
## 
##  Moran's I test under randomisation
## 
## data:  soco$PERPOV  
## weights: soco_nbq_w  
##  
## Moran I statistic standard deviate = 26.96, p-value < 2.2e-16
## alternative hypothesis: greater 
## sample estimates:
## Moran I statistic       Expectation          Variance 
##         0.4346902        -0.0007215         0.0002609
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.

Testing Significance of Moran's I Statistic

# Simulations of Moran's I statistic using moran.mc listw= is for the
# weight matrix object nsim= is for the # of simulations to include.
# Always make it 99, 999, 9999, etc. because we're adding in the real
# world Moran's I to the list as well.  We want a round number in the end
# to make the p-value intuitive.
socoMoranSim <- moran.mc(soco$PERPOV, listw = soco_nbq_w, nsim = 999)
socoMoranSim
## 
##  Monte-Carlo simulation of Moran's I
## 
## data:  soco$PERPOV 
## weights: soco_nbq_w  
## number of simulations + 1: 1000 
##  
## statistic = 0.4347, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
names(socoMoranSim)  # socoMoranSim$res is the Moran's I result
## [1] "statistic"   "parameter"   "p.value"     "alternative" "method"     
## [6] "data.name"   "res"

# Plot the Moran's I Simulation Results
hist(socoMoranSim$res, breaks = 50)

plot of chunk unnamed-chunk-74


# Moran's I Plot Quadrants Use moran.plot(x variable, lag x variable,
# labels=as.character(label)) This plot will automatically create labels,
# so you might as well assign something intuitive.
socoMoranPlot <- moran.plot(soco$PPOV, soco_nbq_w, labels = as.character(soco$CNTY_ST), 
    xlab = "Percent Poverty", ylab = "Lag of Percent Poverty")

plot of chunk unnamed-chunk-74

Local Moran's I

# Calculate the local Moran's I statistic using localmoran(DATA, WEIGHTS)
# where WEIGHTS is the list of weighted neighbors.
soco_locm <- localmoran(soco$PPOV, soco_nbq_w)
summary(soco_locm)
##        Ii              E.Ii               Var.Ii            Z.Ii       
##  Min.   :-1.778   Min.   :-0.000722   Min.   :0.0901   Min.   :-3.983  
##  1st Qu.: 0.012   1st Qu.:-0.000722   1st Qu.:0.1420   1st Qu.: 0.029  
##  Median : 0.219   Median :-0.000722   Median :0.1658   Median : 0.517  
##  Mean   : 0.589   Mean   :-0.000722   Mean   :0.1861   Mean   : 1.408  
##  3rd Qu.: 0.746   3rd Qu.:-0.000722   3rd Qu.:0.1990   3rd Qu.: 1.761  
##  Max.   : 9.335   Max.   :-0.000722   Max.   :0.9981   Max.   :18.977  
##    Pr(z > 0)     
##  Min.   :0.0000  
##  1st Qu.:0.0391  
##  Median :0.3025  
##  Mean   :0.2900  
##  3rd Qu.:0.4886  
##  Max.   :1.0000

## Manually make a moran plot Standardize variables using the scale()
## function; save to a new column
soco$sPPOV <- scale(soco$PPOV)
# Create a lagged variable using the lag.listw(WEIGHTS, SCALED_DATA)
# function.  Note: 'lag' variable means the values of the neighbors of the
# variable under consideration.
soco$lag_sPPOV <- lag.listw(soco_nbq_w, soco$sPPOV)
summary(soco$sPPOV)
##        V1        
##  Min.   :-2.151  
##  1st Qu.:-0.700  
##  Median :-0.123  
##  Mean   : 0.000  
##  3rd Qu.: 0.591  
##  Max.   : 4.062
summary(soco$lag_sPPOV)
##        V1         
##  Min.   :-1.9925  
##  1st Qu.:-0.5298  
##  Median :-0.0762  
##  Mean   : 0.0085  
##  3rd Qu.: 0.4514  
##  Max.   : 2.6478
# Plot the lag v. the scaled variable.
plot(x = soco$sPPOV, y = soco$lag_sPPOV, main = "Moran Scatterplot PPOV", xlab = "Scaled % Poverty", 
    ylab = "Scaled Lag of % Poverty")
abline(h = 0, v = 0)  # Divides the plot into quadrants
abline(lm(soco$lag_sPPOV ~ soco$sPPOV), lty = 3, lwd = 4, col = "red")  # linear model used to find line of best fit.

# Investigate outliers by clicking on one or two and then hitting escape
# (or click finish)
identify(soco$sPPOV, soco$lag_sPPOV, soco$CNTY_ST, cex = 0.8)

plot of chunk unnamed-chunk-75

## integer(0)
## Quadrant I: high poverty counties surrounded by high poverty counties.
## Quadrant II: low poverty counties surrounded by high poverty counties.
## Quadrant III: low poverty counties surrounded by low poverty counties.
## Quadrant IV: high poverty counties surrounded by low poverty counties.

# Identify the Moran plot quadrant for each observation
soco$quad_sig <- NA  # Start by creating a new column in the original data frame.
# Assign the appropriate quadrant based on whether the scaled and lag
# values are positive or negative. If not significant, assign a 5.
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV >= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 1
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV <= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 2
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 3
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV >= 0) & (soco_locm[, 5] <= 0.05), 
    "quad_sig"] <- 4
soco@data[(soco_locm[, 5] > 0.05), "quad_sig"] <- 5

Plotting Results

# Presenting the results (from Paul Voss) Set the breaks for the thematic
# map classes
breaks <- seq(1, 5, 1)
# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Signif.")
# see ?findInterval - This is necessary for making a map
np <- findInterval(soco$quad_sig, breaks)
# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
plot(soco, col = colors[np])  #colors[np] manually sets the color for each county
mtext("Local Moran's I\n% Poverty in Southern Counties", cex = 1.5, side = 3, 
    line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")

plot of chunk unnamed-chunk-76

# Correlelogram
cor10 <- sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "I")
cor10  # Notice how the values for Moran's I go from significantly positive to significantly negative, crossing 0 around a lag of 8.
## Spatial correlogram for soco$PPOV 
## method: Moran's I
##            estimate expectation  variance standard deviate Pr(I) two sided
## 1 (1387)   5.89e-01   -7.22e-04  2.61e-04            36.54         < 2e-16
## 2 (1387)   4.58e-01   -7.22e-04  1.30e-04            40.20         < 2e-16
## 3 (1387)   3.34e-01   -7.22e-04  8.85e-05            35.60         < 2e-16
## 4 (1387)   2.40e-01   -7.22e-04  7.03e-05            28.69         < 2e-16
## 5 (1387)   1.48e-01   -7.22e-04  5.90e-05            19.39         < 2e-16
## 6 (1387)   8.35e-02   -7.22e-04  5.03e-05            11.88         < 2e-16
## 7 (1387)   3.09e-02   -7.22e-04  4.33e-05             4.81         1.5e-06
## 8 (1387)  -1.46e-02   -7.22e-04  3.83e-05            -2.24           0.025
## 9 (1387)  -5.41e-02   -7.22e-04  3.47e-05            -9.06         < 2e-16
## 10 (1387) -7.38e-02   -7.22e-04  3.23e-05           -12.85         < 2e-16
##              
## 1 (1387)  ***
## 2 (1387)  ***
## 3 (1387)  ***
## 4 (1387)  ***
## 5 (1387)  ***
## 6 (1387)  ***
## 7 (1387)  ***
## 8 (1387)  *  
## 9 (1387)  ***
## 10 (1387) ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cor10, main = "Correlelogram of % Poverty")

plot of chunk unnamed-chunk-77