Help Function: help() – opens help information in Help tab on the indicated function
help(log)
Combine Function: c() – creates a vector object from the list
x <- c(1, 2, 4, 8, 16)
x
## [1] 1 2 4 8 16
R Arithmetic: +,-,,/,*
R Arithmetic Functions: sqrt(),log(),exp()
methusela <- 1 + 3
exp(methusela)
## [1] 54.6
prometheus <- x * 6
sqrt(prometheus - 1)
## [1] 2.236 3.317 4.796 6.856 9.747
loga <- log(2.718)
logb <- log(10)
logc <- log(10, 2.718)
logd <- log(2.718, 10)
logtest <- c(loga, logb, logc, logd)
logtest # log() is a natural log function unless specified as log(x,base = y); shorthand is log(x,y) (see above)
## [1] 0.9999 2.3026 2.3028 0.4342
R Logic: & is and, | is or, == is equals, != is not equals, also can use <, >, <=, >=
Importing csv files:
CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/CO2.csv",
header = TRUE) #Normal method of importing a csv file with headers
# CO2 <- read.csv(choose.file(), header = T) # Cheater's method; gives a
# pop-up window; you select the file directory-style
Reading a data.frame:
summary(CO2) # To get a summary of data in the imported table
## Plant Spec Type Treatment
## Min. : 1.0 Mc1 : 7 Mississippi:42 chilled :42
## 1st Qu.:21.8 Mc2 : 7 Quebec :42 nonchilled:42
## Median :42.5 Mc3 : 7
## Mean :42.5 Mn1 : 7
## 3rd Qu.:63.2 Mn2 : 7
## Max. :84.0 Mn3 : 7
## (Other):42
## conc uptake
## Min. : 95 Min. : 7.7
## 1st Qu.: 175 1st Qu.:17.9
## Median : 350 Median :28.3
## Mean : 435 Mean :27.2
## 3rd Qu.: 675 3rd Qu.:37.1
## Max. :1000 Max. :45.5
##
head(CO2) # To get a sample of first six rows & headers
## Plant Spec Type Treatment conc uptake
## 1 1 Qn1 Quebec nonchilled 95 16.0
## 2 2 Qn1 Quebec nonchilled 175 30.4
## 3 3 Qn1 Quebec nonchilled 250 34.8
## 4 4 Qn1 Quebec nonchilled 350 37.2
## 5 5 Qn1 Quebec nonchilled 500 35.3
## 6 6 Qn1 Quebec nonchilled 675 39.2
names(CO2) # To get header names
## [1] "Plant" "Spec" "Type" "Treatment" "conc" "uptake"
Selecting Parts of a Data Frame:
# use data.frame.name$column.name:
CO2$conc
## [1] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [15] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [29] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [43] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [57] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [71] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
# use bracket notation where data.frame[row criteria , column criteria]
# determine what you print
CO2[, "conc"] # Note how the blank space before the comma means all rows
## [1] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [15] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [29] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [43] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [57] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [71] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
CO2[, 5] # Can also call out the column or row by number
## [1] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [15] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [29] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [43] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [57] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
## [71] 95 175 250 350 500 675 1000 95 175 250 350 500 675 1000
CO2.Q <- CO2[CO2$Type == "Quebec", ] # creates new dataset only including rows where Type is Quebec; remember to keep the comma to select all rows!
CO2.Q <- subset(CO2, Type == "Quebec") # Can also use the subset function to do the same thing
Creating New Variables
# Creating a new variable by assigning values based on logical expressions
# (think reclassify in ArcGIS)
CO2$concat[CO2$conc <= 200] <- 1
CO2$concat[CO2$conc > 200 & CO2$conc <= 400] <- 2
CO2$concat[CO2$conc > 400 & CO2$conc <= 600] <- 3
CO2$concat[CO2$conc > 600] <- 4
# Or create a new variable by arthimetic
CO2$conc1000 <- CO2$conc * 1000
head(CO2)
## Plant Spec Type Treatment conc uptake concat conc1000
## 1 1 Qn1 Quebec nonchilled 95 16.0 1 95000
## 2 2 Qn1 Quebec nonchilled 175 30.4 1 175000
## 3 3 Qn1 Quebec nonchilled 250 34.8 2 250000
## 4 4 Qn1 Quebec nonchilled 350 37.2 2 350000
## 5 5 Qn1 Quebec nonchilled 500 35.3 3 500000
## 6 6 Qn1 Quebec nonchilled 675 39.2 4 675000
Histograms: hist() – Useful Arguments:
# breaks= (if a number) is number of breaks (i.e. the one less than the
# number of bins); suggestion only breaks= (if a vector) is the
# breakpoints b/wn bins breaks= seq(x,y,by=z) where x = min, y = max, and
# z = bin size col= gives the col of the bins, xlab= is the x-axis label,
# and main= is the title border= gives the color of the border around the
# bars main= 'title' ylab= 'y labels' xlab= 'x labels'
hist(CO2$uptake, col = "blue", xlab = "Rate (umol/m^2 sec)", main = "CO2 Uptake Rates",
border = "black", breaks = seq(0, 50, by = 2))
Box Plots: boxplot() – Useful Arguments:
# col= color of box main= 'title' ylab= 'y labels' xlab= 'x labels' Use
# the ~ symbol to say 'with respect to' (compare by categories):
boxplot(CO2$uptake ~ CO2$Type, main = "CO2 Uptake Rates", ylab = "Rate (umol/m^2 sec)",
col = rainbow(2))
Scatter Plots: plot(,) – Useful Arguments:
# If you only give one variable, the other will be an index pch= number
# that determines point type xlim=(x1,x2) and ylim=(y1,y2) provide ranges
# for x and y col= color of points main= 'title' ylab= 'y labels' xlab= 'x
# labels'
plot(CO2$conc, CO2$uptake, xlab = "CO2 Concentration", ylab = "CO2 Uptake Rate",
pch = 4, col = "red")
Line Graphs: plot() with added argumet type= “l” Add a legend using legend(); note several arguments
# ylim=c(y1,y2) to set range for y-axis xlim=c(x1,x2) to set range for
# x-axis main= 'title' ylab= 'y labels' xlab= 'x labels'
thames <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/thames.csv",
sep = ",", header = T)
t1 <- 1:nrow(thames) # Creates a vector with a range of intergers from 1 to the nrows in thames
plot(t1, thames$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Feildes",
type = "l", ylim = c(-1, 100))
# Add another line to same graph using lines() lty= determines
# dotted/dahsed/solid
lines(t1, thames$Redbridge, lty = 2, col = "red")
# legend(position, names, line types, colors)
legend("topright", c("Feildes", "Redbridge"), lty = c(1, 2), col = c("black",
"red"))
# Use dev.off() to clear the screen and the panels
Basics: mean(), sum(), and length()
rain <- c(16, 18, 14, 22, 27, 17, 19, 17, 17, 22, 20, 22)
mean(rain) # Returns the average of data set
## [1] 19.25
sum(rain) # Returns the total of data set
## [1] 231
length(rain) # Returns the length of the list
## [1] 12
Function for a Population Standard Deviation stdevp()
stdevp <- function(someData) {
return(sqrt(mean((someData - mean(someData))^2)))
}
stdevp(rain)
## [1] 3.394
Importing a dbf file:
library("foreign", lib.loc = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library")
MN <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/mnmappluto.dbf")
MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]
Reading a table: dim() & grep()
# SS: dim() returns the number of rows and comlumns in a table
dim(MN)
## [1] 43318 78
# SS: Grep() is a text searching function; returns the row numbers where
# the text was found. AC: Arguments include the text to search, the data,
# and ignore.case=TRUE to make NOT case sensative
CrawfordBldgs <- grep("crawford", MN$OwnerName, ignore.case = TRUE)
MN[CrawfordBldgs, c("Address", "AssessTot", "OwnerName")]
## Address AssessTot OwnerName
## 19634 214 WEST 122 STREET 15772 CRAWFORD STACY
## 31738 320 EAST 51 STREET 101760 J H CRAWFORD
## 40468 531 WEST 150 STREET 35591 CRAWFORD, JOANN
## 41676 118 2 AVENUE 913500 CRAWFORD REALTY CO.
Selecting NA Values: is.na()
# Returns TRUE for NA values.
is.na(MN[1:100, "HistDist"])
## [1] FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [100] TRUE
IF/ELSE: ifelse(logicexpression,ifTRUE,ifFALSE)
# SS: The function ifelse() takes three arguments, a logical expression, a
# value for true, and a value for false. If the logical expression
# evaluates to “TRUE” the first value is used, if not the second value is
# used.
MN$HD <- ifelse(is.na(MN[, "HistDist"]), 0, 1)
Converting to a Factor
# AC: We convert MN$HD to a factor because it is a dummy variable for
# categories, not a ratio level variable
MN$HD <- as.factor(MN$HD)
inHD <- MN[MN$HD == 1, ]
outHD <- MN[MN$HD == 0, ]
The %in% Operator
# AC: the %in% operator selects all rows where the block column contains
# values in our list of blocks.
blocks <- inHD$Block
HDB <- MN[MN$Block %in% blocks, ]
T Tests
# Arguments for t.test(): x= the data being tested y= the comparison data
# (for two sample t-tests) mu= a fixed value (for one sample t-tests)
# alternative= can be 'two.sided', 'less', or 'greater' depending on the
# hypothesis test. Default = 'two.sided' conf.level= significance
# threshold. Default = 0.95 var.equal = TRUE or FALSE; depends on whether
# the two variances are treated as equal AC: Two sample t-test (two-sided,
# variance treated as equal) comparing assessment values for buildings in
# and out of an historic district. Null Hypothesis: The buildings in a
# historic district have the same value as those outside of a historic
# district, and difference between the two groups is due to random chance.
t.test(x = inHD$AssessTot, y = outHD$AssessTot) #Hypothesis Test 1
##
## Welch Two Sample t-test
##
## data: inHD$AssessTot and outHD$AssessTot
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1724546 -1327117
## sample estimates:
## mean of x mean of y
## 1233743 2759574
Correlation
# AC: cor(x,y) finds the correlation between two data columns (x and y),
# or the slope of the best fit line
cor(MN$BldgArea, MN$AssessTot)
## [1] 0.4225
# AC: Types of correlation: Pearson correlation coefficient is used to
# summarize the relationship between two continuous variables. Kendall's
# τ (“tau”) and Spearman's rank correlation coefficient compute the
# association between two variables by sorting each variables and
# assigning each observation a rank.
cor(MN$BldgArea, MN$AssessTot, method = "pearson") # Pearson is the default; continuous
## [1] 0.4225
cor(MN$BldgArea, MN$AssessTot, method = "kendall") # Ranked
## [1] 0.61
cor(MN$BldgArea, MN$AssessTot, method = "spearman") # Ranked
## [1] 0.78
Correlation Hypothesis Testing
# SS: cor.test() performs a hypothesis test on observed correlations, it
# tests the null hypothesis that the observed correlation is zero. SS:
# more likely to return significant if sample size is larger or if
# correlation is larger
cor.test(MN$BldgArea, MN$AssessTot)
##
## Pearson's product-moment correlation
##
## data: MN$BldgArea and MN$AssessTot
## t = 97.02, df = 43316, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4148 0.4302
## sample estimates:
## cor
## 0.4225
Normal QQ Plot
# qqnorm() provides a visual test of normality; data should plot on a
# diagonal line if normal
qqnorm(MN$AssessTot, ylim = c(0, 5e+08), main = "Normal Q-Q Plot for Property Assessment Values") # One outlier omitted
# Use qqline() to add a diagonal line; makes it easier to judge
qqline(MN$AssessTot)
Layout Feature
# matrix(data=c(Fig#,Fig#),#rows,#columns) creates a matrix to use for the
# layout
matrx <- matrix(data = c(1, 2), 1, 2)
# layout(matrix) creates a layout for multiple plots based on the matrix
# parameters.
layout(matrx)
# layout.show(n=#ofFigs) displays blank holders for each figure.
layout.show(n = 2)
# Feel free to plot!
plot(MN$YCoord ~ MN$XCoord, xlab = "X Coordinate", ylab = "Y Coordinate", main = "Manhattan Building Locations")
plot(MN$AssessTot, MN$YCoord, xlim = c(0, 5e+08), main = "Value by Y Coord",
ylab = "Y Coordinate (Larger = Further North)", xlab = "Property Assessment Value ($)")
object <- lm(Y ~ X1+X2+…,data=) creates a linear model. We assign the model to an object so that we can extract information from that object later.
data= is for the data frame you're using. Unnecessary if you're diligent about the $ notation.
# Load Data
larain <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/LA_RAIN.csv",
header = TRUE)
names(larain)
## [1] "Year" "APMAM" "APSAB" "APSLAKE" "OPBPC" "OPRC" "OPSLAKE"
## [8] "BSAAM"
opslake <- lm(BSAAM ~ OPSLAKE, data = larain)
After running the model, use extractor function to learn about it.
summary() is the best starting point. It returns most of the data you'd be looking for.
summary(opslake)
##
## Call:
## lm(formula = BSAAM ~ OPSLAKE, data = larain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17604 -5338 332 3411 20876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27015 3219 8.39 1.9e-10 ***
## OPSLAKE 3752 216 17.39 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8920 on 41 degrees of freedom
## Multiple R-squared: 0.881, Adjusted R-squared: 0.878
## F-statistic: 303 on 1 and 41 DF, p-value: <2e-16
names(opslake) # 12 columns!
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
Reading the summary
1) Call:
lm(formula = BSAAM ~ OPSLAKE, data = larain)
This shows you the formula for the model.
2) Residuals:
Min 1Q Median 3Q Max
-17603.8 -5338.0 332.1 3410.6 20875.6
Median should be near 0 (not great here…) and the 1Q and 3Q and min and max should be roughly equal in magnitude (again, ehh…).
3) Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 27014.6 3218.9 8.393 1.93e-10
OPSLAKE 3752.5 215.7 17.394 < 2e-16
The estimate provides the magnitude of the relationship. The standard error is reported (rather than standard deviation) because the coefficient is an estimated value for the real relationship. The t statistic and p-value are the t-test performed to determine if the coefficient (and intercept) are different from 0 (i.e. to determine if the relationship estimated really exists).
4) Residual standard error: 8922 on 41 degrees of freedom
This is the variation of the residuals about the regression line.
5) Multiple R-squared: 0.8807, Adjusted R-squared: 0.8778
The R-squared value is percentage of variation in y that can be explained by the model. Adjusted R-squared is more conservative, penalizing the use of more independent variables to increase explanatory power. R2 = 1-(RSS/TSS) = MSS/TSS
6) F-statistic: 302.6 on 1 and 41 DF, p-value: < 2.2e-16
This tests whether the entire model is greater than 0. Not interesting if there's only one coefficient because then it's just redundant.
To make it quadratic, you have to include the I() function, which indicates another independent variable.
Use anova(model1,model2) to determine if two models are different. If significant, this means the better model really better.
opslake2 <- lm(BSAAM ~ OPSLAKE + I(OPSLAKE^2), data = larain)
summary(opslake2) # adj. R2 = 0.8755 -- note, this is also an ANOVA
##
## Call:
## lm(formula = BSAAM ~ OPSLAKE + I(OPSLAKE^2), data = larain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18060 -5452 290 3790 20250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23800.2 7089.2 3.36 0.0017 **
## OPSLAKE 4220.2 942.3 4.48 6.1e-05 ***
## I(OPSLAKE^2) -14.0 27.4 -0.51 0.6128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9000 on 40 degrees of freedom
## Multiple R-squared: 0.881, Adjusted R-squared: 0.876
## F-statistic: 149 on 2 and 40 DF, p-value: <2e-16
anova(opslake, opslake2) # This ANOVA is comparing two models. So it's a different sort of ANOVA.
## Analysis of Variance Table
##
## Model 1: BSAAM ~ OPSLAKE
## Model 2: BSAAM ~ OPSLAKE + I(OPSLAKE^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 41 3.26e+09
## 2 40 3.24e+09 1 21097409 0.26 0.61
Use regsubsets to select the best regression models (linear or otherwise)
# regsubsets(DEPENDENT~.,data= data.frame) Using the period after the ~
# means that you're ogin to try ALL other variables as independent
# variables.
library(HH) # Need HH loaded.
## Loading required package: lattice
## Loading required package: grid
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: splines
## Loading required package: leaps
## Loading required package: RColorBrewer
## Loading required package: latticeExtra
subsetsmodel <- regsubsets(BSAAM ~ ., data = larain)
summary(subsetsmodel)
## Subset selection object
## Call: regsubsets.formula(BSAAM ~ ., data = larain)
## 7 Variables (and intercept)
## Forced in Forced out
## Year FALSE FALSE
## APMAM FALSE FALSE
## APSAB FALSE FALSE
## APSLAKE FALSE FALSE
## OPBPC FALSE FALSE
## OPRC FALSE FALSE
## OPSLAKE FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: exhaustive
## Year APMAM APSAB APSLAKE OPBPC OPRC OPSLAKE
## 1 ( 1 ) " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " "*" " " " " "*"
## 3 ( 1 ) " " " " " " "*" " " "*" "*"
## 4 ( 1 ) "*" " " " " "*" " " "*" "*"
## 5 ( 1 ) "*" " " "*" "*" " " "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
# The asterisks indicate the variables to use.
coef(subsetsmodel, 1:3) # See the results coefficients.
## [[1]]
## (Intercept) OPSLAKE
## 27015 3752
##
## [[2]]
## (Intercept) APSLAKE OPSLAKE
## 19145 1769 3690
##
## [[3]]
## (Intercept) APSLAKE OPRC OPSLAKE
## 15425 1712 1797 2390
names(subsetsmodel) # See the names of variables stored in the regsubsets object.
## [1] "np" "nrbar" "d" "rbar" "thetab"
## [6] "first" "last" "vorder" "tol" "rss"
## [11] "bound" "nvmax" "ress" "ir" "nbest"
## [16] "lopt" "il" "ier" "xnames" "method"
## [21] "force.in" "force.out" "sserr" "intercept" "lindep"
## [26] "nullrss" "nn" "call"
Reading a Poly Shapefile
# This is loaded at a spatial polygon data frame, which means I can
# display it as a map.
library(sp)
library(maptools)
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)
USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
plot(USA)
List the pieces of the shapefile using slotNames()
# This can be useful especially if there are a lot of fields for the data.
slotNames(USA)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
The @data function
summary(USA)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -124.73 -66.97
## y 24.96 49.37
## Is projected: NA
## proj4string : [NA]
## Data attributes:
## SP_ID NAME STATE_NAME STATE_FIPS
## 0 : 1 Washington: 32 Texas : 254 48 : 254
## 1 : 1 Jefferson : 26 Georgia : 159 13 : 159
## 10 : 1 Franklin : 25 Virginia: 136 51 : 136
## 100 : 1 Jackson : 24 Kentucky: 120 21 : 120
## 1000 : 1 Lincoln : 24 Missouri: 115 29 : 115
## 1001 : 1 Madison : 20 Kansas : 105 20 : 105
## (Other):3105 (Other) :2960 (Other) :2222 (Other):2222
## CNTY_FIPS FIPS AREA FIPS_num
## 001 : 48 01001 : 1 Min. : 2 Min. : 1001
## 003 : 48 01003 : 1 1st Qu.: 435 1st Qu.:19048
## 005 : 48 01005 : 1 Median : 622 Median :29217
## 009 : 47 01007 : 1 Mean : 965 Mean :30699
## 007 : 46 01009 : 1 3rd Qu.: 931 3rd Qu.:46012
## 011 : 46 01011 : 1 Max. :20175 Max. :56045
## (Other):2828 (Other):3105
## Bush Kerry County_F Nader
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 2926 1st Qu.: 1778 1st Qu.:19042 1st Qu.: 0
## Median : 6357 Median : 4041 Median :29211 Median : 14
## Mean : 19055 Mean : 17940 Mean :30656 Mean : 145
## 3rd Qu.: 15894 3rd Qu.: 10418 3rd Qu.:46008 3rd Qu.: 67
## Max. :954764 Max. :1670341 Max. :56045 Max. :13251
##
## Total Bush_pct Kerry_pct Nader_pct
## Min. : 0 Min. : 0.0 Min. : 0.0 Min. :0.000
## 1st Qu.: 4808 1st Qu.:52.7 1st Qu.:30.2 1st Qu.:0.000
## Median : 10407 Median :61.2 Median :38.5 Median :0.302
## Mean : 37140 Mean :60.6 Mean :38.9 Mean :0.401
## 3rd Qu.: 26552 3rd Qu.:69.4 3rd Qu.:46.8 3rd Qu.:0.633
## Max. :2625105 Max. :92.8 Max. :90.0 Max. :4.467
##
## MDratio pcturban pctfemhh pcincome
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 37.3 1st Qu.: 0.0 1st Qu.: 9.6 1st Qu.:15466
## Median : 65.6 Median : 33.4 Median :12.2 Median :17448
## Mean : 93.0 Mean : 35.3 Mean :13.0 Mean :17788
## 3rd Qu.: 117.5 3rd Qu.: 56.5 3rd Qu.:15.4 3rd Qu.:19818
## Max. :2189.5 Max. :100.0 Max. :41.1 Max. :58096
##
## pctpoor pctcoled unemploy homevalu
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.:11.0 1st Qu.: 9.0 1st Qu.: 3.90 1st Qu.: 35850
## Median :15.1 Median :11.6 Median : 5.30 Median : 44400
## Mean :16.5 Mean :13.1 Mean : 5.87 Mean : 52015
## 3rd Qu.:20.4 3rd Qu.:15.3 3rd Qu.: 7.20 3rd Qu.: 58600
## Max. :63.1 Max. :53.4 Max. :37.90 Max. :500001
##
## popdens Obese Noins HISP_LAT
## Min. : 0 Min. :0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 15 1st Qu.:0.320 1st Qu.:0.100 1st Qu.: 0.90
## Median : 39 Median :0.340 Median :0.120 Median : 1.80
## Mean : 194 Mean :0.335 Mean :0.129 Mean : 6.18
## 3rd Qu.: 93 3rd Qu.:0.360 3rd Qu.:0.150 3rd Qu.: 5.10
## Max. :53801 Max. :0.630 Max. :0.410 Max. :97.50
##
## MEDAGE2000 PEROVER65
## Min. : 0.0 Min. : 0.0
## 1st Qu.:35.2 1st Qu.:12.1
## Median :37.4 Median :14.4
## Mean :37.3 Mean :14.8
## 3rd Qu.:39.8 3rd Qu.:17.1
## Max. :54.3 Max. :34.7
##
# In general, use the @data to see just data.
summary(USA@data)
## SP_ID NAME STATE_NAME STATE_FIPS
## 0 : 1 Washington: 32 Texas : 254 48 : 254
## 1 : 1 Jefferson : 26 Georgia : 159 13 : 159
## 10 : 1 Franklin : 25 Virginia: 136 51 : 136
## 100 : 1 Jackson : 24 Kentucky: 120 21 : 120
## 1000 : 1 Lincoln : 24 Missouri: 115 29 : 115
## 1001 : 1 Madison : 20 Kansas : 105 20 : 105
## (Other):3105 (Other) :2960 (Other) :2222 (Other):2222
## CNTY_FIPS FIPS AREA FIPS_num
## 001 : 48 01001 : 1 Min. : 2 Min. : 1001
## 003 : 48 01003 : 1 1st Qu.: 435 1st Qu.:19048
## 005 : 48 01005 : 1 Median : 622 Median :29217
## 009 : 47 01007 : 1 Mean : 965 Mean :30699
## 007 : 46 01009 : 1 3rd Qu.: 931 3rd Qu.:46012
## 011 : 46 01011 : 1 Max. :20175 Max. :56045
## (Other):2828 (Other):3105
## Bush Kerry County_F Nader
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 2926 1st Qu.: 1778 1st Qu.:19042 1st Qu.: 0
## Median : 6357 Median : 4041 Median :29211 Median : 14
## Mean : 19055 Mean : 17940 Mean :30656 Mean : 145
## 3rd Qu.: 15894 3rd Qu.: 10418 3rd Qu.:46008 3rd Qu.: 67
## Max. :954764 Max. :1670341 Max. :56045 Max. :13251
##
## Total Bush_pct Kerry_pct Nader_pct
## Min. : 0 Min. : 0.0 Min. : 0.0 Min. :0.000
## 1st Qu.: 4808 1st Qu.:52.7 1st Qu.:30.2 1st Qu.:0.000
## Median : 10407 Median :61.2 Median :38.5 Median :0.302
## Mean : 37140 Mean :60.6 Mean :38.9 Mean :0.401
## 3rd Qu.: 26552 3rd Qu.:69.4 3rd Qu.:46.8 3rd Qu.:0.633
## Max. :2625105 Max. :92.8 Max. :90.0 Max. :4.467
##
## MDratio pcturban pctfemhh pcincome
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 37.3 1st Qu.: 0.0 1st Qu.: 9.6 1st Qu.:15466
## Median : 65.6 Median : 33.4 Median :12.2 Median :17448
## Mean : 93.0 Mean : 35.3 Mean :13.0 Mean :17788
## 3rd Qu.: 117.5 3rd Qu.: 56.5 3rd Qu.:15.4 3rd Qu.:19818
## Max. :2189.5 Max. :100.0 Max. :41.1 Max. :58096
##
## pctpoor pctcoled unemploy homevalu
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.:11.0 1st Qu.: 9.0 1st Qu.: 3.90 1st Qu.: 35850
## Median :15.1 Median :11.6 Median : 5.30 Median : 44400
## Mean :16.5 Mean :13.1 Mean : 5.87 Mean : 52015
## 3rd Qu.:20.4 3rd Qu.:15.3 3rd Qu.: 7.20 3rd Qu.: 58600
## Max. :63.1 Max. :53.4 Max. :37.90 Max. :500001
##
## popdens Obese Noins HISP_LAT
## Min. : 0 Min. :0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 15 1st Qu.:0.320 1st Qu.:0.100 1st Qu.: 0.90
## Median : 39 Median :0.340 Median :0.120 Median : 1.80
## Mean : 194 Mean :0.335 Mean :0.129 Mean : 6.18
## 3rd Qu.: 93 3rd Qu.:0.360 3rd Qu.:0.150 3rd Qu.: 5.10
## Max. :53801 Max. :0.630 Max. :0.410 Max. :97.50
##
## MEDAGE2000 PEROVER65
## Min. : 0.0 Min. : 0.0
## 1st Qu.:35.2 1st Qu.:12.1
## Median :37.4 Median :14.4
## Mean :37.3 Mean :14.8
## 3rd Qu.:39.8 3rd Qu.:17.1
## Max. :54.3 Max. :34.7
##
# The only difference for the summary function is that if I use @data, it
# adds the coordinates and projection.
Color Brewer Palettes
# Color Brewer links with the shapefile to make thematic maps.
display.brewer.all() # Displays all the colors available
# Use brewer.pal(#,'Type') to create a palette
pal7 <- brewer.pal(7, "Spectral") # Makes a 7-color spectral palette
display.brewer.pal(7, "Spectral") # This displays the colors
pal7 # Returns esoteric list of color codes. Not helpful
## [1] "#D53E4F" "#FC8D59" "#FEE08B" "#FFFFBF" "#E6F598" "#99D594" "#3288BD"
Classifying Data using classIntervals() and findColours()
# SS: Create a column that holds the percent of all votes that went to
# G.W. Bush in 2004.
USA$BushPct <- USA$Bush/USA$Total
# Create the classification intervals using classIntervals(DATA, n=
# #OFCLASSES, style='STYLE') Styles are the basics like
# 'quantile','equal','fixed','sd','jenks', and apparently 'pretty'
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
## Warning: var has missing values, omitted in finding classes
cats7 # Returns the bins for each classification.
## style: quantile
## [0.09308,0.4746) [0.4746,0.5415) [0.5415,0.5907) [0.5907,0.6336)
## 444 444 444 444
## [0.6336,0.6801) [0.6801,0.7421) [0.7421,0.9283]
## 444 444 444
# Link the color pallette to the categories using
# findColours(CATEGORIES,PALETTE)
SevenColors <- findColours(cats7, pal7)
# Draw a map using specificed data and colors
plot(USA, col = SevenColors) # The col= argument provides the coloring scheme.
Multiple regression is very similar to simple linear regression. The only difference is that you can add independent variables using the plus sign. These show up in the ANOVA from the summary() function and are read just like the coefficient in a single linear regression.
# Example: This linear model uses the percentage of the county population
# that is urban and the percentage that is 'poor', the percentage of
# households that are female-headed, and the median age in 2000. Null
# Hypothesis: The model does not explain the variation in perctage vote
# for George Bush better than the mean percentage.
lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + MEDAGE2000, USA)
summary(lm1) # The adj. R^2 is 0.2402
##
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + MEDAGE2000,
## data = USA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5688 -0.0732 0.0108 0.0807 0.2666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.76e-01 2.40e-02 32.33 <2e-16 ***
## pcturban -6.05e-05 8.35e-05 -0.72 0.469
## pctfemhh -1.32e-02 5.10e-04 -25.95 <2e-16 ***
## pctpoor 2.76e-03 3.29e-04 8.38 <2e-16 ***
## MEDAGE2000 -1.10e-03 5.57e-04 -1.98 0.048 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.11 on 3103 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.241, Adjusted R-squared: 0.24
## F-statistic: 247 on 4 and 3103 DF, p-value: <2e-16
Extracting Values
# Residuals and predicted values can be extracted using resid() and
# predict(), respectively, or they can be recalled using MODEL$residuals
# and MODEL$fitted.values.
lm1.resid <- resid(lm1) # Extracting Residuals
lm1.predicted <- predict(lm1) # Extracting Predicted Values
Paneling Graphics: par(mfrow = c( nrow, ncol)) partitions the plot area where nrow is the number of rows and ncol is the number of columns
par(mfrow = c(2, 2))
plot(lm1)
Transformations
CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/co2_LAB1.csv",
header = T)
mauna <- subset(CO2, site == 0)
jolla <- subset(CO2, site == 1)
## The ladder() function shows an array of possible transformations for
## both variables -- the aim being to improve normality. No need to
## memorize formulae because the orange and green headings tell you what
## to do to achieve a particular distribution.
library(HH)
ladder(co2 ~ time, CO2)
Shapiro-Wilk Test
# Use to test for normality. Null Hypotheses: The distributions of these
# data are normal. Alternative Hypotheses: The distributions of these
# data are not normal.
shapiro.test(CO2$co2) # Significance indicates distribution is not normal.
##
## Shapiro-Wilk normality test
##
## data: CO2$co2
## W = 0.963, p-value = 1.202e-14
# p-value indicates the probability of having the given test statistic if
# the distirbution were normal... one of the few times you dislike
# rejecting the null hypothesis.
Wilcoxon Rank Test
# If data cannot be normally distributed, use wilcox, which acts like a
# t-test, only with ranked data. (Outliers no longer have a major
# impact.) If paired=T, performs a paired t-test, where corresponding
# instances (same row #) are compared. Appropriate, for example, when you
# have one hundred sites tested at time 1 and time 2. Site A at time one
# should be compared to site A at time two. Null Hypothesis: The average
# CO2 concentration is the same at each site and any variation is due to
# random chance. Alternative Hypothesis: The average CO2 concentration is
# different at La Jolla from Mauna Loa.
wilcox.test(mauna$co2, jolla$co2, paired = T) # Therefore, there is justification for using one site over the other. Since Mauna Loa is considered more favorable, only Mauna Loa will be used for to answer questions 2 & 3.
##
## Wilcoxon signed rank test with continuity correction
##
## data: mauna$co2 and jolla$co2
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0
summary(jolla$co2 - mauna$co2) # Mean difference of 0.9496 ppm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.84 -0.30 1.92 0.95 2.60 4.81
Adding Lines to a Model Plot
abline
lm1 <- lm(co2 ~ time, mauna)
plot(mauna$co2 ~ mauna$time, main = "Fit for Linear Regression Model", xlab = "Time (years)",
ylab = "CO2 concentration (ppm)")
abline(coef(lm1), col = "blue", lwd = 3)
# abline(coef(MODEL)) is good if you have a linear regression. It looks
# for a slope and an intercept.
curve
mauna$cent <- mauna$time - mean(mauna$time)
lm2 <- lm(co2 ~ cent + I(cent^2), mauna)
plot(co2 ~ cent, mauna, main = "CO2 Concentration Regression Models", xlab = "Centered Years (1969-2007)",
ylab = "CO2 Concentration (ppm)", cex = 0.6)
curve(predict(lm2, data.frame(cent = x), type = "resp"), add = TRUE, col = "red",
lwd = 3)
# curve(predict(MODEL,data.frame(XVARIABLE=x),type='resp'),add=T) looks a
# bit complicated. USe it if your regression model is nonlinear. The one
# trick is that you need to define the x variables using the same
# independent variable (and data frame) as when you made the model.
lines
CO2$cent <- CO2$time - mean(CO2$time)
lm3 <- lm(co2 ~ cent + I(cent^2) + site, CO2)
plot(CO2$co2 ~ CO2$cent, pch = CO2$site, cex = 0.6, main = "Fits for Quadratic Split Model",
xlab = "Centered Years (1969-2007)", ylab = "CO2 Concentration (ppm)")
lines(predict(lm3) ~ cent, data = CO2, subset = site == 0, col = "red", lwd = 2)
lines(predict(lm3) ~ cent, data = CO2, subset = site == 1, col = "blue", lwd = 2)
# lines(predict(MODEL)~X-VARIABLE, data= X-DATAFRAME, subset =
# DUMMYVARIABLE == ##) is useful when you have a dummy variable and want
# to draw a separate line for each subset. Like with curve, you have to
# include the x variable from the dataframe you made the model from. Also
# note you have to use subset = DUMMYVARIABLE == ##, which involves a
# logical argument.
legend("bottomright", c("Mauna Loa", "La Jolla"), lty = c(1, 1), col = c("red",
"blue"))
bd <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/babybirthdata.csv")
# MAGE = mother's age, casegrp: 0 = no defect, 1 = defects
# Logistic Regression is a type of generalized linear model. Use it when
# the outcome is categorical (regardless of independent variable). The
# function is glm(DVAR ~ IVARs, family=binomial('logit'))
bdlog <- glm(bd$casegrp ~ bd$MAGE, family = binomial("logit"))
# Use glm instead of lm for generalized linear model Use
# family=binomial('logit') to specify a logit test. Other steps work the
# same as linear regression.
plot(casegrp ~ MAGE, data = bd) # Our normal plots are not helpful b/c we only have two possible outcomes per x value -- can't see the # of instances for each x
plot(jitter(casegrp) ~ MAGE, data = bd) # jitter adds some randomness to the y values... gives a better sense of spread -- but Seth doesn't like it
plot(bd$MAGE, fitted(glm(casegrp ~ MAGE, binomial, data = bd)), xlab = "Maternal Age",
ylab = "P(Birth Defect)", pch = 15) # Plots the pi hat i (probability of a y=1) on the y-axis; this is actually useful
summary(bdlog)
##
## Call:
## glm(formula = bd$casegrp ~ bd$MAGE, family = binomial("logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.567 -0.240 -0.147 -0.089 3.610
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.837 0.328 2.56 0.011 *
## bd$MAGE -0.199 0.015 -13.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2096.3 on 11750 degrees of freedom
## AIC: 2100
##
## Number of Fisher Scoring iterations: 8
# We need to turn the estimate into an odds ratio or probability value in
# order to interpret them. We can get probaiblity equation using the
# intercept (b0) and independent variable (b1)
# Calculating an odds ratio: This is a fairly simple operation. All you
# need is the exp() function and the estimate from logistic regression.
library(MASS)
exp(-0.19872) # This returns a value less than one, indicating that probability goes down
## [1] 0.8198
# this means that for every 1-year increase in MAGE, the probability of a
# birth defect goes down by ~18% (since (OR-1)*100% = % increase and OR =
# 0.820) Note that when OR > 1 there is a positive relationship, when OR <
# 1 there is a negative relationship. To obtain a confidence interval,
# add the function confint():
exp(confint(bdlog)) # returns a range of 0.795 to 0.844 of the odds ratio
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.2214 4.4155
## bd$MAGE 0.7955 0.8438
# The default is a 95% confidence interval.
bd$magecat3 <- ifelse(bd$MAGE > 25, c(1), c(0)) # Older than 25 (1 is true, 0 is false)
bd$magecat2 <- ifelse(bd$MAGE >= 20 & bd$MAGE <= 25, c(1), c(0)) # Aged 20 to 25
bd$magecat1 <- ifelse(bd$MAGE < 20, c(1), c(0)) # Younger than 20
# Fit a model with new catergories:
bdlog2 <- glm(casegrp ~ magecat1 + magecat2, data = bd, binomial)
# Remember: always drop one of the dummy variable cats; otherwise, it
# gives NA as the coeff
summary(bdlog2)
##
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2, family = binomial,
## data = bd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.378 -0.232 -0.106 -0.106 3.222
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.186 0.167 -31.0 < 2e-16 ***
## magecat1 2.582 0.196 13.2 < 2e-16 ***
## magecat2 1.582 0.195 8.1 5.4e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2113.7 on 11749 degrees of freedom
## AIC: 2120
##
## Number of Fisher Scoring iterations: 8
# Our odds ratios will be in comparison to anyone older than 25:
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.005594 0.003956 0.007633
## magecat1 13.228246 9.092820 19.682178
## magecat2 4.863064 3.350905 7.219970
# Tells us that women in magecat1 are 13 times more likely to have a birth
# defect than the reference group (over 25) Tells us that women in
# magecat2 are 5 times more likely to have a birth defect than the
# reference group (over 25)
# Calculating Pi i: This involves reading the summary table estimates and
# plugging them into the equation exp(b0 + biXi...)/1+exp(b0 + biXi...)
exp(-5.1861 + (2.5824 * (1)) + (1.5817 * (0)))/(1 + exp(-5.1861 + (2.5824 *
(1)) + (1.5817 * (0)))) # for under age 20
## [1] 0.0689
# Note that above, I multipled b1 by x1 and b2 by x2... and for women
# under 20, x1 = magecat1 = 1 and x2 = magecat2 = 0 Returns 0.0689
exp(-5.1861 + (2.5824 * (0)) + (1.5817 * (1)))/(1 + exp(-5.1861 + (2.5824 *
(0)) + (1.5817 * (1)))) # for under age 20 to 25
## [1] 0.02648
# Returns: 0.0265
exp(-5.1861 + 2.5824 * (0) + 1.5817 * (0))/(1 + exp(-5.1861 + 2.5824 * (0) +
1.5817 * (0))) # for over age 25
## [1] 0.005563
# Returns: 0.0055
# Significance Testing
bd.log <- glm(casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, binomial,
data = bd)
# Wald's Test shows us whether a particular independent variable explains
# a part of the variation in the dependent variable (Null Hypothesis = No)
# Deviance Test is like the F-test for normal regression. It tests whether
# the model overall has explanatory value (Null Hypothesis = None)
summary(bd.log)
##
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2 + bthparity2 + smoke,
## family = binomial, data = bd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.510 -0.247 -0.121 -0.091 3.315
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.907 0.188 -26.13 < 2e-16 ***
## magecat1 2.257 0.208 10.87 < 2e-16 ***
## magecat2 1.432 0.198 7.25 4.3e-13 ***
## bthparity2 -0.582 0.151 -3.85 0.00012 ***
## smoke 0.676 0.155 4.36 1.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2319.1 on 11739 degrees of freedom
## Residual deviance: 2078.1 on 11735 degrees of freedom
## (12 observations deleted due to missingness)
## AIC: 2088
##
## Number of Fisher Scoring iterations: 8
# This changes the odds ratios
exp(cbind(OR = coef(bd.log), CI = confint(bd.log)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.007393 0.005033 0.01052
## magecat1 9.555339 6.419046 14.52035
## magecat2 4.185062 2.869565 6.24008
## bthparity2 0.558605 0.413840 0.74906
## smoke 1.965743 1.440422 2.64938
# Ex. maybe some of the differnce b/wn cat 1 and cat 3 was that there were
# more smokers in cat 1
anova(bd.log, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: casegrp
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11739 2319
## magecat1 1 134.2 11738 2185 < 2e-16 ***
## magecat2 1 77.1 11737 2108 < 2e-16 ***
## bthparity2 1 12.4 11736 2095 0.00042 ***
## smoke 1 17.2 11735 2078 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This is giving us the bare model and adds an indepedent variable one by
# one... cumulative -- the ORDER MATTERS the anova() function is smart
# enough to know a logistic regression from a linear regression
anova(bd.log, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: casegrp
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11739 2319
## magecat1 1 134.2 11738 2185 < 2e-16 ***
## magecat2 1 77.1 11737 2108 < 2e-16 ***
## bthparity2 1 12.4 11736 2095 0.00042 ***
## smoke 1 17.2 11735 2078 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This one is the same idea, just a different method.
drop1(bd.log, test = "LRT")
## Single term deletions
##
## Model:
## casegrp ~ magecat1 + magecat2 + bthparity2 + smoke
## Df Deviance AIC LRT Pr(>Chi)
## <none> 2078 2088
## magecat1 1 2216 2224 137.9 < 2e-16 ***
## magecat2 1 2139 2147 60.7 6.6e-15 ***
## bthparity2 1 2093 2101 15.4 8.9e-05 ***
## smoke 1 2095 2103 17.2 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Can also try this backwards by starting with all independent variables
# and dropping one at a time.
# The vif(MODEL) tests for multicollinearity in a multiple regression. A
# mean value of 10 is bad and merits exploring the removal of a variable.
library(foreign)
Elect04 <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/2004_Election_Counties.dbf")
m8a <- lm(Bush_pct ~ ASIA + AIAN + UNDER18 + ginirev + unemploy + pctcoled +
pctfemhh + Nader_pct, data = Elect04)
library(car)
## Loading required package: nnet
## Attaching package: 'car'
## The following object(s) are masked from 'package:HH':
##
## logit, vif
m8a_vif <- vif(m8a)
summary(m8a_vif)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.13 1.17 1.45 1.42 1.65 1.69
# Mean VIF = 1.418 Range if 1.127 to 1.686 So there isn't much concern
# here for multicollinearity.
CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/co2_LAB1.csv",
header = T)
mauna <- subset(CO2, site == 0)
mauna$cent <- mauna$time - mean(mauna$time)
lm2 <- lm(co2 ~ cent + I(cent^2), mauna)
library(lmtest)
## Loading required package: zoo
## Attaching package: 'zoo'
## The following object(s) are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Null Hypothesis: rho = 0; there is only random error and no
# autocorrelation
dwtest(lm2) # DW = 0.3067, p-value < 2.2e16
##
## Durbin-Watson test
##
## data: lm2
## DW = 0.3067, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# This tells us that there is autocorrelation.
acf(lm2$residuals, main = "Correllelogram of lm2 Residuals") # That is some fine seasonality.
# Create a time series using ts()
lm2.ts <- ts(mauna$co2, start = min(mauna$year), frequency = 12)
# start= indicates the first observation time unit. start=min(mauna$year)
# indicates that we should start in the minimum year (1969). frequency=
# indicates the number of observations per unit of time. (In this case we
# have 12.)
head(mauna)
## year month time co2 site cent
## 1 1969 2 1969 324.4 0 -19.42
## 2 1969 3 1969 325.6 0 -19.33
## 3 1969 4 1969 326.7 0 -19.25
## 4 1969 5 1969 327.3 0 -19.17
## 5 1969 6 1970 326.8 0 -19.08
## 6 1969 7 1970 325.9 0 -19.00
# Plot using decompose will separate a time series into trend, seasonal,
# and random parts.
plot(decompose(lm2.ts))
# Cycle() assigns values for each observation of each time unit (the
# seasonal effect).
season <- cycle(lm2.ts) # In this case, we're using months in a time unit of years.
head(season)
## [1] 1 2 3 4 5 6
# Compare to the time() function, which treats observations as fractions
# of the time unit
time <- time(lm2.ts) # In this case,0.083333 for each month
head(time)
## [1] 1969 1969 1969 1969 1969 1969
# Refitting the model with factor(season) as a dummy.
lm3 <- lm(mauna$co2 ~ mauna$cent + I(mauna$cent^2) + factor(season))
summary(lm3) # adj. R^2 = 0.9983
##
## Call:
## lm(formula = mauna$co2 ~ mauna$cent + I(mauna$cent^2) + factor(season))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8937 -0.5283 0.0676 0.4837 1.7820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+02 1.21e-01 2901.81 < 2e-16 ***
## mauna$cent 1.55e+00 2.96e-03 522.78 < 2e-16 ***
## I(mauna$cent^2) 9.92e-03 2.94e-04 33.69 < 2e-16 ***
## factor(season)2 8.01e-01 1.63e-01 4.92 1.2e-06 ***
## factor(season)3 1.92e+00 1.63e-01 11.83 < 2e-16 ***
## factor(season)4 2.30e+00 1.63e-01 14.14 < 2e-16 ***
## factor(season)5 1.63e+00 1.63e-01 10.00 < 2e-16 ***
## factor(season)6 -2.12e-02 1.63e-01 -0.13 0.9
## factor(season)7 -2.18e+00 1.63e-01 -13.38 < 2e-16 ***
## factor(season)8 -3.90e+00 1.63e-01 -23.97 < 2e-16 ***
## factor(season)9 -3.94e+00 1.63e-01 -24.25 < 2e-16 ***
## factor(season)10 -2.79e+00 1.63e-01 -17.16 < 2e-16 ***
## factor(season)11 -1.59e+00 1.63e-01 -9.75 < 2e-16 ***
## factor(season)12 -6.69e-01 1.64e-01 -4.09 5.1e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.718 on 453 degrees of freedom
## Multiple R-squared: 0.998, Adjusted R-squared: 0.998
## F-statistic: 2.13e+04 on 13 and 453 DF, p-value: <2e-16
# Why isn't June significant? I don't know, but the mdoel does fit better.
par(mfrow = c(1, 2))
ts.plot(cbind(lm2.ts, lm3$fitted), lty = 1:2, col = c(1, 3), lwd = c(1, 2),
main = "CO2 Concentration Seasonal Quadratic Model", ylab = "CO2 Concentration (ppm)",
xlab = "Years")
plot(lm3$residuals ~ lm3$fitted.values, main = "Residuals for Seasonal Quadratic Model",
xlab = "Fitted Values (ppm)", ylab = "Residuals")
# The Cochrane-Orcutt Procedure attempts to correct for the
# autocorrelation. It also returns a value for rho. Note that to read
# the output, you just type the object name; don't use the summary()
# function.
library(orcutt)
lm4 <- cochrane.orcutt(lm2)
lm4 # Gives a rho of 0.8466136; the new model has an adj. R^2 of 0.9995
## $Cochrane.Orcutt
##
## Call:
## lm(formula = YB ~ XB - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.732 -0.898 0.516 0.884 2.175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## XB(Intercept) 3.50e+02 5.34e-01 655.21 <2e-16 ***
## XBcent 1.54e+00 3.20e-02 48.28 <2e-16 ***
## XBI(cent^2) 9.77e-03 3.18e-03 3.08 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.18 on 463 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.25e+05 on 3 and 463 DF, p-value: <2e-16
##
##
## $rho
## [1] 0.8466
##
## $number.interaction
## [1] 1
# Load tools and read in data.
library(maptools)
library(spdep)
## Loading required package: boot
## Attaching package: 'boot'
## The following object(s) are masked from 'package:car':
##
## logit
## The following object(s) are masked from 'package:HH':
##
## logit
## The following object(s) are masked from 'package:survival':
##
## aml
## The following object(s) are masked from 'package:lattice':
##
## melanoma
## Loading required package: Matrix
## Loading required package: nlme
## Loading required package: deldir
## deldir 0.0-21
## Loading required package: coda
## Attaching package: 'coda'
## The following object(s) are masked from 'package:HH':
##
## acfplot
library(rgdal)
## rgdal: version: 0.8-5, (SVN revision 449) Geospatial Data Abstraction
## Library extensions to R successfully loaded Loaded GDAL runtime: GDAL
## 1.9.0, released 2011/12/29 Path to GDAL shared files:
## /Library/Frameworks/R.framework/Versions/2.15/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480] Path to
## PROJ.4 shared files:
## /Library/Frameworks/R.framework/Versions/2.15/Resources/library/rgdal/proj
sids <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/sids2/sids2.shp")
class(sids) # Checkign to see file type
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
# Assignment of a projection using a CRS code:
# http://www.spatialreference.org/ref/
proj4string(sids) <- CRS("+proj=longlat +ellps=WGS84")
# Assigning projections using CRS code:
sids_Albers <- spTransform(sids, CRS("+init=epsg:2163")) # Albers Equal Area
sids_SP <- spTransform(sids, CRS("+init=ESRI:102719")) # NC State Plane
par(mfrow = c(3, 1))
plot(sids, axes = T)
title("WGS84")
plot(sids_SP, axes = T)
title("NC State Plane (feet)")
plot(sids_Albers, axes = T)
title("Albers Equal Area Conic USA\n(seems wrong)")
# Find neighbors using Queen's or Rook's case.
library(spdep)
sids_nbq <- poly2nb(sids) # Queen's Case
sids_nbr <- poly2nb(sids, queen = FALSE) # Rook's Case
# Extract the coordinates for the locations:
coords <- coordinates(sids)
dev.off() #clears the screen and the panels
## null device
## 1
par(mfrow = c(2, 1))
# Plot the links for neighbors:
plot(sids_nbq, coords) # Queen's case -- selects all locations that share an edge or a point with the focus location.
plot(sids, add = T) #Add the location outlines
plot(sids_nbr, coords) # Rook's case -- selects all locations that share an edge with the focus location.
plot(sids, add = T) #Add the location outlines
# Set coords using state plane projection:
coords <- coordinates(sids_SP)
# use knn2nb(knearneigh(COORDS, k=##)) to identify nearest neighbors based
# on the coordinates. k= determines the number of neighbors to select for
# each location. k-nearest neighbor identifies the k nearest neighbors
# (by centroid), irrespective of the actual distance. Only the relative
# distance matters.
sids_kn1 <- knn2nb(knearneigh(coords, k = 1))
sids_kn2 <- knn2nb(knearneigh(coords, k = 2))
sids_kn4 <- knn2nb(knearneigh(coords, k = 4))
# Plotting:
dev.off() #clears the screen and the panels
## null device
## 1
plot(sids_SP)
plot(sids_kn2, coords, add = T) # Adds 2 links for each location.
# Distance-based will select all centroids within a certain distance of
# the focus location. This can result in 0 negihbors. Use the k1-nearest
# neighbor as a starting point if you want each location to have at least
# 1 neighbor. Use unlist(nbdists(K-NEAREST, COORDS))
dist <- unlist(nbdists(sids_kn1, coords)) # Using the State Plane version so that the distances are easier to interpret
summary(dist) # The summary gives a good sense of what might be an appropriate threshold. The maximum distance of a k1 neighbor is the highest you can make the minimum threshold without having a location with 0 neighbors.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40100 89800 97600 96300 107000 135000
max_k1 <- max(dist) # Store the max dist as an object.
# Now you can set various levels. using dnearneigh(COORDS, d1=#,
# d2=#*MAX_K1) d1 is usually 0 d2 if you multiply the k1 max by a number
# under 1, you will end up with some locations that have 0 neighbors.
sids_kd1 <- dnearneigh(coords, d1 = 0, d2 = 0.75 * max_k1)
sids_kd2 <- dnearneigh(coords, d1 = 0, d2 = 1 * max_k1)
sids_kd3 <- dnearneigh(coords, d1 = 0, d2 = 1.5 * max_k1)
# This can also be done by raw distance:
sids_ran1 <- dnearneigh(coords, d1 = 0, d2 = 134600)
# Not all neighbors are equal. Some should be weighted more than others.
# For instance, if location 1 has 3 neighbors and location 2 has 5
# neighbors, the neighbors for location 1 need to be weighted differently
# than those for location 2. This is resolved by row standardization,
# which assigns the appropriate weight depending on how many neighbors a
# location has.
# The function nb2listw(NEIGHBORS_LIST) performs row standardization.
sids_nbq_w <- nb2listw(sids_nbq) # Example with Queen's case neighbors
sids_nbq_w
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 490
## Percentage nonzero weights: 4.9
## Average number of links: 4.9
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 100 10000 100 44.65 410.5
sids_nbq_w$neighbours[1:3] # The neighbours section of the weights object gives you the id #s of all a location's neighbors.
## [[1]]
## [1] 2 18 19
##
## [[2]]
## [1] 1 3 18
##
## [[3]]
## [1] 2 10 18 23 25
sids_nbq_w$weights[1:3] # The weights section of the weights object gives you the decimal weight for each neighbor (the weight is always 1/x when x is the number of neighbors).
## [[1]]
## [1] 0.3333 0.3333 0.3333
##
## [[2]]
## [1] 0.3333 0.3333 0.3333
##
## [[3]]
## [1] 0.2 0.2 0.2 0.2 0.2
# Since row standardization gives more weight to neighbors of lcoations
# with few neighbors, you may prefer binary weights if looking for global
# characteristizations. Binary weights bumps up the many-neighbored
# neighbors compared to row standardization.
sids_nbq_wb <- nb2listw(sids_nbq, style = "B")
sids_nbq_wb
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100
## Number of nonzero links: 490
## Percentage nonzero weights: 4.9
## Average number of links: 4.9
##
## Weights style: B
## Weights constants summary:
## n nn S0 S1 S2
## B 100 10000 490 980 10696
sids_nbq_wb$weights[1:3] # Notice how all the neighbors here are given equal weight!
## [[1]]
## [1] 1 1 1
##
## [[2]]
## [1] 1 1 1
##
## [[3]]
## [1] 1 1 1 1 1
# Islands do not have neighbors in a queen's or rook's case. Use k nearest
# neighbor to rectify, or use 'zero.policy=T' as an argument when
# assigning weights:
sids_nbq_w <- nb2listw(sids_nbq, zero.policy = T)
# Inverse distance weighting bases weights on the distance between
# centroids.
dists <- nbdists(sids_nbq, coordinates(sids_SP))
idw <- lapply(dists, function(x) 1/(x/1000)) # The function can be whatever you want; so you could make more distant neighbors worth more! Although that is usually ridiculous...
sids_nbq_idwb <- nb2listw(sids_nbq, glist = idw, style = "B")
summary(unlist(sids_nbq_idwb$weights))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00412 0.00627 0.00764 0.00804 0.00927 0.02490
# To test spatial dependence, you need: A variable of interest A lagged
# version of that variable (based on the weights determined in step 2)
sids$NWBIR79 #births to non-white mothers
## [1] 19 12 260 145 1197 1237 139 371 844 176 597
## [12] 1369 1058 650 1492 2980 933 222 33 310 491 5
## [23] 76 950 5031 7089 1397 1161 1086 4948 2274 4 2696
## [34] 360 10 1033 6221 3 1305 177 150 941 407 651
## [45] 141 128 483 687 2330 1504 3059 914 1206 1349 62
## [56] 73 1163 406 664 905 576 3073 1453 1729 350 307
## [67] 1151 11631 1203 588 528 264 45 2047 104 2194 79
## [78] 22 1524 277 42 10614 305 1348 1161 1172 169 1227
## [89] 1218 5 2342 1436 3568 6899 487 1023 763 1832 2100
## [100] 841
sids$NWBIR79_lag <- lag(sids_nbq_w, sids$NWBIR79) #lag using row standardized weights
lm1 <- lm(sids$NWBIR79_lag ~ sids$NWBIR79)
# The regression coefficient IS the Moran's I statistic.
summary(lm1)
##
## Call:
## lm(formula = sids$NWBIR79_lag ~ sids$NWBIR79)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1667 -803 -266 644 3036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.21e+03 1.26e+02 9.59 9.2e-16 ***
## sids$NWBIR79 1.50e-01 5.27e-02 2.84 0.0055 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1040 on 98 degrees of freedom
## Multiple R-squared: 0.0761, Adjusted R-squared: 0.0667
## F-statistic: 8.08 on 1 and 98 DF, p-value: 0.00546
plot(y = sids$NWBIR79_lag, x = sids$NWBIR79, ylab = "Births to non-white mothers lag",
xlab = "Births to non-white mothers")
title("The relationship between a variable \n and the lag of itself is Moran’s I")
## Warning: conversion failure on ' and the lag of itself is Moran’s I' in
## 'mbcsToSbcs': dot substituted for <e2>
## Warning: conversion failure on ' and the lag of itself is Moran’s I' in
## 'mbcsToSbcs': dot substituted for <80>
## Warning: conversion failure on ' and the lag of itself is Moran’s I' in
## 'mbcsToSbcs': dot substituted for <99>
## Alternate way of getting Moran's I: Find the Moran's I value using
## moran.test(VARIABLE, listw=WEIGHTSLIST, zero.policy=T) variable is the
## variable you wish to test for autocorrelation listw= is the weight list
## object zero.policy=T is used to protect from islands (units without a
## nearest neighbor)
moran.test(sids$NWBIR79, listw = sids_nbq_w, zero.policy = T)
##
## Moran's I test under randomisation
##
## data: sids$NWBIR79
## weights: sids_nbq_w
##
## Moran I statistic standard deviate = 2.613, p-value = 0.004487
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.149686 -0.010101 0.003739
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.
# Load libraries
library(spdep)
library(classInt)
library(gstat)
# Load soco dataset with load() function Automatically creates an object
# called soco
load("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/soco.rda")
# Find neighbors for each location and save them to an object.
soco_nbq <- poly2nb(soco) # Queen’s neighborhood method
# Create the weight matrix for the neighbors
soco_nbq_w <- nb2listw(soco_nbq)
# Moran's I Test
moran.test(soco$PERPOV, listw = soco_nbq_w, zero.policy = T)
##
## Moran's I test under randomisation
##
## data: soco$PERPOV
## weights: soco_nbq_w
##
## Moran I statistic standard deviate = 26.96, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.4346902 -0.0007215 0.0002609
# Null Hypothesis: There is no spatial autocorrelation for this variable.
# Alt Hypothesis: There is spatial autocorrelation for this variable.
# Simulations of Moran's I statistic using moran.mc listw= is for the
# weight matrix object nsim= is for the # of simulations to include.
# Always make it 99, 999, 9999, etc. because we're adding in the real
# world Moran's I to the list as well. We want a round number in the end
# to make the p-value intuitive.
socoMoranSim <- moran.mc(soco$PERPOV, listw = soco_nbq_w, nsim = 999)
socoMoranSim
##
## Monte-Carlo simulation of Moran's I
##
## data: soco$PERPOV
## weights: soco_nbq_w
## number of simulations + 1: 1000
##
## statistic = 0.4347, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
names(socoMoranSim) # socoMoranSim$res is the Moran's I result
## [1] "statistic" "parameter" "p.value" "alternative" "method"
## [6] "data.name" "res"
# Plot the Moran's I Simulation Results
hist(socoMoranSim$res, breaks = 50)
# Moran's I Plot Quadrants Use moran.plot(x variable, lag x variable,
# labels=as.character(label)) This plot will automatically create labels,
# so you might as well assign something intuitive.
socoMoranPlot <- moran.plot(soco$PPOV, soco_nbq_w, labels = as.character(soco$CNTY_ST),
xlab = "Percent Poverty", ylab = "Lag of Percent Poverty")
# Calculate the local Moran's I statistic using localmoran(DATA, WEIGHTS)
# where WEIGHTS is the list of weighted neighbors.
soco_locm <- localmoran(soco$PPOV, soco_nbq_w)
summary(soco_locm)
## Ii E.Ii Var.Ii Z.Ii
## Min. :-1.778 Min. :-0.000722 Min. :0.0901 Min. :-3.983
## 1st Qu.: 0.012 1st Qu.:-0.000722 1st Qu.:0.1420 1st Qu.: 0.029
## Median : 0.219 Median :-0.000722 Median :0.1658 Median : 0.517
## Mean : 0.589 Mean :-0.000722 Mean :0.1861 Mean : 1.408
## 3rd Qu.: 0.746 3rd Qu.:-0.000722 3rd Qu.:0.1990 3rd Qu.: 1.761
## Max. : 9.335 Max. :-0.000722 Max. :0.9981 Max. :18.977
## Pr(z > 0)
## Min. :0.0000
## 1st Qu.:0.0391
## Median :0.3025
## Mean :0.2900
## 3rd Qu.:0.4886
## Max. :1.0000
## Manually make a moran plot Standardize variables using the scale()
## function; save to a new column
soco$sPPOV <- scale(soco$PPOV)
# Create a lagged variable using the lag.listw(WEIGHTS, SCALED_DATA)
# function. Note: 'lag' variable means the values of the neighbors of the
# variable under consideration.
soco$lag_sPPOV <- lag.listw(soco_nbq_w, soco$sPPOV)
summary(soco$sPPOV)
## V1
## Min. :-2.151
## 1st Qu.:-0.700
## Median :-0.123
## Mean : 0.000
## 3rd Qu.: 0.591
## Max. : 4.062
summary(soco$lag_sPPOV)
## V1
## Min. :-1.9925
## 1st Qu.:-0.5298
## Median :-0.0762
## Mean : 0.0085
## 3rd Qu.: 0.4514
## Max. : 2.6478
# Plot the lag v. the scaled variable.
plot(x = soco$sPPOV, y = soco$lag_sPPOV, main = "Moran Scatterplot PPOV", xlab = "Scaled % Poverty",
ylab = "Scaled Lag of % Poverty")
abline(h = 0, v = 0) # Divides the plot into quadrants
abline(lm(soco$lag_sPPOV ~ soco$sPPOV), lty = 3, lwd = 4, col = "red") # linear model used to find line of best fit.
# Investigate outliers by clicking on one or two and then hitting escape
# (or click finish)
identify(soco$sPPOV, soco$lag_sPPOV, soco$CNTY_ST, cex = 0.8)
## integer(0)
## Quadrant I: high poverty counties surrounded by high poverty counties.
## Quadrant II: low poverty counties surrounded by high poverty counties.
## Quadrant III: low poverty counties surrounded by low poverty counties.
## Quadrant IV: high poverty counties surrounded by low poverty counties.
# Identify the Moran plot quadrant for each observation
soco$quad_sig <- NA # Start by creating a new column in the original data frame.
# Assign the appropriate quadrant based on whether the scaled and lag
# values are positive or negative. If not significant, assign a 5.
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV >= 0) & (soco_locm[, 5] <= 0.05),
"quad_sig"] <- 1
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV <= 0) & (soco_locm[, 5] <= 0.05),
"quad_sig"] <- 2
soco@data[(soco$sPPOV >= 0 & soco$lag_sPPOV <= 0) & (soco_locm[, 5] <= 0.05),
"quad_sig"] <- 3
soco@data[(soco$sPPOV <= 0 & soco$lag_sPPOV >= 0) & (soco_locm[, 5] <= 0.05),
"quad_sig"] <- 4
soco@data[(soco_locm[, 5] > 0.05), "quad_sig"] <- 5
# Presenting the results (from Paul Voss) Set the breaks for the thematic
# map classes
breaks <- seq(1, 5, 1)
# Set the corresponding labels for the thematic map classes
labels <- c("High-High", "Low-Low", "High-Low", "Low-High", "Not Signif.")
# see ?findInterval - This is necessary for making a map
np <- findInterval(soco$quad_sig, breaks)
# Assign colors to each map class
colors <- c("red", "blue", "lightpink", "skyblue2", "white")
plot(soco, col = colors[np]) #colors[np] manually sets the color for each county
mtext("Local Moran's I\n% Poverty in Southern Counties", cex = 1.5, side = 3,
line = 1)
legend("topleft", legend = labels, fill = colors, bty = "n")
# Correlelogram
cor10 <- sp.correlogram(soco_nbq, soco$PPOV, order = 10, method = "I")
cor10 # Notice how the values for Moran's I go from significantly positive to significantly negative, crossing 0 around a lag of 8.
## Spatial correlogram for soco$PPOV
## method: Moran's I
## estimate expectation variance standard deviate Pr(I) two sided
## 1 (1387) 5.89e-01 -7.22e-04 2.61e-04 36.54 < 2e-16
## 2 (1387) 4.58e-01 -7.22e-04 1.30e-04 40.20 < 2e-16
## 3 (1387) 3.34e-01 -7.22e-04 8.85e-05 35.60 < 2e-16
## 4 (1387) 2.40e-01 -7.22e-04 7.03e-05 28.69 < 2e-16
## 5 (1387) 1.48e-01 -7.22e-04 5.90e-05 19.39 < 2e-16
## 6 (1387) 8.35e-02 -7.22e-04 5.03e-05 11.88 < 2e-16
## 7 (1387) 3.09e-02 -7.22e-04 4.33e-05 4.81 1.5e-06
## 8 (1387) -1.46e-02 -7.22e-04 3.83e-05 -2.24 0.025
## 9 (1387) -5.41e-02 -7.22e-04 3.47e-05 -9.06 < 2e-16
## 10 (1387) -7.38e-02 -7.22e-04 3.23e-05 -12.85 < 2e-16
##
## 1 (1387) ***
## 2 (1387) ***
## 3 (1387) ***
## 4 (1387) ***
## 5 (1387) ***
## 6 (1387) ***
## 7 (1387) ***
## 8 (1387) *
## 9 (1387) ***
## 10 (1387) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cor10, main = "Correlelogram of % Poverty")