Li Xu R-Journal

GEOG 5023: Quantitative Methods In Geography

This is a place where I organize the quantitative methods and functions I have learned from the class of GEOG5023. This page will grow up as the semester goes. Click the links below for specific topics.

1. R Markdown Notes
2. Exercise 1 & Lecture 2: Introduction to R
3. Homework 1: Functions in R
4. Exercise 2 & Lecture 4: Intro to Correlation and Regression
5. Homework 2: Hypothesis Testing
6. Exercise 3 & Lecture 5: Multiple Regression


1. R Markdown Notes

I collect the useful grammers for creating a Rmd file here.


2. Exercise 1 & Lecture 2: Introduction to R

2.1 Assignment Operator & Combine Function

*Varibles in R are case sensitive.

x <- c(1, 2, 4, 8, 16)
x
## [1]  1  2  4  8 16

2.2 Arithmetic & Logical Operators

2.3 Importing Data

Take csv. for example:

CO2 <- read.csv("D:\\GEOG5023\\R Journal\\CO2.csv", header = T)

2.4 Working with Data

names(CO2)
## [1] "Plant"     "Spec"      "Type"      "Treatment" "conc"      "uptake"

2.5 Basic Graphics

hist(CO2$uptake, col = "blue", breaks = seq(0, 60, by = 2), xlab = "Rate (umol/m^2 sec)", 
    main = "CO2 Uptake Rates")

plot of chunk unnamed-chunk-4

boxplot(CO2$uptake ~ CO2$Type, main = "CO2 Uptake Rates", ylab = "Rate (umol/m^2 sec)", 
    col = rainbow(2))

plot of chunk unnamed-chunk-5

plot(CO2$uptake)

plot of chunk unnamed-chunk-6

*Two elements:
Other parameters: ylim - range for y axis; xlim - range for x axis

plot(CO2$conc, CO2$uptake, xlab = "CO2 Concentration", ylab = "CO2 Uptake Rate", 
    pch = 2, col = "red")

plot of chunk unnamed-chunk-7

thames2 <- read.csv("D:\\GEOG5023\\R Journal\\thames.csv", sep = ",", 
    header = T)
plot(thames2$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Feildes", 
    type = "l", ylim = c(-1, 100))

plot of chunk unnamed-chunk-8

*To add another line to the graph and a legend

plot(thames2$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Feildes", 
    type = "l", ylim = c(-1, 100))
lines(thames2$Redbridge, lty = 2, col = "red")
legend(5, 100, c("Feildes", "Redbridge"), lty = c(1, 2), col = c("black", "red"))

plot of chunk unnamed-chunk-9

par(mfrow = c(2, 2))
hist(CO2$uptake, col = "blue", breaks = seq(0, 50, by = 5), xlab = "Rate (umol/m^2 sec)", 
    main = "CO2 Uptake Rates")
plot(CO2$conc, CO2$uptake, xlab = "CO2 Concentration", ylab = "CO2 Uptake Rate", 
    pch = 2, col = "red")
boxplot(CO2$uptake ~ CO2$Type, main = "CO2 Uptake Rates", ylab = "Rate (umol/m^2 sec)", 
    col = rainbow(2))
plot(thames2$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Thames Tributaries", 
    type = "l")
lines(thames2$Redbridge, lty = 2, col = "red")
legend(5, 80, c("Feildes", "Redbridge"), lty = c(1, 2), col = c("black", "red"))

plot of chunk unnamed-chunk-10

2.6 More Notes from Exercise 1

faculty2 <- read.csv("D:/GEOG5023/Exercise 1_Lecture 2/faculty.csv", sep = ",", 
    header = T)
nrow(faculty2)
## [1] 725
ncol(faculty2)
## [1] 11

3. Homework 1: Functions in R

3.1 Using Functions in R

## 'rain' contains actual rainfall data for Boulder, CO (2000-2011)
rain <- c(16, 18, 14, 22, 27, 17, 19, 17, 17, 22, 20, 22)
mean(rain)  #returns the average rainfall from 2000-2011 in Boulder, CO
## [1] 19.25
sum(rain)  #returns the total amount of rainfall during the study period
## [1] 231
length(rain)  #returns the length of the list, i.e. the number of years of data
## [1] 12
sqrt(rain)  #Square root of rainfall values
##  [1] 4.000 4.243 3.742 4.690 5.196 4.123 4.359 4.123 4.123 4.690 4.472
## [12] 4.690

3.2 Writing Functions in R

Define my own function (standard_deviation) for computing the standard deviation:

standard_deviation <- function(inputData) {
    deviation <- inputData - mean(inputData)
    squareDeviation <- deviation^2
    meanSquareDeviation <- mean(squareDeviation)
    return(sqrt(meanSquareDeviation))
}
sdRain <- standard_deviation(rain)
sdRain
## [1] 3.394

4. Exercise 2 & Lecture 4: Intro to Correlation and Regression

4.1 Simple Linear Regression

rain <- read.csv("D:\\GEOG5023\\Exercise 2_Lecture 4\\LA_RAIN.csv", header = TRUE)
plot(rain)

plot of chunk unnamed-chunk-15

lm1 = lm(BSAAM ~ OPSLAKE, rain)
summary(lm1)
## 
## Call:
## lm(formula = BSAAM ~ OPSLAKE, data = rain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17604  -5338    332   3411  20876 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    27015       3219    8.39  1.9e-10 ***
## OPSLAKE         3752        216   17.39  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 8920 on 41 degrees of freedom
## Multiple R-squared: 0.881,   Adjusted R-squared: 0.878 
## F-statistic:  303 on 1 and 41 DF,  p-value: <2e-16

# the way to get the values of R-squared and p-values
summary(lm1)$adj.r.squared
## [1] 0.8778
summary(lm1)$coefficients[7]
## [1] 1.927e-10
summary(lm1)$coefficients[8]
## [1] 1.56e-20

4.2 Transforms to make data more linear

# load the library for box-cox function
library(MASS)

bestLambda <- numeric(6)  # record the best power value to perform transform
R2s <- numeric(6)  # record R-square
pValues1 <- numeric(6)  # p-values of intercept
pValues2 <- numeric(6)  # p-values of coefficient

# use a 'for' statement to cycle the all 6 stations
for (i in 1:6) {
    # perform box-cox transformation
    a <- boxcox(rain$BSAAM ~ rain[, i + 1])
    # record the best log-likelihood
    bestLambda[i] <- a$x[a$y == max(a$y)]

    # perform linear regression after transformation
    lmi <- lm(rain$BSAAM^(bestLambda[i]) ~ rain[, i + 1])
    R2s[i] <- summary(lmi)$adj.r.squared
    pValues1[i] <- summary(lmi)$coefficients[7]
    pValues2[i] <- summary(lmi)$coefficients[8]
}
# load library for the ladder function
library(HH)
ladder(BSAAM ~ OPBPC, data = rain, main = "Ladder of Powers")
## Error: could not find function "ladder"
# choose beset powers by looking at the ladder plot
BSAAM1 = log(rain$BSAAM)  # power=0
OPBPC1 = rain$OPBPC^0.5  # power=0.5
summary(lm(BSAAM1 ~ OPBPC1))  # R-squared increased by 0.01
## 
## Call:
## lm(formula = BSAAM1 ~ OPBPC1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3011 -0.0987 -0.0054  0.0821  0.3337 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.2275     0.0823   124.3  < 2e-16 ***
## OPBPC1        0.2854     0.0230    12.4  1.7e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.145 on 41 degrees of freedom
## Multiple R-squared: 0.79,    Adjusted R-squared: 0.785 
## F-statistic:  154 on 1 and 41 DF,  p-value: 1.74e-15

4.3 Linear Regression with two variables

summary(lm(BSAAM ~ OPRC + OPSLAKE, rain))
## 
## Call:
## lm(formula = BSAAM ~ OPRC + OPSLAKE, data = rain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15991  -6485   -498   4700  19946 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    22891       3278    6.98  2.0e-08 ***
## OPRC            1866        639    2.92   0.0057 ** 
## OPSLAKE         2401        503    4.77  2.5e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 8200 on 40 degrees of freedom
## Multiple R-squared: 0.902,   Adjusted R-squared: 0.897 
## F-statistic:  183 on 2 and 40 DF,  p-value: <2e-16

5. Homework 2: Hypothesis Testing

5.1 Import other data types in R

Take dbf. for example,

# Load the 'foreign' library in order to read dbase files
library(foreign)
# Import the dbf. data
MN <- read.dbf("D:/GEOG5023/Homework 2 Hypothesis Testing/mnmappluto.dbf")

5.2 Dummy Variable

Dummy variable is very useful for simplifing data, and for categorical data analysis.

# Remove data with incorrect location information
MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]

# Create a dummy variable HD(1=in a historic district, 0=not in a historic
# district)
MN$HD <- ifelse(is.na(MN[, "HistDist"]), 0, 1)

# convert MN$HD to a factor
MN$HD <- as.factor(MN$HD)

'col' changes the color of dots depending upon the value in the 'HD' column

# Draw a map of historic districts.(Red=in a HD, Black=not in a HD)
#'col' changes the color of dots depending upon the value in the 'HD' column
#'pch' sets the symbol to a solid dot
#'cex'  makes the dot .5 the normal size
plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5)

plot of chunk unnamed-chunk-23


# split the 'MN' object based on whether a building is in or out of a
# historic district
inHD <- MN[MN$HD == 1, ]
outHD <- MN[MN$HD == 0, ]

5.3 Hypothesis Testing

The two-sample t test is used to test the hypothesis that two samples may be assumed to come from distributions with the same mean.

t.test(x = inHD$AssessTot, y = outHD$AssessTot)
## 
##  Welch Two Sample t-test
## 
## data:  inHD$AssessTot and outHD$AssessTot 
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0 
## 95 percent confidence interval:
##  -1724546 -1327117 
## sample estimates:
## mean of x mean of y 
##   1233743   2759574

“in” syntax is useful for selecting.

# Get a list of all blocks that contain historic buildings
blocks <- inHD$Block

# Select all buildings (from MN) that are on the same block as historic
# buildings. The line below selects all rows where the block column
# contains values in our list of blocks Save the result as a new object
HDB <- MN[MN$Block %in% blocks, ]

5.4 Correlation

There are several different methods for computing correlation. In the Pearson correlation coefficient is used to summarize the relationship between two continuous variables. For example, the relationship between temperature and elevation. Kendall's tau and Spearman's rank correlation coefficient compute the association between two variables by sorting each variables and assigning each observation a rank. To test the null hypothesis that pho=0 we use a t-test. Conceptually, when either the observed correlation, r, or the number of observations n is large, the t-statistic increases (making it more likely that we will reject the null hypothesis). The function, cor.test() computes this t-statistic and returns a confidence interval for pho given the observed data.

cor(x, y)
cor.test(x, y, method = "pearson")  #Pearson is the default.
cor.test(x, y, method = "kendall")
cor.test(x, y, method = "spearman")

5.5 Layout Function

Layout divides the device up into as many rows and columns as there are in matrix mat, with the column-widths and the row-heights specified in the respective arguments.
The numbers in the matrix object specifying the location of the next N figures. For example, 1,2,3 are locations of the 1st, 2nd and 3rd figures on the output device. 0=no figures in the location. the two numbers after matrix define the number of rows and columns in the matrix.

layout(matrix(c(1, 2, 0, 0), 1, 2, byrow = TRUE))
plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5, main = "Map of Manhattan Buildings", 
    xlab = "X Coordinate", ylab = "Y Coordinate")
plot(y = MN$AssessTot, x = MN$YCoord, main = "Building Value vs Y Coordinate(North-South\n Position)", 
    xlab = "Y Coordinate", ylab = "Building Value")

plot of chunk unnamed-chunk-27

6. Exercise 3 & Lecture 5: Multiple Regression

6.1 Work with shp. file in R

library(sp)
library(maptools)
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)
# READ DATA
USA <- readShapePoly("D:\\GEOG5023\\Exercise 3_Lecture 5\\Maps and Multiple Regression\\USA.shp")
# draw shp. file
plot(USA)

plot of chunk unnamed-chunk-29

# Remove counties with no votes
USA <- USA[USA$Total > 0, ]
# List the pieces of the file
slotNames(USA)
# summarize the file
summary(USA)
# summarize the data
summary(USA@data)
par(mfrow = c(1, 2))
display.brewer.all()

# lets make a 7 color 'spectral' palette
pal7 <- brewer.pal(7, "Spectral")
# to see the colors
display.brewer.pal(7, "Spectral")

plot of chunk unnamed-chunk-31


USA$BushPct <- USA$Bush/USA$Total
# create categories for display
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
cats7
## style: quantile
## [0.09308,0.4746)  [0.4746,0.5415)  [0.5415,0.5907)  [0.5907,0.6336) 
##              444              444              444              444 
##  [0.6336,0.6801)  [0.6801,0.7421)  [0.7421,0.9283] 
##              444              444              444

SevenColors <- findColours(cats7, pal7)

# draw map using specificed data and colors
plot(USA, col = SevenColors)

plot of chunk unnamed-chunk-31

6.2 Multiple Regression

lmBush <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lmBush)
## 
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6082 -0.0731  0.0089  0.0779  0.3558 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.77e-01   5.60e-03   138.9   <2e-16 ***
## pctfemhh    -1.01e-02   3.61e-04   -27.9   <2e-16 ***
## homevalu    -7.60e-07   6.01e-08   -12.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256,   Adjusted R-squared: 0.255 
## F-statistic:  533 on 2 and 3105 DF,  p-value: <2e-16
lmBush.resid <- resid(lmBush)
plot(lmBush.resid ~ USA$BushPct)

plot of chunk unnamed-chunk-32

## Try to look at spatial pattern of residuals. Add a new column to USA to
## hold the residuals
USA$resid <- resid(lmBush)
pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
plot(USA, col = ThreeColors)

6.3 Model Selection in Multiple Regression

Use regsubsets() to choose the appropriate number of variables in a multiple regression.

# library for regsubsets()
library(leaps)
# read another data '2004_Election_Counties.shp'(same as USA.shp, but more
# fields)
USA2 <- readShapePoly("D:\\GEOG5023\\Exercise 3_Lecture 5\\Maps and Multiple Regression\\2004_Election_Counties.shp")
# use data in attributes table
USA2 = USA2@data
d <- regsubsets(Bush_pct ~ ., nbest = 1, nvmax = 20, data = USA2[, c(13, 16:38)])
d.fit <- summary(d)
par(mfrow = c(1, 2))
plot(1:20, y = d.fit$adjr2, type = "o", xlab = "Number of Parameters", ylab = "Adj. r^2")
abline(v = "10", col = "blue")
plot(1:20, y = d.fit$bic, type = "o", xlab = "Number of Parameters", ylab = "BIC")
abline(v = "10", col = "blue")

plot of chunk unnamed-chunk-36

# by plotting them
plot(d)

plot of chunk unnamed-chunk-37

# or by specify a model, the one below shows the best 10-variable model
# (the ones with stars)
d.fit$outmat[10, ]
##   MDratio      hosp   pcthisp  pcturban  urbrural  pctfemhh  pcincome 
##       " "       "*"       "*"       " "       " "       "*"       " " 
##   pctpoor  pctlt9ed   pcthsed  pctcoled  unemploy  pctwhtcl  homevalu 
##       "*"       " "       " "       "*"       "*"       " "       "*" 
##      rent   popdens   crowded   ginirev SmokecurM  SmokevrM SmokecurF 
##       " "       " "       "*"       "*"       " "       " "       " " 
##  SmokevrF     Obese 
##       "*"       " "

6.4 VIF and Multicollinearity

library(car)
## Loading required package: MASS
## Loading required package: nnet
# the 10 variables above
lm1 <- lm(Bush_pct ~ hosp + pcthisp + pctfemhh + pctpoor + pctcoled + unemploy + 
    homevalu + crowded + ginirev + SmokevrF, USA2)
# calculate VIF to inspect multicollinearity!
vif(lm1)
##     hosp  pcthisp pctfemhh  pctpoor pctcoled unemploy homevalu  crowded 
##    1.145    1.797    2.191    4.357    2.192    1.733    2.537    3.024 
##  ginirev SmokevrF 
##    3.180    1.739
plot(resid(lm1))

plot of chunk unnamed-chunk-38


Created by: Li Xu; Created on: 01/22/2013; Updated on: 02/20/2013