This is a place where I organize the quantitative methods and functions I have learned from the class of GEOG5023. This page will grow up as the semester goes. Click the links below for specific topics.
1. R Markdown Notes
2. Exercise 1 & Lecture 2: Introduction to R
3. Homework 1: Functions in R
4. Exercise 2 & Lecture 4: Intro to Correlation and Regression
5. Homework 2: Hypothesis Testing
6. Exercise 3 & Lecture 5: Multiple Regression
*Varibles in R are case sensitive.
x <- c(1, 2, 4, 8, 16)
x
## [1] 1 2 4 8 16
Take csv. for example:
CO2 <- read.csv("D:\\GEOG5023\\R Journal\\CO2.csv", header = T)
names(CO2)
## [1] "Plant" "Spec" "Type" "Treatment" "conc" "uptake"
hist(CO2$uptake, col = "blue", breaks = seq(0, 60, by = 2), xlab = "Rate (umol/m^2 sec)",
main = "CO2 Uptake Rates")
boxplot(CO2$uptake ~ CO2$Type, main = "CO2 Uptake Rates", ylab = "Rate (umol/m^2 sec)",
col = rainbow(2))
plot(CO2$uptake)
*Two elements:
Other parameters: ylim - range for y axis; xlim - range for x axis
plot(CO2$conc, CO2$uptake, xlab = "CO2 Concentration", ylab = "CO2 Uptake Rate",
pch = 2, col = "red")
thames2 <- read.csv("D:\\GEOG5023\\R Journal\\thames.csv", sep = ",",
header = T)
plot(thames2$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Feildes",
type = "l", ylim = c(-1, 100))
*To add another line to the graph and a legend
plot(thames2$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Feildes",
type = "l", ylim = c(-1, 100))
lines(thames2$Redbridge, lty = 2, col = "red")
legend(5, 100, c("Feildes", "Redbridge"), lty = c(1, 2), col = c("black", "red"))
par(mfrow = c(2, 2))
hist(CO2$uptake, col = "blue", breaks = seq(0, 50, by = 5), xlab = "Rate (umol/m^2 sec)",
main = "CO2 Uptake Rates")
plot(CO2$conc, CO2$uptake, xlab = "CO2 Concentration", ylab = "CO2 Uptake Rate",
pch = 2, col = "red")
boxplot(CO2$uptake ~ CO2$Type, main = "CO2 Uptake Rates", ylab = "Rate (umol/m^2 sec)",
col = rainbow(2))
plot(thames2$Feildes, xlab = "Time", ylab = "Flow Rate", main = "Flow Rate for Thames Tributaries",
type = "l")
lines(thames2$Redbridge, lty = 2, col = "red")
legend(5, 80, c("Feildes", "Redbridge"), lty = c(1, 2), col = c("black", "red"))
faculty2 <- read.csv("D:/GEOG5023/Exercise 1_Lecture 2/faculty.csv", sep = ",",
header = T)
nrow(faculty2)
## [1] 725
ncol(faculty2)
## [1] 11
## 'rain' contains actual rainfall data for Boulder, CO (2000-2011)
rain <- c(16, 18, 14, 22, 27, 17, 19, 17, 17, 22, 20, 22)
mean(rain) #returns the average rainfall from 2000-2011 in Boulder, CO
## [1] 19.25
sum(rain) #returns the total amount of rainfall during the study period
## [1] 231
length(rain) #returns the length of the list, i.e. the number of years of data
## [1] 12
sqrt(rain) #Square root of rainfall values
## [1] 4.000 4.243 3.742 4.690 5.196 4.123 4.359 4.123 4.123 4.690 4.472
## [12] 4.690
Define my own function (standard_deviation) for computing the standard deviation:
standard_deviation <- function(inputData) {
deviation <- inputData - mean(inputData)
squareDeviation <- deviation^2
meanSquareDeviation <- mean(squareDeviation)
return(sqrt(meanSquareDeviation))
}
sdRain <- standard_deviation(rain)
sdRain
## [1] 3.394
rain <- read.csv("D:\\GEOG5023\\Exercise 2_Lecture 4\\LA_RAIN.csv", header = TRUE)
plot(rain)
lm1 = lm(BSAAM ~ OPSLAKE, rain)
summary(lm1)
##
## Call:
## lm(formula = BSAAM ~ OPSLAKE, data = rain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17604 -5338 332 3411 20876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27015 3219 8.39 1.9e-10 ***
## OPSLAKE 3752 216 17.39 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8920 on 41 degrees of freedom
## Multiple R-squared: 0.881, Adjusted R-squared: 0.878
## F-statistic: 303 on 1 and 41 DF, p-value: <2e-16
# the way to get the values of R-squared and p-values
summary(lm1)$adj.r.squared
## [1] 0.8778
summary(lm1)$coefficients[7]
## [1] 1.927e-10
summary(lm1)$coefficients[8]
## [1] 1.56e-20
# load the library for box-cox function
library(MASS)
bestLambda <- numeric(6) # record the best power value to perform transform
R2s <- numeric(6) # record R-square
pValues1 <- numeric(6) # p-values of intercept
pValues2 <- numeric(6) # p-values of coefficient
# use a 'for' statement to cycle the all 6 stations
for (i in 1:6) {
# perform box-cox transformation
a <- boxcox(rain$BSAAM ~ rain[, i + 1])
# record the best log-likelihood
bestLambda[i] <- a$x[a$y == max(a$y)]
# perform linear regression after transformation
lmi <- lm(rain$BSAAM^(bestLambda[i]) ~ rain[, i + 1])
R2s[i] <- summary(lmi)$adj.r.squared
pValues1[i] <- summary(lmi)$coefficients[7]
pValues2[i] <- summary(lmi)$coefficients[8]
}
# load library for the ladder function
library(HH)
ladder(BSAAM ~ OPBPC, data = rain, main = "Ladder of Powers")
## Error: could not find function "ladder"
# choose beset powers by looking at the ladder plot
BSAAM1 = log(rain$BSAAM) # power=0
OPBPC1 = rain$OPBPC^0.5 # power=0.5
summary(lm(BSAAM1 ~ OPBPC1)) # R-squared increased by 0.01
##
## Call:
## lm(formula = BSAAM1 ~ OPBPC1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3011 -0.0987 -0.0054 0.0821 0.3337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.2275 0.0823 124.3 < 2e-16 ***
## OPBPC1 0.2854 0.0230 12.4 1.7e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.145 on 41 degrees of freedom
## Multiple R-squared: 0.79, Adjusted R-squared: 0.785
## F-statistic: 154 on 1 and 41 DF, p-value: 1.74e-15
summary(lm(BSAAM ~ OPRC + OPSLAKE, rain))
##
## Call:
## lm(formula = BSAAM ~ OPRC + OPSLAKE, data = rain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15991 -6485 -498 4700 19946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22891 3278 6.98 2.0e-08 ***
## OPRC 1866 639 2.92 0.0057 **
## OPSLAKE 2401 503 4.77 2.5e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8200 on 40 degrees of freedom
## Multiple R-squared: 0.902, Adjusted R-squared: 0.897
## F-statistic: 183 on 2 and 40 DF, p-value: <2e-16
Take dbf. for example,
# Load the 'foreign' library in order to read dbase files
library(foreign)
# Import the dbf. data
MN <- read.dbf("D:/GEOG5023/Homework 2 Hypothesis Testing/mnmappluto.dbf")
Dummy variable is very useful for simplifing data, and for categorical data analysis.
# Remove data with incorrect location information
MN <- MN[MN$YCoord > 0 & MN$XCoord > 0, ]
# Create a dummy variable HD(1=in a historic district, 0=not in a historic
# district)
MN$HD <- ifelse(is.na(MN[, "HistDist"]), 0, 1)
# convert MN$HD to a factor
MN$HD <- as.factor(MN$HD)
'col' changes the color of dots depending upon the value in the 'HD' column
# Draw a map of historic districts.(Red=in a HD, Black=not in a HD)
#'col' changes the color of dots depending upon the value in the 'HD' column
#'pch' sets the symbol to a solid dot
#'cex' makes the dot .5 the normal size
plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5)
# split the 'MN' object based on whether a building is in or out of a
# historic district
inHD <- MN[MN$HD == 1, ]
outHD <- MN[MN$HD == 0, ]
The two-sample t test is used to test the hypothesis that two samples may be assumed to come from distributions with the same mean.
t.test(x = inHD$AssessTot, y = outHD$AssessTot)
##
## Welch Two Sample t-test
##
## data: inHD$AssessTot and outHD$AssessTot
## t = -15.05, df = 43286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1724546 -1327117
## sample estimates:
## mean of x mean of y
## 1233743 2759574
“in” syntax is useful for selecting.
# Get a list of all blocks that contain historic buildings
blocks <- inHD$Block
# Select all buildings (from MN) that are on the same block as historic
# buildings. The line below selects all rows where the block column
# contains values in our list of blocks Save the result as a new object
HDB <- MN[MN$Block %in% blocks, ]
There are several different methods for computing correlation. In the Pearson correlation coefficient is used to summarize the relationship between two continuous variables. For example, the relationship between temperature and elevation. Kendall's tau and Spearman's rank correlation coefficient compute the association between two variables by sorting each variables and assigning each observation a rank. To test the null hypothesis that pho=0 we use a t-test. Conceptually, when either the observed correlation, r, or the number of observations n is large, the t-statistic increases (making it more likely that we will reject the null hypothesis). The function, cor.test() computes this t-statistic and returns a confidence interval for pho given the observed data.
cor(x, y)
cor.test(x, y, method = "pearson") #Pearson is the default.
cor.test(x, y, method = "kendall")
cor.test(x, y, method = "spearman")
Layout divides the device up into as many rows and columns as there are in matrix mat, with the column-widths and the row-heights specified in the respective arguments.
The numbers in the matrix object specifying the location of the next N figures. For example, 1,2,3 are locations of the 1st, 2nd and 3rd figures on the output device. 0=no figures in the location. the two numbers after matrix define the number of rows and columns in the matrix.
layout(matrix(c(1, 2, 0, 0), 1, 2, byrow = TRUE))
plot(y = MN$YCoord, x = MN$XCoord, col = MN$HD, pch = 16, cex = 0.5, main = "Map of Manhattan Buildings",
xlab = "X Coordinate", ylab = "Y Coordinate")
plot(y = MN$AssessTot, x = MN$YCoord, main = "Building Value vs Y Coordinate(North-South\n Position)",
xlab = "Y Coordinate", ylab = "Building Value")
library(sp)
library(maptools)
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)
# READ DATA
USA <- readShapePoly("D:\\GEOG5023\\Exercise 3_Lecture 5\\Maps and Multiple Regression\\USA.shp")
# draw shp. file
plot(USA)
# Remove counties with no votes
USA <- USA[USA$Total > 0, ]
# List the pieces of the file
slotNames(USA)
# summarize the file
summary(USA)
# summarize the data
summary(USA@data)
par(mfrow = c(1, 2))
display.brewer.all()
# lets make a 7 color 'spectral' palette
pal7 <- brewer.pal(7, "Spectral")
# to see the colors
display.brewer.pal(7, "Spectral")
USA$BushPct <- USA$Bush/USA$Total
# create categories for display
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
cats7
## style: quantile
## [0.09308,0.4746) [0.4746,0.5415) [0.5415,0.5907) [0.5907,0.6336)
## 444 444 444 444
## [0.6336,0.6801) [0.6801,0.7421) [0.7421,0.9283]
## 444 444 444
SevenColors <- findColours(cats7, pal7)
# draw map using specificed data and colors
plot(USA, col = SevenColors)
lmBush <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lmBush)
##
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6082 -0.0731 0.0089 0.0779 0.3558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.77e-01 5.60e-03 138.9 <2e-16 ***
## pctfemhh -1.01e-02 3.61e-04 -27.9 <2e-16 ***
## homevalu -7.60e-07 6.01e-08 -12.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256, Adjusted R-squared: 0.255
## F-statistic: 533 on 2 and 3105 DF, p-value: <2e-16
lmBush.resid <- resid(lmBush)
plot(lmBush.resid ~ USA$BushPct)
## Try to look at spatial pattern of residuals. Add a new column to USA to
## hold the residuals
USA$resid <- resid(lmBush)
pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
plot(USA, col = ThreeColors)
Use regsubsets() to choose the appropriate number of variables in a multiple regression.
# library for regsubsets()
library(leaps)
# read another data '2004_Election_Counties.shp'(same as USA.shp, but more
# fields)
USA2 <- readShapePoly("D:\\GEOG5023\\Exercise 3_Lecture 5\\Maps and Multiple Regression\\2004_Election_Counties.shp")
# use data in attributes table
USA2 = USA2@data
d <- regsubsets(Bush_pct ~ ., nbest = 1, nvmax = 20, data = USA2[, c(13, 16:38)])
d.fit <- summary(d)
par(mfrow = c(1, 2))
plot(1:20, y = d.fit$adjr2, type = "o", xlab = "Number of Parameters", ylab = "Adj. r^2")
abline(v = "10", col = "blue")
plot(1:20, y = d.fit$bic, type = "o", xlab = "Number of Parameters", ylab = "BIC")
abline(v = "10", col = "blue")
# by plotting them
plot(d)
# or by specify a model, the one below shows the best 10-variable model
# (the ones with stars)
d.fit$outmat[10, ]
## MDratio hosp pcthisp pcturban urbrural pctfemhh pcincome
## " " "*" "*" " " " " "*" " "
## pctpoor pctlt9ed pcthsed pctcoled unemploy pctwhtcl homevalu
## "*" " " " " "*" "*" " " "*"
## rent popdens crowded ginirev SmokecurM SmokevrM SmokecurF
## " " " " "*" "*" " " " " " "
## SmokevrF Obese
## "*" " "
library(car)
## Loading required package: MASS
## Loading required package: nnet
# the 10 variables above
lm1 <- lm(Bush_pct ~ hosp + pcthisp + pctfemhh + pctpoor + pctcoled + unemploy +
homevalu + crowded + ginirev + SmokevrF, USA2)
# calculate VIF to inspect multicollinearity!
vif(lm1)
## hosp pcthisp pctfemhh pctpoor pctcoled unemploy homevalu crowded
## 1.145 1.797 2.191 4.357 2.192 1.733 2.537 3.024
## ginirev SmokevrF
## 3.180 1.739
plot(resid(lm1))
Created by: Li Xu; Created on: 01/22/2013; Updated on: 02/20/2013