X Wang - Lab 1

GEOG 5023: Quantitative Methods in Geography

# Load co2 file for Mac
co2 <- read.csv("/Users/xiwang/Dropbox/GEOG 5023 - offline/Labs/co2_LAB1.csv", 
    header = TRUE)
# Summarizes variables for the dataframe
summary(co2)

##       year          month            time           co2           site    
##  Min.   :1969   Min.   : 1.00   Min.   :1969   Min.   :322   Min.   :0.0  
##  1st Qu.:1978   1st Qu.: 4.00   1st Qu.:1979   1st Qu.:337   1st Qu.:0.0  
##  Median :1988   Median : 7.00   Median :1989   Median :351   Median :0.5  
##  Mean   :1988   Mean   : 6.51   Mean   :1989   Mean   :352   Mean   :0.5  
##  3rd Qu.:1998   3rd Qu.: 9.75   3rd Qu.:1998   3rd Qu.:366   3rd Qu.:1.0  
##  Max.   :2007   Max.   :12.00   Max.   :2008   Max.   :389   Max.   :1.0

plot(co2$co2 ~ co2$time, main = "Combined CO2 Concentrations over Time", xlab = "Time (years)", 
    ylab = "CO2 Concentration (ppm)")

plot of chunk unnamed-chunk-1

Question 1: Do the CO2 measurements taken at the two sites differ significantly from one another?

Evaluate data for normal distribution.

# Subset the data based on site
mauna <- subset(co2, site == 0)
jolla <- subset(co2, site == 1)
# Apply different tests for normality
par(mfrow = c(2, 2))
hist(mauna$co2, main = "Distribution of Mauna Loa Data", xlab = "CO2 Concentrations (ppm)")
hist(jolla$co2, main = "Distribution of La Jolla Data", xlab = "CO2 Concentrations (ppm)")
qqnorm(mauna$co2, main = "Mauna Loa Normal Q-Q Plot")
qqline(mauna$co2)
qqnorm(jolla$co2, main = "La Jolla Normal Q-Q Plot")
qqline(jolla$co2)

plot of chunk unnamed-chunk-2

# The histograms for both sites don't show a bell curve distribution. The
# Q-Q plots for both sites don't fit the line Q-Q line. Both tests
# indicate the data is not normally distributed.  The Shapiro-Wilk test
# tests the null hypothesis that the data comes from a
# normally-distributed population.
shapiro.test(mauna$co2)

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$co2 
## W = 0.9581, p-value = 2.935e-10

shapiro.test(jolla$co2)

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$co2 
## W = 0.966, p-value = 6.135e-09

# The p-values for both sites are statistically significant, which leads
# to a conclusion that the data is not normally distributed.

Apply transformations to make the data normally distributed.

library(HH)

## Loading required package: lattice

## Loading required package: grid

## Loading required package: multcomp

## Loading required package: mvtnorm

## Loading required package: survival

## Loading required package: splines

## Loading required package: leaps

## Loading required package: RColorBrewer

## Loading required package: latticeExtra

ladder(co2 ~ time, data = mauna, main = "Ladder of Powers")

plot of chunk unnamed-chunk-3

ladder(co2 ~ time, data = jolla, main = "Ladder of Powers")

plot of chunk unnamed-chunk-3

# Transform CO2 values to -1
mauna.co2.trans <- ((mauna$co2)^-1)
shapiro.test(mauna.co2.trans)

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna.co2.trans 
## W = 0.9599, p-value = 5.806e-10

jolla.co2.trans <- ((jolla$co2)^-1)
shapiro.test(jolla.co2.trans)

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla.co2.trans 
## W = 0.9686, p-value = 1.864e-08

# The p-values for both sites are statistically significant, leading to a
# conclusion that even with transformations, the data is not normally
# distributed.

Apply non-parametric Wilcoxon test to compare data from the two sites.

# The null hypothesis of the Wilcoxon test is that the CO2 concentrations
# measured at the two sites do not differ from each other.
wilcox.test(mauna$co2, jolla$co2, paired = TRUE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mauna$co2 and jolla$co2 
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0

# The p-value is statistically significant, leading to a rejection of the
# null hypothesis and the conclusion that measurements at the two sites
# differ statistically significantly from each other.

Question 2: What is the overall rate of CO2 increase from 1969 to 2008?

Perform linear regression to the combined site data.

lm.1 <- lm(co2 ~ time, data = co2)
# We are testing several hypotheses. For the f-test: the null hypothesis
# is that the variation explained by the model is due to chance. For the
# t-test of the b-paramater: the null hypothesis for the is that b = 0,
# meaning there is no significant relationship between time and CO2
# concentrations.
summary(lm.1)

## 
## Call:
## lm(formula = co2 ~ time, data = co2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.547 -2.362  0.449  2.302  8.334 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.68e+03   1.86e+01    -144   <2e-16 ***
## time         1.52e+00   9.35e-03     163   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.21 on 932 degrees of freedom
## Multiple R-squared: 0.966,   Adjusted R-squared: 0.966 
## F-statistic: 2.66e+04 on 1 and 932 DF,  p-value: <2e-16

# The summary shows that the p-values for both the f-test and the
# b-paramater are statistically significant, so we conclude the variation
# explained by the model is not due to chance and the amount of variation
# explained by the model is not 0. Furthermore, the R-squared value
# indicates the model explains about 96% of variance in the outcome
# (doesn't measure causation).  Look at the residuals to see if they meet
# the assumptions that 1) the residuals should have a mean of 0 and have a
# normal distribution around 0
par(mfrow = c(1, 2), pty = "s")  # squaring plotting region independent of device size
plot(lm.1, which = 1:2)

plot of chunk unnamed-chunk-5

# The residuals don't have a normal distrution around 0 and don't appear
# to have a mean of 0. The model appears to systematically overpredict the
# data.  Plot observations against the regression model to examine fit.
par(mfrow = c(1, 1))
plot(co2$co2 ~ co2$time, main = "Observations vs. Linear Regression Model", 
    xlab = "Time (years)", ylab = "CO2 Concentration (ppm)")
abline(coef(lm.1), col = "red", lwd = 3)

plot of chunk unnamed-chunk-5

# The data shows a slight curvature compared to the regression line, so a
# quadratic regression may provide a better fit.

Perform quadratic regression to the combined site data.

# Center the data.
co2$cent <- co2$time - mean(co2$time)
# Perform quadratic regression.
lm.2 <- lm(co2 ~ cent + I(cent^2), data = co2)
# The null hypothesis for the f-test is that the variation explained by
# the model is due to chance.
summary(lm.2)

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = co2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.005 -2.213  0.429  2.358  6.026 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.51e+02   1.47e-01  2378.9   <2e-16 ***
## cent        1.52e+00   8.74e-03   174.4   <2e-16 ***
## I(cent^2)   1.01e-02   8.70e-04    11.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97,    Adjusted R-squared: 0.97 
## F-statistic: 1.53e+04 on 2 and 931 DF,  p-value: <2e-16

# The p-value is statistically significant, so we reject the null
# hypothesis. The R-value is slightly higher than the linear model.  Check
# residuals.
par(mfrow = c(1, 2), pty = "s")
plot(lm.2, which = 1:2)

plot of chunk unnamed-chunk-6

# Residuals look more evenly distributed around 0.  Plot observations
# against the regression model to examine fit.
par(mfrow = c(1, 1))
plot(co2$co2 ~ co2$cent, main = "Observations vs. Quadratic Regression Model", 
    xlab = "Centered Time (1969-2007)", ylab = "CO2 Concentration (ppm)")
curve(predict(lm.2, data.frame(cent = x), type = "resp"), add = TRUE, col = "purple", 
    lwd = 3)
lm.2.linear <- lm(co2 ~ cent, data = co2)  # linear regression model for centered data
abline(coef(lm.2.linear), col = "red", lwd = 3)
legend("bottomright", c("Quadratic", "Linear"), col = c("purple", "red"), lty = 1)

plot of chunk unnamed-chunk-6

# The quadratic regression model appears to better describe the overall
# rate of CO2 increase because 1) the R-squared value is slightly higher,
# and 2) it appears to fit the data better visually.

Compare regression models.

# The anova can be used to compare the variance for different fitted
# models. The null hypothesis is there is no difference in how well the
# two models explain CO2 concentration over time.
anova(lm.1, lm.2)

## Analysis of Variance Table
## 
## Model 1: co2 ~ time
## Model 2: co2 ~ cent + I(cent^2)
##   Res.Df  RSS Df Sum of Sq   F Pr(>F)    
## 1    932 9608                            
## 2    931 8393  1      1215 135 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The residual sum of squares (RSS) is less for the quadratic model and
# p-value is statistically significant.

Question 3: Does the rate of change appear to be steady, or is it accelerating/decelerating?

The slope of the quadratic function describes the acceleration of the rate of CO2 increase.

# The slope is the derivative of the quadratic function f(x) = 0.00101x^2
# + 1.525x + 350.6, where x is the time deviation from the mean time in
# years, and f(x) is in CO2 concentration in PPM.
D(expression(0.00101 * x^2 + 1.525 * x + 350.6), "x")  # f'(x)

## 0.00101 * (2 * x) + 1.525

# f'(x) = 0.00202x -1.525 (form: y = mx + b), so the change is
# accelerating at m = 0.002 ppm every year.  Find slope at every x and
# find range and median.
co2$slopes <- 0.00202 * co2$cent + 1.525
summary(co2$slopes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.49    1.51    1.52    1.52    1.54    1.56

# The range of slopes is between 1.486 and 1.564, and the median is 1.525,
# which can be interpreted as the average rate of CO2 increase. Note this
# is very close to the slope on the linear model (1.52).

Question 4: Is the rate of increase the same at both sites?

Apply the quadratic regression to centered subset data for the two sites, take the derivatives, find slopes at every x, and compare the results.

# Center data.
mauna$cent <- mauna$time - mean(mauna$time)
jolla$cent <- jolla$time - mean(jolla$time)
# Perform quadratic regression.
lm.2.mauna <- lm(co2 ~ cent + I(cent^2), data = mauna)
lm.2.jolla <- lm(co2 ~ cent + I(cent^2), data = jolla)
par(mfrow = c(1, 2), pty = "s")
plot(co2 ~ cent, mauna, main = "Quadratic Model: Mauna Loa Site", xlab = "Centered Time (1969-2007)", 
    ylab = "CO2 Concentration (ppm)")
curve(predict(lm.2.mauna, data.frame(cent = x), type = "resp"), add = TRUE, 
    col = "purple", lwd = 3)
plot(co2 ~ cent, jolla, main = "Quadratic Model: La Jolla Site", xlab = "Centered Time (1969-2007)", 
    ylab = "CO2 Concentration (ppm)")
curve(predict(lm.2.jolla, data.frame(cent = x), type = "resp"), add = TRUE, 
    col = "purple", lwd = 3)

plot of chunk unnamed-chunk-9

# Take the derivatives.
summary(lm.2.mauna)

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.061 -1.766  0.155  1.836  4.419 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.50e+02   1.54e-01  2273.7   <2e-16 ***
## cent        1.54e+00   9.14e-03   168.9   <2e-16 ***
## I(cent^2)   9.92e-03   9.09e-04    10.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984,   Adjusted R-squared: 0.984 
## F-statistic: 1.43e+04 on 2 and 464 DF,  p-value: <2e-16

summary(lm.2.jolla)

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = jolla)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.30  -2.65   1.23   2.74   5.86 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.51e+02   2.47e-01 1423.31   <2e-16 ***
## cent        1.51e+00   1.46e-02  102.93   <2e-16 ***
## I(cent^2)   1.03e-02   1.46e-03    7.06    6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958,   Adjusted R-squared: 0.958 
## F-statistic: 5.32e+03 on 2 and 464 DF,  p-value: <2e-16

# Mauna Loa: f(x) = 0.0009917x^2 + 1.544x + 350.1 La Jolla: f(x) =
# 0.001029x^2 + 1.506x + 351 Where x is the time deviation from the mean
# time in years, and f(x) is in CO2 concentration in PPM.
D(expression(0.0009917 * x^2 + 1.544 * x + 350.1), "x")  # Mauna Loa f'(x) = 0.0019834x + 1.544

## 0.0009917 * (2 * x) + 1.544

D(expression(0.001029 * x^2 + 1.506 * x + 351), "x")  # La Jolla f'(x) = 0.002058x + 1.506

## 0.001029 * (2 * x) + 1.506

# Find slope at every x and find range and median.
mauna$slopes <- 0.0019834 * mauna$cent + 1.544
jolla$slopes <- 0.002058 * jolla$cent + 1.506
summary(mauna$slopes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.51    1.52    1.54    1.54    1.56    1.58

summary(jolla$slopes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.47    1.49    1.51    1.51    1.53    1.55

# Apply the Shapiro-Wilk test with the null hypothesis that the data comes
# from a normally-distributed population.
shapiro.test(mauna$slopes)

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$slopes 
## W = 0.9547, p-value = 8.884e-11

shapiro.test(jolla$slopes)

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$slopes 
## W = 0.9547, p-value = 8.886e-11

# The p-values for the rate of CO2 increase for both sites are
# statistically significant, which leads to a conclusion that the data is
# not normally distributed.  Apply non-parametric Wilcoxon test to compare
# data from the two sites. The null hypothesis is that the CO2
# concentrations measured at the two sites do not differ from each other.
wilcox.test(mauna$slopes, jolla$slopes, paired = TRUE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mauna$slopes and jolla$slopes 
## V = 109278, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# The p-value is statistically significant, leading to the conclusion that
# rate of CO2 increase at the two sites differs statistically
# significantly. La Jolla (1.506 ppm/year)has a slightly slower rate of
# increase compared to Mauna Loa (1.544 ppm/year), but a higher
# acceleration.