Appendix for Lab 1 Report – Alex Crawford

Loading the Data

CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/co2_LAB1.csv", 
    header = T)
# Subsets to compare stations.
mauna <- subset(CO2, site == 0)
jolla <- subset(CO2, site == 1)

plot(CO2$co2 ~ CO2$time, main = "CO2 Concentration Increases over Time", xlab = "Time (years)", 
    ylab = "CO2 concentration (ppm)")

plot of chunk unnamed-chunk-2

Queestion 1

1. Do the CO2 measurements taken at the two sites differ significantly from one another?

Tests of Normality

par(mfrow = c(2, 2))
hist(mauna$co2, main = "Histogram of Mauna Loa CO2 Concentrations", xlab = "CO2 Concentration (ppm)")
hist(jolla$co2, main = "Histogram of La Jolla CO2 Concentrations", xlab = "CO2 Concentration (ppm)")
qqnorm(mauna$co2, main = "Mauna Loa CO2 QQ-Plot")
qqline(mauna$co2)
qqnorm(jolla$co2, main = "La Jolla CO2 QQ-Plot")
qqline(jolla$co2)

plot of chunk unnamed-chunk-3

# Data histograms shows the data do not form a bell curve.  No straight
# lines means no normality.

# Shapiro-Wilk Tests: Null Hypotheses: The distributions of these data are
# normal.  Alternative Hypotheses: The distributions of these data are not
# normal.
shapiro.test(mauna$co2)  # Significance indicates distribution is not normal.

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$co2 
## W = 0.9581, p-value = 2.935e-10

shapiro.test(jolla$co2)  # Significance indicates distribution is not normal.

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$co2 
## W = 0.966, p-value = 6.135e-09

Transformations

library(HH)

## Loading required package: lattice

## Loading required package: grid

## Loading required package: multcomp

## Loading required package: mvtnorm

## Loading required package: survival

## Loading required package: splines

## Loading required package: leaps

## Loading required package: RColorBrewer

## Loading required package: latticeExtra

ladder(co2 ~ time, mauna)

plot of chunk unnamed-chunk-5

ladder(co2 ~ time, jolla)

plot of chunk unnamed-chunk-5

# No amount of transformation promises to make the data normally
# distributed.  So Wilcox test is perferred.

Wilcox Test of Difference

# If data cannot be normally distributed, use wilcox, which acts like a
# t-test with ranked the data.  Null Hypothesis: The average CO2
# concentration is the same at each site and any variation is due to
# random chance.  Alternative Hypothesis: The average CO2 concentration is
# different at La Jolla from Mauna Loa.
wilcox.test(mauna$co2, jolla$co2, paired = T)  # Therefore, there is justification for using one site over the other.  Since Mauna Loa is considered more favorable, only Mauna Loa will be used for to answer questions 2 & 3.

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mauna$co2 and jolla$co2 
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0

summary(jolla$co2 - mauna$co2)  # Mean difference of 0.9496 ppm

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -5.84   -0.30    1.92    0.95    2.60    4.81

Questions 2 & 3

What is the overall rate of CO2 increase from 1969 to 2008?
Does the rate of change appear to be steady, or is it accelerating/decelerating?

Simple Linear Regression

# Null Hypothesis: the linear regression model does not explain the
# variation of CO2 concentration better than the mean value.  Alt
# Hypothesis: the linear regression model explains the variation of CO2
# concentration better than the mean value.
lm1 <- lm(co2 ~ time, mauna)
summary(lm1)  # Shows an overall rate of 1.544 ppm/yr (which is significant)

## 
## Call:
## lm(formula = co2 ~ time, data = mauna)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.479 -1.878  0.087  1.803  6.080 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.72e+03   2.03e+01    -134   <2e-16 ***
## time         1.54e+00   1.02e-02     151   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.48 on 465 degrees of freedom
## Multiple R-squared: 0.98,    Adjusted R-squared: 0.98 
## F-statistic: 2.28e+04 on 1 and 465 DF,  p-value: <2e-16

plot(lm1$residuals ~ lm1$fitted.values, main = "Residuals for Linear Regression Model", 
    xlab = "Fitted Values (ppm)", ylab = "Residuals")

plot of chunk unnamed-chunk-8

plot(mauna$co2 ~ mauna$time, main = "Fit for Linear Regression Model", xlab = "Time (years)", 
    ylab = "CO2 concentration (ppm)")
abline(coef(lm1), col = "blue", lwd = 3)

plot of chunk unnamed-chunk-8

# The residuals are not symmetrical about 0.  Therefore, there must be
# some bias in the model. By plotting the regression line, it becomes
# apparent that the data arcs and does not follow the line exactly.

Quadratic Regression

# Center the data:
mauna$cent <- mauna$time - mean(mauna$time)
# Perform quadratic regression: Null Hypothesis: the quadratic regression
# model does not explain the variation of y better than the mean y value.
# Alternative Hypothesis: the quadratic regression model explains the
# variation of y better than the mean y value.
lm2 <- lm(co2 ~ cent + I(cent^2), mauna)
summary(lm2)

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.061 -1.766  0.155  1.836  4.419 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.50e+02   1.54e-01  2273.7   <2e-16 ***
## cent        1.54e+00   9.14e-03   168.9   <2e-16 ***
## I(cent^2)   9.92e-03   9.09e-04    10.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984,   Adjusted R-squared: 0.984 
## F-statistic: 1.43e+04 on 2 and 464 DF,  p-value: <2e-16

plot(lm2$residuals ~ lm2$fitted.values, main = "Residuals for Quadratic Regression Model", 
    xlab = "Fitted Values (ppm)", ylab = "Residuals")

plot of chunk unnamed-chunk-10

lm5 <- lm(mauna$co2 ~ mauna$cent)
plot(co2 ~ cent, mauna, main = "CO2 Concentration Regression Models", xlab = "Centered Years (1969-2007)", 
    ylab = "CO2 Concentration (ppm)", cex = 0.6)
curve(predict(lm2, data.frame(cent = x), type = "resp"), add = TRUE, col = "red", 
    lwd = 3)
abline(coef(lm5), col = "blue", lwd = 3)
legend("bottomright", c("Linear", "Quadratic"), lty = c(1, 1), col = c("blue", 
    "red"))

plot of chunk unnamed-chunk-11

## Why is this model better?  1) The residuals visually fit better versus
## fitted values.  2) The R2 value is slightly higher.

Test Significance of Model Differences

# We perform the ANOVA to determine whether the two models are
# statistically different.  Null Hypothesis: The two models explain the
# variation in CO2 concentration equally well.  Alternative Hypothesis:
# One model explains the variation in CO2 concentration better than the
# other.
anova(lm1, lm2)

## Analysis of Variance Table
## 
## Model 1: co2 ~ time
## Model 2: co2 ~ cent + I(cent^2)
##   Res.Df  RSS Df Sum of Sq   F Pr(>F)    
## 1    465 2869                            
## 2    464 2284  1       585 119 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The result is significant, so the quadratic model really is better.

Describing the Slope

# The slope is the derivative of the function 0.009917x^2 + 1.544x + 350.1
# f'(x) = 0.0049585x + 1.544, where x = time deviation from mean time (in
# years) and f'(x) = CO2 concentration (in ppm) Since the coefficeint is
# about m=0.005 for the slope equation, the change is accelerating with
# time by about 5 ppb/yr each year.  To find the range of slopes, use
# f'(x) to determine slope at each point and then take the minimum and
# maximum.
mauna$slopes <- 0.0049585 * (mauna$cent) + 1.544
summary(mauna$slopes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.45    1.50    1.54    1.54    1.59    1.64

# The slope is between 1.448 and 1.640 ppm/yr.  The mean slope is 1.544
# ppm/yr (which, of course, is the same as the linear model reported).

Question 4

Is the rate of increase the same at both sites?

# Center the data:
mauna$cent <- mauna$time - mean(mauna$time)
jolla$cent <- jolla$time - mean(jolla$time)
# Perform a quadratic regression at each site: Null: the quadratic
# regression model does not explain the variation of y better than the
# mean y value.  Alt: the quadratic regression model explains the
# variation of y better than the mean y value.
lm4m <- lm(co2 ~ cent + I(cent^2), mauna)
lm4j <- lm(co2 ~ cent + I(cent^2), jolla)

par(mfrow = c(1, 2))
plot(co2 ~ cent, mauna, main = "Mauna Loa Quadratic Model", xlab = "Centered Years (1969-2007)", 
    ylab = "CO2 Concentration (ppm)", cex = 0.6, ylim = c(320, 390))
curve(predict(lm4m, data.frame(cent = x), type = "resp"), add = TRUE, col = "red", 
    lwd = 3)
plot(co2 ~ cent, jolla, main = "La Jolla Quadratic Model", xlab = "Centered Years (1969-2007)", 
    ylab = "CO2 Concentration (ppm)", cex = 0.6, ylim = c(320, 390))
curve(predict(lm4j, data.frame(cent = x), type = "resp"), add = TRUE, col = "blue", 
    lwd = 3)

plot of chunk unnamed-chunk-16

# plot(lm4m$residuals~lm4m$fitted.values,main='Residuals for Mauna Loa
# Quadratic', xlab='Centred Years (1969-2007)',ylab='Residuals')
# plot(lm4j$residuals~lm4j$fitted.values,main='Residuals for La Jolla
# Quadratic', xlab='Fitted Values (ppm)',ylab='Residuals')

Calculating Slopes


summary(lm4m)  # f(x) = 0.009917x^2 + 1.544x + 350.10 --> f'(x) = 0.0049585x + 1.544

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.061 -1.766  0.155  1.836  4.419 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.50e+02   1.54e-01  2273.7   <2e-16 ***
## cent        1.54e+00   9.14e-03   168.9   <2e-16 ***
## I(cent^2)   9.92e-03   9.09e-04    10.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984,   Adjusted R-squared: 0.984 
## F-statistic: 1.43e+04 on 2 and 464 DF,  p-value: <2e-16

summary(lm4j)  # f(x) = 0.01029x^2 + 1.506x + 350.10 --> f'(x) = 0.005145x + 1.506

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = jolla)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.30  -2.65   1.23   2.74   5.86 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.51e+02   2.47e-01 1423.31   <2e-16 ***
## cent        1.51e+00   1.46e-02  102.93   <2e-16 ***
## I(cent^2)   1.03e-02   1.46e-03    7.06    6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958,   Adjusted R-squared: 0.958 
## F-statistic: 5.32e+03 on 2 and 464 DF,  p-value: <2e-16

mauna$slopes = 0.0049585 * (mauna$cent) + 1.544
jolla$slopes = 0.005145 * (jolla$cent) + 1.506
summary(mauna$slopes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.45    1.50    1.54    1.54    1.59    1.64

summary(jolla$slopes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.41    1.46    1.51    1.51    1.56    1.61

Comparing Rates of Increase

# Null Hypothesis: The data are normally distributed.  Alternative
# Hypothesis: The data are not normally distributed.
shapiro.test(jolla$slopes)

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$slopes 
## W = 0.9547, p-value = 8.884e-11

shapiro.test(mauna$slopes)

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$slopes 
## W = 0.9547, p-value = 8.885e-11

# The data are not normally distributed, so a Wilcox test will be used.
# Null Hypothesis: There is no difference between the rates of CO2
# concentration increase at Mauna Loa and La Jolla Alternative Hypothesis:
# There is a difference between the rates of CO2 concentration increase at
# Mauna Loa and La Jolla
wilcox.test(jolla$slopes, mauna$slopes, paired = T)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  jolla$slopes and mauna$slopes 
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# The wilcox test indicates that there IS a significant difference in the
# slopes.  La Jolla has a slower rate of increase than Mauna Loa.

Alternative Method for Question 4

# Null Hypothesis: the quadratic split regression model does not explain
# the variation of y better than the mean y value.  Alternative
# Hypothesis: the quadratic split regression model explains the variation
# of y better than the mean y value.
CO2$cent <- CO2$time - mean(CO2$time)
lm3 <- lm(co2 ~ cent + I(cent^2) + site, CO2)
summary(lm3)

## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2) + site, data = CO2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.48  -2.11   0.55   2.30   5.55 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.50e+02   1.75e-01 2000.90  < 2e-16 ***
## cent        1.52e+00   8.64e-03  176.51  < 2e-16 ***
## I(cent^2)   1.01e-02   8.60e-04   11.75  < 2e-16 ***
## site        9.50e-01   1.94e-01    4.89  1.2e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.97 on 930 degrees of freedom
## Multiple R-squared: 0.971,   Adjusted R-squared: 0.971 
## F-statistic: 1.04e+04 on 3 and 930 DF,  p-value: <2e-16

# The additional null hypothesis that difference site does not explain a
# part of the variation in y can be rejected.  The rate of increase is not
# the same.

par(mfrow = c(1, 2))
plot(lm3$residuals ~ lm3$fitted.values, main = "Residuals for Quadratic Split Model", 
    xlab = "Fitted Values (ppm)", ylab = "Residuals")
plot(CO2$co2 ~ CO2$cent, pch = CO2$site, cex = 0.6, main = "Fits for Quadratic Split Model", 
    xlab = "Centered Years (1969-2007)", ylab = "CO2 Concentration (ppm)")
lines(predict(lm3) ~ cent, data = CO2, subset = site == 0, col = "red", lwd = 2)
lines(predict(lm3) ~ cent, data = CO2, subset = site == 1, col = "blue", lwd = 2)
legend("bottomright", c("Mauna Loa", "La Jolla"), lty = c(1, 1), col = c("red", 
    "blue"))

plot of chunk unnamed-chunk-20