CO2 <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/co2_LAB1.csv",
header = T)
# Subsets to compare stations.
mauna <- subset(CO2, site == 0)
jolla <- subset(CO2, site == 1)
plot(CO2$co2 ~ CO2$time, main = "CO2 Concentration Increases over Time", xlab = "Time (years)",
ylab = "CO2 concentration (ppm)")
1. Do the CO2 measurements taken at the two sites differ significantly from one another?
Tests of Normality
par(mfrow = c(2, 2))
hist(mauna$co2, main = "Histogram of Mauna Loa CO2 Concentrations", xlab = "CO2 Concentration (ppm)")
hist(jolla$co2, main = "Histogram of La Jolla CO2 Concentrations", xlab = "CO2 Concentration (ppm)")
qqnorm(mauna$co2, main = "Mauna Loa CO2 QQ-Plot")
qqline(mauna$co2)
qqnorm(jolla$co2, main = "La Jolla CO2 QQ-Plot")
qqline(jolla$co2)
# Data histograms shows the data do not form a bell curve. No straight
# lines means no normality.
# Shapiro-Wilk Tests: Null Hypotheses: The distributions of these data are
# normal. Alternative Hypotheses: The distributions of these data are not
# normal.
shapiro.test(mauna$co2) # Significance indicates distribution is not normal.
##
## Shapiro-Wilk normality test
##
## data: mauna$co2
## W = 0.9581, p-value = 2.935e-10
shapiro.test(jolla$co2) # Significance indicates distribution is not normal.
##
## Shapiro-Wilk normality test
##
## data: jolla$co2
## W = 0.966, p-value = 6.135e-09
Transformations
library(HH)
## Loading required package: lattice
## Loading required package: grid
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: splines
## Loading required package: leaps
## Loading required package: RColorBrewer
## Loading required package: latticeExtra
ladder(co2 ~ time, mauna)
ladder(co2 ~ time, jolla)
# No amount of transformation promises to make the data normally
# distributed. So Wilcox test is perferred.
Wilcox Test of Difference
# If data cannot be normally distributed, use wilcox, which acts like a
# t-test with ranked the data. Null Hypothesis: The average CO2
# concentration is the same at each site and any variation is due to
# random chance. Alternative Hypothesis: The average CO2 concentration is
# different at La Jolla from Mauna Loa.
wilcox.test(mauna$co2, jolla$co2, paired = T) # Therefore, there is justification for using one site over the other. Since Mauna Loa is considered more favorable, only Mauna Loa will be used for to answer questions 2 & 3.
##
## Wilcoxon signed rank test with continuity correction
##
## data: mauna$co2 and jolla$co2
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0
summary(jolla$co2 - mauna$co2) # Mean difference of 0.9496 ppm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.84 -0.30 1.92 0.95 2.60 4.81
What is the overall rate of CO2 increase from 1969 to 2008?
Does the rate of change appear to be steady, or is it accelerating/decelerating?
Simple Linear Regression
# Null Hypothesis: the linear regression model does not explain the
# variation of CO2 concentration better than the mean value. Alt
# Hypothesis: the linear regression model explains the variation of CO2
# concentration better than the mean value.
lm1 <- lm(co2 ~ time, mauna)
summary(lm1) # Shows an overall rate of 1.544 ppm/yr (which is significant)
##
## Call:
## lm(formula = co2 ~ time, data = mauna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.479 -1.878 0.087 1.803 6.080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.72e+03 2.03e+01 -134 <2e-16 ***
## time 1.54e+00 1.02e-02 151 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.48 on 465 degrees of freedom
## Multiple R-squared: 0.98, Adjusted R-squared: 0.98
## F-statistic: 2.28e+04 on 1 and 465 DF, p-value: <2e-16
plot(lm1$residuals ~ lm1$fitted.values, main = "Residuals for Linear Regression Model",
xlab = "Fitted Values (ppm)", ylab = "Residuals")
plot(mauna$co2 ~ mauna$time, main = "Fit for Linear Regression Model", xlab = "Time (years)",
ylab = "CO2 concentration (ppm)")
abline(coef(lm1), col = "blue", lwd = 3)
# The residuals are not symmetrical about 0. Therefore, there must be
# some bias in the model. By plotting the regression line, it becomes
# apparent that the data arcs and does not follow the line exactly.
Quadratic Regression
# Center the data:
mauna$cent <- mauna$time - mean(mauna$time)
# Perform quadratic regression: Null Hypothesis: the quadratic regression
# model does not explain the variation of y better than the mean y value.
# Alternative Hypothesis: the quadratic regression model explains the
# variation of y better than the mean y value.
lm2 <- lm(co2 ~ cent + I(cent^2), mauna)
summary(lm2)
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.061 -1.766 0.155 1.836 4.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50e+02 1.54e-01 2273.7 <2e-16 ***
## cent 1.54e+00 9.14e-03 168.9 <2e-16 ***
## I(cent^2) 9.92e-03 9.09e-04 10.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.984
## F-statistic: 1.43e+04 on 2 and 464 DF, p-value: <2e-16
plot(lm2$residuals ~ lm2$fitted.values, main = "Residuals for Quadratic Regression Model",
xlab = "Fitted Values (ppm)", ylab = "Residuals")
lm5 <- lm(mauna$co2 ~ mauna$cent)
plot(co2 ~ cent, mauna, main = "CO2 Concentration Regression Models", xlab = "Centered Years (1969-2007)",
ylab = "CO2 Concentration (ppm)", cex = 0.6)
curve(predict(lm2, data.frame(cent = x), type = "resp"), add = TRUE, col = "red",
lwd = 3)
abline(coef(lm5), col = "blue", lwd = 3)
legend("bottomright", c("Linear", "Quadratic"), lty = c(1, 1), col = c("blue",
"red"))
## Why is this model better? 1) The residuals visually fit better versus
## fitted values. 2) The R2 value is slightly higher.
Test Significance of Model Differences
# We perform the ANOVA to determine whether the two models are
# statistically different. Null Hypothesis: The two models explain the
# variation in CO2 concentration equally well. Alternative Hypothesis:
# One model explains the variation in CO2 concentration better than the
# other.
anova(lm1, lm2)
## Analysis of Variance Table
##
## Model 1: co2 ~ time
## Model 2: co2 ~ cent + I(cent^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 465 2869
## 2 464 2284 1 585 119 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The result is significant, so the quadratic model really is better.
Describing the Slope
# The slope is the derivative of the function 0.009917x^2 + 1.544x + 350.1
# f'(x) = 0.0049585x + 1.544, where x = time deviation from mean time (in
# years) and f'(x) = CO2 concentration (in ppm) Since the coefficeint is
# about m=0.005 for the slope equation, the change is accelerating with
# time by about 5 ppb/yr each year. To find the range of slopes, use
# f'(x) to determine slope at each point and then take the minimum and
# maximum.
mauna$slopes <- 0.0049585 * (mauna$cent) + 1.544
summary(mauna$slopes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.45 1.50 1.54 1.54 1.59 1.64
# The slope is between 1.448 and 1.640 ppm/yr. The mean slope is 1.544
# ppm/yr (which, of course, is the same as the linear model reported).
Is the rate of increase the same at both sites?
# Center the data:
mauna$cent <- mauna$time - mean(mauna$time)
jolla$cent <- jolla$time - mean(jolla$time)
# Perform a quadratic regression at each site: Null: the quadratic
# regression model does not explain the variation of y better than the
# mean y value. Alt: the quadratic regression model explains the
# variation of y better than the mean y value.
lm4m <- lm(co2 ~ cent + I(cent^2), mauna)
lm4j <- lm(co2 ~ cent + I(cent^2), jolla)
par(mfrow = c(1, 2))
plot(co2 ~ cent, mauna, main = "Mauna Loa Quadratic Model", xlab = "Centered Years (1969-2007)",
ylab = "CO2 Concentration (ppm)", cex = 0.6, ylim = c(320, 390))
curve(predict(lm4m, data.frame(cent = x), type = "resp"), add = TRUE, col = "red",
lwd = 3)
plot(co2 ~ cent, jolla, main = "La Jolla Quadratic Model", xlab = "Centered Years (1969-2007)",
ylab = "CO2 Concentration (ppm)", cex = 0.6, ylim = c(320, 390))
curve(predict(lm4j, data.frame(cent = x), type = "resp"), add = TRUE, col = "blue",
lwd = 3)
# plot(lm4m$residuals~lm4m$fitted.values,main='Residuals for Mauna Loa
# Quadratic', xlab='Centred Years (1969-2007)',ylab='Residuals')
# plot(lm4j$residuals~lm4j$fitted.values,main='Residuals for La Jolla
# Quadratic', xlab='Fitted Values (ppm)',ylab='Residuals')
Calculating Slopes
summary(lm4m) # f(x) = 0.009917x^2 + 1.544x + 350.10 --> f'(x) = 0.0049585x + 1.544
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.061 -1.766 0.155 1.836 4.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50e+02 1.54e-01 2273.7 <2e-16 ***
## cent 1.54e+00 9.14e-03 168.9 <2e-16 ***
## I(cent^2) 9.92e-03 9.09e-04 10.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.984
## F-statistic: 1.43e+04 on 2 and 464 DF, p-value: <2e-16
summary(lm4j) # f(x) = 0.01029x^2 + 1.506x + 350.10 --> f'(x) = 0.005145x + 1.506
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = jolla)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.30 -2.65 1.23 2.74 5.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+02 2.47e-01 1423.31 <2e-16 ***
## cent 1.51e+00 1.46e-02 102.93 <2e-16 ***
## I(cent^2) 1.03e-02 1.46e-03 7.06 6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958, Adjusted R-squared: 0.958
## F-statistic: 5.32e+03 on 2 and 464 DF, p-value: <2e-16
mauna$slopes = 0.0049585 * (mauna$cent) + 1.544
jolla$slopes = 0.005145 * (jolla$cent) + 1.506
summary(mauna$slopes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.45 1.50 1.54 1.54 1.59 1.64
summary(jolla$slopes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.41 1.46 1.51 1.51 1.56 1.61
Comparing Rates of Increase
# Null Hypothesis: The data are normally distributed. Alternative
# Hypothesis: The data are not normally distributed.
shapiro.test(jolla$slopes)
##
## Shapiro-Wilk normality test
##
## data: jolla$slopes
## W = 0.9547, p-value = 8.884e-11
shapiro.test(mauna$slopes)
##
## Shapiro-Wilk normality test
##
## data: mauna$slopes
## W = 0.9547, p-value = 8.885e-11
# The data are not normally distributed, so a Wilcox test will be used.
# Null Hypothesis: There is no difference between the rates of CO2
# concentration increase at Mauna Loa and La Jolla Alternative Hypothesis:
# There is a difference between the rates of CO2 concentration increase at
# Mauna Loa and La Jolla
wilcox.test(jolla$slopes, mauna$slopes, paired = T)
##
## Wilcoxon signed rank test with continuity correction
##
## data: jolla$slopes and mauna$slopes
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# The wilcox test indicates that there IS a significant difference in the
# slopes. La Jolla has a slower rate of increase than Mauna Loa.
Alternative Method for Question 4
# Null Hypothesis: the quadratic split regression model does not explain
# the variation of y better than the mean y value. Alternative
# Hypothesis: the quadratic split regression model explains the variation
# of y better than the mean y value.
CO2$cent <- CO2$time - mean(CO2$time)
lm3 <- lm(co2 ~ cent + I(cent^2) + site, CO2)
summary(lm3)
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2) + site, data = CO2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.48 -2.11 0.55 2.30 5.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50e+02 1.75e-01 2000.90 < 2e-16 ***
## cent 1.52e+00 8.64e-03 176.51 < 2e-16 ***
## I(cent^2) 1.01e-02 8.60e-04 11.75 < 2e-16 ***
## site 9.50e-01 1.94e-01 4.89 1.2e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.97 on 930 degrees of freedom
## Multiple R-squared: 0.971, Adjusted R-squared: 0.971
## F-statistic: 1.04e+04 on 3 and 930 DF, p-value: <2e-16
# The additional null hypothesis that difference site does not explain a
# part of the variation in y can be rejected. The rate of increase is not
# the same.
par(mfrow = c(1, 2))
plot(lm3$residuals ~ lm3$fitted.values, main = "Residuals for Quadratic Split Model",
xlab = "Fitted Values (ppm)", ylab = "Residuals")
plot(CO2$co2 ~ CO2$cent, pch = CO2$site, cex = 0.6, main = "Fits for Quadratic Split Model",
xlab = "Centered Years (1969-2007)", ylab = "CO2 Concentration (ppm)")
lines(predict(lm3) ~ cent, data = CO2, subset = site == 0, col = "red", lwd = 2)
lines(predict(lm3) ~ cent, data = CO2, subset = site == 1, col = "blue", lwd = 2)
legend("bottomright", c("Mauna Loa", "La Jolla"), lty = c(1, 1), col = c("red",
"blue"))