#load file for PC #co2<- read.csv(choose.files(),header = TRUE) #load file for Mac #co2<- read.csv(file.choose(),header = TRUE)
# hard code to computer for Knitr
co2 <- read.csv("/Users/kennetheggering/CU stuff/GEOG Quant class/Lab 1/co2_LAB1.csv")
#
#
#
##############################################Begin Question 1###################################################
Question 1 -Do the CO2 measurements at the two sites differ significantly from each other?
# subset the data based on site
mauna <- co2[co2$site == 0, ]
jolla <- co2[co2$site == 1, ]
### are all the data normally distributed?
par(mfrow = c(1, 3))
qqnorm(co2$co2)
qqline(co2$co2)
### are the Mauna data normal
qqnorm(mauna$co2, main = "Mauna Quantile Quantile plot")
qqline(mauna$co2)
### are the Jolla data normal
qqnorm(jolla$co2, main = "Jolla Quantile Quantile plot")
qqline(jolla$co2)
title("\n\nQQplot and QQlines", outer = TRUE)
### based on the qq plots (entire collection, mauna-only collection,
### jolla-only collection) the Data does not appear to be normally
### distributed, so we can throw out the t.test. The data are not normal
### complete histograms of co2 and subsetted data
par(mfrow = c(1, 3))
hist(co2$co2, main = "", xlab = "CO2 at both sites")
hist(mauna$co2, main = "", xlab = "CO2 at Mauna")
hist(jolla$co2, main = "", xlab = "CO2 at Jolla")
title("\n\nCO2 histograms of Mauna and Jolla", outer = TRUE)
# none of the histograms shows a normal distribution of data
### the shapiro test calcs the probability that the sample was drawn from
### a normal population this is a statistical test (not visual)
shapiro.test(co2$co2)
##
## Shapiro-Wilk normality test
##
## data: co2$co2
## W = 0.963, p-value = 1.202e-14
shapiro.test(mauna$co2)
##
## Shapiro-Wilk normality test
##
## data: mauna$co2
## W = 0.9581, p-value = 2.935e-10
shapiro.test(jolla$co2)
##
## Shapiro-Wilk normality test
##
## data: jolla$co2
## W = 0.966, p-value = 6.135e-09
### the data are not normally distributed. Try to transform the data
### log (transform) the data see if it fits does all the data fit a linear
### does the co2 data (when logged) fit a linear model
par(mfrow = c(1, 3))
co2.log <- log(co2$co2)
qqnorm(co2.log)
qqline(co2.log)
### does the Mauna data (when logged) fit a linear
mauna.log <- log(mauna$co2)
qqnorm(mauna.log, main = "Mauna QQ plot of log")
qqline(mauna.log)
### does the Jolla data (when logged) fit a linear?
jolla.log <- log(jolla$co2)
qqnorm(jolla.log, main = "Jolla QQ plot of log")
qqline(jolla.log)
### the log of the data still isn't normal based on the plots above
### Shapiro test for all CO2
shapiro.test(co2.log)
##
## Shapiro-Wilk normality test
##
## data: co2.log
## W = 0.9645, p-value = 2.632e-14
####### the results of the above tests show that the data are not normally
####### distributed (surprise!) Shapiro test for mauna
shapiro.test(mauna.log)
##
## Shapiro-Wilk normality test
##
## data: mauna.log
## W = 0.9594, p-value = 4.756e-10
###### the results of the above tests show that the data are not normally
###### distributed (surprise!) Shapiro test for jolla
shapiro.test(jolla.log)
##
## Shapiro-Wilk normality test
##
## data: jolla.log
## W = 0.9677, p-value = 1.271e-08
####### the results of the above tests show that the data are not normally
####### distributed (surprise!)
##fun fact: a soccer ball has 32 patches
### The wilcox makes no assumptions of normality in the data distribution.
### the wilcox also minimizes the outliers within the data (e.g. bill
### gates) it's a robust statistical technique wilcox answers question 1.
### paired=F because the measurements taken from different locations
wilcox.test(mauna$co2, jolla$co2, paired = F)
##
## Wilcoxon rank sum test with continuity correction
##
## data: mauna$co2 and jolla$co2
## W = 105522, p-value = 0.393
## alternative hypothesis: true location shift is not equal to 0
### the p result of .393 tells us that we don't reject the null. Therefore
### we conclude that the CO2 measurements taken at the two sites do not
### differ significantly from one another
#######################################################end question 1########################################
#
#
#
#
#
#
#
#######################################################begin question 2########################################
# Question 2 - What is the overall rate of CO2 increase from 1969 to 2008?
# Simple regression models
model.simple <- lm(co2 ~ time, data = co2)
summary(model.simple)
##
## Call:
## lm(formula = co2 ~ time, data = co2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.547 -2.362 0.449 2.302 8.334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.68e+03 1.86e+01 -144 <2e-16 ***
## time 1.52e+00 9.35e-03 163 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.21 on 932 degrees of freedom
## Multiple R-squared: 0.966, Adjusted R-squared: 0.966
## F-statistic: 2.66e+04 on 1 and 932 DF, p-value: <2e-16
lm1 <- lm(co2 ~ I(time^2) + site + I(time^2 * time), data = co2)
# par(mfrow = c(2, 2)) plot(lm1) summary(lm1)
model.mauna <- lm(co2 ~ time, data = mauna)
# par(mfrow = c(2, 2)) plot(model.mauna) summary(model.mauna)
model.jolla <- lm(co2 ~ time, data = jolla)
# par(mfrow = c(2, 2)) plot(model.jolla) summary(model.jolla)
dev.off()
par(mfrow = c(2, 2))
plot(model.simple$residuals ~ model.simple$fitted.values, main = "model.simple residuals")
plot(lm1$residuals ~ lm1$fitted.values, main = "lm1 residuals")
plot(model.mauna$residuals ~ model.mauna$fitted.values, main = "model.mauna residuals")
plot(model.jolla$residuals ~ model.jolla$fitted.values, main = "model.jolla residuals")
# no matter how you look at it, the residuals are fitting a parabolic
# line. They do fit better in the polonomial (lm1)
# ???what about the following answer for question 2/3???? Question 2. What
# is the overall rate of CO2 increase from 1969 to 2008? Question 3. Does
# the rate of change appear to be steady, or is it
# accelerating/decelerating?
mauna.first <- mauna[1, 4]
mauna.last <- mauna[467, 4]
jolla.first <- jolla[1, 4]
jolla.last <- jolla[467, 4]
totalmonths <- 467
totalyears <- totalmonths/12
mauna_co2_change_per_year <- (mauna.last - mauna.first)/totalyears ### 1.527 CO2 ppm/year
jolla_co2_change_per_year <- (jolla.last - jolla.first)/totalyears ### 1.474 CO2 ppm/year
#######################################################end question 2########################################
#
#
#
#
#
#
#
#######################################################begin question 3########################################
# Question 3. Does the rate of change appear to be steady, or is it
# accelerating/decelerating? rate of change over first 5 years
mauna1st <- mauna[1, 4]
mauna60th <- mauna[60, 4]
rate_of_mauna_1st5 <- (mauna60th - mauna1st)/5
### rate of change over second 5 years
mauna61st <- mauna[61, 4]
mauna120th <- mauna[120, 4]
rate_of_mauna_2nd5 <- (mauna120th - mauna61st)/5
### rate of change over third 5 years
mauna121st <- mauna[121, 4]
mauna180th <- mauna[180, 4]
rate_of_mauna_3rd5 <- (mauna180th - mauna121st)/5
### rate of change over forth 5 years
mauna181st <- mauna[181, 4]
mauna240th <- mauna[240, 4]
rate_of_mauna_4th5 <- (mauna240th - mauna181st)/5
### rate of change over fifth 5 years
mauna241st <- mauna[241, 4]
mauna300th <- mauna[300, 4]
rate_of_mauna_5th5 <- (mauna300th - mauna241st)/5
### rate of change over sixth 5 years
mauna301st <- mauna[301, 4]
mauna360th <- mauna[360, 4]
rate_of_mauna_6th5 <- (mauna360th - mauna301st)/5
### rate of change over seventh 5 years
mauna361st <- mauna[361, 4]
mauna420th <- mauna[420, 4]
rate_of_mauna_7th5 <- (mauna420th - mauna361st)/5
### rate of change over eighth 5 years
mauna421st <- mauna[421, 4]
mauna467th <- mauna[467, 4]
rate_of_mauna_8th5 <- (mauna467th - mauna421st)/3.1966
#### not a complete 5 years it's 3.9166 years
### rate of change over 1st 5 years
jolla1st <- jolla[1, 4]
jolla60th <- jolla[60, 4]
rate_of_jolla_1st5 <- (jolla60th - jolla1st)/5
### rate of change over 2nd 5 years
jolla61st <- jolla[61, 4]
jolla120th <- jolla[120, 4]
rate_of_jolla_2nd5 <- (jolla120th - jolla61st)/5
### rate of change over 2nd 5 years
jolla121st <- jolla[121, 4]
jolla180th <- jolla[180, 4]
rate_of_jolla_3rd5 <- (jolla180th - jolla121st)/5
### rate of change over forth 5 years
jolla181st <- jolla[181, 4]
jolla240th <- jolla[240, 4]
rate_of_jolla_4th5 <- (jolla240th - jolla181st)/5
### rate of change over fifth 5 years
jolla241st <- jolla[241, 4]
jolla300th <- jolla[300, 4]
rate_of_jolla_5th5 <- (jolla300th - jolla241st)/5
### rate of change over sixth 5 years
jolla301st <- jolla[301, 4]
jolla360th <- jolla[360, 4]
rate_of_jolla_6th5 <- (jolla360th - jolla301st)/5
### rate of change over seventh 5 years
jolla361st <- jolla[361, 4]
jolla420th <- jolla[420, 4]
rate_of_jolla_7th5 <- (jolla420th - jolla361st)/5
### rate of change over eight 5 years
jolla421st <- jolla[421, 4]
jolla467th <- jolla[467, 4]
rate_of_jolla_8th5 <- (jolla467th - jolla421st)/3.9166 ####not a complete 5 years it's 3.9166 years
print(jolla5yearchanges <- c(rate_of_jolla_1st5, rate_of_jolla_2nd5, rate_of_jolla_3rd5,
rate_of_jolla_4th5, rate_of_jolla_5th5, rate_of_jolla_6th5, rate_of_jolla_7th5,
rate_of_jolla_8th5))
## [1] 0.740 1.318 1.334 1.536 1.012 1.884 1.674 1.660
print(mauna5yearchanges <- c(rate_of_mauna_1st5, rate_of_mauna_2nd5, rate_of_mauna_3rd5,
rate_of_mauna_4th5, rate_of_mauna_5th5, rate_of_mauna_6th5, rate_of_mauna_7th5,
rate_of_mauna_8th5))
## [1] 0.986 1.100 1.446 1.626 0.990 1.834 1.640 1.868
?qqline
### question 4: Is the rate of increase the same at both sites?
totalmonths <- 467
totalyears <- totalmonths/12
totalyears
## [1] 38.92
rate_of_mauna_change <- (mauna.last - mauna.first)/totalyears
rate_of_mauna_change ### 1.527 CO2 ppm/year
## [1] 1.527
rate_of_jolla_change <- (jolla.last - jolla.first)/totalyears
rate_of_jolla_change ### 1.474 CO2 ppm/year
## [1] 1.474
plot(mauna$co2 ~ mauna$time, col = "darkgreen", xlab = "Time", ylab = "", pch = 21,
cex = 0.6, main = "Mauna in Green, Jolla in Red")
axis(side = 2)
mtext("Mauna CO2", side = 2, line = 2, col = "darkgreen")
par(new = T)
plot(jolla$co2, axes = F, xlab = "", ylab = "", col = "red", pch = 24, cex = 0.6)
axis(side = 4)
mtext("Jolla CO2", side = 4, line = 2, col = "red")
par(new = F)
### both sets do not appear linear (when zoomed in on the plot) note, it
### appears that La Jolla has a wider spread
### linear regression model for entire co2 dataset
model.simple <- lm(co2 ~ time, data = co2)
plot(co2 ~ time, data = co2)
par(mfrow = c(2, 2))
plot(model.simple)
### Create plots of residuals linear regression model for Mauna co2
### dataset
model.mauna <- lm(co2 ~ time, data = mauna)
plot(model.mauna$resid ~ mauna$time, col = "darkgreen", xlab = "From 1969-2007",
ylab = "", pch = 21, cex = 0.56, main = "Mauna in Green, Jolla in Red")
axis(side = 2)
mtext("Mauna CO2 resids", side = 2, line = 2, col = "darkgreen")
par(new = T)
# linear regression model for Jolla co2 dataset
model.jolla <- lm(co2 ~ time, data = jolla)
plot(model.jolla$resid ~ jolla$time, axes = F, xlab = "From 1969-2007", ylab = "",
col = "red", pch = 24, cex = 0.58)
axis(side = 4)
mtext("Jolla CO2 resids", side = 4, line = 2, col = "red")
par(new = F)
# question 4: Is the rate of increase the same at both sites? Another
# alternative to using predict() is to center the data
par(mfrow = c(1, 3))
co2$cent <- co2$time - mean(co2$time)
lm3 <- lm(co2 ~ cent + I(cent^2), co2)
summary(lm3)
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = co2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.005 -2.213 0.429 2.358 6.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+02 1.47e-01 2378.9 <2e-16 ***
## cent 1.52e+00 8.74e-03 174.4 <2e-16 ***
## I(cent^2) 1.01e-02 8.70e-04 11.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97, Adjusted R-squared: 0.97
## F-statistic: 1.53e+04 on 2 and 931 DF, p-value: <2e-16
plot(y = co2$co2, co2$cent, col = "brown", main = "Centered values of both co2",
ylab = "co2 (ppm)")
mauna$cent <- mauna$time - mean(mauna$time)
lmM <- lm(co2 ~ cent + I(cent^2), mauna)
summary(lmM)
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.061 -1.766 0.155 1.836 4.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50e+02 1.54e-01 2273.7 <2e-16 ***
## cent 1.54e+00 9.14e-03 168.9 <2e-16 ***
## I(cent^2) 9.92e-03 9.09e-04 10.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.984
## F-statistic: 1.43e+04 on 2 and 464 DF, p-value: <2e-16
plot(y = mauna$co2, mauna$cent, col = "yellow", main = "Centered values of Mauna co2",
ylab = "co2 (ppm)")
jolla$cent <- jolla$time - mean(jolla$time)
lmJ <- lm(co2 ~ cent + I(cent^2), jolla)
summary(lmJ)
##
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = jolla)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.30 -2.65 1.23 2.74 5.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+02 2.47e-01 1423.31 <2e-16 ***
## cent 1.51e+00 1.46e-02 102.93 <2e-16 ***
## I(cent^2) 1.03e-02 1.46e-03 7.06 6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958, Adjusted R-squared: 0.958
## F-statistic: 5.32e+03 on 2 and 464 DF, p-value: <2e-16
plot(y = jolla$co2, jolla$cent, col = "lightgreen", main = "Centered values of Jolla co2",
ylab = "co2 (ppm)")
### all the data have a slight curve in them. Meaning that the rate of
### increase is increasing (non-linear)