Eggering – Lab 1 Appendix

Eggering – Lab 1

#load file for PC #co2<- read.csv(choose.files(),header = TRUE) #load file for Mac #co2<- read.csv(file.choose(),header = TRUE)

# hard code to computer for Knitr
co2 <- read.csv("/Users/kennetheggering/CU stuff/GEOG Quant class/Lab 1/co2_LAB1.csv")

#

#

#

##############################################Begin Question 1###################################################

Question 1 -Do the CO2 measurements at the two sites differ significantly from each other?

# subset the data based on site
mauna <- co2[co2$site == 0, ]
jolla <- co2[co2$site == 1, ]
### are all the data normally distributed?
par(mfrow = c(1, 3))
qqnorm(co2$co2)
qqline(co2$co2)
### are the Mauna data normal
qqnorm(mauna$co2, main = "Mauna Quantile Quantile plot")
qqline(mauna$co2)
### are the Jolla data normal
qqnorm(jolla$co2, main = "Jolla Quantile Quantile plot")
qqline(jolla$co2)
title("\n\nQQplot and QQlines", outer = TRUE)

plot of chunk unnamed-chunk-3

### based on the qq plots (entire collection, mauna-only collection,
### jolla-only collection) the Data does not appear to be normally
### distributed, so we can throw out the t.test. The data are not normal
### complete histograms of co2 and subsetted data
par(mfrow = c(1, 3))
hist(co2$co2, main = "", xlab = "CO2 at both sites")
hist(mauna$co2, main = "", xlab = "CO2 at Mauna")
hist(jolla$co2, main = "", xlab = "CO2 at Jolla")
title("\n\nCO2 histograms of Mauna and Jolla", outer = TRUE)

plot of chunk unnamed-chunk-4

# none of the histograms shows a normal distribution of data
### the shapiro test calcs the probability that the sample was drawn from
### a normal population this is a statistical test (not visual)
shapiro.test(co2$co2)
## 
##  Shapiro-Wilk normality test
## 
## data:  co2$co2 
## W = 0.963, p-value = 1.202e-14
shapiro.test(mauna$co2)
## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$co2 
## W = 0.9581, p-value = 2.935e-10
shapiro.test(jolla$co2)
## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$co2 
## W = 0.966, p-value = 6.135e-09
### the data are not normally distributed. Try to transform the data
### log (transform) the data see if it fits does all the data fit a linear
### does the co2 data (when logged) fit a linear model
par(mfrow = c(1, 3))
co2.log <- log(co2$co2)
qqnorm(co2.log)
qqline(co2.log)

### does the Mauna data (when logged) fit a linear
mauna.log <- log(mauna$co2)
qqnorm(mauna.log, main = "Mauna QQ plot of log")
qqline(mauna.log)

### does the Jolla data (when logged) fit a linear?
jolla.log <- log(jolla$co2)
qqnorm(jolla.log, main = "Jolla QQ plot of log")
qqline(jolla.log)

plot of chunk unnamed-chunk-6

### the log of the data still isn't normal based on the plots above
### Shapiro test for all CO2
shapiro.test(co2.log)
## 
##  Shapiro-Wilk normality test
## 
## data:  co2.log 
## W = 0.9645, p-value = 2.632e-14
####### the results of the above tests show that the data are not normally
####### distributed (surprise!) Shapiro test for mauna
shapiro.test(mauna.log)
## 
##  Shapiro-Wilk normality test
## 
## data:  mauna.log 
## W = 0.9594, p-value = 4.756e-10
###### the results of the above tests show that the data are not normally
###### distributed (surprise!) Shapiro test for jolla
shapiro.test(jolla.log)
## 
##  Shapiro-Wilk normality test
## 
## data:  jolla.log 
## W = 0.9677, p-value = 1.271e-08
####### the results of the above tests show that the data are not normally
####### distributed (surprise!)

##fun fact: a soccer ball has 32 patches

### The wilcox makes no assumptions of normality in the data distribution.
### the wilcox also minimizes the outliers within the data (e.g. bill
### gates) it's a robust statistical technique wilcox answers question 1.
### paired=F because the measurements taken from different locations
wilcox.test(mauna$co2, jolla$co2, paired = F)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  mauna$co2 and jolla$co2 
## W = 105522, p-value = 0.393
## alternative hypothesis: true location shift is not equal to 0
### the p result of .393 tells us that we don't reject the null. Therefore
### we conclude that the CO2 measurements taken at the two sites do not
### differ significantly from one another

#######################################################end question 1########################################

#

#

#

#

#

#

#

#######################################################begin question 2########################################

# Question 2 - What is the overall rate of CO2 increase from 1969 to 2008?
# Simple regression models
model.simple <- lm(co2 ~ time, data = co2)
summary(model.simple)
## 
## Call:
## lm(formula = co2 ~ time, data = co2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.547 -2.362  0.449  2.302  8.334 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.68e+03   1.86e+01    -144   <2e-16 ***
## time         1.52e+00   9.35e-03     163   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.21 on 932 degrees of freedom
## Multiple R-squared: 0.966,   Adjusted R-squared: 0.966 
## F-statistic: 2.66e+04 on 1 and 932 DF,  p-value: <2e-16
lm1 <- lm(co2 ~ I(time^2) + site + I(time^2 * time), data = co2)
# par(mfrow = c(2, 2)) plot(lm1) summary(lm1)
model.mauna <- lm(co2 ~ time, data = mauna)
# par(mfrow = c(2, 2)) plot(model.mauna) summary(model.mauna)
model.jolla <- lm(co2 ~ time, data = jolla)
# par(mfrow = c(2, 2)) plot(model.jolla) summary(model.jolla)

dev.off()

par(mfrow = c(2, 2))
plot(model.simple$residuals ~ model.simple$fitted.values, main = "model.simple residuals")
plot(lm1$residuals ~ lm1$fitted.values, main = "lm1 residuals")
plot(model.mauna$residuals ~ model.mauna$fitted.values, main = "model.mauna residuals")
plot(model.jolla$residuals ~ model.jolla$fitted.values, main = "model.jolla residuals")

plot of chunk unnamed-chunk-13

# no matter how you look at it, the residuals are fitting a parabolic
# line. They do fit better in the polonomial (lm1)
# ???what about the following answer for question 2/3???? Question 2. What
# is the overall rate of CO2 increase from 1969 to 2008? Question 3. Does
# the rate of change appear to be steady, or is it
# accelerating/decelerating?
mauna.first <- mauna[1, 4]
mauna.last <- mauna[467, 4]

jolla.first <- jolla[1, 4]
jolla.last <- jolla[467, 4]

totalmonths <- 467
totalyears <- totalmonths/12
mauna_co2_change_per_year <- (mauna.last - mauna.first)/totalyears  ### 1.527 CO2 ppm/year
jolla_co2_change_per_year <- (jolla.last - jolla.first)/totalyears  ### 1.474 CO2 ppm/year

#######################################################end question 2########################################

#

#

#

#

#

#

#

#######################################################begin question 3########################################

# Question 3. Does the rate of change appear to be steady, or is it
# accelerating/decelerating? rate of change over first 5 years
mauna1st <- mauna[1, 4]
mauna60th <- mauna[60, 4]
rate_of_mauna_1st5 <- (mauna60th - mauna1st)/5

### rate of change over second 5 years
mauna61st <- mauna[61, 4]
mauna120th <- mauna[120, 4]
rate_of_mauna_2nd5 <- (mauna120th - mauna61st)/5

### rate of change over third 5 years
mauna121st <- mauna[121, 4]
mauna180th <- mauna[180, 4]
rate_of_mauna_3rd5 <- (mauna180th - mauna121st)/5

### rate of change over forth 5 years
mauna181st <- mauna[181, 4]
mauna240th <- mauna[240, 4]
rate_of_mauna_4th5 <- (mauna240th - mauna181st)/5

### rate of change over fifth 5 years
mauna241st <- mauna[241, 4]
mauna300th <- mauna[300, 4]
rate_of_mauna_5th5 <- (mauna300th - mauna241st)/5

### rate of change over sixth 5 years
mauna301st <- mauna[301, 4]
mauna360th <- mauna[360, 4]
rate_of_mauna_6th5 <- (mauna360th - mauna301st)/5

### rate of change over seventh 5 years
mauna361st <- mauna[361, 4]
mauna420th <- mauna[420, 4]
rate_of_mauna_7th5 <- (mauna420th - mauna361st)/5

### rate of change over eighth 5 years
mauna421st <- mauna[421, 4]
mauna467th <- mauna[467, 4]
rate_of_mauna_8th5 <- (mauna467th - mauna421st)/3.1966
#### not a complete 5 years it's 3.9166 years
### rate of change over 1st 5 years
jolla1st <- jolla[1, 4]
jolla60th <- jolla[60, 4]
rate_of_jolla_1st5 <- (jolla60th - jolla1st)/5

### rate of change over 2nd 5 years
jolla61st <- jolla[61, 4]
jolla120th <- jolla[120, 4]
rate_of_jolla_2nd5 <- (jolla120th - jolla61st)/5

### rate of change over 2nd 5 years
jolla121st <- jolla[121, 4]
jolla180th <- jolla[180, 4]
rate_of_jolla_3rd5 <- (jolla180th - jolla121st)/5

### rate of change over forth 5 years
jolla181st <- jolla[181, 4]
jolla240th <- jolla[240, 4]
rate_of_jolla_4th5 <- (jolla240th - jolla181st)/5

### rate of change over fifth 5 years
jolla241st <- jolla[241, 4]
jolla300th <- jolla[300, 4]
rate_of_jolla_5th5 <- (jolla300th - jolla241st)/5

### rate of change over sixth 5 years
jolla301st <- jolla[301, 4]
jolla360th <- jolla[360, 4]
rate_of_jolla_6th5 <- (jolla360th - jolla301st)/5

### rate of change over seventh 5 years
jolla361st <- jolla[361, 4]
jolla420th <- jolla[420, 4]
rate_of_jolla_7th5 <- (jolla420th - jolla361st)/5

### rate of change over eight 5 years
jolla421st <- jolla[421, 4]
jolla467th <- jolla[467, 4]
rate_of_jolla_8th5 <- (jolla467th - jolla421st)/3.9166  ####not a complete 5 years it's 3.9166 years
print(jolla5yearchanges <- c(rate_of_jolla_1st5, rate_of_jolla_2nd5, rate_of_jolla_3rd5, 
    rate_of_jolla_4th5, rate_of_jolla_5th5, rate_of_jolla_6th5, rate_of_jolla_7th5, 
    rate_of_jolla_8th5))
## [1] 0.740 1.318 1.334 1.536 1.012 1.884 1.674 1.660

print(mauna5yearchanges <- c(rate_of_mauna_1st5, rate_of_mauna_2nd5, rate_of_mauna_3rd5, 
    rate_of_mauna_4th5, rate_of_mauna_5th5, rate_of_mauna_6th5, rate_of_mauna_7th5, 
    rate_of_mauna_8th5))
## [1] 0.986 1.100 1.446 1.626 0.990 1.834 1.640 1.868

?qqline

### question 4: Is the rate of increase the same at both sites?
totalmonths <- 467
totalyears <- totalmonths/12
totalyears
## [1] 38.92

rate_of_mauna_change <- (mauna.last - mauna.first)/totalyears
rate_of_mauna_change  ### 1.527 CO2 ppm/year
## [1] 1.527
rate_of_jolla_change <- (jolla.last - jolla.first)/totalyears
rate_of_jolla_change  ### 1.474 CO2 ppm/year
## [1] 1.474

plot(mauna$co2 ~ mauna$time, col = "darkgreen", xlab = "Time", ylab = "", pch = 21, 
    cex = 0.6, main = "Mauna in Green, Jolla in Red")
axis(side = 2)
mtext("Mauna CO2", side = 2, line = 2, col = "darkgreen")
par(new = T)

plot(jolla$co2, axes = F, xlab = "", ylab = "", col = "red", pch = 24, cex = 0.6)
axis(side = 4)
mtext("Jolla CO2", side = 4, line = 2, col = "red")

plot of chunk unnamed-chunk-19

par(new = F)
### both sets do not appear linear (when zoomed in on the plot) note, it
### appears that La Jolla has a wider spread


### linear regression model for entire co2 dataset
model.simple <- lm(co2 ~ time, data = co2)
plot(co2 ~ time, data = co2)

plot of chunk unnamed-chunk-19

par(mfrow = c(2, 2))
plot(model.simple)

plot of chunk unnamed-chunk-19



### Create plots of residuals linear regression model for Mauna co2
### dataset
model.mauna <- lm(co2 ~ time, data = mauna)
plot(model.mauna$resid ~ mauna$time, col = "darkgreen", xlab = "From 1969-2007", 
    ylab = "", pch = 21, cex = 0.56, main = "Mauna in Green, Jolla in Red")
axis(side = 2)
mtext("Mauna CO2 resids", side = 2, line = 2, col = "darkgreen")
par(new = T)

# linear regression model for Jolla co2 dataset
model.jolla <- lm(co2 ~ time, data = jolla)
plot(model.jolla$resid ~ jolla$time, axes = F, xlab = "From 1969-2007", ylab = "", 
    col = "red", pch = 24, cex = 0.58)
axis(side = 4)
mtext("Jolla CO2 resids", side = 4, line = 2, col = "red")
par(new = F)

plot of chunk unnamed-chunk-19

# question 4: Is the rate of increase the same at both sites? Another
# alternative to using predict() is to center the data
par(mfrow = c(1, 3))

co2$cent <- co2$time - mean(co2$time)
lm3 <- lm(co2 ~ cent + I(cent^2), co2)
summary(lm3)
## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = co2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.005 -2.213  0.429  2.358  6.026 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.51e+02   1.47e-01  2378.9   <2e-16 ***
## cent        1.52e+00   8.74e-03   174.4   <2e-16 ***
## I(cent^2)   1.01e-02   8.70e-04    11.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97,    Adjusted R-squared: 0.97 
## F-statistic: 1.53e+04 on 2 and 931 DF,  p-value: <2e-16
plot(y = co2$co2, co2$cent, col = "brown", main = "Centered values of both co2", 
    ylab = "co2 (ppm)")

mauna$cent <- mauna$time - mean(mauna$time)
lmM <- lm(co2 ~ cent + I(cent^2), mauna)
summary(lmM)
## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = mauna)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.061 -1.766  0.155  1.836  4.419 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.50e+02   1.54e-01  2273.7   <2e-16 ***
## cent        1.54e+00   9.14e-03   168.9   <2e-16 ***
## I(cent^2)   9.92e-03   9.09e-04    10.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984,   Adjusted R-squared: 0.984 
## F-statistic: 1.43e+04 on 2 and 464 DF,  p-value: <2e-16
plot(y = mauna$co2, mauna$cent, col = "yellow", main = "Centered values of Mauna co2", 
    ylab = "co2 (ppm)")

jolla$cent <- jolla$time - mean(jolla$time)
lmJ <- lm(co2 ~ cent + I(cent^2), jolla)
summary(lmJ)
## 
## Call:
## lm(formula = co2 ~ cent + I(cent^2), data = jolla)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.30  -2.65   1.23   2.74   5.86 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.51e+02   2.47e-01 1423.31   <2e-16 ***
## cent        1.51e+00   1.46e-02  102.93   <2e-16 ***
## I(cent^2)   1.03e-02   1.46e-03    7.06    6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958,   Adjusted R-squared: 0.958 
## F-statistic: 5.32e+03 on 2 and 464 DF,  p-value: <2e-16
plot(y = jolla$co2, jolla$cent, col = "lightgreen", main = "Centered values of Jolla co2", 
    ylab = "co2 (ppm)")

plot of chunk unnamed-chunk-20

### all the data have a slight curve in them.  Meaning that the rate of
### increase is increasing (non-linear)