## Lab 1 set a working directory if you want setwd('D:\\Quant\\Lab1')

# load file for PC
co2 <- read.csv("E:/Quant/Assignments/Lab01/co2_LAB1.csv")

# load file for Mac co2<- read.csv(file.choose(),header = TRUE)

# list the variable names (column names)
names(co2)

## [1] "year"  "month" "time"  "co2"   "site"


######################################### Question 1 -Do the CO2
######################################### measurements at the two sites
######################################### differ significantly from each
######################################### other? look at the entire
######################################### collection of co2 measurements
######################################### for normality

# Get variables for the two sites...
mauna <- co2[co2$site == 0, ]
jolla <- co2[co2$site == 1, ]

# Check for normality... For Mauna
hist(mauna$co2)

plot of chunk unnamed-chunk-1

qqnorm(mauna$co2)
qqline(mauna$co2)

plot of chunk unnamed-chunk-1

shapiro.test(mauna$co2)

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$co2 
## W = 0.9581, p-value = 2.935e-10

# H_o = the data is normally distributed p-values = 2.936e-10; Reject the
# null hypothessis. With 95% confidence, the data is NOT normally
# distributed.

# For jolla
hist(jolla$co2)

plot of chunk unnamed-chunk-1

qqnorm(jolla$co2)
qqline(jolla$co2)

plot of chunk unnamed-chunk-1

shapiro.test(jolla$co2)

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$co2 
## W = 0.966, p-value = 6.135e-09

# H_o = the data is normally distributed p-values = 6.137e-9; Reject the
# null hypothessis. With 95% confidence, the data is NOT normally
# distributed.

# We must to a non-parametric test to compare them... Do a
# Mann-Whitney(Wilcoxon) test...
wilcox.test(x = mauna$co2, y = jolla$co2, paired = T)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mauna$co2 and jolla$co2 
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0

# H_o = the difference between the two sites is 0 p-values = 0.393; Accept
# the null hypothessis. With 95% confidence, There is a difference between
# the sites.

######################################### Question 2 - What is the overall
######################################### rate of CO2 increase from 1969
######################################### to 2008?

# linear regression model for entire co2 dataset
model1 <- lm(co2 ~ time, data = co2)
par(mfrow = c(2, 2))
plot(model1)

plot of chunk unnamed-chunk-1

summary(model1)

## 
## Call:
## lm(formula = co2 ~ time, data = co2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.547 -2.362  0.449  2.302  8.334 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.68e+03   1.86e+01    -144   <2e-16 ***
## time         1.52e+00   9.35e-03     163   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.21 on 932 degrees of freedom
## Multiple R-squared: 0.966,   Adjusted R-squared: 0.966 
## F-statistic: 2.66e+04 on 1 and 932 DF,  p-value: <2e-16

# The residuals are not distributed around 0. Try to fit a polynomial
# through the data
model_quad <- lm(co2 ~ time + I(time^2), data = co2)
plot(model_quad)

plot of chunk unnamed-chunk-1

summary(model_quad)

## 
## Call:
## lm(formula = co2 ~ time + I(time^2), data = co2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.005 -2.213  0.429  2.358  6.026 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.73e+04   3.44e+03    10.8   <2e-16 ***
## time        -3.87e+01   3.46e+00   -11.2   <2e-16 ***
## I(time^2)    1.01e-02   8.70e-04    11.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97,    Adjusted R-squared: 0.97 
## F-statistic: 1.53e+04 on 2 and 931 DF,  p-value: <2e-16

# Ftest: N_o = model is due to random chance f-statistic = 1.527e4;
# p-value = 2.2e-16; Reject the null hypothesis With 95% confidence, we
# assume that the model is NOT due to random chance adjusted r^2 = 0.9704
# t-test to understand if the slope is 0 or not: Must center the values
co2$cent <- co2$time - mean(co2$time)
model_cent <- lm(co2$co2 ~ co2$cent + I(co2$cent^2))
summary(model_cent)

## 
## Call:
## lm(formula = co2$co2 ~ co2$cent + I(co2$cent^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.005 -2.213  0.429  2.358  6.026 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.51e+02   1.47e-01  2378.9   <2e-16 ***
## co2$cent      1.52e+00   8.74e-03   174.4   <2e-16 ***
## I(co2$cent^2) 1.01e-02   8.70e-04    11.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97,    Adjusted R-squared: 0.97 
## F-statistic: 1.53e+04 on 2 and 931 DF,  p-value: <2e-16

# T-test on model coefficients: H_o = the model coeficients = 0 for
# coefficient 1 (x): t-statistic: 174.38; p-value: 2e-16; rejct null
# hypothesis for coefficient 2 (x^2): t-statistic: 11.61; p-value : 2e-16;
# reject null hypothesis We can reject the null hypothesis is grater than
# 99.9% confidence that the slopes of the model are not equal to 0.
plot(co2$cent, co2$co2)
# model: y = 350.6 + 1.5x + 0.01x^2

anova(model1, model_quad)

## Analysis of Variance Table
## 
## Model 1: co2 ~ time
## Model 2: co2 ~ time + I(time^2)
##   Res.Df  RSS Df Sum of Sq   F Pr(>F)    
## 1    932 9608                            
## 2    931 8393  1      1215 135 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Using an f-test, we can determine if the two models are similar. H_o is
# that the two models are the same. f-statistic: 134.76; p-value 2.2e-16
# reject null hypothesis and determine that the models are not the same
# conclude that the quadratic model outperforms the liner model based on
# Rsquared values.

# use predict to estimate the starting and ending CO2 values in the model
pre <- predict(model_cent)
totalSamples <- length(predict(model_cent))
# summary(pre)
startingCO2 <- pre[1]
endingCO2 <- pre[totalSamples]
totalYrs <- 2008 - 1969
rateOfIncrease <- (endingCO2 - startingCO2)/totalYrs
rateOfIncrease

##   934 
## 1.518

# Linear rate of increase is about 1.52ppm per year.
# 
# Question 3 - does the rate of change appear to be steady or is it
# accelerating/decelerating?

# answered under question 2 - in summary(model_cent)

######################################### Question 4 - Is the rate of
######################################### increase different between the
######################################### sites?

# create model for Mauna
mauna_lm1 <- lm(co2 ~ time, data = mauna)
par(mfrow = c(2, 2))

plot of chunk unnamed-chunk-1

plot(mauna_lm1)

plot of chunk unnamed-chunk-1

# The residuals are not distributed around 0. Try to fit a polynomial
# through the data
mauna_lm2 <- lm(co2 ~ time + I(time^2), data = mauna)
plot(mauna_lm2)

plot of chunk unnamed-chunk-1

summary(mauna_lm2)

## 
## Call:
## lm(formula = co2 ~ time + I(time^2), data = mauna)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.061 -1.766  0.155  1.836  4.419 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.65e+04   3.60e+03    10.2   <2e-16 ***
## time        -3.79e+01   3.62e+00   -10.5   <2e-16 ***
## I(time^2)    9.92e-03   9.09e-04    10.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984,   Adjusted R-squared: 0.984 
## F-statistic: 1.43e+04 on 2 and 464 DF,  p-value: <2e-16

# Ftest: N_o = model is due to random chance f-statistic = 1.433e4;
# p-value = 2.2e-16; Reject the null hypothesis With 95% confidence, we
# assume that the model is NOT due to random chance adjusted r^2 = 0.984
# t-test to understand if the slope is 0 or not: Must center the values
mauna$cent <- mauna$time - mean(mauna$time)
mauna_cent <- lm(mauna$co2 ~ mauna$cent + I(mauna$cent^2))
summary(mauna_cent)

## 
## Call:
## lm(formula = mauna$co2 ~ mauna$cent + I(mauna$cent^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.061 -1.766  0.155  1.836  4.419 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.50e+02   1.54e-01  2273.7   <2e-16 ***
## mauna$cent      1.54e+00   9.14e-03   168.9   <2e-16 ***
## I(mauna$cent^2) 9.92e-03   9.09e-04    10.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984,   Adjusted R-squared: 0.984 
## F-statistic: 1.43e+04 on 2 and 464 DF,  p-value: <2e-16

# T-test on model coefficients: H_o = the model coeficients = 0 for
# coefficient 1 (x): t-statistic: 168.9; p-value: 2e-16; rejct null
# hypothesis for coefficient 2 (x^2): t-statistic: 10.9; p-value : 2e-16;
# reject null hypothesis We can reject the null hypothesis is grater than
# 99.9% confidence that the slopes of the model are not equal to 0. MODEL:
# y = 350.1 + 1.54x + .01x^2

# create model for jolla
jolla_lm1 <- lm(co2 ~ time, data = jolla)
par(mfrow = c(2, 2))
plot(jolla_lm1)

plot of chunk unnamed-chunk-1

# The residuals are not distributed around 0. Try to fit a polynomial
# through the data
jolla_lm2 <- lm(co2 ~ time + I(time^2), data = jolla)
plot(jolla_lm2)

plot of chunk unnamed-chunk-1

summary(jolla_lm2)

## 
## Call:
## lm(formula = co2 ~ time + I(time^2), data = jolla)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.30  -2.65   1.23   2.74   5.86 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.80e+04   5.76e+03    6.60  1.1e-10 ***
## time        -3.94e+01   5.79e+00   -6.80  3.2e-11 ***
## I(time^2)    1.03e-02   1.46e-03    7.06  6.0e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958,   Adjusted R-squared: 0.958 
## F-statistic: 5.32e+03 on 2 and 464 DF,  p-value: <2e-16

# Ftest: N_o = model is due to random chance f-statistic = 5322; p-value =
# 2.2e-16; Reject the null hypothesis With 95% confidence, we assume that
# the model is NOT due to random chance adjusted r^2 = 0.958 t-test to
# understand if the slope is 0 or not: Must center the values
jolla$cent <- jolla$time - mean(jolla$time)
jolla_cent <- lm(jolla$co2 ~ jolla$cent + I(jolla$cent^2))
summary(jolla_cent)

## 
## Call:
## lm(formula = jolla$co2 ~ jolla$cent + I(jolla$cent^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.30  -2.65   1.23   2.74   5.86 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.51e+02   2.47e-01 1423.31   <2e-16 ***
## jolla$cent      1.51e+00   1.46e-02  102.93   <2e-16 ***
## I(jolla$cent^2) 1.03e-02   1.46e-03    7.06    6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958,   Adjusted R-squared: 0.958 
## F-statistic: 5.32e+03 on 2 and 464 DF,  p-value: <2e-16

# T-test on model coefficients: H_o = the model coeficients = 0 for
# coefficient 1 (x): t-statistic: 102.9; p-value: 2e-16; rejct null
# hypothesis for coefficient 2 (x^2): t-statistic: 7.1; p-value :
# 6.01e-12; reject null hypothesis We can reject the null hypothesis is
# grater than 99.9% confidence that the slopes of the model are not equal
# to 0. MODEL: y = 351.0 + 1.51x + .01x^2

# calculate predicted values for each model
mauna$line <- 350.1 + (1.54 * mauna$cent) + (0.01 * mauna$cent * mauna$cent)
jolla$line <- 351 + (1.51 * jolla$cent) + (0.01 * jolla$cent * jolla$cent)
shapiro.test(mauna$line)

## 
##  Shapiro-Wilk normality test
## 
## data:  mauna$line 
## W = 0.9507, p-value = 2.28e-11

# H_o = the data is normally distributed w-statistic: 0.95; p-values:
# 2.25e-11 Reject the null hypothessis; the data is NOT normally
# distributed.
shapiro.test(jolla$line)

## 
##  Shapiro-Wilk normality test
## 
## data:  jolla$line 
## W = 0.9505, p-value = 2.162e-11

# H_o = the data is normally distributed w-statistic: 0.95; p-values:
# 2.16e-11 Reject the null hypothessis; the data is NOT normally
# distributed.

wilcox.test(x = mauna$line, y = jolla$line, paired = T)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  mauna$line and jolla$line 
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# H_o = the difference between the two models is 0 v-statistic: 0;
# p-values = 2.2e-16; Reject the null hypothessis; there is a difference
# between the models.