## Lab 1 set a working directory if you want setwd('D:\\Quant\\Lab1')
# load file for PC
co2 <- read.csv("E:/Quant/Assignments/Lab01/co2_LAB1.csv")
# load file for Mac co2<- read.csv(file.choose(),header = TRUE)
# list the variable names (column names)
names(co2)
## [1] "year" "month" "time" "co2" "site"
######################################### Question 1 -Do the CO2
######################################### measurements at the two sites
######################################### differ significantly from each
######################################### other? look at the entire
######################################### collection of co2 measurements
######################################### for normality
# Get variables for the two sites...
mauna <- co2[co2$site == 0, ]
jolla <- co2[co2$site == 1, ]
# Check for normality... For Mauna
hist(mauna$co2)
qqnorm(mauna$co2)
qqline(mauna$co2)
shapiro.test(mauna$co2)
##
## Shapiro-Wilk normality test
##
## data: mauna$co2
## W = 0.9581, p-value = 2.935e-10
# H_o = the data is normally distributed p-values = 2.936e-10; Reject the
# null hypothessis. With 95% confidence, the data is NOT normally
# distributed.
# For jolla
hist(jolla$co2)
qqnorm(jolla$co2)
qqline(jolla$co2)
shapiro.test(jolla$co2)
##
## Shapiro-Wilk normality test
##
## data: jolla$co2
## W = 0.966, p-value = 6.135e-09
# H_o = the data is normally distributed p-values = 6.137e-9; Reject the
# null hypothessis. With 95% confidence, the data is NOT normally
# distributed.
# We must to a non-parametric test to compare them... Do a
# Mann-Whitney(Wilcoxon) test...
wilcox.test(x = mauna$co2, y = jolla$co2, paired = T)
##
## Wilcoxon signed rank test with continuity correction
##
## data: mauna$co2 and jolla$co2
## V = 32061, p-value = 1.015e-14
## alternative hypothesis: true location shift is not equal to 0
# H_o = the difference between the two sites is 0 p-values = 0.393; Accept
# the null hypothessis. With 95% confidence, There is a difference between
# the sites.
######################################### Question 2 - What is the overall
######################################### rate of CO2 increase from 1969
######################################### to 2008?
# linear regression model for entire co2 dataset
model1 <- lm(co2 ~ time, data = co2)
par(mfrow = c(2, 2))
plot(model1)
summary(model1)
##
## Call:
## lm(formula = co2 ~ time, data = co2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.547 -2.362 0.449 2.302 8.334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.68e+03 1.86e+01 -144 <2e-16 ***
## time 1.52e+00 9.35e-03 163 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.21 on 932 degrees of freedom
## Multiple R-squared: 0.966, Adjusted R-squared: 0.966
## F-statistic: 2.66e+04 on 1 and 932 DF, p-value: <2e-16
# The residuals are not distributed around 0. Try to fit a polynomial
# through the data
model_quad <- lm(co2 ~ time + I(time^2), data = co2)
plot(model_quad)
summary(model_quad)
##
## Call:
## lm(formula = co2 ~ time + I(time^2), data = co2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.005 -2.213 0.429 2.358 6.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.73e+04 3.44e+03 10.8 <2e-16 ***
## time -3.87e+01 3.46e+00 -11.2 <2e-16 ***
## I(time^2) 1.01e-02 8.70e-04 11.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97, Adjusted R-squared: 0.97
## F-statistic: 1.53e+04 on 2 and 931 DF, p-value: <2e-16
# Ftest: N_o = model is due to random chance f-statistic = 1.527e4;
# p-value = 2.2e-16; Reject the null hypothesis With 95% confidence, we
# assume that the model is NOT due to random chance adjusted r^2 = 0.9704
# t-test to understand if the slope is 0 or not: Must center the values
co2$cent <- co2$time - mean(co2$time)
model_cent <- lm(co2$co2 ~ co2$cent + I(co2$cent^2))
summary(model_cent)
##
## Call:
## lm(formula = co2$co2 ~ co2$cent + I(co2$cent^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.005 -2.213 0.429 2.358 6.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+02 1.47e-01 2378.9 <2e-16 ***
## co2$cent 1.52e+00 8.74e-03 174.4 <2e-16 ***
## I(co2$cent^2) 1.01e-02 8.70e-04 11.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3 on 931 degrees of freedom
## Multiple R-squared: 0.97, Adjusted R-squared: 0.97
## F-statistic: 1.53e+04 on 2 and 931 DF, p-value: <2e-16
# T-test on model coefficients: H_o = the model coeficients = 0 for
# coefficient 1 (x): t-statistic: 174.38; p-value: 2e-16; rejct null
# hypothesis for coefficient 2 (x^2): t-statistic: 11.61; p-value : 2e-16;
# reject null hypothesis We can reject the null hypothesis is grater than
# 99.9% confidence that the slopes of the model are not equal to 0.
plot(co2$cent, co2$co2)
# model: y = 350.6 + 1.5x + 0.01x^2
anova(model1, model_quad)
## Analysis of Variance Table
##
## Model 1: co2 ~ time
## Model 2: co2 ~ time + I(time^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 932 9608
## 2 931 8393 1 1215 135 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Using an f-test, we can determine if the two models are similar. H_o is
# that the two models are the same. f-statistic: 134.76; p-value 2.2e-16
# reject null hypothesis and determine that the models are not the same
# conclude that the quadratic model outperforms the liner model based on
# Rsquared values.
# use predict to estimate the starting and ending CO2 values in the model
pre <- predict(model_cent)
totalSamples <- length(predict(model_cent))
# summary(pre)
startingCO2 <- pre[1]
endingCO2 <- pre[totalSamples]
totalYrs <- 2008 - 1969
rateOfIncrease <- (endingCO2 - startingCO2)/totalYrs
rateOfIncrease
## 934
## 1.518
# Linear rate of increase is about 1.52ppm per year.
#
# Question 3 - does the rate of change appear to be steady or is it
# accelerating/decelerating?
# answered under question 2 - in summary(model_cent)
######################################### Question 4 - Is the rate of
######################################### increase different between the
######################################### sites?
# create model for Mauna
mauna_lm1 <- lm(co2 ~ time, data = mauna)
par(mfrow = c(2, 2))
plot(mauna_lm1)
# The residuals are not distributed around 0. Try to fit a polynomial
# through the data
mauna_lm2 <- lm(co2 ~ time + I(time^2), data = mauna)
plot(mauna_lm2)
summary(mauna_lm2)
##
## Call:
## lm(formula = co2 ~ time + I(time^2), data = mauna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.061 -1.766 0.155 1.836 4.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.65e+04 3.60e+03 10.2 <2e-16 ***
## time -3.79e+01 3.62e+00 -10.5 <2e-16 ***
## I(time^2) 9.92e-03 9.09e-04 10.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.984
## F-statistic: 1.43e+04 on 2 and 464 DF, p-value: <2e-16
# Ftest: N_o = model is due to random chance f-statistic = 1.433e4;
# p-value = 2.2e-16; Reject the null hypothesis With 95% confidence, we
# assume that the model is NOT due to random chance adjusted r^2 = 0.984
# t-test to understand if the slope is 0 or not: Must center the values
mauna$cent <- mauna$time - mean(mauna$time)
mauna_cent <- lm(mauna$co2 ~ mauna$cent + I(mauna$cent^2))
summary(mauna_cent)
##
## Call:
## lm(formula = mauna$co2 ~ mauna$cent + I(mauna$cent^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.061 -1.766 0.155 1.836 4.419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50e+02 1.54e-01 2273.7 <2e-16 ***
## mauna$cent 1.54e+00 9.14e-03 168.9 <2e-16 ***
## I(mauna$cent^2) 9.92e-03 9.09e-04 10.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.22 on 464 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.984
## F-statistic: 1.43e+04 on 2 and 464 DF, p-value: <2e-16
# T-test on model coefficients: H_o = the model coeficients = 0 for
# coefficient 1 (x): t-statistic: 168.9; p-value: 2e-16; rejct null
# hypothesis for coefficient 2 (x^2): t-statistic: 10.9; p-value : 2e-16;
# reject null hypothesis We can reject the null hypothesis is grater than
# 99.9% confidence that the slopes of the model are not equal to 0. MODEL:
# y = 350.1 + 1.54x + .01x^2
# create model for jolla
jolla_lm1 <- lm(co2 ~ time, data = jolla)
par(mfrow = c(2, 2))
plot(jolla_lm1)
# The residuals are not distributed around 0. Try to fit a polynomial
# through the data
jolla_lm2 <- lm(co2 ~ time + I(time^2), data = jolla)
plot(jolla_lm2)
summary(jolla_lm2)
##
## Call:
## lm(formula = co2 ~ time + I(time^2), data = jolla)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.30 -2.65 1.23 2.74 5.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.80e+04 5.76e+03 6.60 1.1e-10 ***
## time -3.94e+01 5.79e+00 -6.80 3.2e-11 ***
## I(time^2) 1.03e-02 1.46e-03 7.06 6.0e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958, Adjusted R-squared: 0.958
## F-statistic: 5.32e+03 on 2 and 464 DF, p-value: <2e-16
# Ftest: N_o = model is due to random chance f-statistic = 5322; p-value =
# 2.2e-16; Reject the null hypothesis With 95% confidence, we assume that
# the model is NOT due to random chance adjusted r^2 = 0.958 t-test to
# understand if the slope is 0 or not: Must center the values
jolla$cent <- jolla$time - mean(jolla$time)
jolla_cent <- lm(jolla$co2 ~ jolla$cent + I(jolla$cent^2))
summary(jolla_cent)
##
## Call:
## lm(formula = jolla$co2 ~ jolla$cent + I(jolla$cent^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.30 -2.65 1.23 2.74 5.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+02 2.47e-01 1423.31 <2e-16 ***
## jolla$cent 1.51e+00 1.46e-02 102.93 <2e-16 ***
## I(jolla$cent^2) 1.03e-02 1.46e-03 7.06 6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.55 on 464 degrees of freedom
## Multiple R-squared: 0.958, Adjusted R-squared: 0.958
## F-statistic: 5.32e+03 on 2 and 464 DF, p-value: <2e-16
# T-test on model coefficients: H_o = the model coeficients = 0 for
# coefficient 1 (x): t-statistic: 102.9; p-value: 2e-16; rejct null
# hypothesis for coefficient 2 (x^2): t-statistic: 7.1; p-value :
# 6.01e-12; reject null hypothesis We can reject the null hypothesis is
# grater than 99.9% confidence that the slopes of the model are not equal
# to 0. MODEL: y = 351.0 + 1.51x + .01x^2
# calculate predicted values for each model
mauna$line <- 350.1 + (1.54 * mauna$cent) + (0.01 * mauna$cent * mauna$cent)
jolla$line <- 351 + (1.51 * jolla$cent) + (0.01 * jolla$cent * jolla$cent)
shapiro.test(mauna$line)
##
## Shapiro-Wilk normality test
##
## data: mauna$line
## W = 0.9507, p-value = 2.28e-11
# H_o = the data is normally distributed w-statistic: 0.95; p-values:
# 2.25e-11 Reject the null hypothessis; the data is NOT normally
# distributed.
shapiro.test(jolla$line)
##
## Shapiro-Wilk normality test
##
## data: jolla$line
## W = 0.9505, p-value = 2.162e-11
# H_o = the data is normally distributed w-statistic: 0.95; p-values:
# 2.16e-11 Reject the null hypothessis; the data is NOT normally
# distributed.
wilcox.test(x = mauna$line, y = jolla$line, paired = T)
##
## Wilcoxon signed rank test with continuity correction
##
## data: mauna$line and jolla$line
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# H_o = the difference between the two models is 0 v-statistic: 0;
# p-values = 2.2e-16; Reject the null hypothessis; there is a difference
# between the models.