rm(list=ls(all=TRUE)) #reset environment
setwd('~/Dropbox/Statistics_study/STOR_664/Homework/HW1')
data13 <- read.table('Ex13.dat')
names(data13) <- c('year', 'wintime')
plot(wintime ~ year, data13)
From this plot, it seems that there is a negative linear relationship between year and winning time. Need to exclude the outlier – first point
\(Y_i=\beta_0+\beta_1(x_i-\bar{x})+\epsilon_i\), \(\epsilon_i\sim N(0,\sigma^2)\)
data13=data13[2:25,]
names(data13) <- c('year', 'wintime')
data13.lm <- lm(wintime ~ year, data = data13)
summary(data13.lm)
##
## Call:
## lm(formula = wintime ~ year, data = data13)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9352 -3.3310 0.1004 3.2048 6.8426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 851.8252 48.6785 17.50 2.13e-14 ***
## year -0.3206 0.0249 -12.87 1.02e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.904 on 22 degrees of freedom
## Multiple R-squared: 0.8828, Adjusted R-squared: 0.8774
## F-statistic: 165.7 on 1 and 22 DF, p-value: 1.023e-11
n <- length(data13$year)
x_bar <- mean(data13$year)
sum_xi_sq <- sum(data13$year^2)
\(n\) = 24
\(\bar{x}\) = 1954.3333333
\(\sum{x_i^2}\) = 9.169062410^{7}
#data get from the summary of data13.lm
alpha_hat = summary(data13.lm)$coefficients[1,1] #estimator of alpha
beta1_hat = summary(data13.lm)$coefficients[2,1] #estimator of beta_1
beta0_hat =mean(data13$wintime)# or= alpha_hat + beta1_hat * x_bar estimator of beta_0
s <- summary(data13.lm)$sigma #residual standard error (s)
alpha_hat_sd = summary(data13.lm)$coefficients[1,2] #alpha hat standart error
beta1_hat_sd = summary(data13.lm)$coefficients[2,2] #beta1 hat standard error
beta0_hat_sd <- s/sqrt(n) # beta0 hat standard error
s_sq <- s^2 #estimate for sigma^2
#beta1_hat_sd <- s/sqrt(sum_xi_sq - n * x_bar^2) #another way to calculate beta1 hat standard error
The estimate for \(\alpha\) is: \(\hat{\alpha}\) = 851.8252523
The estimate for \(\beta_1\) is: \(\hat{\beta_1}\) = -0.3205562
\(\hat{\beta_1}\) standard error is 0.0249047
Residual standard error is: \(s\) = 3.9040243
The Estimate for \(\sigma^2\) is: \(\hat{\sigma^2} = s^2\) = 15.241406
The estimate for \(\beta_0\) is: \(\hat{\beta_0} = \bar{y}\) = 225.3516667
\(\hat{\beta_0}\) standard error is: \(\frac{\sigma}{\sqrt{n}}\) = 0.7969056
So the regression function is: \(\hat{Y_i}\)=851.8252523+ (-0.3205562)* \(x_i\)
y_hat <- alpha_hat + beta1_hat * data13$year
plot(data13$year, y_hat, type = 'l', xlab = 'year', ylab = "winning time")
points(data13$year, data13$wintime, col = 'red')
e <- data13$wintime - y_hat
plot(e ~ data13$year, xlab = "year", ylab = "residuals")
abline(0,0)
There seems to be some correlations between residuals in the neighboring years. So the error terms for different years might not be uncorrelated.
#pred <- predict(data13.lm, data.frame(data13$year <- 2008), se.fit = T)
#The above doesn't work, why?
new_x <- 2008
y_star_hat <- beta0_hat + beta1_hat * (new_x - x_bar) #point prediction at year 2008
a = 0.05
t = qt(1-a/2, n-2)
y_star_hat_sd <- s * sqrt( 1/n + (new_x - x_bar)^2/(sum_xi_sq - n * x_bar^2) )
up_limit_y_star_hat <- y_star_hat + t * y_star_hat_sd
low_limit_y_star_hat <- y_star_hat - t * y_star_hat_sd
pred_sd <- s * sqrt(1 + 1/n + (new_x - x_bar)^2/(sum_xi_sq - n * x_bar^2) )
up_limit_pred <- y_star_hat + t * pred_sd
low_limit_pred <- y_star_hat - t * pred_sd
\(\hat{y^*} \pm t_(n-2;1-\frac{\alpha}{2}) s \sqrt{\frac{1}{n}+\frac{(x^*-\bar{x})^2}{\sum{(x_i - \bar{x})^2}}}\)
\(\hat{y^*} \pm t_(n-2;97.5\%) s \sqrt{\frac{1}{n}+\frac{(x^*-\bar{x})^2}{\sum{(x_i - \bar{x})^2}}}\)
Which is 208.1484862 \(\pm\) 3.2271381
Which is from 204.921348 to 211.3756243.
\(\hat{y^*} \pm t_(n-2;97.5\%) s \sqrt{\frac{1}{n}+\frac{(x^*-\bar{x})^2}{\sum{(x_i - \bar{x})^2}} + 1}\)
Which is 208.1484862 \(\pm\) 8.7159015
Which is from 199.4325847 to 216.8643876.
pred_CI <- predict(data13.lm, data.frame(year = 2008), se.fit = T,interval = "confidence",level = 0.95)
pred_CI
## $fit
## fit lwr upr
## 1 208.1485 204.9213 211.3756
##
## $se.fit
## [1] 1.556092
##
## $df
## [1] 22
##
## $residual.scale
## [1] 3.904024
pred_PI <- predict(data13.lm, data.frame(year = 2008), se.fit = T,interval = "prediction", level = 0.95)
pred_PI
## $fit
## fit lwr upr
## 1 208.1485 199.4326 216.8644
##
## $se.fit
## [1] 1.556092
##
## $df
## [1] 22
##
## $residual.scale
## [1] 3.904024