rm(list=ls(all=TRUE)) #reset environment
setwd('~/Dropbox/Statistics_study/STOR_664/Homework/HW1')
data14 <- read.table('Ex14.dat', header = F)
names(data14) <- c('Obs', 'x', 'y')
data14[1:5,] #show the first 5 obersevations
## Obs x y
## 1 1 0.05 0.05
## 2 2 0.05 0.10
## 3 3 0.25 0.25
## 4 4 0.25 0.35
## 5 5 0.50 0.75
summary(data14)
## Obs x y
## Min. : 1.00 Min. :0.050 Min. :0.050
## 1st Qu.: 6.25 1st Qu.:0.500 1st Qu.:0.875
## Median :11.50 Median :1.250 Median :2.200
## Mean :11.50 Mean :1.407 Mean :2.217
## 3rd Qu.:16.75 3rd Qu.:2.100 3rd Qu.:3.482
## Max. :22.00 Max. :2.500 Max. :4.070
n <- length(data14$x) #number of observations
x_bar <- mean(data14$x)
y_bar <- mean(data14$y)
data14.lm <- lm(y ~ x, data=data14)
summary(data14.lm) # the summary of the regression result
##
## Call:
## lm(formula = y ~ x, data = data14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54587 -0.13209 -0.01313 0.09678 0.60146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03804 0.10036 -0.379 0.709
## x 1.60313 0.06070 26.412 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2474 on 20 degrees of freedom
## Multiple R-squared: 0.9721, Adjusted R-squared: 0.9707
## F-statistic: 697.6 on 1 and 20 DF, p-value: < 2.2e-16
plot(data14$x,data14$y, xlab = "potassium/carbon ratio (K/C)", ylab = "the amount of carbon monoxide (CO) desorbed" )
abline(-0.03804, 1.60313)
# σ2
s <- summary(data14.lm)$sigma #residual standard error
sigma_sq_hat <- s^2 #estimator of σ2
a <- 1-0.95
chi_up <- qchisq(1-a/2, n-2)
chi_low <- qchisq(a/2, n-2)
up_sigma_sq <- s^2 * (n-2)/chi_low #up limit of sigma square
low_sigma_sq <- s^2 * (n-2)/chi_up #low limit of sigma square
rdm_chisq <-rchisq(1000000,n-2)
sigma_sq_dist <- s^2 * (n-2)/rdm_chisq
plot(density(sigma_sq_dist), main='')
#hist(sigma_sq_dist, breaks=1000, xlim = range(0, 0.5), main = '')
abline(v=s^2, col = 'red')
abline(v=up_sigma_sq, col = 'red',lty='dotted')
abline(v=low_sigma_sq, col = 'red',lty='dotted')
title('Distribution of σ2 with 95% C.I.')
#beta0
beta0_hat <- y_bar #estimator of beta0
beta0_sd <- s/sqrt(n) #the standard error of beta0
t <- qt(1-a/2, n-2)
ul_beta0 <- beta0_hat + t * beta0_sd #up limit of beta0
ll_beta0 <- beta0_hat - t * beta0_sd #low limit of beta0
rdm_tdist <-rt(1000000, n-2)
beta0_dist <- beta0_hat + rdm_tdist * beta0_sd
plot(density(beta0_dist), main='')
abline(v=beta0_hat, col = 'red')
abline(v=ul_beta0, col = 'red',lty='dotted')
abline(v=ll_beta0, col = 'red',lty='dotted')
title('Distribution of β0 with 95% C.I.')
#beta1
beta1_hat <- summary(data14.lm)$coefficients[2,1] #estimator of beta1, get directly from summary(data14.lm)
beta1_sd <- summary(data14.lm)$coefficients[2,2] #the standard error of beta1, get directly from summary(data14.lm)
ul_beta1 <- beta1_hat + t * beta1_sd #up limit of beta1
ll_beta1 <- beta1_hat - t * beta1_sd #low limit of beta1
beta1_dist <- beta1_hat + rdm_tdist * beta1_sd
plot(density(beta1_dist), main='')
abline(v=beta1_hat, col = 'red')
abline(v=ul_beta1, col = 'red',lty='dotted')
abline(v=ll_beta1, col = 'red',lty='dotted')
title('Distribution of β1 with 95% C.I.')
y_hat <- alpha_hat + beta1_hat * data14$x # a list of yi hat
e <- data14$y - y_hat #residual
plot(data14$x, e, xlab="x", ylab='residual', main='Residual Plot')
abline(0,0)
hist(e,breaks=50, ylab='counts', xlab='residuals', main='Hisogram of Residuals')
e_sorted <- sort(e)
d <- e_sorted/s
z <- c(1:n)
z <- (z-0.5)/n
z <-qnorm(z)
plot(d~z, xlab='Expected', ylab='observed', main = 'Normal Probability plot of residuals')
abline(0,1)
#Acturall can just use this:
# qqnorm(e);qqline(e)
shapiro.test(residuals(data14.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(data14.lm)
## W = 0.9401, p-value = 0.1986
ks.test(residuals(data14.lm), 'pnorm')
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(data14.lm)
## D = 0.3448, p-value = 0.00769
## alternative hypothesis: two-sided
For all the hypothesis test, we use \(\alpha\)=0.05 as the cutoff.
\(H_0\): Residuals are normally distributed.
So from the result of Shapiro-Wilk normality test, p-value = 0.1986, can’t reject \(H_0\).
But from the result of One-sample Kolmogorov-Smirnov test, p-value = 0.00769, so need to reject \(H_0\).
norm.test <- function(y, x, nsim=1000)
{
nreg <- lm(y ~ x)
q1<-qqnorm(nreg$resid,plot=F)
c1<-cor(q1$x,q1$y)
u1<-pnorm(sort(nreg$resid),mean=0, sd=summary(nreg)$sigma)
d1<-max(c((1:n)/n-u1,u1-(0:(n-1))/n))
w1<-sum((u1-((1:n)-0.5)/n)^2)+1/(12*n)
a1<--sum((2*(1:n)-1)*log(u1)+(2*n+1-2*(1:n))*log(1-u1))/n-n
count<-rep(0,4)
names(count) <- c("L-G", "K-S", "C-V", "A-D")
for(j in 1:nsim)
{
temp1<-rnorm(n)
nreg<-lm(temp1~x)
q2<-qqnorm(nreg$resid,plot=F)
c2<-cor(q2$x,q2$y)
if(c2<c1)count[1]<-count[1]+1
u2<-pnorm(sort(nreg$resid),mean=0,sd=summary(nreg)$sigma)
d2<-max(c((1:n)/n-u2,u2-(0:(n-1))/n))
w2<-sum((u2-((1:n)-0.5)/n)^2)+1/(12*n)
a2<--sum((2*(1:n)-1)*log(u2)+(2*n+1-2*(1:n))*log(1-u2))/n-n
if(d2>d1)count[2]<-count[2]+1
if(w2>w1)count[3]<-count[3]+1
if(a2>a1)count[4]<-count[4]+1
}
return(count/nsim)
}
norm.test(data14$y, data14$x)
## L-G K-S C-V A-D
## 0.105 0.143 0.112 0.095
From this simulation result, all p-value is larger than 0.05, so we can’t reject the null-hypothesis that residuals are normally distributed
data14.lm2 <- lm(y ~ factor(x), data14)
anova(data14.lm)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 42.692 42.692 697.61 < 2.2e-16 ***
## Residuals 20 1.224 0.061
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(data14.lm, data14.lm2)
## Analysis of Variance Table
##
## Model 1: y ~ x
## Model 2: y ~ factor(x)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 20 1.2240
## 2 16 1.1241 4 0.099833 0.3552 0.8366
For the top anova test, \(H_0\) is “\(\beta_1 = 0\)”“, but p value = 2.2e-16, so reject \(\beta_1 = 0\), i.e. conclusion: the regression model is good.
For the bottom anova test, \(H_0\) is “linear regression model is correct”, the p value = 0.8366, so can’t reject \(H_0\).
In conclusion, both anova tests show that the linear regression model is good.
y_star_pred_interval <- predict(data14.lm, data.frame(x= 2.0), se.fit = T, interval = "prediction", level = 0.90)
y_star_pred_interval
## $fit
## fit lwr upr
## 1 3.168222 2.727573 3.608871
##
## $se.fit
## [1] 0.06385905
##
## $df
## [1] 20
##
## $residual.scale
## [1] 0.2473809
The 90% prediction interval of \(y^*\) is: 2.7275734 to 3.6088709
y_star_hat_CI <- predict(data14.lm, data.frame(x= 2.0), se.fit = T, interval = "confidence", level = 0.90)
y_star_hat_CI
## $fit
## fit lwr upr
## 1 3.168222 3.058083 3.278361
##
## $se.fit
## [1] 0.06385905
##
## $df
## [1] 20
##
## $residual.scale
## [1] 0.2473809
The 90% prediction interval of \(\hat{y^*}\) is: 3.0580833 to 3.278361