rm(list=ls(all=TRUE))
protein <- read.table('~/Dropbox/Statistics_study/STOR_664/Homework/HW2/HW2_C_19_data1.txt',header=F)
names(protein) <- c('obs','percent','l1','l2','l3','l4','l5','l6')
attach(protein)
pro.lm <- lm(percent ~ l1 + l2 + l3 + l4 + l5 + l6)
summary(pro.lm)
##
## Call:
## lm(formula = percent ~ l1 + l2 + l3 + l4 + l5 + l6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39798 -0.12660 -0.00822 0.07745 0.38705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.074230 9.899022 2.331 0.03232 *
## l1 0.028124 0.082118 0.342 0.73618
## l2 0.001667 0.087162 0.019 0.98497
## l3 0.234909 0.077400 3.035 0.00748 **
## l4 -0.240445 0.063218 -3.803 0.00142 **
## l5 0.011839 0.006126 1.932 0.07014 .
## l6 -0.035584 0.045530 -0.782 0.44522
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2203 on 17 degrees of freedom
## Multiple R-squared: 0.9821, Adjusted R-squared: 0.9758
## F-statistic: 155.9 on 6 and 17 DF, p-value: 6.654e-14
# residual sum of square: (n-p) * s^2
(length(l1) - 7) * (summary(pro.lm)$sigma)^2
## [1] 0.8253352
abs(summary(pro.lm)$coefficients[,1]) - 2 * summary(pro.lm)$coefficients[,2]
## (Intercept) l1 l2 l3 l4
## 3.2761868173 -0.1361126288 -0.1726581135 0.0801083561 0.1140087892
## l5 l6
## -0.0004136855 -0.0554751355
So need to drop l1, l2, l5, l6.
pro2.lm <- lm(percent ~ l3 + l4 )
summary(pro2.lm)
##
## Call:
## lm(formula = percent ~ l3 + l4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52215 -0.09417 0.02566 0.13763 0.41405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.174314 1.308664 23.82 < 2e-16 ***
## l3 0.240405 0.009640 24.94 < 2e-16 ***
## l4 -0.217101 0.009568 -22.69 2.97e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2477 on 21 degrees of freedom
## Multiple R-squared: 0.9721, Adjusted R-squared: 0.9695
## F-statistic: 366.2 on 2 and 21 DF, p-value: < 2.2e-16
# residual sum of square: (n-p) * s^2
(length(l3) - 3) * (summary(pro2.lm)$sigma)^2
## [1] 1.288837
beta_hat_l3 <- summary(pro2.lm)$coefficients[2,1]
beta_hat_l4 <- summary(pro2.lm)$coefficients[3,1]
pro3.lm <- lm(percent ~ offset(beta_hat_l3 * l3 + beta_hat_l4 * l4))
anova(pro3.lm, pro.lm)
## Analysis of Variance Table
##
## Model 1: percent ~ offset(beta_hat_l3 * l3 + beta_hat_l4 * l4)
## Model 2: percent ~ l1 + l2 + l3 + l4 + l5 + l6
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 23 1.28884
## 2 17 0.82534 6 0.4635 1.5912 0.21
So it’s not significant, which means can’t reject H0. So still believe the model fitted in part (b)
pred.data <- read.table('~/Dropbox/Statistics_study/STOR_664/Homework/HW2/HW2_C_19_data2.txt',header=F)
names(pred.data) <- c('obs','l1','l2','l3','l4','l5','l6')
pred.data = pred.data[, 4:5]
pred_inte <- data.frame(1:4)
for (i in 1:6){
pred_inte[[i]] <- predict(pro2.lm, data.frame(l3 = pred.data[i,1], l4 = pred.data[i,2]), interval = "prediction", se.fit=T, level = 0.90)
}
names(pred_inte)=c('sample1', 'sample2','sample3','sample4','sample5','sample6')
#the point prediction, lower bound, upper bound of each sample is:
pred_inte[1,]
## sample1 sample2
## 1 9.801225, 9.365855, 10.236595 8.358796, 7.909020, 8.808573
## sample3 sample4
## 1 9.444302, 9.007920, 9.880684 11.83122, 11.34155, 12.32089
## sample5 sample6
## 1 10.025695, 9.589644, 10.461745 10.98612, 10.52057, 11.45166
#Bonferroni procedures:
k=6
p=3
n=length(l1)
alpha=.1
t=qt(1-alpha/(2*k), n-p)
simult_pi = data.frame(1:k,1:k,1:k)*0
names(simult_pi) = c('fit', 'lower','upper')
for (i in 1:k) {
simult_pi$fit[i] = pred_inte[[i]]$fit[1]
simult_pi$lower[i] = pred_inte[[i]]$fit[1] - t * summary(pro2.lm)$sigma * sqrt( (pred_inte[[i]]$se.fit / summary(pro2.lm)$sigma)^2 + 1)
simult_pi$upper[i] = pred_inte[[i]]$fit[1] + t * summary(pro2.lm)$sigma * sqrt( (pred_inte[[i]]$se.fit / summary(pro2.lm)$sigma)^2 + 1)
}
simult_pi
## fit lower upper
## 1 9.801225 9.143050 10.45940
## 2 8.358796 7.678842 9.03875
## 3 9.444302 8.784598 10.10401
## 4 11.831218 11.090953 12.57148
## 5 10.025695 9.366491 10.68490
## 6 10.986116 10.282327 11.68991
#Scheffe procedures
q=k # q is number of observations for prediction!
G = sqrt(q * qf(1-alpha, q, n-p))
simult_pi_scheffe = data.frame(1:k,1:k,1:k)*0
names(simult_pi_scheffe) = c('fit', 'lower','upper')
for (i in 1:k) {
simult_pi_scheffe$fit[i] = pred_inte[[i]]$fit[1]
simult_pi_scheffe$lower[i] = pred_inte[[i]]$fit[1] - G * summary(pro2.lm)$sigma * sqrt( (pred_inte[[i]]$se.fit / summary(pro2.lm)$sigma)^2 + 1)
simult_pi_scheffe$upper[i] = pred_inte[[i]]$fit[1] + G * summary(pro2.lm)$sigma * sqrt( (pred_inte[[i]]$se.fit / summary(pro2.lm)$sigma)^2 + 1)
}
simult_pi_scheffe
## fit lower upper
## 1 9.801225 8.908454 10.693997
## 2 8.358796 7.436483 9.281109
## 3 9.444302 8.549456 10.339148
## 4 11.831218 10.827097 12.835338
## 5 10.025695 9.131528 10.919861
## 6 10.986116 10.031472 11.940761
In this case, G > t, which means the Scheffe predication intervals are wider than the Bonferroni predication intervals, so we choose the narrower one - the Bonferroni method.