# set global options
knitr::opts_chunk$set(
echo = TRUE, # show code
warning = FALSE,
message = FALSE
)
Disk drive service. The staff of a service center for electronic equipment includes three technicians who specialize in repairing three widely used makes of disk drives for desktop computers. It was desired to study the effects of tecinician (factor \(A\) ) and make of disk drive (factor \(B\) ) on the service time. The data that follow show the number of minutes required to complete the repair job in a study where each technician was randomly assigned to five jobs on each make of disk drive.
The dataset used in this question is “disk drive service.txt”.
\[ \begin{array}{ccccc} & & \text { Factor } B \text { (make of drive) } \\ \begin{array}{c} \text { Factor } A \\ \text { (technician) } \end{array} & \begin{array}{l} \\ \end{array} & \begin{array}{c} j=1 \\ \text { Make } 1 \end{array} & \begin{array}{c} j=2 \\ \text { Make } 2 \end{array} & \begin{array}{c} j=3 \\ \text { Make } 3 \end{array} \\ \hline i=1 & \text { Technician 1 } & 62 & 57 & 59 \\ & & 48 & 45 & 53 \\ & & \ldots & \ldots & \ldots \\ & & 69 & 44 & 47 \\ i=2 & \text { Technician 2 } & 51 & 61 & 55 \\ & & 57 & 58 & 58 \\ & & \ldots & \ldots & \ldots \\ & & 39 & 51 & 49 \\ i=3 & \text { Technician 3 } & 59 & 58 & 47 \\ & & 65 & 63 & 56 \\ & & \ldots & \ldots & \ldots \\ & & 70 & 60 & 50 \end{array} \]
- Compute the least squares estimates for \(\mu_{\cdot \cdot}, \alpha_{i}, \beta_{j}, ( \alpha \beta )_{i j}\).
#First download and format data set
setwd("~/Master's Program/Classes/Winter 2022/STA 106")
Disk_df <- read.table(file = "disk drive service.txt")
colnames(Disk_df) <- c("Minutes_to_Repair","i","j","k")
View(Disk_df)
# Establish values of a, b, and n
a = 3 # number of different factor levels factor A ("i") may take
b = 3 # number of different factor levels factor B ("j") may take
n = 5 # the number of observations for each unique factor level combinations (or #treatments)
n_tot = length(Disk_df$Minutes_to_Repair) # total sample size
n_tot
## [1] 45
# establish overall means, factor level means
#mu_dot_dot = Ybar_dot_dot_dot
mu_dot_dot = mean(Disk_df$Minutes_to_Repair)
#the main effect of factor A at i levels
alpha_i_hat <- rep(NA,a)
for (i in 1:a){
alpha_i_hat[i] = mean(Disk_df$Minutes_to_Repair[Disk_df$i == i])-mu_dot_dot
}
#the main effect of factor B at j levels
beta_j_hat <- rep(NA, b)
for (j in 1:b){
beta_j_hat[j] = mean(Disk_df$Minutes_to_Repair[Disk_df$j == j]) - mu_dot_dot
}
Overall mean= 55.82222
Main effect of factor A = -0.4888889, -0.5555556, 1.0444444, for i from 1 to 3 respectively
Main effect of factor B = 0.3111111, 0.7777778, -1.0888889, for j from 1 to 3 respectively
Interaction estimate of (AB)ij
#matrix of interaction effects
inter_ij_hat=matrix(0,nrow = a,ncol = b)
for(i in 1:a)
{
for(j in 1:b)
{
inter_ij_hat[i,j] = mean(Disk_df$Minutes_to_Repair[Disk_df$i==i & Disk_df$j==j])-
mean(Disk_df$Minutes_to_Repair[Disk_df$i==i])-
mean(Disk_df$Minutes_to_Repair[Disk_df$j==j])+
mean(Disk_df$Minutes_to_Repair)
}
}
rownames(inter_ij_hat)=c("i=1 Technician 1","i=2 Technician 2",
"i=3 Technician 3")
colnames(inter_ij_hat)=c("j=1 Drive Make 1","j=2 Drive Make 2",
"j=3 Drive Make 3")
inter_ij_hat
## j=1 Drive Make 1 j=2 Drive Make 2 j=3 Drive Make 3
## i=1 Technician 1 4.155556 -8.311111 4.155556
## i=2 Technician 2 -7.177778 5.155556 2.022222
## i=3 Technician 3 3.022222 3.155556 -6.177778
library('knitr')
library(kableExtra)
kable(mu_dot_dot,
"html",
digits=2,
col.names = NULL,
caption = "Overall mean")%>%kable_styling()
| 55.82 |
kable(alpha_i_hat ,
"html",
digits=2,
col.names = NULL,
caption = "Factor A main effects")%>%kable_styling()
| -0.49 |
| -0.56 |
| 1.04 |
kable(beta_j_hat ,
"html",
digits=2,
col.names = NULL,
caption = "Factor B main effects")%>%kable_styling()
| 0.31 |
| 0.78 |
| -1.09 |
kable(inter_ij_hat,
"html",
digits=2,
caption = "Interaction effects")%>%kable_styling()
| j=1 Drive Make 1 | j=2 Drive Make 2 | j=3 Drive Make 3 | |
|---|---|---|---|
| i=1 Technician 1 | 4.16 | -8.31 | 4.16 |
| i=2 Technician 2 | -7.18 | 5.16 | 2.02 |
| i=3 Technician 3 | 3.02 | 3.16 | -6.18 |
interaction.plot(x.factor=Disk_df$i,trace.factor=Disk_df$j,response = Disk_df$Minutes_to_Repair,xlab= c('Technician (factor A)'),trace.label = c('Drive Make (factor B)'))
The non-parallel lines on the interaction graph appear to indicate presence of interaction and therefore a need to parameterize in terms of treatment means
SSA = n*b*sum(alpha_i_hat^2)
SSB = n*a*sum(beta_j_hat^2)
SSAB = n*sum(inter_ij_hat^2)
# fitted value
y_hat = rep(0,n_tot)
for(g in 1:n_tot)
{
y_hat[g] = mu_dot_dot+alpha_i_hat[Disk_df$i[g]]+beta_j_hat[Disk_df$j[g]]+inter_ij_hat[Disk_df$i[g],Disk_df$j[g]]
}
# residuals
e = Disk_df$Minutes_to_Repair-y_hat
SSE = sum(e^2)
SSTO = sum((Disk_df$Minutes_to_Repair-mu_dot_dot)^2)
AnovaTable = matrix(0,nrow = 5,ncol = 3)
AnovaTable[1,1] = SSA
AnovaTable[1,2] = a-1
AnovaTable[1,3] = SSA/(a-1)
AnovaTable[2,1] = SSB
AnovaTable[2,2] = b-1
AnovaTable[2,3] = SSB/(b-1)
AnovaTable[3,1] = SSAB
AnovaTable[3,2] = (a-1)*(b-1)
AnovaTable[3,3] = SSAB/((a-1)*(b-1))
AnovaTable[4,1] = SSE
AnovaTable[4,2] = a*b*(n-1)
AnovaTable[4,3] = SSE/(a*b*(n-1))
AnovaTable[5,1] = SSTO
AnovaTable[5,2] = n*a*b-1
AnovaTable[5,3] = '-'
AnovaTable = as.data.frame(AnovaTable)
rownames(AnovaTable) = c('Factor A','Factor B','Interaction AB','Error','Total')
colnames(AnovaTable) = c('SS','df','MS')
knitr::kable(AnovaTable,
"html",
digits=2,
align = "c",
caption = "ANOVA Table")%>%kable_styling(full_width = F)
| SS | df | MS | |
|---|---|---|---|
| Factor A | 24.5777777777778 | 2 | 12.2888888888889 |
| Factor B | 28.3111111111111 | 2 | 14.1555555555556 |
| Interaction AB | 1215.28888888889 | 4 | 303.822222222222 |
| Error | 1872.4 | 36 | 52.0111111111111 |
| Total | 3140.57777777778 | 44 |
|
MSAB = SSAB/((a-1)*(b-1))
MSE = SSE/((a*b)*(n-1))
F_stat = (MSAB/MSE)
#By P-Value
pf(F_stat, df1 = ((a-1)*(b-1)), df2 = ((a*b)*(n-1)), lower.tail = FALSE)
## [1] 0.0009941068
#By Critical Region
help(qf)
F_crit = qf((1-.01), df1 = ((a-1)*(b-1)), df2 = ((a*b)*(n-1)))
F_stat < F_crit
## [1] FALSE
Null hypothesis: All interactions for every combination of factor levels are equal to zero Alternative hypothesis: Not all interactions for every combination of factor levels are equal to zero
Decision rule if the p-value of the F_statistic is < alpha, reject the null hypothesis.
p-value of test statistic is 0.0009941068 which is < alpha. Thus we reject the null hypothesis that all interactions for every combination of factor levels are equal to zero. Therefore, interactions are significant to consider.
#Testing Significance of Factor A
MSA = (SSA/(a-1))
MSB = (SSB/(b-1))
F_stat = MSA/MSE
alpha = .01
# By
p_value = pf(F_stat, df1=2, df2=36, lower.tail=FALSE)
if (p_value < alpha){
print("P_value < .01. Reject Null Hypothesis")
} else {print("P_value > .01. Fail to Reject Null Hypothesis")
}
## [1] "P_value > .01. Fail to Reject Null Hypothesis"
# By Critical Region
F_crit = qf((1-.01), df1 = 2, df2 = 36)
F_stat < F_crit
## [1] TRUE
if (F_stat < F_crit){
print("F_stat < F_crit. Fail to Reject Null Hypothesis")
} else {print("F_stat > F_crit. Reject Null Hypothesis")
}
## [1] "F_stat < F_crit. Fail to Reject Null Hypothesis"
# F statistic does is not greater than the critical value
#Testing Significance of Factor B
F_stat = MSB/MSE
df1 = b-1
p_value = pf(F_stat, df1=(b-1), df2=36, lower.tail=FALSE)
if (p_value < alpha){
print("P_value < .01. Reject Null Hypothesis")
} else {print("P_value > .01. Fail to Reject Null Hypothesis")
}
## [1] "P_value > .01. Fail to Reject Null Hypothesis"
#Critical Region
F_crit = qf((1-.01), df1 = 2, df2 = 36)
F_stat < F_crit
## [1] TRUE
if (F_stat < F_crit){
print("F_stat < F_crit. Fail to Reject Null Hypothesis")
} else {print("F_stat > F_crit. Reject Null Hypothesis")
}
## [1] "F_stat < F_crit. Fail to Reject Null Hypothesis"
For the main effect of Factor A we utilize an F test which is calculated by MSA/MSE Null hypothesis is that main effect of factor A is equal to zero at all factor levels i. Alternative is that main effect of factor A is NOT equal to zero at all factor levels i. If the p-value of the test statistic is < alpha where alpha = .01 we reject the null hypothesis that is the main effect of factor A is equal to zero at all factor levels i. Given our p-value is NOT less than alpha we fail to reject the null hypothesis that the main effect of factor A is equal to zero at all factor levels i.
For the main effect of Factor B we utilize an F test which is calculated by MSB/MSE Null hypothesis is that main effect of factor B is equal to zero at all factor levels j. Alternative is that main effect of factor B is NOT equal to zero at all factor levels j. If the p-value of the test statistic is < alpha where alpha = .01 we reject the null hypothesis that is the main effect of factor B is equal to zero at all factor levels i. Given our p-value is NOT less than alpha we fail to reject the null hypothesis that the main effect of factor B is equal to zero at all factor levels i.
By formal statistical testing in part 4 we show that by conducting an F test of MSAB/MSE that the p-value is less than are alpha=.01. Therefore we may reject the null hypothesis that the interaction term for all combination of factor levels is 0. By formal statistical testing in part 5 we show that the main effect of factor A at all factor levels and the main effect of factor B at all factor levels are both equal 0. We do this by an F test of the respective mean squares of treatment means due to factor A or B respectively.
More detailed analyses: - (7) Estimate \(\mu_{11}\) with a 99 percent confidence interval. Interpret your interval estimate.
Y_hat_11 <- mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1])
std_dev_Y_hat_11 = (MSE/(b*5))
T_critical = qt((1-(.01/2)),df=((a*b)*(n-1)))
Upperbound = (Y_hat_11 + T_critical*std_dev_Y_hat_11)
Lowerbound = (Y_hat_11 - T_critical*std_dev_Y_hat_11)
CI = c(Lowerbound,Upperbound)
print("99% CI")
## [1] "99% CI"
print(CI)
## [1] 50.37044 69.22956
The 99% confdience interval for the point estimate of Ybar_1,1 is (50.37044, 69.22956) with a point estimate of 59.8. We can thus say with 99% confidence that the true average response for the minutes to repair a disk drive with make 1 by technician 1 is between 50.37044 & 69.22956 minutes. Or if we were to repeat the experiment an infinite number of times 99% of the confidence intervals we generate would contain the true or population mean response for minutes to repair of a disk with make 1 by technician 1.
Ybar_22 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2])
Ybar_21 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1])
D_hat = Ybar_22 - Ybar_21
std_dev_Dhat <- sqrt(MSE*(2/(b*n)))
#Since this is a pairwise difference we can use Tukey, Scheffe, or Bonferroni
#Determine the most efficient procedure
#Bonferoni
B = qt((1-(.01/2)), df=(3*3*(n-1)), lower.tail=TRUE)
#Scheffe
S = sqrt((a-1)*qf((1-(.01/2)),df1=(a*b-1),df2=(a*b*(n-1))))
#Tukey
Tu = (1/sqrt(2))*qtukey(1-.01,nmeans=a*b,df=a*b*(n-1))
#Scheffe is the most efficent procedure for comparison therefore we will use it
Upperbound = (D_hat + S*std_dev_Dhat)
Lowerbound = (D_hat - S*std_dev_Dhat)
CI = c(Lowerbound,Upperbound)
print("Point Estimate")
## [1] "Point Estimate"
print(D_hat)
## [1] 12.8
print("99% CI")
## [1] "99% CI"
print(CI)
## [1] 5.907889 19.692111
This is a pair-wise comparison so we can assess the efficiency of using either Tukey, Bonferroni, or Scheffe. Scheffe was the most efficient so we use that. The point estimate for this difference is 12.8 with a 99% confidence interval of (5.907889, 19.692111).
#Establish Bonferroni multiplier
B = qt((1-(.05/6)),(a*b*(n-1)))
# Family 1 i=1 differences in j
D_hat1_2 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 2])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat1_2 + std_dev_Dhat*B, D_hat1_2 - std_dev_Dhat*B)
## [1] 18.612583 5.387417
D_hat1_3 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 3])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat1_3 + std_dev_Dhat*B, D_hat1_3 - std_dev_Dhat*B)
## [1] 8.012583 -5.212583
D_hat3_2 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 3]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 2])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat3_2 + std_dev_Dhat*B, D_hat3_2 - std_dev_Dhat*B)
## [1] 17.212583 3.987417
#Family 2 i=2 differences in j
D_hat2_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat2_1 + std_dev_Dhat*B, D_hat2_1 - std_dev_Dhat*B)
## [1] -6.187417 -19.412583
D_hat2_3 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 3])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat2_3 + std_dev_Dhat*B, D_hat2_3 - std_dev_Dhat*B)
## [1] 11.612583 -1.612583
D_hat3_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 3])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat3_1 + std_dev_Dhat*B, D_hat3_1 - std_dev_Dhat*B)
## [1] -1.187417 -14.412583
#Family 3 i=3 differences in j
D_hat3_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 3]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 1])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat3_1 + std_dev_Dhat*B, D_hat3_1 - std_dev_Dhat*B)
## [1] -3.987417 -17.212583
D_hat3_2 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 3]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 2])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat3_2 + std_dev_Dhat*B, D_hat3_2 - std_dev_Dhat*B)
## [1] -4.587417 -17.812583
D_hat2_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 2]) - mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 1])
std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
c(D_hat2_1 + std_dev_Dhat*B, D_hat2_1 - std_dev_Dhat*B)
## [1] 7.212583 -6.012583
The difference in minutes to repair for technician 1 for repairing drive make 1 minus drive make 2 was between (18.612583 , 5.387417). For drive make 1 minus drive make 3 between (8.012583, -5.212583) (not significant) and for drive make 3 minus drive make 2 between (17.212583, 3.987417)
The difference in minutes to repair for technician 2 for repairing drive make 1 minus drive make 2 was between (-6.187417, -19.412583). For drive make 2 minus drive make 3 between (11.612583, -1.612583) (not significant) and for drive make 1 minus drive make 3 between (-1.187417, -14.412583)
The difference in minutes to repair for technician 3 for repairing drive make 3 minus drive make 1 was between (-3.987417, -17.212583). For drive make 3 minus drive make 2 between (-4.587417, -17.812583) and for drive make 2 minus drive make 3 between (7.212583, -6.012583) (not significant).
The only statistically non-significant differences in minutes to repair found were: for technician 1, the difference between drive make 1 minus drive make 3 between (8.012583, -5.212583), for technician 2 the difference between drive make 2 minus drive make 3 between (11.612583, -1.612583), and for technician 3 the difference between drive make 2 minus drive make 3 between (7.212583, -6.012583).
#Essentially were tasked with evaluating a confidence interval for a linear #combination where we have
#10*mu_hat_ij for each combination of i and j
#Compute the Bonferroni critical value, g=1 b/cuz we are interested in a singular
#linear combination
B=qt((1-(.01/2)),df=(a*b*(n-1)))
#Compute the standard error for this linear combination
s_Lhat <- sqrt((MSE/n)*10^2+10^2+10^2+10^2+10^2+10^2+10^2+10^2+10^2)
#Compute the point estimate (essentially sum of 10*each treatment mean)
L_hat <- (10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 2])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 3])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 3])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 1])+ 10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 2])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 3]))
L_hat
## [1] 5024
Confidence_interval = c(L_hat - B*s_Lhat, L_hat + B*s_Lhat)
print("99% Confidence Interval")
## [1] "99% Confidence Interval"
print(Confidence_interval)
## [1] 4907.34 5140.66
The expected total amount of minutes is 5024 minute with a 99% Confidence Interval between 4907.34 and 5140.66. We can thus be 99% confident that the true amount of minutes to repair these 90 disk drives with the specified allocation of 10 drives of each make to each technician is between 4907.34 minutes and 5140.66 minutes in a given week.