# set global options
knitr::opts_chunk$set(
  echo = TRUE, # show code 
  warning = FALSE,  
  message = FALSE
  
) 

Case Study.

Disk drive service. The staff of a service center for electronic equipment includes three technicians who specialize in repairing three widely used makes of disk drives for desktop computers. It was desired to study the effects of tecinician (factor \(A\) ) and make of disk drive (factor \(B\) ) on the service time. The data that follow show the number of minutes required to complete the repair job in a study where each technician was randomly assigned to five jobs on each make of disk drive.

The dataset used in this question is “disk drive service.txt”.

  • Column 1: the number of minutes (outcome)
  • Column 2: i (factor A levels)
  • Column 3: j (factor B levels)
  • Column 4: k

\[ \begin{array}{ccccc} & & \text { Factor } B \text { (make of drive) } \\ \begin{array}{c} \text { Factor } A \\ \text { (technician) } \end{array} & \begin{array}{l} \\ \end{array} & \begin{array}{c} j=1 \\ \text { Make } 1 \end{array} & \begin{array}{c} j=2 \\ \text { Make } 2 \end{array} & \begin{array}{c} j=3 \\ \text { Make } 3 \end{array} \\ \hline i=1 & \text { Technician 1 } & 62 & 57 & 59 \\ & & 48 & 45 & 53 \\ & & \ldots & \ldots & \ldots \\ & & 69 & 44 & 47 \\ i=2 & \text { Technician 2 } & 51 & 61 & 55 \\ & & 57 & 58 & 58 \\ & & \ldots & \ldots & \ldots \\ & & 39 & 51 & 49 \\ i=3 & \text { Technician 3 } & 59 & 58 & 47 \\ & & 65 & 63 & 56 \\ & & \ldots & \ldots & \ldots \\ & & 70 & 60 & 50 \end{array} \]

    1. Compute the least squares estimates for \(\mu_{\cdot \cdot}, \alpha_{i}, \beta_{j}, ( \alpha \beta )_{i j}\).
#First download and format data set
setwd("~/Master's Program/Classes/Winter 2022/STA 106")
Disk_df <- read.table(file = "disk drive service.txt")
colnames(Disk_df) <- c("Minutes_to_Repair","i","j","k")
View(Disk_df)

# Establish values of a, b, and n
a = 3 # number of different factor levels factor A ("i") may take
b = 3 # number of different factor levels factor B ("j") may take 
n = 5 # the number of observations for each unique factor level combinations (or #treatments)
n_tot = length(Disk_df$Minutes_to_Repair) # total sample size
n_tot
## [1] 45
# establish overall means, factor level means
#mu_dot_dot = Ybar_dot_dot_dot
mu_dot_dot = mean(Disk_df$Minutes_to_Repair)

#the main effect of factor A at i levels
alpha_i_hat <- rep(NA,a)
for (i in 1:a){
  alpha_i_hat[i] = mean(Disk_df$Minutes_to_Repair[Disk_df$i == i])-mu_dot_dot
}


#the main effect of factor B at j levels
beta_j_hat <- rep(NA, b)
for (j in 1:b){
  beta_j_hat[j] = mean(Disk_df$Minutes_to_Repair[Disk_df$j == j]) - mu_dot_dot
}

Overall mean= 55.82222

Main effect of factor A = -0.4888889, -0.5555556, 1.0444444, for i from 1 to 3 respectively

Main effect of factor B = 0.3111111, 0.7777778, -1.0888889, for j from 1 to 3 respectively

Interaction estimate of (AB)ij

#matrix of interaction effects
inter_ij_hat=matrix(0,nrow = a,ncol = b)
for(i in 1:a)
{
  for(j in 1:b)
  {
    inter_ij_hat[i,j] = mean(Disk_df$Minutes_to_Repair[Disk_df$i==i & Disk_df$j==j])-
      mean(Disk_df$Minutes_to_Repair[Disk_df$i==i])-
      mean(Disk_df$Minutes_to_Repair[Disk_df$j==j])+
      mean(Disk_df$Minutes_to_Repair)
  }
}

rownames(inter_ij_hat)=c("i=1 Technician 1","i=2 Technician 2", 
                         "i=3 Technician 3")
colnames(inter_ij_hat)=c("j=1 Drive Make 1","j=2 Drive Make 2",
                         "j=3 Drive Make 3")

inter_ij_hat
##                  j=1 Drive Make 1 j=2 Drive Make 2 j=3 Drive Make 3
## i=1 Technician 1         4.155556        -8.311111         4.155556
## i=2 Technician 2        -7.177778         5.155556         2.022222
## i=3 Technician 3         3.022222         3.155556        -6.177778
library('knitr')
library(kableExtra)

kable(mu_dot_dot, 
             "html",
             digits=2,
             col.names = NULL,
             caption = "Overall mean")%>%kable_styling()
Overall mean
55.82
kable(alpha_i_hat , 
             "html",
             digits=2,
             col.names = NULL,
             caption = "Factor A main effects")%>%kable_styling() 
Factor A main effects
-0.49
-0.56
1.04
kable(beta_j_hat , 
             "html",
             digits=2,
             col.names = NULL,
             caption = "Factor B main effects")%>%kable_styling() 
Factor B main effects
0.31
0.78
-1.09
kable(inter_ij_hat, 
             "html",
             digits=2,
             caption = "Interaction effects")%>%kable_styling() 
Interaction effects
j=1 Drive Make 1 j=2 Drive Make 2 j=3 Drive Make 3
i=1 Technician 1 4.16 -8.31 4.16
i=2 Technician 2 -7.18 5.16 2.02
i=3 Technician 3 3.02 3.16 -6.18
    1. Draw an interaction plot based on the estimated treatment means. Comment on the plot in terms of interaction effects, factor A and B main effects.
interaction.plot(x.factor=Disk_df$i,trace.factor=Disk_df$j,response = Disk_df$Minutes_to_Repair,xlab= c('Technician (factor A)'),trace.label = c('Drive Make (factor B)'))

The non-parallel lines on the interaction graph appear to indicate presence of interaction and therefore a need to parameterize in terms of treatment means

    1. Obtain the ANOVA table.
SSA = n*b*sum(alpha_i_hat^2)
SSB = n*a*sum(beta_j_hat^2)
SSAB = n*sum(inter_ij_hat^2)

# fitted value
y_hat = rep(0,n_tot)

for(g in 1:n_tot)
{
    y_hat[g] = mu_dot_dot+alpha_i_hat[Disk_df$i[g]]+beta_j_hat[Disk_df$j[g]]+inter_ij_hat[Disk_df$i[g],Disk_df$j[g]]
}

# residuals
e = Disk_df$Minutes_to_Repair-y_hat

SSE = sum(e^2)
SSTO = sum((Disk_df$Minutes_to_Repair-mu_dot_dot)^2)
AnovaTable = matrix(0,nrow = 5,ncol = 3)
AnovaTable[1,1] = SSA
AnovaTable[1,2] = a-1
AnovaTable[1,3] = SSA/(a-1)
AnovaTable[2,1] = SSB
AnovaTable[2,2] = b-1
AnovaTable[2,3] = SSB/(b-1)
AnovaTable[3,1] = SSAB
AnovaTable[3,2] = (a-1)*(b-1)
AnovaTable[3,3] = SSAB/((a-1)*(b-1))
AnovaTable[4,1] = SSE
AnovaTable[4,2] = a*b*(n-1)
AnovaTable[4,3] = SSE/(a*b*(n-1))
AnovaTable[5,1] = SSTO
AnovaTable[5,2] = n*a*b-1
AnovaTable[5,3] = '-'

AnovaTable = as.data.frame(AnovaTable)
rownames(AnovaTable) = c('Factor A','Factor B','Interaction AB','Error','Total')
colnames(AnovaTable) = c('SS','df','MS')

 
knitr::kable(AnovaTable, 
            "html",
             digits=2,
             align = "c",
             caption = "ANOVA Table")%>%kable_styling(full_width = F)
ANOVA Table
SS df MS
Factor A 24.5777777777778 2 12.2888888888889
Factor B 28.3111111111111 2 14.1555555555556
Interaction AB 1215.28888888889 4 303.822222222222
Error 1872.4 36 52.0111111111111
Total 3140.57777777778 44
    1. Test whether or not interaction effects are present at 0.01 significance level. You need to state the null and alternative hypotheses, compute the test statistic, state the decision rule and make your conclusion.
  • MSAB = SSAB/((a-1)*(b-1))
    MSE = SSE/((a*b)*(n-1))
    F_stat = (MSAB/MSE)
    #By P-Value
    pf(F_stat, df1 = ((a-1)*(b-1)), df2 = ((a*b)*(n-1)), lower.tail = FALSE)
    ## [1] 0.0009941068
    #By Critical Region
    help(qf)
    F_crit = qf((1-.01), df1 = ((a-1)*(b-1)), df2 = ((a*b)*(n-1)))
    F_stat < F_crit
    ## [1] FALSE

    Null hypothesis: All interactions for every combination of factor levels are equal to zero Alternative hypothesis: Not all interactions for every combination of factor levels are equal to zero

    Decision rule if the p-value of the F_statistic is < alpha, reject the null hypothesis.

    p-value of test statistic is 0.0009941068 which is < alpha. Thus we reject the null hypothesis that all interactions for every combination of factor levels are equal to zero. Therefore, interactions are significant to consider.

      1. Test whether or not main effects for technician and make of drive are present. Use \(\alpha=.01\) in each case and state the alternatives, decision rule, and conclusion. Is it meaningful here to test for main factor effects? Explain.
    #Testing Significance of Factor A
    MSA = (SSA/(a-1))
    MSB = (SSB/(b-1))
    F_stat = MSA/MSE
    alpha = .01
    # By 
    p_value = pf(F_stat, df1=2, df2=36, lower.tail=FALSE)
    if (p_value < alpha){
      print("P_value < .01. Reject Null Hypothesis")
      } else {print("P_value > .01. Fail to Reject Null Hypothesis")
        }
    ## [1] "P_value > .01. Fail to Reject Null Hypothesis"
    # By Critical Region
    F_crit = qf((1-.01), df1 = 2, df2 = 36)
    F_stat < F_crit
    ## [1] TRUE
    if (F_stat < F_crit){
      print("F_stat < F_crit. Fail to Reject Null Hypothesis")
      } else {print("F_stat > F_crit. Reject Null Hypothesis")
        }
    ## [1] "F_stat < F_crit. Fail to Reject Null Hypothesis"
    # F statistic does is not greater than the critical value
    
    #Testing Significance of Factor B
    F_stat = MSB/MSE
    df1 = b-1
    p_value = pf(F_stat, df1=(b-1), df2=36, lower.tail=FALSE)
    if (p_value < alpha){
      print("P_value < .01. Reject Null Hypothesis")
      } else {print("P_value > .01. Fail to Reject Null Hypothesis")
        }
    ## [1] "P_value > .01. Fail to Reject Null Hypothesis"
    #Critical Region
    F_crit = qf((1-.01), df1 = 2, df2 = 36)
    F_stat < F_crit
    ## [1] TRUE
    if (F_stat < F_crit){
      print("F_stat < F_crit. Fail to Reject Null Hypothesis")
      } else {print("F_stat > F_crit. Reject Null Hypothesis")
        }
    ## [1] "F_stat < F_crit. Fail to Reject Null Hypothesis"

    For the main effect of Factor A we utilize an F test which is calculated by MSA/MSE Null hypothesis is that main effect of factor A is equal to zero at all factor levels i. Alternative is that main effect of factor A is NOT equal to zero at all factor levels i. If the p-value of the test statistic is < alpha where alpha = .01 we reject the null hypothesis that is the main effect of factor A is equal to zero at all factor levels i. Given our p-value is NOT less than alpha we fail to reject the null hypothesis that the main effect of factor A is equal to zero at all factor levels i.

    For the main effect of Factor B we utilize an F test which is calculated by MSB/MSE Null hypothesis is that main effect of factor B is equal to zero at all factor levels j. Alternative is that main effect of factor B is NOT equal to zero at all factor levels j. If the p-value of the test statistic is < alpha where alpha = .01 we reject the null hypothesis that is the main effect of factor B is equal to zero at all factor levels i. Given our p-value is NOT less than alpha we fail to reject the null hypothesis that the main effect of factor B is equal to zero at all factor levels i.

      1. Do the results in parts (4) and (5) confirm your graphic analysis in part (2)? Yes, in the analysis of the interaction plot we see non-parallel slopes, this indicates that the average response at factor A level i is dependent on the factor B level j – or that the response of Factor A at a given value of i interacts with Factor B at a given value of j to produce an average response at factor levels ij. We also known since interaction is present we cannot represent the model with just factor level means, we need to model the treatment level means.

    By formal statistical testing in part 4 we show that by conducting an F test of MSAB/MSE that the p-value is less than are alpha=.01. Therefore we may reject the null hypothesis that the interaction term for all combination of factor levels is 0. By formal statistical testing in part 5 we show that the main effect of factor A at all factor levels and the main effect of factor B at all factor levels are both equal 0. We do this by an F test of the respective mean squares of treatment means due to factor A or B respectively.

    More detailed analyses: - (7) Estimate \(\mu_{11}\) with a 99 percent confidence interval. Interpret your interval estimate.

    Y_hat_11 <- mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1])
    
    std_dev_Y_hat_11 = (MSE/(b*5))
    T_critical = qt((1-(.01/2)),df=((a*b)*(n-1)))
    
    Upperbound = (Y_hat_11 + T_critical*std_dev_Y_hat_11)
    Lowerbound = (Y_hat_11 - T_critical*std_dev_Y_hat_11)
    CI = c(Lowerbound,Upperbound)
    print("99% CI")
    ## [1] "99% CI"
    print(CI)
    ## [1] 50.37044 69.22956

    The 99% confdience interval for the point estimate of Ybar_1,1 is (50.37044, 69.22956) with a point estimate of 59.8. We can thus say with 99% confidence that the true average response for the minutes to repair a disk drive with make 1 by technician 1 is between 50.37044 & 69.22956 minutes. Or if we were to repeat the experiment an infinite number of times 99% of the confidence intervals we generate would contain the true or population mean response for minutes to repair of a disk with make 1 by technician 1.

      1. Estimate \(D=\mu_{22}-\mu_{21}\) with a 99 percent confidence interval. Interpret your interval estimate.
    Ybar_22 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2])
    Ybar_21 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1])
    D_hat = Ybar_22 - Ybar_21
    std_dev_Dhat <- sqrt(MSE*(2/(b*n)))
    
    #Since this is a pairwise difference we can use Tukey, Scheffe, or Bonferroni
    #Determine the most efficient procedure
    
    #Bonferoni
      B = qt((1-(.01/2)), df=(3*3*(n-1)), lower.tail=TRUE)
      
      #Scheffe
      S = sqrt((a-1)*qf((1-(.01/2)),df1=(a*b-1),df2=(a*b*(n-1))))
      
      #Tukey
      Tu = (1/sqrt(2))*qtukey(1-.01,nmeans=a*b,df=a*b*(n-1))
    
    
    #Scheffe is the most efficent procedure for comparison therefore we will use it
      
      
    Upperbound = (D_hat + S*std_dev_Dhat)
    Lowerbound = (D_hat - S*std_dev_Dhat)
    CI = c(Lowerbound,Upperbound)
    print("Point Estimate")
    ## [1] "Point Estimate"
    print(D_hat)
    ## [1] 12.8
    print("99% CI")
    ## [1] "99% CI"
    print(CI)
    ## [1]  5.907889 19.692111

    This is a pair-wise comparison so we can assess the efficiency of using either Tukey, Bonferroni, or Scheffe. Scheffe was the most efficient so we use that. The point estimate for this difference is 12.8 with a 99% confidence interval of (5.907889, 19.692111).

      1. The nature of the interaction effects is to be studied by making, for each technician, all three pairwise comparisons among the disk drive makes in order to identify, if possible, the make of disk drive for which the technician’s mean service time is lowest. The family confidence coefficient for each set of three pairwise comparisons is to be 95 percent. Use the Bonferroni procedure to make all required pairwise comparisons. Summarize your findings.
    #Establish Bonferroni multiplier
    
    B = qt((1-(.05/6)),(a*b*(n-1)))
    
    # Family 1 i=1 differences in j
    D_hat1_2 =  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 2])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat1_2 + std_dev_Dhat*B, D_hat1_2 - std_dev_Dhat*B)
    ## [1] 18.612583  5.387417
    D_hat1_3 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 3])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat1_3 + std_dev_Dhat*B, D_hat1_3 - std_dev_Dhat*B)
    ## [1]  8.012583 -5.212583
    D_hat3_2 =  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 3]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 2])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat3_2 + std_dev_Dhat*B, D_hat3_2 - std_dev_Dhat*B)
    ## [1] 17.212583  3.987417
    #Family 2 i=2 differences in j
    D_hat2_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat2_1 + std_dev_Dhat*B, D_hat2_1 - std_dev_Dhat*B)
    ## [1]  -6.187417 -19.412583
    D_hat2_3 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 3])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat2_3 + std_dev_Dhat*B, D_hat2_3 - std_dev_Dhat*B)
    ## [1] 11.612583 -1.612583
    D_hat3_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 3])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat3_1 + std_dev_Dhat*B, D_hat3_1 - std_dev_Dhat*B)
    ## [1]  -1.187417 -14.412583
    #Family 3 i=3 differences in j
    D_hat3_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 3]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 1])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat3_1 + std_dev_Dhat*B, D_hat3_1 - std_dev_Dhat*B)
    ## [1]  -3.987417 -17.212583
    D_hat3_2 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 3]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 2])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat3_2 + std_dev_Dhat*B, D_hat3_2 - std_dev_Dhat*B)
    ## [1]  -4.587417 -17.812583
    D_hat2_1 = mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 2]) -  mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 1])
    std_dev_Dhat = sqrt((MSE)*(2/(b*n)))
    c(D_hat2_1 + std_dev_Dhat*B, D_hat2_1 - std_dev_Dhat*B)
    ## [1]  7.212583 -6.012583

    The difference in minutes to repair for technician 1 for repairing drive make 1 minus drive make 2 was between (18.612583 , 5.387417). For drive make 1 minus drive make 3 between (8.012583, -5.212583) (not significant) and for drive make 3 minus drive make 2 between (17.212583, 3.987417)

    The difference in minutes to repair for technician 2 for repairing drive make 1 minus drive make 2 was between (-6.187417, -19.412583). For drive make 2 minus drive make 3 between (11.612583, -1.612583) (not significant) and for drive make 1 minus drive make 3 between (-1.187417, -14.412583)

    The difference in minutes to repair for technician 3 for repairing drive make 3 minus drive make 1 was between (-3.987417, -17.212583). For drive make 3 minus drive make 2 between (-4.587417, -17.812583) and for drive make 2 minus drive make 3 between (7.212583, -6.012583) (not significant).

    The only statistically non-significant differences in minutes to repair found were: for technician 1, the difference between drive make 1 minus drive make 3 between (8.012583, -5.212583), for technician 2 the difference between drive make 2 minus drive make 3 between (11.612583, -1.612583), and for technician 3 the difference between drive make 2 minus drive make 3 between (7.212583, -6.012583).

      1. The service center currently services 30 disk drives of each of the three makes per week, with each technician servicing 10 machines of each make. Estimate the expected total amount of service time required per week to service the 90 disk drives; use a 99 percent confidence interval.
    #Essentially were tasked with evaluating a confidence interval for a linear #combination where we have 
    #10*mu_hat_ij for each combination of i and j
    
    #Compute the Bonferroni critical value, g=1 b/cuz we are interested in a singular
    #linear combination
    
    B=qt((1-(.01/2)),df=(a*b*(n-1)))
    
    #Compute the standard error for this linear combination
    s_Lhat <- sqrt((MSE/n)*10^2+10^2+10^2+10^2+10^2+10^2+10^2+10^2+10^2)
    
    #Compute the point estimate (essentially sum of 10*each treatment mean)
    L_hat <- (10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 1])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 2])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 1 & Disk_df$j == 3])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 1])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 2])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 2 & Disk_df$j == 3])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 1])+ 10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 2])+10*mean(Disk_df$Minutes_to_Repair[Disk_df$i == 3 & Disk_df$j == 3]))
    
    L_hat
    ## [1] 5024
    Confidence_interval = c(L_hat - B*s_Lhat, L_hat + B*s_Lhat)
    print("99% Confidence Interval")
    ## [1] "99% Confidence Interval"
    print(Confidence_interval)
    ## [1] 4907.34 5140.66

    The expected total amount of minutes is 5024 minute with a 99% Confidence Interval between 4907.34 and 5140.66. We can thus be 99% confident that the true amount of minutes to repair these 90 disk drives with the specified allocation of 10 drives of each make to each technician is between 4907.34 minutes and 5140.66 minutes in a given week.