Data/Task Description

Three exam scores (Exam 1, Exam 2, Final Exam) from a Math 120 course are to be analyzed from a previous semester. In particular, 95% simultaneous confidence intervals for the mean score of each exam. The only issue is that we have missing data (e.g. a student was missing on the exam data and never rescheduled).

To remedy this problem we will use the EM Algorithm to impute these missing values and ultimately compute our MLE estimates for \(\mu\) and \(\Sigma\).

EM Algorithm Results

We initialize our EM algorithm with the following starting values:

\[ \mu_{initial} = (75.33793, 78.77305, 78.02055)^T \\ \Sigma_{initial} = \begin{bmatrix} 193.86961 & 76.58623 & 84.08576 \\ 76.58623 & 60.20492 & 41.89544\\ 84.08576 & 41.89544 & 57.77959\\ \end{bmatrix} \]

## [1] "Iter         mu1         mu3           sig.11    sig.13           ||grad||"
## [1] "------------------------------------------------------------------------------------------------"
## [1] " 0        75.33793       78.02055      193.86961        84.08576              NA"
## [1] " 1        75.35330       78.13977      197.64865        87.71555           1.37566"
## [1] " 2        75.35212       78.14707      197.58252        87.82320           0.17547"
## [1] " 3        75.35198       78.14771      197.57540        87.83149           0.02086"
## $mu
## [1] 75.35195 78.69630 78.14778
## 
## $sigma
##           [,1]     [,2]     [,3]
## [1,] 197.57479 83.63317 87.83247
## [2,]  83.63317 64.84842 47.27422
## [3,]  87.83247 47.27422 59.88321
## 
## $iterations
## [1] 8

As we can see in the console output above, our algorithm took 8 iterations to converge and the estimates for \(\mu\) and \(\Sigma\) can be seen above. These estimates will provide us the necessary values for our simultaneous confidence intervals.

Simultaenous Confidence Intervals

##    Exam Type            Lower            Upper
## 1     Exam 1 72.0841315300859 78.6197608750794
## 2     Exam 2 76.8241485389077 80.5684548145653
## 3 Final Exam 76.3487297605574 79.9468376885862

Above we can see the simultaneous confidence intervals graphically overlayed in each of the exam score distributions and numerically in the table console output

3D Plot of the Confidence Volume

Appendix: EM Algorithm Code

#gradient functions used
diff.sig <- function(A) {    
  row.list = list()   
  row.list[[1]] = (-1/2)*A[1,1]
  for(i in 2:nrow(A)) {       
    current.row = A[i,]
    for (j in 1:i) {            
      row.list[[i]] = c(-1*current.row[1:(j-1)], (-1/2)*current.row[j])       
    }    
  }
  return(as.matrix(unlist(row.list)))
}

#functions used
listOfE.x <- function(data, mu, sigma, list.observ) {
  list.E.x = list()
  n = nrow(data)
  p = ncol(data)
  num.variables = c(1:p)
  for(i in 1:n) {
    E.yobs = matrix(-100, nrow=p)
    
    obs = list.observ[[i]]
    y.obs = as.numeric(data[i,obs])
    E.yobs[obs, ] = y.obs
    if(length(obs) != p) { #incomplete data in this row
      mis = setdiff(num.variables, obs)
      mu.miss = matrix(mu[mis], ncol=1)
      sigma.OO = matrix(sigma[obs,obs], nrow=length(obs), ncol=length(obs)) 
      sigma.MO = matrix(sigma[mis,obs], nrow=length(mis), ncol=length(obs))
      sigma.MM = matrix(sigma[mis,mis], nrow=length(mis), ncol=length(mis))
      sigOO.inv = solve(sigma.OO)
      E.ymiss = mu.miss + sigma.MO %*% sigOO.inv %*% matrix(y.obs - mu[obs],ncol=1)
      E.ymiss = as.numeric(E.ymiss)
      E.yobs[mis, ] = E.ymiss
    } 
    list.E.x[[i]] = E.yobs
  } #end for loop
  return(list.E.x)
}

E.Xbar.star <- function(listOfE.x, n, p) {
  val = matrix(0,nrow=p)
  for(i in 1:n) {
    val = val + listOfE.x[[i]]
  }
  return(val/n)
}

listOfS.x <- function(data, mu, sigma, list.observ, list.Eyobs) {
  list.S.x = list()
  n = nrow(data)
  p = ncol(data)
  num.variables = c(1:p)
  for(i in 1:n) {
    obs = list.observ[[i]]
    y.obs = as.numeric(data[i,obs])
    y.obs.mat = matrix(y.obs, ncol=1)
    E.yobs.yobst = y.obs.mat %*% t(y.obs.mat)
    if(length(obs) != p) { #incomplete data in this row
      
      #jth column equal 
      E.yobs = list.Eyobs[[i]]
      mat = matrix(-20, nrow=p, ncol=p)
      for(j in 1:p) {
        mat[,j] = E.yobs[j,] * E.yobs
      }
      #calculate E.ymiss
      mis = setdiff(num.variables, obs)
      mu.miss = matrix(mu[mis], ncol=1)
      sigma.OO = matrix(sigma[obs,obs], nrow=length(obs), ncol=length(obs)) 
      sigma.MO = matrix(sigma[mis, obs], nrow=length(mis), ncol=length(obs))
      sigma.MM = matrix(sigma[mis,mis], nrow=length(mis), ncol=length(mis))
      sigOO.inv = solve(sigma.OO)
      E.ymiss = mu.miss + sigma.MO %*% sigOO.inv %*% matrix(y.obs - mu[obs],ncol=1)
      
      
      #low right
      low.right = sigma.MM - sigma.MO %*% sigOO.inv %*% t(sigma.MO) + (E.ymiss %*% t(E.ymiss))
      
      #replace with low.right
      mat[mis,mis] = low.right
      E.S = mat
    } else {
      E.S = E.yobs.yobst
    }
    list.S.x[[i]] = E.S
  } #end for loop
  return(list.S.x)
}

E.S.star <- function(listOfS.x, n, p) {
  val = matrix(0,nrow=p, ncol=p)
  for(i in 1:n) {
    val = val + listOfS.x[[i]]
  }
  return(val/n)
}

EM.miss <- function(data, mu.init, sig.init, maxit, tolerance) {
  
  mu = mu.init
  sigma = sig.init
  
  header = paste0("Iter", "         mu1", "         mu3", "           sig.11", "    sig.13", "           ||grad||")
  print(header)
  print("------------------------------------------------------------------------------------------------") 
  print(sprintf('%2.0f    %12.5f   %12.5f   %12.5f    %12.5f    %12.5f', 0,  mu[1], mu[3], sigma[1,1], sigma[1,3], NA))
  
  
  list.observed = list()
  p = ncol(data)
  num.variables = c(1:p)
  for(i in 1:nrow(data)) {
    observed = setdiff(num.variables, which(is.na(data[i,])))
    list.observed[[i]] = observed
  }
  
  for(it in 1:maxit) {
    #E-step: Compute Xbar.star and S.star
    blah = listOfE.x(data, mu, sigma, list.observed)
    mu.hat = E.Xbar.star(blah, nrow(data), ncol(data))
    
    blahS = listOfS.x(data, mu, sigma, list.observed, blah)
    S.star = E.S.star(blahS, nrow(data), ncol(data))
    Sig.hat = S.star - (mu.hat %*% t(mu.hat))
    
    #check for convergence
    #calculate gradient of log-likelihood. Calcualte norm. Check less than tolerance
    #gradient calculation
    n = nrow(data)
    mu.matrix = matrix(mu, ncol=1)
    siginv = solve(sigma)
    gradM = n * siginv %*% (mu.matrix - mu.hat)
    
    A = S.star - mu.matrix%*%t(mu.hat) - mu.hat%*%t(mu.matrix) + mu.matrix%*%t(mu.matrix)
    B = siginv %*% (sigma - A) %*% siginv
    gradS = (n) * diff.sig(B)
    
    grad = matrix(c(gradM, gradS), ncol=1)
    grad.norm = norm(grad, type='2')
    
    #M-step: Replace mu and sigma with mu.hat and sigma.hat
    mu = as.numeric(mu.hat)
    sigma = Sig.hat
    if (it == 1 || it == 2 || it == 3 || it == 33 || it == 34 || it == 35) {
      print(sprintf('%2.0f    %12.5f   %12.5f   %12.5f    %12.5f      %12.5f', it,  mu[1], mu[3], sigma[1,1], sigma[1,3], grad.norm))
    }
    
    
    if (grad.norm < tolerance) break
  }#end iteration for loop
  return(list(mu = mu, sigma = sigma, iterations = it))
}