Three exam scores (Exam 1, Exam 2, Final Exam) from a Math 120 course are to be analyzed from a previous semester. In particular, 95% simultaneous confidence intervals for the mean score of each exam. The only issue is that we have missing data (e.g. a student was missing on the exam data and never rescheduled).
To remedy this problem we will use the EM Algorithm to impute these missing values and ultimately compute our MLE estimates for \(\mu\) and \(\Sigma\).
We initialize our EM algorithm with the following starting values:
\[ \mu_{initial} = (75.33793, 78.77305, 78.02055)^T \\ \Sigma_{initial} = \begin{bmatrix} 193.86961 & 76.58623 & 84.08576 \\ 76.58623 & 60.20492 & 41.89544\\ 84.08576 & 41.89544 & 57.77959\\ \end{bmatrix} \]
## [1] "Iter mu1 mu3 sig.11 sig.13 ||grad||"
## [1] "------------------------------------------------------------------------------------------------"
## [1] " 0 75.33793 78.02055 193.86961 84.08576 NA"
## [1] " 1 75.35330 78.13977 197.64865 87.71555 1.37566"
## [1] " 2 75.35212 78.14707 197.58252 87.82320 0.17547"
## [1] " 3 75.35198 78.14771 197.57540 87.83149 0.02086"
## $mu
## [1] 75.35195 78.69630 78.14778
##
## $sigma
## [,1] [,2] [,3]
## [1,] 197.57479 83.63317 87.83247
## [2,] 83.63317 64.84842 47.27422
## [3,] 87.83247 47.27422 59.88321
##
## $iterations
## [1] 8
As we can see in the console output above, our algorithm took 8 iterations to converge and the estimates for \(\mu\) and \(\Sigma\) can be seen above. These estimates will provide us the necessary values for our simultaneous confidence intervals.
## Exam Type Lower Upper
## 1 Exam 1 72.0841315300859 78.6197608750794
## 2 Exam 2 76.8241485389077 80.5684548145653
## 3 Final Exam 76.3487297605574 79.9468376885862
Above we can see the simultaneous confidence intervals graphically overlayed in each of the exam score distributions and numerically in the table console output
#gradient functions used
diff.sig <- function(A) {
row.list = list()
row.list[[1]] = (-1/2)*A[1,1]
for(i in 2:nrow(A)) {
current.row = A[i,]
for (j in 1:i) {
row.list[[i]] = c(-1*current.row[1:(j-1)], (-1/2)*current.row[j])
}
}
return(as.matrix(unlist(row.list)))
}
#functions used
listOfE.x <- function(data, mu, sigma, list.observ) {
list.E.x = list()
n = nrow(data)
p = ncol(data)
num.variables = c(1:p)
for(i in 1:n) {
E.yobs = matrix(-100, nrow=p)
obs = list.observ[[i]]
y.obs = as.numeric(data[i,obs])
E.yobs[obs, ] = y.obs
if(length(obs) != p) { #incomplete data in this row
mis = setdiff(num.variables, obs)
mu.miss = matrix(mu[mis], ncol=1)
sigma.OO = matrix(sigma[obs,obs], nrow=length(obs), ncol=length(obs))
sigma.MO = matrix(sigma[mis,obs], nrow=length(mis), ncol=length(obs))
sigma.MM = matrix(sigma[mis,mis], nrow=length(mis), ncol=length(mis))
sigOO.inv = solve(sigma.OO)
E.ymiss = mu.miss + sigma.MO %*% sigOO.inv %*% matrix(y.obs - mu[obs],ncol=1)
E.ymiss = as.numeric(E.ymiss)
E.yobs[mis, ] = E.ymiss
}
list.E.x[[i]] = E.yobs
} #end for loop
return(list.E.x)
}
E.Xbar.star <- function(listOfE.x, n, p) {
val = matrix(0,nrow=p)
for(i in 1:n) {
val = val + listOfE.x[[i]]
}
return(val/n)
}
listOfS.x <- function(data, mu, sigma, list.observ, list.Eyobs) {
list.S.x = list()
n = nrow(data)
p = ncol(data)
num.variables = c(1:p)
for(i in 1:n) {
obs = list.observ[[i]]
y.obs = as.numeric(data[i,obs])
y.obs.mat = matrix(y.obs, ncol=1)
E.yobs.yobst = y.obs.mat %*% t(y.obs.mat)
if(length(obs) != p) { #incomplete data in this row
#jth column equal
E.yobs = list.Eyobs[[i]]
mat = matrix(-20, nrow=p, ncol=p)
for(j in 1:p) {
mat[,j] = E.yobs[j,] * E.yobs
}
#calculate E.ymiss
mis = setdiff(num.variables, obs)
mu.miss = matrix(mu[mis], ncol=1)
sigma.OO = matrix(sigma[obs,obs], nrow=length(obs), ncol=length(obs))
sigma.MO = matrix(sigma[mis, obs], nrow=length(mis), ncol=length(obs))
sigma.MM = matrix(sigma[mis,mis], nrow=length(mis), ncol=length(mis))
sigOO.inv = solve(sigma.OO)
E.ymiss = mu.miss + sigma.MO %*% sigOO.inv %*% matrix(y.obs - mu[obs],ncol=1)
#low right
low.right = sigma.MM - sigma.MO %*% sigOO.inv %*% t(sigma.MO) + (E.ymiss %*% t(E.ymiss))
#replace with low.right
mat[mis,mis] = low.right
E.S = mat
} else {
E.S = E.yobs.yobst
}
list.S.x[[i]] = E.S
} #end for loop
return(list.S.x)
}
E.S.star <- function(listOfS.x, n, p) {
val = matrix(0,nrow=p, ncol=p)
for(i in 1:n) {
val = val + listOfS.x[[i]]
}
return(val/n)
}
EM.miss <- function(data, mu.init, sig.init, maxit, tolerance) {
mu = mu.init
sigma = sig.init
header = paste0("Iter", " mu1", " mu3", " sig.11", " sig.13", " ||grad||")
print(header)
print("------------------------------------------------------------------------------------------------")
print(sprintf('%2.0f %12.5f %12.5f %12.5f %12.5f %12.5f', 0, mu[1], mu[3], sigma[1,1], sigma[1,3], NA))
list.observed = list()
p = ncol(data)
num.variables = c(1:p)
for(i in 1:nrow(data)) {
observed = setdiff(num.variables, which(is.na(data[i,])))
list.observed[[i]] = observed
}
for(it in 1:maxit) {
#E-step: Compute Xbar.star and S.star
blah = listOfE.x(data, mu, sigma, list.observed)
mu.hat = E.Xbar.star(blah, nrow(data), ncol(data))
blahS = listOfS.x(data, mu, sigma, list.observed, blah)
S.star = E.S.star(blahS, nrow(data), ncol(data))
Sig.hat = S.star - (mu.hat %*% t(mu.hat))
#check for convergence
#calculate gradient of log-likelihood. Calcualte norm. Check less than tolerance
#gradient calculation
n = nrow(data)
mu.matrix = matrix(mu, ncol=1)
siginv = solve(sigma)
gradM = n * siginv %*% (mu.matrix - mu.hat)
A = S.star - mu.matrix%*%t(mu.hat) - mu.hat%*%t(mu.matrix) + mu.matrix%*%t(mu.matrix)
B = siginv %*% (sigma - A) %*% siginv
gradS = (n) * diff.sig(B)
grad = matrix(c(gradM, gradS), ncol=1)
grad.norm = norm(grad, type='2')
#M-step: Replace mu and sigma with mu.hat and sigma.hat
mu = as.numeric(mu.hat)
sigma = Sig.hat
if (it == 1 || it == 2 || it == 3 || it == 33 || it == 34 || it == 35) {
print(sprintf('%2.0f %12.5f %12.5f %12.5f %12.5f %12.5f', it, mu[1], mu[3], sigma[1,1], sigma[1,3], grad.norm))
}
if (grad.norm < tolerance) break
}#end iteration for loop
return(list(mu = mu, sigma = sigma, iterations = it))
}