Penalized mixture of logistic regression

This Rmarkdown presents the code of the penalized mixture of logistic regression. First, a simple simulation example is presented. Then, the details of the code is displayed: the internal functions are introduced and the main fucntions of the algorithm are shown.

Simulation Example

We present the use of our EM algorithm to perform penalized mixture of logistic regressions, with a simulation example.

library(mvtnorm)
library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-16

library(matrixcalc)
library(glasso)
library(MASS)
library(brglm2)

The example presented here is based on a simulated data set, with 3 clusters. Two clusters having the same mean but all regression vectors are different. The simulation parameters are :

K      = 3
p      = 40
N      = 500

mu1    = c(rep(1, p/2), rep(2, p/2)) 
mu2    = c(rep(1, p))
mu3    = c(rep(1, p/2), rep(2, p/2))
mu     = as.matrix(data.frame(mu_1 = mu1, mu_2 = mu2, mu_3 = mu3))

beta1  = c(-2, 1, 0.5, 1, rep(0, 32), -1, -0.2, 0.5, 0.2)
beta2  = c(rep(0, 5), 1, -0.5, -0.5, -1, rep(0, 22), 1, -0.2, 1, -0.5, rep(0, 5))
beta3  = c(rep(0, 10), -2, 0.5, -2, -1, 1.5, 1, 2, -1, rep(0, 22))
beta   = as.matrix(data.frame(beta_1 = beta1, beta_2 = beta2, beta_3 = beta3))

SIGMA1 = 2*diag(p)/3
SIGMA2 = 2*diag(p)/3
SIGMA3 = toeplitz(c(1, 0.4, 0.1, 0.1, rep(0, 36)))
SIGMA  = list(SIGMA1, SIGMA2, SIGMA3)
pik    = c(0.3, 0.3, 0.4)

The data is simulated according to the mixture of regression model.

set.seed(1)
classe = c()
pY     = c()
Y      = c()
X      = matrix(NA, ncol = p, nrow = N)

for (i in 1:N) {
  k         = sample(1:K, 1, prob = pik)
  classe[i] = k
  X[i, ]    = rmvnorm(1, mean = mu[, k], sigma = SIGMA[[k]])
  pY[i]     = exp(X[i, ] %*% beta[, k])/ (1 + exp(X[i, ] %*% beta[, k]))
  U = runif(length(pY))
  Y = as.numeric(U < pY)
}

The estimation procedure with the tuning parameters selection can be executed with:

t0      = Sys.time()
res    = procedure_selec_model(Y = Y, X = X, Mmin = 2, Mmax = 2, length.lambda = 5, length.rho = 5)
t1      = Sys.time()

The estimated parameters are listed in the object “mod”, the retained model estimated with the selected tuning parameters:

mod    = res$res
mod$pik
mod$clustering
image.plot(mod$THETA[[1]])

Then, prediction can be made for a new simulated data set

classet = c()
pYt     = c()
Yt      = c()
Xt      = matrix(NA, ncol = p, nrow = N)

for (i in 1:N) {
  k          = sample(1:K, 1, prob = pik)
  classet[i] = k
  Xt[i, ]    = rmvnorm(1, mean = mu[, k], sigma = SIGMA[[k]])
  pYt[i]     = exp(Xt[i, ] %*% beta[, k])/ (1 + exp(Xt[i, ] %*% beta[, k]))
  U = runif(length(pYt))
  Yt = as.numeric(U < pYt)
}

pred   = prediction(Xt, mod, logit = TRUE) 
boxplot(pred$Yhnew ~ Yt)

EM algorithm

In this part, all the functions used to perform penalized mixture of logistic regression are presented.

Internal functions

EM_converged: to check the convergence of the EM algorithm through the increase of the log-likelihood.

EM_converged <- function(loglik, previous_loglik, threshold = 1e-4) {

  converged = 0;
  decrease  = 0;
  if (!(previous_loglik == -Inf)) {
    if (loglik - previous_loglik < -1e-3) # allow for a little imprecision
        {
        print(c("******likelihood decreased from ",previous_loglik, " to ", loglik), quote = FALSE)
        decrease = 1;
        }

    delta_loglik = abs(loglik - previous_loglik);
    avg_loglik   = (abs(loglik) + abs(previous_loglik) + threshold)/2;
    bb = ((delta_loglik/avg_loglik) < threshold)
    if (bb) {converged = 1}
    }

  res <- NULL
  res$converged <- converged
  res$decrease  <- decrease
  return(res)

}

NR_logit_w: to compute the logistic regression coefficients using a Newton Raphson method, when no LASSO penalization is performed

NR_logit_w <- function(y, X, beta = NULL, w = rep(1, length(y))/length(y), maxiter = 100, epsilon = 1e-5){
    options(warn = -1)
    if (is.null(beta)){
        beta = coefficients(lm(y ~ X-1, weights = w))
    }
    beta0 = beta
    options(warn = 0)
    err  = 2*epsilon
    iter = 0
    yw   = y*w
    ll   = NULL
    while ((err > epsilon) & (iter < maxiter)){
        iter = iter + 1
        beta.old = beta
        e  = exp(X %*% beta)
        p  = e/(1+e) 
        pr = prod(c(p * (1 - p) * w))
        if (pr < 1e-30 | is.na(pr)){
            beta = coefficients(lm(y ~ X-1, weights = w))
            warning("Completely separable pb...")
            break
        }
        V    = diag(c(p*(1-p)*w))
        mu   = p*w
        beta = beta + solve((t(X) %*% V %*% X), t(X) %*% (yw-mu))
        ll   = c(ll, t(yw) %*% X %*% beta - matrix(w, 1, length(y)) %*% log(1 + exp(X %*% beta)))
        err  = mean(abs(beta-beta.old)/abs(beta.old))
    }
    return(list(beta = beta, ll = ll, beta0 = beta0))
}

NR_logit_w_lasso: to force a penalization for the regression coefficients estimation when necessary

NR_logit_w_lasso <- function(y, X, beta = NULL, w = rep(1, length(y))/length(y), maxiter = 10, epsilon = 1e-5, lambda = .1){
    if (is.null(beta)){
        beta = coefficients(lm(y ~ X-1, weights = w))
        kp   = which(abs(beta) > lambda)
    } else
    {kp  = which(!(beta == 0))}
    err  = 2*epsilon
    iter = 0
    yw   = y*w
    ll   = NULL
    
    while ((err > epsilon) & (iter < maxiter)){
        iter     = iter + 1
        beta.old = beta
        e        = exp(X %*% beta)
        p        = e/(1+e) 
        pr       = prod(c(p * (1 - p) * w))
        if (pr<1e-30 | is.na(pr)){
            beta   = coefficients(lm(y ~ X-1, weights = w))
            warning("Completely separable pb...")
            break
        }
        V        = diag(c(p*(1-p)*w))
        mu       = p*w
        score1   = t(X) %*% (yw - mu) + lambda*sign(beta)
        score2   = t(X) %*% V %*% X
        kp       = which(abs(beta) > lambda) 
        beta[kp] = beta[kp] + solve(score2[kp, kp], score1[kp])
        ll       = c(ll, t(yw) %*% X %*% beta - matrix(w, 1, length(y)) %*% log(1 + exp(X %*% beta)))
        err      = mean(abs(beta[kp] - beta.old[kp])/abs(beta.old[kp]))
    }
    beta[-kp]  = 0 * beta[-kp]
    return(list(beta = beta, ll = ll))
}

Main functions

Initialization of the algorithm with repetitions of Kmeans clustering and few iterations of E and M steps. For the initialization step, 3 functions are necessary : a Mstep function adapted for the initialization, and the E and M step function used later for the EM-algorithm.

Maximization step used for the initialization:

MstepMMbinit = function(X, espt, rho = rho, THETA = THETA, maxiter = 10, epsilon = 1e-5){
  n = dim(X)[1]
  p = dim(X)[2]
  M = dim(espt)[2]
  
  # Z : pi (proportion of each cluster)
  pik     = sapply(1:M, function(k) {(1/n) * sum(espt[, k])}) 
  
  # X : 
  # mu: 
  mu      = sapply(1:M, function(k) sapply(1:p, function(j) sum(espt[, k] * X[, j]) / sum(espt[, k]))) 
  # SIGMA
  SIGMA   = lapply(1:M, function(k) Reduce('+', lapply (1:n, function(i) sapply(1:p, function(j) sapply(1:p, function(l) (X[i, j] - mu[j, k]) * (X[i, l] - mu[l, k]))) * espt[i, k])) / sum(espt[, k]))
  if (sum(abs(rho)) != 0){
    SIGMA = lapply(1:M, function(k) glasso(SIGMA[[k]], rho = rho[k], wi.init = THETA)$w)
    THETA = lapply(1:M, function(k) glasso(SIGMA[[k]], rho = rho[k], wi.init = THETA)$wi)
  } else {THETA = lapply(1:M, function(k) solve(SIGMA[[k]]))}
  
  # llik 
  llik = sum(sapply(1:n, function(i) log(sum(sapply(1:M, function(k) pik[k]* dmvnorm(X[i, ], mean = mu[, k], sigma = SIGMA[[k]]))))))
  
  ddltheta   = sapply(1:M, function(m) sum(abs(THETA[[m]])>0))
  npar.THETA = sum(sapply(1:M, function(m) (ddltheta[m]+p)/2))
  
  npar = M*p + M - 1 + npar.THETA
  BIC  = (-2 * llik[length(llik)]) + (npar * log(n))
  
  return(list(pik = pik, mu = mu, SIGMA = SIGMA, THETA = THETA, llik = llik, BIC = BIC))
}

Maximization step: to estimate the parameters

MstepMMb = function(Y, X, beta = beta, espt, Intercept = FALSE, lasso = lasso, lambda = lambda, rho = rho, THETA = THETA, maxiter = 10, epsilon = 1e-5){
  n = dim(X)[1]
  p = dim(X)[2]
  M = dim(espt)[2]
  
  classetemp = apply(espt, 1, which.max)
  
  if (Intercept) {X1 = cbind(1, X)}
  else {X1 = X}
  
  # Z : pi (proportion of each cluster)
  pik    = sapply(1:M, function(k) {(1/n) * sum(espt[, k])}) 
  
  # X : 
  # mu: 
  mu     = sapply(1:M, function(k) sapply(1:p, function(j) sum(espt[, k] * X[ ,j]) / sum(espt[, k]))) 
  # SIGMA
  SIGMA  = lapply(1:M, function(k) Reduce('+', lapply (1:n, function(i) sapply(1:p, function(j) sapply(1:p, function(l) (X[i, j] - mu[j, k]) * (X[i, l] - mu[l, k]))) * espt[i, k])) / sum(espt[, k]))
  if (sum(abs(rho)) != 0){
    SIGMA = lapply(1:M, function(k) glasso(SIGMA[[k]], rho = rho[k], wi.init = THETA)$w)
    THETA = lapply(1:M, function(k) glasso(SIGMA[[k]], rho = rho[k], wi.init = THETA)$wi)
  } else {THETA = lapply(1:M, function(k) solve(SIGMA[[k]]))}
  
  # Y : 
  # beta 
  if (lasso == FALSE) {
    if (Intercept) {X1 = cbind(1, X)}
    else {X1 = X}
    beta = sapply(1:M, function(k) NR_logit_w(Y, X1, beta = beta[, k], w = espt[, k])$beta) 
    for (k in 1:M){
      if (glm(Y[which(classetemp == k)]~., data = as.data.frame(X[which(classetemp == k), ]), family = binomial, method = "detect_separation")$separation == TRUE) {
        beta[, k] = NR_logit_w_lasso(Y, X1, beta = beta[, k], w = espt[, k], lambda = .5)$beta
      } else { beta[, k] = NR_logit_w(Y, X1, beta = beta[, k], w = espt[, k])$beta }
    }
  } else {
    if (Intercept) {
      beta = sapply(1:M, function(k) as.matrix(coef(glmnet(X, Y, family = "binomial", weights = espt[, k], standardize = TRUE, lambda = lambda[k]))))
    } else {
      beta = sapply(1:M, function(k) as.matrix(coef(glmnet(X, Y, family = "binomial", weights = espt[, k], standardize = TRUE, lambda = lambda[k], intercept = FALSE))))
      beta = as.matrix(beta[-1, ])
    }
  }
  
  # llik 
  pbx  = sapply(1:M, function(k) sapply(1:n, function(i) exp(sum(X1[i, ] * beta[, k]))/ (1 + exp(sum(X1[i, ] * beta[, k])))))
  llik = sum(sapply(1:n, function(i) log(sum(sapply(1:M, function(k) pik[k]* dbinom(Y[i], size = 1, prob = pbx[i, k]) * dmvnorm(X[i, ], mean = mu[, k], sigma = SIGMA[[k]]))))))
  
  return(list(pik = pik, mu = mu, SIGMA = SIGMA, THETA = THETA, beta = beta, llik = llik, lambda = lambda, rho = rho))
}

Expectation step: to compute the posterior probabilities

EstepMMb = function(Y, X, pik, mu, SIGMA, beta, M, Intercept = FALSE){
  if (Intercept) {X1 = cbind(1, X)}
  else {X1 = X}
  
  n = dim(X1)[1]
  p = dim(X1)[2]
  
  if (M == 1) {
    espt = as.matrix(rep(1, nrow(X)))
  } else {
    pbx  = sapply(1:M, function(k) sapply(1:n, function(i) exp(sum(X1[i, ] * beta[, k]))/ (1 + exp(sum(X1[i, ] * beta[, k])))))
    espt = sapply(1:M, function(k) sapply(1:n, function(i) pik[k] * dbinom(Y[i], size = 1, prob = pbx[i, k]) * dmvnorm(X[i, ], mean = mu[, k], sigma = SIGMA[[k]]) / sum(sapply(1:M, function(z) pik[z] * dbinom(Y[i], size = 1, prob = pbx[i, z]) * dmvnorm(X[i, ], mean = mu[, z], sigma = SIGMA[[z]]) ))))
  }
  return(list(espt = espt))
}

Initialization function: to choose initialization parameters among repetitions of clustering with a k-means algorithm and parameters estimation with a few iterations of E and M steps.

init.stepMMb = function(Y, X, param.init, M, maxrep = 10, Intercept = FALSE, lasso = TRUE, lambda = rep(0, M), rho = rep(0, M)) {
  if (Intercept) {X1 = cbind(1, X)}
  else {X1 = X}
  n = dim(X1)[1]
  p = dim(X1)[2]
  
  pii0   = mui0 = SIGMAi0 = THETAi0 = betai0 = espti0 = classes0 = list()
  lliki0 = c()
  
  if (param.init == 0) { 
    for (it in 1:maxrep){
      # kmeans :
      resclas        = kmeans(X, centers = M)
      pii0[[it]]     = resclas$size/n
      mui0[[it]]     = t(resclas$centers)
      # For a cluster with only an individuals
      Ngr  = 0
      Kout = NULL
      Iout = NULL
      for (k in 1:M) {
        if (length(which(resclas$cluster == k)) == 1) {
          Ngr     = Ngr + 1
          Ioutdef = c(Ioutdef, which(resclas$cluster == k))
          Iout    = which(resclas$cluster == k)
          Xout    = X[Iout, ]
          X       = X[-Iout, ]
          Y       = Y[-Iout]
          n       = n-1
          Kout    = k
          classes = resclas$cluster[-Iout]
          
        }} 
      if (Ngr == 0) {
        classes = resclas$cluster
        Kout    = M+1
        Iout    = NULL
      }
      
      
      SIGMAi0[[it]]  = lapply(1:M, function(k) var(X[classes == k, ]))
      THETAi0[[it]]  = list()
      for (k in 1:M) { 
        if (is.singular.matrix(SIGMAi0[[it]][[k]])) {
          THETAi0[[it]][[k]] = glasso(SIGMAi0[[it]][[k]], rho = 0.1)$wi
        } else { THETAi0[[it]][[k]]  = solve(SIGMAi0[[it]][[k]]) }
      } 
      
      
      if (lasso == FALSE) {
        lambdai = rep(0, M) ; rhoi = rep(0, M)
        if (M > 1) {
          for (k in 1:M){ if (p*(M+1) > length(which(classes == k))) {lambdai[k] = 0.01; rhoi[k] = 0.01}}
        }
        if (sum(abs(lambdai)) != rep(0, M) | sum(abs(rhoi)) != rep(0, M)) {lasso = TRUE}
      } else {lambdai = lambda; rhoi = rho;
        for (k in 1:M){ if (p*(M+1) > length(which(classes == k))) {lambdai[k] = lambda[k]*5; rhoi[k] = rho[k]*5}}
      }

      
      if (Intercept) {
        betai0[[it]] = sapply(1:M, function(k) as.matrix(coef(glmnet(X[classes == k,], Y[classes == k], family = "binomial", standardize = TRUE, lambda = lambdai[k])))) 
      } else {
        betai0[[it]] = sapply(1:M, function(k) as.matrix(coef(glmnet(X[classes == k,], Y[classes == k], family = "binomial", standardize = TRUE, lambda = lambdai[k], intercept = FALSE)))) 
      }
      
      
      # Repetition of the EM algorithm
      
      resE  = EstepMMb(Y, X, pik = pii0[[it]], mu = as.matrix(mui0[[it]]), SIGMA = SIGMAi0[[it]], beta = betai0[[it]], M = M, Intercept = Intercept)
      resM  = MstepMMb(Y, X, beta = betai0[[it]], espt = resE$espt, Intercept = Intercept, lasso = lasso, lambda = lambdai, rho = rhoi, THETA = THETAi0[[it]])
      
      repet = 1
      while (repet <= maxrep) {
        resE = EstepMMb(Y, X, pik = resM$pik, mu = resM$mu, SIGMA = resM$SIGMA, beta = resM$beta, M = M, Intercept = Intercept)
        resM  = MstepMMb(Y, X, beta = resM$beta, espt = resE$espt, Intercept = Intercept, lasso = lasso, lambda = lambdai, rho = rhoi, THETA = resM$THETA)
        repet = repet + 1
      }
      
      pii0[[it]]     = resM$pik
      mui0[[it]]     = resM$mu
      SIGMAi0[[it]]  = resM$SIGMA
      THETAi0[[it]]  = resM$THETA
      betai0[[it]]   = resM$beta
      espti0[[it]]   = resE$espt
      lliki0[it]     = resM$llik
      classes0[[it]] = sapply(1:n, function(i) which.max(resE$espt[i, ]))
    }
    
    choix    = which.max(lliki0)
    pii      = pii0[[choix]]     
    mui      = mui0[[choix]]     
    SIGMAi   = SIGMAi0[[choix]]  
    THETAi   = THETAi0[[choix]]
    betai    = betai0[[choix]]   
    espti    = espti0[[choix]]
    llik     = lliki0[choix]
    classifi = classes0[[choix]]
    
  } else {
    pii      = param.init[1]
    mui      = param.init[2]
    SIGMAi   = param.init[3]
    THETAi   = param.init[4]
    betai    = param.init[5]
    
    pbxi     = sapply(1:M, function(k) sapply(1:n, function(i) exp(sum(X1[i, ] * betai[, k]))/ (1 + exp(sum(X1[i, ] * betai[, k])))))
    espti    = sapply(1:M, function(k) sapply(1:n, function(i) pii[k] * dbinom(Y[i], size = 1, prob = pbxi[i, k]) * dmvnorm(X[i, ], mean = mui[, k], sigma = SIGMAi[[k]]) / sum(sapply(1:M, function(z) pii[z] * dbinom(Y[i], size = 1, prob = pbxi[i, z]) * dmvnorm(X[i, ], mean = mui[, z], sigma = SIGMAi[[z]]) ))))
    
    llik     = sum(sapply(1:n, function(i) log(sum(sapply(1:M, function(k) pii[k]* dbinom(Y[i], size = 1, prob = pbxi[i, k]) * dmvnorm(X[i, ], mean = mui[, k], sigma = SIGMAi[[k]]))))))
    
    classifi = sapply(1:n, function(i) which.max(espti[i, ]))
  }
  
  return(list(pik = pii, mu = mui, SIGMA = SIGMAi, THETA = THETAi, beta = betai, llik = llik, classifi = classifi, Iout = Iout, M = M, lambda = lambdai, rho = rhoi))
}

Main function to run the algorithm

Function to run the EM algorithm to perform penalized mixture of logistic regressions, using all the functions defined previously:

EM.MMbinom = function(Y, X, M, param.init = 0, maxit = 500, keep = TRUE, eps = 10^-5, Intercept = FALSE, lasso = TRUE, lambda = rep(0, M), rho = rep(0, M), param.init.res = NULL, first.step = NULL) {
  # Y          : binary vector 
  # X          : numeric matrix
  # M          : number of clusters
  # param.init : initial parameters c(pik, mu, SIGMA, beta)
  # maxit      : maximum number of EM iterations
  # keep       : boolean, keep results of each iteration or not
  # Intercept  : boolean, Consider intecept in the model or not
  # lasso      : boolean, perform variable selection or not
  # lamda, rho : tunning parameters for the regularization
  
  n  = dim(X)[1]
  p  = dim(X)[2]
  Md = M
  
  if (is.null(first.step)){
    if (is.null(param.init.res)){
      print("initializing")
      res.init = init.stepMMb(Y, X, param.init, M = M, maxrep = 1, Intercept = Intercept, lasso = TRUE, lambda = lambda, rho = rho)
      if (is.null(res.init$Iout)) {
        Xn = X ; Yn = Y ; Iout = res.init$Iout
      } else {
        Xn     = X[-c(res.init$Iout), ]
        Yn     = Y[-res.init$Iout]
        n      = nrow(Xn)
        lambda = res.init$lambda 
        rho    = res.init$rho
        Iout   = res.init$Iout
      }
      
    } else {
      print(paste("Use previous computations to initialize"))
      res.init = param.init.res
    }
    pik      = res.init$pik
    mu       = res.init$mu
    SIGMA    = res.init$SIGMA
    THETA    = res.init$THETA
    beta     = res.init$beta
    llik     = res.init$llik
    M        = res.init$M
    print(paste("Iter", 1, ", llik = ",  res.init$llik))
    resM.old = res.init
    resE     = EstepMMb(Yn, Xn, pik = resM.old$pik, mu = resM.old$mu, SIGMA = resM.old$SIGMA, beta = resM.old$beta, M = M, Intercept = Intercept)
    resM     = MstepMMb(Yn, Xn, beta = beta, espt = resE$espt, Intercept = Intercept, lasso = lasso, lambda = lambda, rho = rho, THETA = THETA)
    print(paste("Iter", 2, ", llik = ",  resM$llik))
  } else {
    resE = first.step$resE
    resM = first.step$resM
    llik = resM$llik
    Iout = NULL
  }
  
  resE.old   = resE
  resM.old   = resM
  if (keep  == TRUE) {
    pik        = rbind(pik, resM$pik)
    mu         = list(mu, resM$mu)
    SIGMA      = list(SIGMA, resM$SIGMA)
    THETA      = list(THETA, resM$THETA)
    beta       = list(beta, resM$beta)
    tau        = list(NULL, resE$espt)
    classif    = list(res.init$classifi, sapply(1:n, function(i) which.max(resE$espt[i, ])))
    npar.beta  = sum(abs(beta[[2]])>0)
    ddltheta   = sapply(1:M, function(m) sum(abs(THETA[[2]][[m]])>0))
    npar.THETA = sum(sapply(1:M, function(m) (ddltheta[m] + p)/2))
  } else {
    pik        = resM$pik
    mu         = resM$mu
    SIGMA      = resM$SIGMA
    THETA      = resM$THETA
    beta       = resM$beta
    classif    = sapply(1:n, function(i) which.max(resE$espt[i, ]))
    npar.beta  = sum(abs(beta)>0)
    ddltheta   = sapply(1:M, function(m) sum(abs(THETA[[m]]) > 0))
    npar.THETA = sum(sapply(1:M, function(m) (ddltheta[m]+p)/2))
    tau        = resE$espt
  }
  
  llik     = cbind(llik, resM$llik)
  epsilon  = 1
  it       = 2
  
  print("EM algorithm")
  while (EM_converged(loglik = resM$llik, previous_loglik = llik[length(llik)-1])$converged == 0 & it < maxit){
    it       = it + 1
    
    resE     = EstepMMb(Yn, Xn, pik = resM.old$pik, mu = resM.old$mu, SIGMA = resM.old$SIGMA, beta = resM.old$beta, M = M, Intercept = Intercept)
    resM     = MstepMMb(Yn, Xn, beta = resM.old$beta, espt = resE$espt, Intercept = Intercept, lasso = lasso, lambda = lambda, rho = rho, THETA = resM.old$THETA)
    print(paste("Iter", it, ", llik = ",  resM$llik))
    
    epsilon  = sum((unlist(resM)-unlist(resM.old))^2)/sum(unlist(resM.old)^2)
    
    resE.old = resE
    resM.old = resM
    
    lambda   = resM$lambda
    rho      = resM$rho
    
    if (keep == TRUE){
      pik           = rbind(pik, resM$pik)
      mu[[it]]      = resM$mu
      SIGMA[[it]]   = resM$SIGMA
      THETA[[it]]   = resM$THETA
      beta[[it]]    = resM$beta
      npar.beta     = sum(abs(beta[[it]])>0)
      ddltheta      = sapply(1:M, function(m) sum(abs(THETA[[it]][[m]])>0))
      npar.THETA    = sum(sapply(1:M, function(m) (ddltheta[m]+p)/2))
      classif[[it]] = sapply(1:n, function(i) which.max(resE$espt[i, ]))
      tau[[it]]     = resE$espt
    } else {
      pik         = resM$pik
      mu          = resM$mu
      SIGMA       = resM$SIGMA
      THETA       = resM$THETA
      ddltheta    = sapply(1:M, function(m) sum(abs(THETA[[m]])>0))
      npar.THETA  = sum(sapply(1:M, function(m) (ddltheta[m]+p)/2))
      beta        = resM$beta
      npar.beta   = sum(abs(beta)>0)
      classif     = sapply(1:n, function(i) which.max(resE$espt[i, ]))
      tau         = resE$espt
    }
    llik = c(llik, resM$llik)
  }  
  npar = npar.beta + 2*M -1 + npar.THETA 
  BIC  = (-2 * llik[length(llik)]) + (npar * log(n))
  AIC  = (2 * npar) - (2 * llik[length(llik)])
  
  # class order
  if (keep == TRUE) {
    if (M == 1) {ordre = lapply(1:it, function(nit) 1)}
    else {
      detSigma     = sapply(1:it, function(nit) sapply(1:M, function(k) det(SIGMA[[nit]][[k]])))
      ordre        = lapply(1:it, function(nit) order(detSigma[nit, ]))
    }
    clustering = lapply(1:it, function(nit) sapply(1:length(classif[[nit]]), function(i) which(ordre[[nit]] == classif[[nit]][i])))
    pik            = lapply(1:it, function(nit) pik[nit, ][ordre[[nit]]])
    mu             = lapply(1:it, function(nit) mu[[nit]][, ordre[[nit]]])
    beta           = lapply(1:it, function(nit) beta[[nit]][, ordre[[nit]]])
    SIGMA          = lapply(1:it, function(nit) lapply(1:M, function(i) SIGMA[[nit]][[ordre[[nit]][i]]]))
    THETA          = lapply(1:it, function(nit) lapply(1:M, function(i) THETA[[nit]][[ordre[[nit]][i]]]))
    tau            = lapply(2:it, function(nit) lapply(1:M, function(i) tau[[nit]][[ordre[[nit]][i]]]))
  } else {
    detSigma       = sapply(1:M, function(k) det(SIGMA[[k]]))
    ordre          = order(detSigma)
    clustering = sapply(1:length(classif), function(i) which(ordre == classif[i]))
    pik            = pik[ordre]
    mu             = as.matrix(mu[, ordre])
    beta           = as.matrix(beta[, ordre])
    SIGMA          = lapply(1:M, function(i) SIGMA[[ordre[i]]])
    tau            = sapply(1:M, function(i) tau[,ordre[[i]]])
    if (sum(abs(rho)) != 0) {THETA = lapply(1:M, function(i) THETA[[ordre[i]]])}
  }  
  
  MAP  = sapply(1:n, function(i) which.max(tau[i, ]))
  zikt = matrix(0, nrow = n, ncol = M) ; for (i in 1:n) {zikt[i, MAP[i]] = 1}
  ICL  = BIC - 2 * sum(sapply(1:n, function(i) sum(sapply(1:M, function(k) zikt[i, k] %*% log(tau[i, k])), na.rm = TRUE)))
  
  reg = list(M = M, lambda = lambda, rho = rho)
  
  return(list(pik = pik, mu = mu, SIGMA = SIGMA, THETA = THETA, beta = beta, llik = llik, BIC = BIC, AIC = AIC, ICL = ICL, clustering = clustering, reg = reg, tau = tau, Iout = Iout, Md = Md, Mnew = M))
}

Function for the selection of the tunning parameters

This function performs the tuning for parameter selection. The procedure is as follows. A grid of possible values for lambda and rho coefficients is build for each clusters. Models are fitted for each point of the grid, with a small number of iterations of the EM algorithm. The BIC is then used to select the best 5 models, which are re-estimated with a larger number of iterations, and the best model according to the BIC is then selected.

procedure_selec_model = function(Y, X, Mmin = 2, Mmax = 5, length.lambda = 5, length.rho = 20, Intercept = TRUE) {
  # Y             : binary vector 
  # X             : numeric matrix
  # Mmin          : minimum number of clusters to test
  # Mmax          : maximum number of clusters to test
  # length.lambda : size of the lambda grid to consider
  # length.rho    : size of the rho grid to consider
  # Intercept     : boolean, Consider intecept in the model or not
  
  resEM = resEM.2 = list()
  it    = 1
  
  lambda.min = 0
  rho.min    = 0
  
  for (m in Mmin:Mmax) {
    ## Initialization: GMM
    mi       = m
    res.init = init.stepMMb(Y, X, param.init=0, M = m, maxrep = 1, Intercept = Intercept, lasso = FALSE)
    pik      = res.init$pik
    mu       = res.init$mu
    SIGMA    = res.init$SIGMA
    THETA    = res.init$THETA
    beta     = res.init$beta
    llik     = res.init$llik
    m        = res.init$M
    if (is.null(res.init$Iout)) {
      Xn = X ; Yn = Y
    } else {
      Xn       = X[-c(res.init$Iout), ]
      Yn       = Y[-res.init$Iout]
    }
    
    print(paste("Iter", 1, ", llik = ",  res.init$llik))
    resM.old = res.init
    
    resE     = EstepMMb(Yn, Xn, pik = resM.old$pik, mu = resM.old$mu, SIGMA = resM.old$SIGMA, beta = resM.old$beta, M = m, Intercept = Intercept)
    classifi =  apply(resE$espt, 1, which.max)
    
    ## Looking for lambda and rho in a large grid, few iterations of EM algorithm
    lambda.max = rep(0, m)
    rho.max    = rep(0, m)
    
    for (k in 1:m){
      beta.ridge    = lm.ridge(Yn[classifi == k] ~ Xn[classifi == k, ], lambda = 0.01)
      lambda.max[k] = max(abs(beta.ridge$coef))/length(classifi == k) * 100
      S             = cov(Xn[classifi == k, ])
      rho.max[k]    = max(abs(S))/length(classifi == k) * 10
    }
    
    grid.lambda = sapply(lambda.max, function(x) seq(0, x, length.out = length.lambda)) ## regular grid needed if we don't take too much regularization parameters
    grid.rho    = sapply(rho.max, function(x) seq(0,x, length.out = length.rho))        ## regular grid needed if we don't take too much regularization parameters
    
    grid.rho.full    = expand.grid(as.data.frame(grid.rho))
    grid.lambda.full = expand.grid(as.data.frame(grid.lambda))
    
    ## Reduce the size of grid.rho.full
    it.init = 1
    resMstep.init = list()
    tot = dim(grid.rho.full)[1]
    
    for (r in 2:dim(grid.rho.full)[1]){
      print(paste('Rho ', it.init, 'over', tot))
      resMstep.init[[it.init]] = MstepMMbinit(Xn, espt = resE$espt, rho = grid.rho.full[r, ], THETA = THETA, maxiter = 5, epsilon = 1e-5)
      it.init = it.init + 1
    }
    BICrho         = sapply(1:length(resMstep.init), function(i) resMstep.init[[i]]$BIC)
    ind.rho        = order(BICrho)[1:3]
    grid.rho.small = as.matrix(grid.rho.full[ind.rho, ])
    
    ## For each regularization parameter, run the EM algorithm
    for (r in 1:dim(grid.rho.small)[1]){
      for (l in 1:dim(grid.lambda.full)[1]){
        print(paste('Model', it, 'over', 3*(length.lambda)^m))
        resM        = MstepMMb(Yn, Xn, beta = beta, espt = resE$espt, Intercept = Intercept, lasso = TRUE, lambda = grid.lambda.full[l, ], rho = grid.rho.small[r, ], THETA = THETA)
        first.step  = list(resE = resE, resM = resM)
        resEM[[it]] = EM.MMbinom(Yn, Xn, M = m, param.init = 0, maxit = 5, eps = 10^-5, keep = FALSE, Intercept = Intercept, 
                                 lasso = TRUE, lambda = grid.lambda.full[l, ], rho = grid.rho.small[r, ], first.step = first.step)
        assign("resEMcurrent", resEM, envir = .GlobalEnv)
        it = it+1
      }
    }
  }
  
  print('Selection of best models by BIC')
  BIC      = sapply(1:length(resEM), function(i) resEM[[i]]$BIC)
  order.bic = order(BIC)[1:5]

  ## Run again for the 5 best models the EM algorithm with large number of iterations
  it2 = 1
  for (ind in order.bic){
    print(paste('Model',it2))
    reg = resEM[[ind]]$reg
    resEM.2[[it2]] = EM.MMbinom(Yn, Xn, M = reg$M, param.init = 0, maxit = 50, eps = 10^-5, keep = FALSE, Intercept = Intercept,
                                lasso = TRUE, lambda = reg$lambda, rho = reg$rho)
    it2 = it2 + 1
    assign("resEM2current", resEM.2, envir = .GlobalEnv)
  }

  BIC     = sapply(1:length(resEM.2), function(i) resEM.2[[i]]$BIC)
  bestRes = which.min(BIC)
  res     = resEM.2[[bestRes]]
  
  return(list(res = res, resEM.2))
}

Prediction

Once the model is estimated and selected, it is possible to predict the binary response for a new data, with the prediction function:

prediction = function(Xnew, mod, logit = TRUE) {
  # Xnew  : numeric matrix, new data to predict
  # mod   : model estimated with procedure_selec_model
  # logit : boolean indicating whether or not the predicted response is binary 
  
  M     = mod$Mnew
  pik   = mod$pik
  mu    = mod$mu
  SIGMA = mod$SIGMA
  beta  = mod$beta
  n     = dim(Xnew)[1]
  
  if (nrow(beta) == ncol(Xnew)){
    X1new = Xnew
  } else {X1new = cbind(1, Xnew) }
  
  # computation of the posterior probability
  PZnew = sapply(1:M, function(k) sapply(1:dim(Xnew)[1], function(i) pik[k] * dmvnorm(Xnew[i, ], mean = mu[, k], sigma = SIGMA[[k]]) / sum(sapply(1:M, function(z) pik[z] * dmvnorm(Xnew[i, ], mean = mu[, z], sigma = SIGMA[[z]]) ))))
 
  knew  = sapply(1:n, function(i) which.max(PZnew[i, ]))
  
  # weighted sum of each model
  Yhnew = c()
  for (ind in 1:n) {
    if (logit == FALSE) {
      Yhnew[ind] = sum(sapply(1:M, function(k) PZnew[ind, k] * X1new[ind, ] * beta[, k])) 
    } else {
      Yhnew[ind] = sum(sapply(1:M, function(k) PZnew[ind, k] * (exp(X1new[ind, ] %*% beta[, k])/ (1 + exp(X1new[ind, ] %*% beta[, k]))))) 
    }
  }
  
  return(list(Yhnew = Yhnew, classif = knew))
}