Question 1: GLM with count data

library(MASS)

# Define the simulation function
glm_count <- function(n, beta, iters){
  
  beta_est = matrix(-1,iters,4)
  for (t in 1:iters)
       {
        # generate data
        x1 <- runif(n,0,1)
        cov_mat <- matrix(c(1,0.3,0.3,1),nrow = 2,ncol = 2)
        x23 <- mvrnorm(n,rep(0,2),cov_mat)
        x2 <- x23[,1]
        x3 <- x23[,2]
        x <- rbind(rep(1,n),x1,x2,x3)
        mu <- exp(beta %*% x)
        
        y <- rep(-1,n)
        for (i in 1:n)
        {
          y[i] <- rpois(1, mu[i])
        }
        
        # Store the results
        beta_est[t,] <- glm(y ~ x1 + x2 + x3, family = poisson)$coefficients
        }
  # return
  return(beta_est)
}

# Initiate the setting
n <- 500
beta <- c(-1,1,1,1)

# Start
beta_est <- glm_count(n, beta, iters = 10)

The Bias and RMSE of the parameter estimates are as follows:

# Calculate the Bias and RMSE
Bias <- colMeans(beta_est) - beta
dist <- beta_est - matrix(beta, nrow(beta_est), ncol = length(beta), byrow = T)
RMSE <- apply(dist,2,norm,'2')

# Print
cat("Bias:\n", Bias, '\n')
Bias:
 -0.05306205 0.004322885 -0.006351053 0.03050441 
cat("RMSE:\n", RMSE, '\n')
RMSE:
 0.3258033 0.3204383 0.1323827 0.174261 

Question 2: Extended GLM with nominal data

library(nnet)
library(extraDistr)

# Define the simulation function
glm_nominal <- function(n, beta, iters){
  
  beta_est = matrix(-1,iters,(nrow(beta)-1)*ncol(beta))
  for (i in 1:iters){
    # generate the data
    x1 <- runif(n,0,1)
    x2 <- rnorm(n,0,1)
    x <- rbind(rep(1,n),x1,x2)

    pi <- apply(exp(beta %*% x),2,function(x){x / sum(x)})
    y <- apply(pi,2, function(x){rcat(1, prob = x)})
    
    y <- factor(as.character(y),
                levels = c('4','1','2','3'),
                labels = c('category4','category1','category2','category3'))
    
    # Store the results
    beta_est[i,] <- summary(multinom(y ~ x1 + x2))$coefficients
  }
  # return
  return(beta_est)
}

# Initiate the setting
beta1 <- c(-1,-1,1)
beta2 <- c(-1,-1,-1)
beta3 <- c(1,-1,1)
beta4 <- rep(0,3)
beta <- rbind(beta1, beta2, beta3, beta4)

# Start
beta_est <- glm_nominal(n = 500, beta, iters = 10)
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 491.266116
final  value 490.213443 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 499.961662
final  value 499.701329 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 504.874699
final  value 503.915752 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 477.547388
final  value 476.550571 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 498.945864
final  value 498.709079 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 451.280932
final  value 450.759075 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 491.938176
final  value 490.150183 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 536.929615
final  value 536.616682 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 480.642659
final  value 479.724027 
converged
# weights:  16 (9 variable)
initial  value 693.147181 
iter  10 value 487.815272
final  value 485.508866 
converged

The Bias and RMSE of the parameter estimates are as follows:

# Calculate the Bias and RMSE
beta_vec <- as.vector(beta[1:3,])
Bias <- colMeans(beta_est) - beta_vec
dist <- beta_est - matrix(beta_vec, nrow(beta_est), length(beta_vec), byrow = T)
RMSE <- as.vector(apply(dist,2,norm,'2'))

# print
cat("\014")  

cat("Bias:\n")
Bias:
print(data.frame(beta1 = Bias[1:3], beta2 = Bias[4:6], beta3 = Bias[7:9]))
        beta1       beta2       beta3
1 -0.11584195 -0.02726638 0.010007544
2 -0.02267826  0.08881272 0.048615939
3  0.05862205 -0.09126364 0.009007726
cat("\n")
cat("RMSE:\n")
RMSE:
print(data.frame(beta1 = RMSE[1:3], beta2 = RMSE[4:6], beta3 = RMSE[7:9]))
      beta1     beta2     beta3
1 1.3264619 3.1356427 0.6980622
2 1.2548290 2.0378704 0.6572526
3 0.5514944 0.9489469 0.4309279

Question 3: GLMM with longitudinal binary data

Considering keeping the structure of panel data, here I apply a three-dimensional array to store the generated data.

library(glmm)

# Define the simulation function
glmm_bernoulli <- function(n, T, beta, sig2, iters){
  
  params_est <- matrix(0,iters,4)
  
  for (i in 1:iters){
    
    # Generate the data
    x1 <- matrix(rbinom(n*T,1,prob = 0.4),T)
    x2 <- matrix(rnorm(n*T,0,1),T)
    x0 <- matrix(1,T,n)
    u <- rnorm(n,0,sig2)
    x <- array(c(x0,x1,x2), dim=c(T,n,3))
    
    logit_pi <- apply(x, c(1,2), function(i){beta %*% i}) + array(matrix(u,n,T), dim = c(T,n))
    pi <- exp(logit_pi) / (1 + exp(logit_pi))
    y <- apply(pi,c(1,2), function(i){rbinom(1, 1, prob = i)})
    sub <- as.factor(col(y))
    
    data <- data.frame(y = as.vector(y), x1 = as.vector(x1), x2 = as.vector(x2), subject = sub) 
    
    # Fit GLMM model and estimate the parameters
    # ptm <- proc.time()# record the time for each estimation
    model_fit <- glmm(y ~ x1 + x2, random = list(y ~ 0 + subject), 
                      varcomps.names = c("subject"), data = data,
                      family.glmm = bernoulli.glmm, m = 100, doPQL = TRUE)
    # proc.time() - ptm
    
    # Store the results
    params_est[i, 1:3] <- coefficients(model_fit)
    params_est[i, 4] <- varcomps(model_fit)
  }
  
  return(params_est)
}

# Initiate the setting
n <- 500
T <- 4
sig2 <- 1
beta <- c(-1,0.5,-0.5)

# Start
params_est <- glmm_bernoulli(n, T, beta, sig2, iters = 10)

The Bias and RMSE of the parameter estimates are as follows:

# Calculate the Bias and RMSE
params <- append(beta, sig2)
Bias <- colMeans(params_est) - params
dist <- params_est - matrix(params, nrow(params_est), length(params), byrow = T)
RMSE <- apply(dist,2,norm,'2')

# Print
cat("Bias:\n", Bias, '\n')
Bias:
 0.1708627 -0.1025265 0.07979331 -0.7529994 
cat("RMSE:\n", RMSE, '\n')
RMSE:
 0.5719067 0.417621 0.3233055 2.423903 

Question 4: Reanalyze Example 3.3

Read data into R

According to the given instructions, the insomnia data set of Example 3.3 can be downloaded online. Here, the value of response (1,2,3,4) represents different lengths of time to falling asleep of a subject (1 - “<20”, 2 - “20-30”, 3 - “30-60”, 4 - “>60”).

insomnia <- read.table("insomnia.txt", header = T)
insomnia

Using clm and clmm functions

Here I use the clm and clmm function of ordinal package for replication of the results.

# import libraries
library(ordinal)

# estimation with the random intercept (clmm function)
model_clmm <- clmm(factor(response) ~ (1|case) + treat*occasion, data = insomnia, link = "logit")

# print
cat('Cumulative Logit Model with the Random Intercept\nbeta:',model_clmm$beta,'\n\n')
Cumulative Logit Model with the Random Intercept
beta: -0.04830713 -1.571161 -1.057957 
# estimation without the random intercept (clm function)
model_clm <- clm(factor(response) ~ treat*occasion, data = insomnia, link = "logit")

# print
cat('Cumulative Logit Model without the Random Intercept\nbeta:',model_clm$beta,'\n\n')
Cumulative Logit Model without the Random Intercept
beta: -0.03361002 -1.038076 -0.7077589 

Here are the detailed summaries of these two models. The results showed that including the random effect can improve the performance. However, according to the std, the random effect is not significantly different from 0 at the 0.05 level.

# Summary #1
summary(model_clmm)
Cumulative Link Mixed Model fitted with the Laplace approximation

formula: factor(response) ~ (1 | case) + treat * occasion
data:    insomnia

Random effects:
 Groups Name        Variance Std.Dev.
 case   (Intercept) 3.043    1.744   
Number of groups:  case 239 

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
treat          -0.04831    0.35018  -0.138  0.89028    
occasion       -1.57116    0.28475  -5.518 3.43e-08 ***
treat:occasion -1.05796    0.38006  -2.784  0.00537 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Threshold coefficients:
    Estimate Std. Error z value
1|2  -3.3986     0.3517  -9.664
2|3  -1.4451     0.2799  -5.163
3|4   0.5457     0.2587   2.109
# Summary #2
summary(model_clm)
formula: factor(response) ~ treat * occasion
data:    insomnia

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
treat          -0.03361    0.23775  -0.141   0.8876    
occasion       -1.03808    0.24099  -4.308 1.65e-05 ***
treat:occasion -0.70776    0.33394  -2.119   0.0341 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Threshold coefficients:
    Estimate Std. Error z value
1|2  -2.2671     0.2048 -11.071
2|3  -0.9515     0.1812  -5.251
3|4   0.3517     0.1746   2.014
---
title: "STAT5060 Assignment1"
author: "LI Xinyu 1155184414"
date: '2022-10-24'
output: html_notebook
---

## Question 1: GLM with count data

```{r}
library(MASS)

# Define the simulation function
glm_count <- function(n, beta, iters){
  
  beta_est = matrix(-1,iters,4)
  for (t in 1:iters)
       {
        # generate data
        x1 <- runif(n,0,1)
        cov_mat <- matrix(c(1,0.3,0.3,1),nrow = 2,ncol = 2)
        x23 <- mvrnorm(n,rep(0,2),cov_mat)
        x2 <- x23[,1]
        x3 <- x23[,2]
        x <- rbind(rep(1,n),x1,x2,x3)
        mu <- exp(beta %*% x)
        
        y <- rep(-1,n)
        for (i in 1:n)
        {
          y[i] <- rpois(1, mu[i])
        }
        
        # Store the results
        beta_est[t,] <- glm(y ~ x1 + x2 + x3, family = poisson)$coefficients
        }
  # return
  return(beta_est)
}

# Initiate the setting
n <- 500
beta <- c(-1,1,1,1)

# Start
beta_est <- glm_count(n, beta, iters = 10)
```

The Bias and RMSE of the parameter estimates are as follows:

```{r}
# Calculate the Bias and RMSE
Bias <- colMeans(beta_est) - beta
dist <- beta_est - matrix(beta, nrow(beta_est), ncol = length(beta), byrow = T)
RMSE <- apply(dist,2,norm,'2')

# Print
cat("Bias:\n", Bias, '\n')
cat("RMSE:\n", RMSE, '\n')
```

## Question 2: Extended GLM with nominal data

```{r}
library(nnet)
library(extraDistr)

# Define the simulation function
glm_nominal <- function(n, beta, iters){
  
  beta_est = matrix(-1,iters,(nrow(beta)-1)*ncol(beta))
  for (i in 1:iters){
    # generate the data
    x1 <- runif(n,0,1)
    x2 <- rnorm(n,0,1)
    x <- rbind(rep(1,n),x1,x2)

    pi <- apply(exp(beta %*% x),2,function(x){x / sum(x)})
    y <- apply(pi,2, function(x){rcat(1, prob = x)})
    
    y <- factor(as.character(y),
                levels = c('4','1','2','3'),
                labels = c('category4','category1','category2','category3'))
    
    # Store the results
    beta_est[i,] <- summary(multinom(y ~ x1 + x2))$coefficients
  }
  # return
  return(beta_est)
}

# Initiate the setting
beta1 <- c(-1,-1,1)
beta2 <- c(-1,-1,-1)
beta3 <- c(1,-1,1)
beta4 <- rep(0,3)
beta <- rbind(beta1, beta2, beta3, beta4)

# Start
beta_est <- glm_nominal(n = 500, beta, iters = 10)

```

The Bias and RMSE of the parameter estimates are as follows:

```{r}
# Calculate the Bias and RMSE
beta_vec <- as.vector(beta[1:3,])
Bias <- colMeans(beta_est) - beta_vec
dist <- beta_est - matrix(beta_vec, nrow(beta_est), length(beta_vec), byrow = T)
RMSE <- as.vector(apply(dist,2,norm,'2'))

# print
cat("\014")  
cat("Bias:\n")
print(data.frame(beta1 = Bias[1:3], beta2 = Bias[4:6], beta3 = Bias[7:9]))
  cat("\n")
cat("RMSE:\n")
print(data.frame(beta1 = RMSE[1:3], beta2 = RMSE[4:6], beta3 = RMSE[7:9]))

```

## Question 3: GLMM with longitudinal binary data

Considering keeping the structure of panel data, here I apply a three-dimensional array to store the generated data.

```{r}
library(glmm)

# Define the simulation function
glmm_bernoulli <- function(n, T, beta, sig2, iters){
  
  params_est <- matrix(0,iters,4)
  
  for (i in 1:iters){
    
    # Generate the data
    x1 <- matrix(rbinom(n*T,1,prob = 0.4),T)
    x2 <- matrix(rnorm(n*T,0,1),T)
    x0 <- matrix(1,T,n)
    u <- rnorm(n,0,sig2)
    x <- array(c(x0,x1,x2), dim=c(T,n,3))
    
    logit_pi <- apply(x, c(1,2), function(i){beta %*% i}) + array(matrix(u,n,T), dim = c(T,n))
    pi <- exp(logit_pi) / (1 + exp(logit_pi))
    y <- apply(pi,c(1,2), function(i){rbinom(1, 1, prob = i)})
    sub <- as.factor(col(y))
    
    data <- data.frame(y = as.vector(y), x1 = as.vector(x1), x2 = as.vector(x2), subject = sub) 
    
    # Fit GLMM model and estimate the parameters
    # ptm <- proc.time()# record the time for each estimation
    model_fit <- glmm(y ~ x1 + x2, random = list(y ~ 0 + subject), 
                      varcomps.names = c("subject"), data = data,
                      family.glmm = bernoulli.glmm, m = 100, doPQL = TRUE)
    # proc.time() - ptm
    
    # Store the results
    params_est[i, 1:3] <- coefficients(model_fit)
    params_est[i, 4] <- varcomps(model_fit)
  }
  
  return(params_est)
}

# Initiate the setting
n <- 500
T <- 4
sig2 <- 1
beta <- c(-1,0.5,-0.5)

# Start
params_est <- glmm_bernoulli(n, T, beta, sig2, iters = 10)

```

The Bias and RMSE of the parameter estimates are as follows:

```{r}
# Calculate the Bias and RMSE
params <- append(beta, sig2)
Bias <- colMeans(params_est) - params
dist <- params_est - matrix(params, nrow(params_est), length(params), byrow = T)
RMSE <- apply(dist,2,norm,'2')

# Print
cat("Bias:\n", Bias, '\n')
cat("RMSE:\n", RMSE, '\n')
```

## Question 4: Reanalyze Example 3.3

### Read data into R

According to the given instructions, the insomnia data set of Example 3.3 can be downloaded online. Here, the value of response (1,2,3,4) represents different lengths of time to falling asleep of a subject (1 - "\<20", 2 - "20-30", 3 - "30-60", 4 - "\>60").

```{r}
# Read data
# https://users.stat.ufl.edu/~aa/cat/data/Insomnia.dat
insomnia <- read.table("insomnia.txt", header = T)
insomnia
```

### Using `clm` and `clmm` functions

Here I use the `clm` and `clmm` function of `ordinal` package for replication of the results.

```{r}
# import libraries
library(ordinal)

# #1: Estimate with the random intercept (clmm function)
model_clmm <- clmm(factor(response) ~ (1|case) + treat*occasion, data = insomnia, link = "logit")

# print
cat('Cumulative Logit Model with the Random Intercept\nbeta:',model_clmm$beta,'\n\n')

# #2: Estimate without the random intercept (clm function)
model_clm <- clm(factor(response) ~ treat*occasion, data = insomnia, link = "logit")

# print
cat('Cumulative Logit Model without the Random Intercept\nbeta:',model_clm$beta,'\n\n')
```

Here are the detailed summaries of these two models. The results showed that including the random effect can improve the performance. However, according to the std, the random effect is not significantly different from 0 at the 0.05 level.

```{r}
# Summary #1
summary(model_clmm)
```

```{r}
# Summary #2
summary(model_clm)
```
