Basics template: A simple template for Accessing ROC on CDM item-fits

Load the pacakges and Data

# Load the GDINA package
library(GDINA)
## GDINA R Package (version 2.8.7; 2021-05-28)
## For tutorials, see https://wenchao-ma.github.io/GDINA
# Load the pROC package for ROC curves
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Load the data set
# In here, I use the sim10GDINA data for illustration
data(sim10GDINA)

Run the GDINA model

# Fit the data using te GDINA model
fit <- GDINA(dat=sim10GDINA$simdat,Q=sim10GDINA$simQ,model="GDINA",verbose=0)
# Print out the fit summaries
fit
## Call:
## GDINA(dat = sim10GDINA$simdat, Q = sim10GDINA$simQ, model = "GDINA", 
##     verbose = 0)
## 
##   GDINA version 2.8.7 (2021-05-28) 
## ===============================================
## Data
## -----------------------------------------------
## # of individuals    groups    items         
##             1000         1       10
## ===============================================
## Model
## -----------------------------------------------
## Fitted model(s)       = GDINA 
## Attribute structure   = saturated 
## Attribute level       = Dichotomous 
## ===============================================
## Estimation
## -----------------------------------------------
## Number of iterations  = 45 
## 
## For the final iteration:
##   Max abs change in item success prob. = 0.0001 
##   Max abs change in mixing proportions = 0.0000 
##   Change in -2 log-likelihood          = 0.0004 
##   Converged?                           = TRUE 
## 
## Time used             = 0.0425 secs
summary(fit)
## 
## Test Fit Statistics
## 
## Loglik = -5918.21 
## 
## AIC    = 11926.42  | penalty [2 * p]  = 90.00 
## BIC    = 12147.27  | penalty [log(n) * p]  = 310.85 
## CAIC   = 12192.27  | penalty [(log(n) + 1) * p]  = 355.85 
## SABIC  = 12004.35  | penalty [log((n + 2)/24) * p]  = 167.93 
## 
## No. of parameters (p)  = 45 
##   No. of estimated item parameters =  38 
##   No. of fixed item parameters =  0 
##   No. of distribution parameters =  7 
## 
## Attribute Prevalence
## 
##    Level0 Level1
## A1 0.5027 0.4973
## A2 0.4974 0.5026
## A3 0.4784 0.5216

Calculate ROC & F1_Score for Item #1

# Extract the responses for each item, in here we only take Item #1 as an example
response <- sim10GDINA[["simdat"]][,1]
# Extract the item prediction
item1_prob <- as.data.frame(fit$LC.prob[1,])
# Extract the individual's pattern
person_pattern <- as.data.frame(personparm(fit))
# Merge the individual's pattern into one column
person_pattern_one <- paste(person_pattern$A1,person_pattern$A2,person_pattern$A3)
# Remove the white space
person_pattern_one <- gsub(" ","",person_pattern_one,fixed=TRUE)
# Match the person pattern to items' prediction
pre_prob <- vector()
for (i in 1:1000) {
  pre_prob[i] <- item1_prob[which(person_pattern_one[i]==rownames(item1_prob)),]
  if (pre_prob[i]>=0.5){
  pre_prob[i] <- 1 } else {
   pre_prob[i] <- 0
  }
}
# Run the ROC curve based on the response and prediction data
par(pty="s")
roc_ob <- roc(response,pre_prob,plot=TRUE,legacy.axes=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# AUC=0.8841 for item_1
# Load the MLmetrics package for F1_Score function
library("MLmetrics")
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
# Calculate the F1 Score 
F1Score <- F1_Score(pre_prob,response)
F1Score # F1 score for item #1 equals to 0.874
## [1] 0.873807

Item fit statistics

# Use itemfit() function to calculate item-level fit statistics
ift <- itemfit(fit)
summary(ift)
## 
## Item-level fit statistics
##         z.prop pvalue[z.prop] max[z.r] pvalue.max[z.r] adj.pvalue.max[z.r]
## Item 1  0.0598         0.9523   0.4465          0.6552              1.0000
## Item 2  0.0108         0.9914   0.5339          0.5934              1.0000
## Item 3  0.0063         0.9950   1.4319          0.1522              1.0000
## Item 4  0.0343         0.9726   1.9893          0.0467              0.4200
## Item 5  0.0957         0.9237   1.9893          0.0467              0.4200
## Item 6  0.0372         0.9703   1.4319          0.1522              1.0000
## Item 7  0.1669         0.8674   1.5507          0.1210              1.0000
## Item 8  0.0858         0.9316   1.8360          0.0664              0.5972
## Item 9  0.0721         0.9425   1.8360          0.0664              0.5972
## Item 10 0.1683         0.8663   0.6386          0.5231              1.0000
##         max[z.logOR] pvalue.max[z.logOR] adj.pvalue.max[z.logOR]
## Item 1        0.4503              0.6525                  1.0000
## Item 2        0.5347              0.5929                  1.0000
## Item 3        1.4320              0.1522                  1.0000
## Item 4        1.9482              0.0514                  0.4625
## Item 5        1.9482              0.0514                  0.4625
## Item 6        1.4320              0.1522                  1.0000
## Item 7        1.5333              0.1252                  1.0000
## Item 8        1.8159              0.0694                  0.6244
## Item 9        1.8159              0.0694                  0.6244
## Item 10       0.6214              0.5343                  1.0000
# Item fit relative
mc <- modelcomp(fit)
mc
## 
## Item-level model selection:
## 
## test statistic: Wald 
## Decision rule: simpler model + largest p value rule at 0.05 alpha level.
## Adjusted p values were based on holm correction.
## 
##         models pvalues adj.pvalues
## Item 1   GDINA                    
## Item 2   GDINA                    
## Item 3   GDINA                    
## Item 4    RRUM  0.3338           1
## Item 5    DINA  0.7991           1
## Item 6    DINO  0.8077           1
## Item 7    ACDM  0.6123           1
## Item 8    RRUM    0.32           1
## Item 9     LLM  0.9021           1
## Item 10   RRUM  0.5674           1
# Classification rate
CA(fit,what="MAP")
## Classification Accuracy 
## 
## Test level accuracy =  0.7809 
## 
## Pattern level accuracy: 
## 
##    000    100    010    001    110    101    011    111 
## 0.7866 0.6970 0.7278 0.8073 0.7819 0.8131 0.7693 0.8491 
## 
## Attribute level accuracy: 
## 
##     A1     A2     A3 
## 0.9016 0.9002 0.9321

Calculate ROC & F1_Score for item#2

# Extract the responses for each item, in here we only take Item #1 as an example
response_item2 <- sim10GDINA[["simdat"]][,2]
# Extract the item prediction
item2_prob <- as.data.frame(fit$LC.prob[2,])
# Match the person pattern to items' prediction
pre_prob_2 <- vector()
for (i in 1:1000) {
  pre_prob_2[i] <- item1_prob[which(person_pattern_one[i]==rownames(item2_prob)),]
  if (pre_prob_2[i]>=0.5){
  pre_prob_2[i] <- 1 } else {
   pre_prob_2[i] <- 0
  }
}
# Run the ROC curve based on the response and prediction data
par(pty="s")
roc_ob_item2 <- roc(response_item2,pre_prob_2,auc=TRUE, ci=TRUE,legacy.axes=TRUE, plot=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# AUC=0.505 for item_2
# Load the MLmetrics package for F1_Score function
library("MLmetrics")
# Calculate the F1 Score 
F1Score <- F1_Score(pre_prob_2,response_item2)
F1Score # F1 score for item #1 equals to 0.5194
## [1] 0.5193798
# Compare the item#1 and item#2 in one plot
roc.test(roc_ob,roc_ob_item2)
## 
##  DeLong's test for two ROC curves
## 
## data:  roc_ob and roc_ob_item2
## D = 20.225, df = 1685.2, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8841414   0.5045738

Question for Dr.Ma: Which item absolute/relative fit statistics is the best for us to use for the comparison with AUC or F1 scores?

Test level fit

# Extract the test level fit 
modelfit(fit)
## Test-level Model Fit Evaluation
## 
## Relative fit statistics: 
##  -2 log likelihood =  11836.42  ( number of parameters =  45 )
##  AIC  =  11926.42  BIC =  12147.27 
##  CAIC =  12192.27  SABIC =  12004.35 
## 
## Absolute fit statistics: 
##  M2 =  6.8584  df =  10  p =  0.7387 
##  RMSEA2 =  0  with  90 % CI: [ 0 , 0.025 ]
##  SRMSR =  0.0205
# Calculate the test-level AUC
# Extract the responses for each item
response <- sim10GDINA[["simdat"]]

Question for Dr. Ma: To build a ROC for test level, are we need to extract all the predicted cells from the response matrix?