# Load the GDINA package
library(GDINA)
## GDINA R Package (version 2.8.7; 2021-05-28)
## For tutorials, see https://wenchao-ma.github.io/GDINA
# Load the pROC package for ROC curves
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Load the data set
# In here, I use the sim10GDINA data for illustration
data(sim10GDINA)
# Fit the data using te GDINA model
fit <- GDINA(dat=sim10GDINA$simdat,Q=sim10GDINA$simQ,model="GDINA",verbose=0)
# Print out the fit summaries
fit
## Call:
## GDINA(dat = sim10GDINA$simdat, Q = sim10GDINA$simQ, model = "GDINA",
## verbose = 0)
##
## GDINA version 2.8.7 (2021-05-28)
## ===============================================
## Data
## -----------------------------------------------
## # of individuals groups items
## 1000 1 10
## ===============================================
## Model
## -----------------------------------------------
## Fitted model(s) = GDINA
## Attribute structure = saturated
## Attribute level = Dichotomous
## ===============================================
## Estimation
## -----------------------------------------------
## Number of iterations = 45
##
## For the final iteration:
## Max abs change in item success prob. = 0.0001
## Max abs change in mixing proportions = 0.0000
## Change in -2 log-likelihood = 0.0004
## Converged? = TRUE
##
## Time used = 0.0425 secs
summary(fit)
##
## Test Fit Statistics
##
## Loglik = -5918.21
##
## AIC = 11926.42 | penalty [2 * p] = 90.00
## BIC = 12147.27 | penalty [log(n) * p] = 310.85
## CAIC = 12192.27 | penalty [(log(n) + 1) * p] = 355.85
## SABIC = 12004.35 | penalty [log((n + 2)/24) * p] = 167.93
##
## No. of parameters (p) = 45
## No. of estimated item parameters = 38
## No. of fixed item parameters = 0
## No. of distribution parameters = 7
##
## Attribute Prevalence
##
## Level0 Level1
## A1 0.5027 0.4973
## A2 0.4974 0.5026
## A3 0.4784 0.5216
# Extract the responses for each item, in here we only take Item #1 as an example
response <- sim10GDINA[["simdat"]][,1]
# Extract the item prediction
item1_prob <- as.data.frame(fit$LC.prob[1,])
# Extract the individual's pattern
person_pattern <- as.data.frame(personparm(fit))
# Merge the individual's pattern into one column
person_pattern_one <- paste(person_pattern$A1,person_pattern$A2,person_pattern$A3)
# Remove the white space
person_pattern_one <- gsub(" ","",person_pattern_one,fixed=TRUE)
# Match the person pattern to items' prediction
pre_prob <- vector()
for (i in 1:1000) {
pre_prob[i] <- item1_prob[which(person_pattern_one[i]==rownames(item1_prob)),]
if (pre_prob[i]>=0.5){
pre_prob[i] <- 1 } else {
pre_prob[i] <- 0
}
}
# Run the ROC curve based on the response and prediction data
par(pty="s")
roc_ob <- roc(response,pre_prob,plot=TRUE,legacy.axes=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# AUC=0.8841 for item_1
# Load the MLmetrics package for F1_Score function
library("MLmetrics")
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
# Calculate the F1 Score
F1Score <- F1_Score(pre_prob,response)
F1Score # F1 score for item #1 equals to 0.874
## [1] 0.873807
# Use itemfit() function to calculate item-level fit statistics
ift <- itemfit(fit)
summary(ift)
##
## Item-level fit statistics
## z.prop pvalue[z.prop] max[z.r] pvalue.max[z.r] adj.pvalue.max[z.r]
## Item 1 0.0598 0.9523 0.4465 0.6552 1.0000
## Item 2 0.0108 0.9914 0.5339 0.5934 1.0000
## Item 3 0.0063 0.9950 1.4319 0.1522 1.0000
## Item 4 0.0343 0.9726 1.9893 0.0467 0.4200
## Item 5 0.0957 0.9237 1.9893 0.0467 0.4200
## Item 6 0.0372 0.9703 1.4319 0.1522 1.0000
## Item 7 0.1669 0.8674 1.5507 0.1210 1.0000
## Item 8 0.0858 0.9316 1.8360 0.0664 0.5972
## Item 9 0.0721 0.9425 1.8360 0.0664 0.5972
## Item 10 0.1683 0.8663 0.6386 0.5231 1.0000
## max[z.logOR] pvalue.max[z.logOR] adj.pvalue.max[z.logOR]
## Item 1 0.4503 0.6525 1.0000
## Item 2 0.5347 0.5929 1.0000
## Item 3 1.4320 0.1522 1.0000
## Item 4 1.9482 0.0514 0.4625
## Item 5 1.9482 0.0514 0.4625
## Item 6 1.4320 0.1522 1.0000
## Item 7 1.5333 0.1252 1.0000
## Item 8 1.8159 0.0694 0.6244
## Item 9 1.8159 0.0694 0.6244
## Item 10 0.6214 0.5343 1.0000
# Item fit relative
mc <- modelcomp(fit)
mc
##
## Item-level model selection:
##
## test statistic: Wald
## Decision rule: simpler model + largest p value rule at 0.05 alpha level.
## Adjusted p values were based on holm correction.
##
## models pvalues adj.pvalues
## Item 1 GDINA
## Item 2 GDINA
## Item 3 GDINA
## Item 4 RRUM 0.3338 1
## Item 5 DINA 0.7991 1
## Item 6 DINO 0.8077 1
## Item 7 ACDM 0.6123 1
## Item 8 RRUM 0.32 1
## Item 9 LLM 0.9021 1
## Item 10 RRUM 0.5674 1
# Classification rate
CA(fit,what="MAP")
## Classification Accuracy
##
## Test level accuracy = 0.7809
##
## Pattern level accuracy:
##
## 000 100 010 001 110 101 011 111
## 0.7866 0.6970 0.7278 0.8073 0.7819 0.8131 0.7693 0.8491
##
## Attribute level accuracy:
##
## A1 A2 A3
## 0.9016 0.9002 0.9321
# Extract the responses for each item, in here we only take Item #1 as an example
response_item2 <- sim10GDINA[["simdat"]][,2]
# Extract the item prediction
item2_prob <- as.data.frame(fit$LC.prob[2,])
# Match the person pattern to items' prediction
pre_prob_2 <- vector()
for (i in 1:1000) {
pre_prob_2[i] <- item1_prob[which(person_pattern_one[i]==rownames(item2_prob)),]
if (pre_prob_2[i]>=0.5){
pre_prob_2[i] <- 1 } else {
pre_prob_2[i] <- 0
}
}
# Run the ROC curve based on the response and prediction data
par(pty="s")
roc_ob_item2 <- roc(response_item2,pre_prob_2,auc=TRUE, ci=TRUE,legacy.axes=TRUE, plot=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# AUC=0.505 for item_2
# Load the MLmetrics package for F1_Score function
library("MLmetrics")
# Calculate the F1 Score
F1Score <- F1_Score(pre_prob_2,response_item2)
F1Score # F1 score for item #1 equals to 0.5194
## [1] 0.5193798
# Compare the item#1 and item#2 in one plot
roc.test(roc_ob,roc_ob_item2)
##
## DeLong's test for two ROC curves
##
## data: roc_ob and roc_ob_item2
## D = 20.225, df = 1685.2, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.8841414 0.5045738
Question for Dr.Ma: Which item absolute/relative fit statistics is the best for us to use for the comparison with AUC or F1 scores?
# Extract the test level fit
modelfit(fit)
## Test-level Model Fit Evaluation
##
## Relative fit statistics:
## -2 log likelihood = 11836.42 ( number of parameters = 45 )
## AIC = 11926.42 BIC = 12147.27
## CAIC = 12192.27 SABIC = 12004.35
##
## Absolute fit statistics:
## M2 = 6.8584 df = 10 p = 0.7387
## RMSEA2 = 0 with 90 % CI: [ 0 , 0.025 ]
## SRMSR = 0.0205
# Calculate the test-level AUC
# Extract the responses for each item
response <- sim10GDINA[["simdat"]]
Question for Dr. Ma: To build a ROC for test level, are we need to extract all the predicted cells from the response matrix?