Init

#global options
options(
  digits = 2,
  contrasts = c("contr.treatment", "contr.treatment")
)

#packages
library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  mirt,
  future, furrr
)

## Loading required package: stats4
## Loading required package: lattice

#ggplot2
theme_set(theme_bw())

Functions

#get factor loadings from mirt fit
get_loadings = function(x) {
  x@Fit$`F` %>% as.vector()
}

#internal function to fit mirt
fit_mirt = function(items, mirt_args) {

  local_mirt_fit = rlang::exec(
    .fn = mirt::mirt,
    data = items,
    !!!mirt_args
  )
  
  #get scores
  local_scores = mirt::fscores(local_mirt_fit, full.scores.SE = T)
  
  #return fit and scores
  list(
    fit = local_mirt_fit,
    scores = local_scores,
    reliability = empirical_rxx(local_scores)
  )
}

#make a list of items indexes with one removed per set
make_leave_one_out_sets = function(item_idx) {
  #make a list of items indexes with one removed per set
  purrr::map(
    seq(item_idx),
    function(i) {
      item_idx[-i]
    }
  )
}


#make forwards item selection sets
make_forward_sets = function(
    all_items, 
    current_items = NULL, 
    full_fit,
    min_items = 3,
    start_method = "highest_loading"
    ) {
  #make a list of items indexes with one removed per set
  #if we have none, it's easy
  if (is.null(current_items)) {
    #if we have no items, we can either try all 3-way combinations, which is often not possible (explosion)
    #or we can begin with a best guess of the 3 items with highest loadings from full analysis (fast)
    if (start_method == "highest_loading") {
      #get highest loading items
      item_loadings = tibble(
        item = 1:length(get_loadings(full_fit)),
        loading = get_loadings(full_fit)
      )
      start_items = item_loadings %>% arrange(loading) %>% tail(min_items) %>% pull(item)
      sets = list(start_items)
      message(str_glue("Starting with the {min_items} items with highest loadings: {str_c(start_items, collapse = ', ')}"))
    } else if (start_method == "combination") {
      #if we have no items, we can either try all 3-way combinations, which is often not possible (explosion)
      #or we can begin with a best guess of the 3 items with highest loadings from full analysis (fast)
      sets = combn(1:ncol(all_items), m = min_items, simplify = F)
    }
 
  } else {
    #if we have some, we need to add one different item to each set
    sets = purrr::map(
      setdiff(seq(ncol(all_items)), current_items),
      function(i) {
        c(current_items, i)
      }
    )
  }
  
  sets
}


#backwards drop function (drop 1 item)
backwards_drop = function(
    all_items,
    current_selection = NULL,
    criterion_vars, 
    criterion_cors_full, 
    selection_method,
    full_fit,
    full_reliability,
    mirt_args = NULL,
    save_fits = T
    ) {

  #if no current selection, select all
  if (is.null(current_selection)) {
    current_selection = seq(ncol(all_items))
  }
  
  #main loop
  reductions = furrr::future_map_dfr(
    make_leave_one_out_sets(current_selection),
    function(item_set) {
      
      #fit mirt
      fit = fit_mirt(all_items[, item_set, drop = F], mirt_args)
      
      #get scores and their full set cors
      cors = wtd.cors(
        bind_cols(
          score = fit$scores[, 1],
          criterion_vars
        )
      )
  
      #mean fraction of criterion cors
      item_set = list(item_set)
      criterion_cors = cors[-1, 1]
      criterion_cors_frac = criterion_cors / criterion_cors_full[-1, 1]
      mean_criterion_cors_frac = mean(criterion_cors_frac)
      reliability_frac = (fit$reliability / full_reliability) %>% unname()
      
      #determine what to maximize
      criterion_value = switch(selection_method,
             "rc" = mean(c(criterion_cors_frac, reliability_frac)),
             "r" = reliability_frac,
             "c" = mean(criterion_cors_frac)
             )

      y = tibble(
        items_in_scale = length(item_set[[1]]),
        item_set = item_set,
        criterion_cors_frac = criterion_cors_frac,
        mean_criterion_cors_frac = mean_criterion_cors_frac,
        reliability = fit$reliability,
        reliability_frac = reliability_frac,
        criterion_value = criterion_value
      )
  
      #add criterion cors 
      for (i in seq_along(criterion_cors)) {
        y[[str_glue("r_{rownames(cors)[i+1]}")]] = criterion_cors[i]
      }
      
      #save fits and scores?
      if (save_fits) {
        y$fit = list(fit)
        y$scores = list(fit$scores)
      }
      
      y
    }
  )
  
  #return results
  reductions
}

#forwards pick function
forwards_pick = function(
    items, 
    current_items,
    criterion_vars, 
    criterion_cors_full, 
    selection_method,
    full_fit,
    full_reliability,
    mirt_args = NULL,
    save_fits = T
) {

  #main loop
  additions = furrr::future_map_dfr(
    make_forward_sets(items, current_items, full_fit = full_fit),
    function(item_set) {
    
      #fit mirt
      fit = fit_mirt(items[, item_set, drop = F], mirt_args)
      
      #get scores and their full set cors
      cors = wtd.cors(
        bind_cols(
          score = fit$scores[, 1],
          criterion_vars
        )
      )
  
      #mean fraction of criterion cors
      item_set = list(item_set)
      criterion_cors = cors[-1, 1]
      criterion_cors_frac = criterion_cors / criterion_cors_full[-1, 1]
      mean_criterion_cors_frac = mean(criterion_cors_frac)
      reliability_frac = (fit$reliability / full_reliability) %>% unname()
      
      #determine what to maximize
      criterion_value = switch(selection_method,
             "rc" = mean(c(criterion_cors_frac, reliability_frac)),
             "r" = reliability_frac,
             "c" = mean(criterion_cors_frac)
             )
 
      y = tibble(
        items_in_scale = length(item_set[[1]]),
        item_set = item_set,
        criterion_cors_frac = criterion_cors_frac,
        mean_criterion_cors_frac = mean_criterion_cors_frac,
        reliability = fit$reliability,
        reliability_frac = reliability_frac,
        criterion_value = criterion_value
      )
      
      #add criterion cors 
      for (i in seq_along(criterion_cors)) {
        y[[str_glue("r_{rownames(cors)[i+1]}")]] = criterion_cors[i]
      }
      
      #save fits and scores?
      if (save_fits) {
        y$fit = list(fit)
        y$scores = list(fit$scores)
      }
      
      y
    }
  )
  
  #return results
  additions
}

#make loadings based item list
make_item_list_for_loadings_method = function(x, max_items) {
  #return a list with vectors of first 3 items, then first 4, then first 5, until max
  purrr::map(
    3:max_items,
    function(last_item) {
      x[1:last_item]
    }
  )
}

#use highest loadings
highest_loadings_method = function(
    items, 
    max_items,
    criterion_vars, 
    criterion_cors_full, 
    full_fit,
    full_reliability,
    mirt_args = NULL,
    save_fits = T,
    selection_method
    ) {
  
  #get loadings
  item_loadings = tibble(
    item = 1:length(get_loadings(full_fit)),
    loading = get_loadings(full_fit)
  ) %>% arrange(-loading)
  
  #fit models with items from 3 to max
  #main loop
  additions = furrr::future_map(
    make_item_list_for_loadings_method(item_loadings$item, max_items),
    function(item_set) {

      #fit mirt
      fit = fit_mirt(items[, item_set, drop = F], mirt_args)
      
      #get scores and their full set cors
      cors = wtd.cors(
        bind_cols(
          score = fit$scores[, 1],
          criterion_vars
        )
      )
  
      #mean fraction of criterion cors
      item_set = list(item_set)
      criterion_cors = cors[-1, 1]
      criterion_cors_frac = criterion_cors / criterion_cors_full[-1, 1]
      mean_criterion_cors_frac = mean(criterion_cors_frac)
      reliability_frac = (fit$reliability / full_reliability) %>% unname()
      
      #determine what to maximize
      criterion_value = switch(selection_method,
             "rc" = mean(c(criterion_cors_frac, reliability_frac)),
             "r" = reliability_frac,
             "c" = mean(criterion_cors_frac)
             )
 
      y = tibble(
        items_in_scale = length(item_set[[1]]),
        item_set = item_set,
        criterion_cors_frac = criterion_cors_frac,
        mean_criterion_cors_frac = mean_criterion_cors_frac,
        reliability = fit$reliability,
        reliability_frac = reliability_frac,
        criterion_value = criterion_value
      )
      
      #add criterion cors 
      for (i in seq_along(criterion_cors)) {
        y[[str_glue("r_{rownames(cors)[i+1]}")]] = criterion_cors[i]
      }
      
      #save fits and scores?
      if (save_fits) {
        y$fit = list(fit)
        y$scores = list(fit$scores)
      }
      
      y
    }
  )
  
  additions
}

#abbreviation scale
abbreviate_scale = function(
    items, 
    criterion_vars = NULL, 
    item_target = 10,
    method = "forwards", 
    selection_method = "rc",
    mirt_args = NULL,
    save_fits = T
    ) {
  
  #start timer
  tictoc::tic()
  
  #stop timer on exit
  on.exit({
    tictoc::toc()
  })
  
  #check methods
  assert_that(is.data.frame(items) | is.matrix(items))
  items = as.matrix(items) #make sure its a matrix to avoid some dropping issues
  assert_that(method %in% c("backwards", "forwards", "max_loading"))
  if (selection_method == "cr") selection_method = "rc"
  assert_that(selection_method %in% c("rc", "r", "c", "cr"))
  message(str_glue("Abbreviating scale using {mapvalues(selection_method, from = c('c', 'rc', 'r'), to = c('correlation with criterion variable(s)', 'average of correlation with creiterion variable(s) and reliability', 'reliability'), warn_missing = F)} method"))
  message(str_glue("Using the {method} method"))
  
  assert_that(item_target >= 3)
  assert_that(item_target < ncol(items))

  #use default mirt args if none given
  if (is.null(mirt_args)) {
    mirt_args = list(
      model = 1,
      itemtype = "2PL",
      technical = list(NCYCLES = 5000),
      verbose = F
    )
  }
  
  #fit full scale mirt
  full_fit = rlang::exec(
    .fn = mirt::mirt,
    data = items,
    !!!mirt_args
  )
  
  #get scores and their full set cors
  full_scores = mirt::fscores(full_fit, full.scores.SE = T)

  #reliability
  full_reliability = empirical_rxx(full_scores) %>% unname()
  
  #make set of criterion vars
  if (is.null(criterion_vars)) {
    criterion_vars = tibble(
      full_score = full_scores[, 1]
    )
  }
  
  #criterion cors with full set
  full_cors = wtd.cors(
    bind_cols(
      score = full_scores[, 1],
      criterion_vars
    )
  )
  
  #main loop
  #prep a list
  item_set_results_all = list()
 
  if (method == "backwards") {
    #drop first, then based on results, drop a second etc. until desired size is reached
    items_to_drop = ncol(items) - item_target
    items_to_drop_seq = seq(ncol(items) - item_target)
    for (i in items_to_drop_seq) {
      message(str_glue("removing item {i} out of {items_to_drop} ({ncol(items) - i} remaining)"))
      
      #if the first round, just drop one item
      #dont need to use the prior item set
      if (i == 1) {

        item_set_results_all[[i]] = backwards_drop(
          #begin with all items
          all_items = items,
          current_selection = seq(ncol(items)),
          criterion_vars = criterion_vars,
          criterion_cors_full = full_cors,
          selection_method = selection_method,
          mirt_args = mirt_args,
          save_fits = save_fits,
          full_fit = full_fit,
          full_reliability = full_reliability
        )
        
        next
        
      } else {
     
        #best prior set
        best_prior_i = which.max(item_set_results_all[[i - 1]]$criterion_value)
        best_prior_set = item_set_results_all[[i - 1]]$item_set[[best_prior_i]]
        
        #use prior best set
        item_set_results_all[[i]] = backwards_drop(
          all_items = items,
          current_selection = best_prior_set,
          criterion_vars = criterion_vars,
          criterion_cors_full = full_cors,
          selection_method = selection_method,
          mirt_args = mirt_args,
          save_fits = save_fits,
          full_fit = full_fit,
          full_reliability = full_reliability
        )
        
        next
      }
      
    }
  } else if (method == "forwards") {

    #pick first, then based on results, pick a second etc. until desired size is reached
    items_to_pick = item_target
    items_to_pick_seq = seq(item_target - 2) #because we start with 3 items (minimum for IRT)
    for (i in items_to_pick_seq) {

      if (i != 1) {
        message(str_glue("adding item {i + 2} out of {items_to_pick}"))
      }
      
      #if the first round, try each item, one at a time
      #dont need to use the prior item set
      if (i == 1) {
        item_set_results_all[[i]] = forwards_pick(
          #begin with all items
          items = items,
          current_items = c(),
          criterion_vars = criterion_vars,
          criterion_cors_full = full_cors,
          selection_method = selection_method,
          mirt_args = mirt_args,
          save_fits = save_fits,
          full_fit = full_fit,
          full_reliability = full_reliability
        )
        
        next
        
      } else {
        
        #best prior set
        best_prior_i = which.max(item_set_results_all[[i - 1]]$criterion_value)
        best_prior_set = item_set_results_all[[i - 1]]$item_set[[best_prior_i]]
        
        #use prior best set
        item_set_results_all[[i]] = forwards_pick(
          items = items,
          current_items = best_prior_set,
          criterion_vars = criterion_vars,
          criterion_cors_full = full_cors,
          selection_method = selection_method,
          mirt_args = mirt_args,
          save_fits = save_fits,
          full_fit = full_fit,
          full_reliability = full_reliability
        )
        
        next
      }
      
    }
  }
  
  #loadings method
  if (method == "max_loading") {
    item_set_results_all = highest_loadings_method(
      items = items,
      max_items = item_target,
      criterion_vars = criterion_vars,
      criterion_cors_full = full_cors,
      mirt_args = mirt_args,
      save_fits = save_fits,
      full_fit = full_fit,
      full_reliability = full_reliability,
      selection_method = selection_method
    )
  }
 
  #return results
  full_results = item_set_results_all %>% ldf_to_df(by_name = "set") %>% as_tibble()
  
  list(
    full_results = full_results,
    best_sets = full_results %>% 
      filter(criterion_value == max(criterion_value), .by = set)
  )
  
}


GG_scale_abbreviation = function(x, vars = c("reliability_frac", "mean_criterion_cors_frac", "r_full_score", "reliability")) {
  x$best_sets %>%
    #display both reliability and criterion value
    pivot_longer(
      cols = all_of(vars),
      names_to = "criterion",
      values_to = "value"
    ) %>%
    ggplot(aes(x = items_in_scale, y = value, color = criterion)) +
    geom_point() +
    geom_line() +
    scale_x_continuous(breaks = seq(0, ncol(d_sim), 5)) +
    scale_y_continuous(limits = c(NA, 1)) +
    theme_minimal() +
    labs(
      x = "Number of items in scale",
      y = "Criterion value"
    )
}

#helper function
get_best_item_set = function(x) {
  x$best_sets %>%
    tail(1) %>% 
    pull(item_set) %>% 
    extract2(1) %>%
    sort()
}

Example

Simulation 1

25 items
select 10

The function can optimize for correlation with an arbitrary variable, including the full score (default). Or it can try to optimize the reliability. Here we use the first option.

#no multi-threading
plan(sequential)

#simulate some data
set.seed(1)

n <- 1000
n_items = 25

#simulate items
d_sim = simdata(
  N = n,
  itemtype = "2PL",
  d = runif(n_items, -2, 2),
  a = runif(n_items, 0.5, 2)
)

#run a global fit
sim_fit = mirt(
  data = d_sim,
  model = 1,
  itemtype = "2PL",
  verbose = F
)

#abbreviate scale
res_c_forwards = abbreviate_scale(
  items = d_sim,
  item_target = 10,
  selection_method = "c",
  method = "forwards"
)

## Abbreviating scale using correlation with criterion variable(s) method

## Using the forwards method

## Starting with the 3 items with highest loadings: 13, 21, 7

## adding item 4 out of 10

## adding item 5 out of 10

## adding item 6 out of 10

## adding item 7 out of 10

## adding item 8 out of 10

## adding item 9 out of 10

## adding item 10 out of 10

## 15.36 sec elapsed

res_c_backwards = abbreviate_scale(
  items = d_sim,
  item_target = 10,
  selection_method = "c",
  method = "backwards"
)

## Abbreviating scale using correlation with criterion variable(s) method

## Using the backwards method

## removing item 1 out of 15 (24 remaining)

## removing item 2 out of 15 (23 remaining)

## removing item 3 out of 15 (22 remaining)

## removing item 4 out of 15 (21 remaining)

## removing item 5 out of 15 (20 remaining)

## removing item 6 out of 15 (19 remaining)

## removing item 7 out of 15 (18 remaining)

## removing item 8 out of 15 (17 remaining)

## removing item 9 out of 15 (16 remaining)

## removing item 10 out of 15 (15 remaining)

## removing item 11 out of 15 (14 remaining)

## removing item 12 out of 15 (13 remaining)

## removing item 13 out of 15 (12 remaining)

## removing item 14 out of 15 (11 remaining)

## removing item 15 out of 15 (10 remaining)

## 62.685 sec elapsed

#plot results
res_c_forwards %>% 
  GG_scale_abbreviation()

res_c_backwards %>%
  GG_scale_abbreviation()

#compare results
res_c_forwards$best_sets %>%
  tail(1)

res_c_backwards$best_sets %>%
  tail(1)

#almost the same, but not entirely, why?
#compare the items in best sets
res_c_backwards %>% 
    get_best_item_set()

##  [1]  4  6  7  9 13 17 18 20 21 23

res_c_forwards %>%
    get_best_item_set()

##  [1]  4  6  7  8  9 13 18 20 21 23

symdiff(
  res_c_backwards %>% 
    get_best_item_set(),
  
  res_c_forwards %>% 
    get_best_item_set()
)

## [1] 17  8

Simulation 2

100 items
select 20

We try optimizing for reliability, full score correlation, and just picking the items with highest loadings.

#no multi-threading
plan(multisession(workers = 7))

#simulate some data
set.seed(1)

n <- 1000
n_items = 100

#simulate items
d_sim = simdata(
  N = n,
  itemtype = "2PL",
  d = runif(n_items, -2, 2),
  a = runif(n_items, 0.5, 2)
)

#run a global fit
sim_fit = mirt(
  data = d_sim,
  model = 1,
  itemtype = "2PL",
  verbose = F
)

#abbreviate scale
res_c_forwards = abbreviate_scale(
  items = d_sim,
  item_target = 20,
  selection_method = "c",
  method = "forwards"
)

## Abbreviating scale using correlation with criterion variable(s) method

## Using the forwards method

## Starting with the 3 items with highest loadings: 7, 80, 18

## adding item 4 out of 20

## adding item 5 out of 20

## adding item 6 out of 20

## adding item 7 out of 20

## adding item 8 out of 20

## adding item 9 out of 20

## adding item 10 out of 20

## adding item 11 out of 20

## adding item 12 out of 20

## adding item 13 out of 20

## adding item 14 out of 20

## adding item 15 out of 20

## adding item 16 out of 20

## adding item 17 out of 20

## adding item 18 out of 20

## adding item 19 out of 20

## adding item 20 out of 20

## 187.505 sec elapsed

#plot results
res_c_forwards %>% 
  GG_scale_abbreviation()

#compare results
res_c_forwards$best_sets %>%
  tail(1)

#alternatively, we can optimize for reliability
res_r_forwards = abbreviate_scale(
  items = d_sim,
  item_target = 20,
  selection_method = "r",
  method = "forwards"
)

## Abbreviating scale using reliability method

## Using the forwards method

## Starting with the 3 items with highest loadings: 7, 80, 18

## adding item 4 out of 20

## adding item 5 out of 20

## adding item 6 out of 20

## adding item 7 out of 20

## adding item 8 out of 20

## adding item 9 out of 20

## adding item 10 out of 20

## adding item 11 out of 20

## adding item 12 out of 20

## adding item 13 out of 20

## adding item 14 out of 20

## adding item 15 out of 20

## adding item 16 out of 20

## adding item 17 out of 20

## adding item 18 out of 20

## adding item 19 out of 20

## adding item 20 out of 20

## 168.722 sec elapsed

#plot results
res_r_forwards %>% 
  GG_scale_abbreviation()

#compare results
res_r_forwards$best_sets %>%
  tail(1)

#highest loading method
res_max_loading = abbreviate_scale(
  items = d_sim,
  item_target = 20,
  selection_method = "c",
  method = "max_loading"
)

## Abbreviating scale using correlation with criterion variable(s) method

## Using the max_loading method

## 4.736 sec elapsed

#plot results
res_max_loading %>% 
  GG_scale_abbreviation()

#compare results
res_max_loading$best_sets %>%
  tail(1)

#a single plot of results at 20 items from each method
combined_results = bind_rows(
  res_max_loading$best_sets %>%
    mutate("method" = "max_loading"),
  
  res_r_forwards$best_sets %>%
    mutate("method" = "reliability"),
  
  res_c_forwards$best_sets %>%
    mutate("method" = "r full score")
) %>% 
  select(reliability, r_full_score, method, items_in_scale) %>% 
  pivot_longer(
    cols = c("reliability", "r_full_score"),
    names_to = "criterion",
    values_to = "value"
  )

combined_results %>% 
  ggplot(aes(items_in_scale, value, color = method)) +
  geom_line() +
  facet_wrap("criterion", scales = "free_y")

Simulation 3

100 items
select 25
-0.5 correlation between item difficulty and loading

#simulate some data
set.seed(1)

n <- 1000
n_items = 100
max_items = 25

#simulate norms to use for item pars
item_pars = MASS::mvrnorm(
  n = n_items,
  mu = c(0, 0),
  Sigma = matrix(c(1, -0.5, -0.5, 1), nrow = 2)
)

#simulate items
d_sim = simdata(
  N = n,
  itemtype = "2PL",
  d = item_pars[, 1],
  a = item_pars[, 2] %>% rescale(0.5, 2)
)

#run a global fit
sim_fit = mirt(
  data = d_sim,
  model = 1,
  itemtype = "2PL",
  verbose = F
)

#abbreviate scale
res_c_forwards = abbreviate_scale(
  items = d_sim,
  item_target = max_items,
  selection_method = "c",
  method = "forwards"
)

## Abbreviating scale using correlation with criterion variable(s) method

## Using the forwards method

## Starting with the 3 items with highest loadings: 50, 61, 70

## adding item 4 out of 25

## adding item 5 out of 25

## adding item 6 out of 25

## adding item 7 out of 25

## adding item 8 out of 25

## adding item 9 out of 25

## adding item 10 out of 25

## adding item 11 out of 25

## adding item 12 out of 25

## adding item 13 out of 25

## adding item 14 out of 25

## adding item 15 out of 25

## adding item 16 out of 25

## adding item 17 out of 25

## adding item 18 out of 25

## adding item 19 out of 25

## adding item 20 out of 25

## adding item 21 out of 25

## adding item 22 out of 25

## adding item 23 out of 25

## adding item 24 out of 25

## adding item 25 out of 25

## 269.946 sec elapsed

#plot results
res_c_forwards %>% 
  GG_scale_abbreviation()

#compare results
res_c_forwards$best_sets %>%
  tail(1)

#alternatively, we can optimize for reliability
res_r_forwards = abbreviate_scale(
  items = d_sim,
  item_target = max_items,
  selection_method = "r",
  method = "forwards"
)

## Abbreviating scale using reliability method

## Using the forwards method

## Starting with the 3 items with highest loadings: 50, 61, 70

## adding item 4 out of 25

## adding item 5 out of 25

## adding item 6 out of 25

## adding item 7 out of 25

## adding item 8 out of 25

## adding item 9 out of 25

## adding item 10 out of 25

## adding item 11 out of 25

## adding item 12 out of 25

## adding item 13 out of 25

## adding item 14 out of 25

## adding item 15 out of 25

## adding item 16 out of 25

## adding item 17 out of 25

## adding item 18 out of 25

## adding item 19 out of 25

## adding item 20 out of 25

## adding item 21 out of 25

## adding item 22 out of 25

## adding item 23 out of 25

## adding item 24 out of 25

## adding item 25 out of 25

## 272.19 sec elapsed

#plot results
res_r_forwards %>% 
  GG_scale_abbreviation()

#compare results
res_r_forwards$best_sets %>%
  tail(1)

#max both r and c
res_rc_forwards = abbreviate_scale(
  items = d_sim,
  item_target = max_items,
  selection_method = "rc",
  method = "forwards"
)

## Abbreviating scale using average of correlation with creiterion variable(s) and reliability method

## Using the forwards method

## Starting with the 3 items with highest loadings: 50, 61, 70

## adding item 4 out of 25

## adding item 5 out of 25

## adding item 6 out of 25

## adding item 7 out of 25

## adding item 8 out of 25

## adding item 9 out of 25

## adding item 10 out of 25

## adding item 11 out of 25

## adding item 12 out of 25

## adding item 13 out of 25

## adding item 14 out of 25

## adding item 15 out of 25

## adding item 16 out of 25

## adding item 17 out of 25

## adding item 18 out of 25

## adding item 19 out of 25

## adding item 20 out of 25

## adding item 21 out of 25

## adding item 22 out of 25

## adding item 23 out of 25

## adding item 24 out of 25

## adding item 25 out of 25

## 275.849 sec elapsed

#plot results
res_rc_forwards %>% 
  GG_scale_abbreviation()

#compare results
res_rc_forwards$best_sets %>%
  tail(1)

#highest loading method
res_max_loading = abbreviate_scale(
  items = d_sim,
  item_target = max_items,
  selection_method = "c",
  method = "max_loading"
)

## Abbreviating scale using correlation with criterion variable(s) method

## Using the max_loading method

## 6.479 sec elapsed

#plot results
res_max_loading %>% 
  GG_scale_abbreviation()

#compare results
res_max_loading$best_sets %>%
  tail(1)

#a single plot of results at 20 items from each method
combined_results = bind_rows(
  res_max_loading$best_sets %>%
    mutate("method" = "max_loading"),
  
  res_r_forwards$best_sets %>%
    mutate("method" = "reliability"),
  
  res_c_forwards$best_sets %>%
    mutate("method" = "r full score"),
  
  res_rc_forwards$best_sets %>%
    mutate("method" = "r full score and reliability")
) %>% 
  select(reliability, r_full_score, method, items_in_scale) %>% 
  pivot_longer(
    cols = c("reliability", "r_full_score"),
    names_to = "criterion",
    values_to = "value"
  )

combined_results %>% 
  ggplot(aes(items_in_scale, value, color = method)) +
  geom_line() +
  facet_wrap("criterion", scales = "free_y")

When item parameter correlations are present (common), optimizing for both full scale correlation and reliability is a good idea. It gets almost the same full score correlation and reliability as the pure strategies. The highest loading method is not as good in this case.

These functions will be implemented in Kirkegaard package shortly and easily available for use.

Scale abbreviation methods