Setup

library(tidyverse)
library(here)
library(lubridate)
library(ggthemes)
library(langcog)
library(mirt)
library(knitr)
##
set.seed(134) # for reproducibility

Read in data

pilot_data_filtered <- read_csv(file = here::here('data/preprocessed_data/all_preprocessed_data.csv')) %>%
  mutate(word1 = str_replace(word1, ' ', '.')) %>%
  mutate(word2 = str_replace(word2, ' ', '.')) %>%
  mutate(item_pair = paste0(word1,'_',word2))
clip_items <- pilot_data_filtered %>%
  filter(cohort=='prolific') %>%
  distinct(word1, word2, wordPairing) %>%
  mutate(item_pair = paste0(word1,'_',word2)) %>%
  mutate(item_type = 'clip')
daivt_items <- read_csv(file = here::here('data/preprocessed_data/all_preprocessed_data.csv')) %>%
  mutate(word1 = str_replace(word1, ' ', '.')) %>%
  mutate(word2 = str_replace(word2, ' ', '.')) %>%
  mutate(item_pair = paste0(word1,'_',word2)) %>%
  anti_join(clip_items) 

dist_per_daivt = daivt_items %>% group_by(word1) %>% summarize(num_dist = length(unique(word2)))

Select data to model

clip items from all participants

data_to_model <-  pilot_data_filtered %>%
  right_join(clip_items %>% select(word1, word2, item_pair)) 

2439 participants

length(unique(data_to_model$pid))
## [1] 2439

135 words

length(unique(data_to_model$word1))
## [1] 135
d_wide_all<- data_to_model %>%
  ungroup() %>%
  select(sub_id, item_pair, correct) %>% # pid  = sub_id
  arrange(item_pair) %>%
  ungroup() %>%
  pivot_wider(names_from=item_pair, values_from=correct, values_fn = ~mean(.x)) %>%
  ungroup()

d_mat_all <- d_wide_all %>%
  select(-sub_id) %>%
  data.frame %>%
  data.matrix 

rownames(d_mat_all) <- d_wide_all$sub_id

Just fit one 2PL model with some priors

start.dim <- dim(d_mat_all)[2]-1
  mm = (
  'F = 1-%d,
  PRIOR = (1-%d, a1, norm, .2, 1),
  PRIOR = (1-%d, d, norm, 0, 2)'
  )
  mm = mirt.model(sprintf(mm,start.dim,start.dim,start.dim))
m <- mirt(d_mat_all, model = mm,itemtype = '2PL',guess=0.5) 
## 
Iteration: 1, Log-Lik: -43205.502, Max-Change: 24.74307
Iteration: 2, Log-Lik: -32288.617, Max-Change: 12.63906
Iteration: 3, Log-Lik: -31070.567, Max-Change: 6.76570
Iteration: 4, Log-Lik: -30822.267, Max-Change: 2.34785
Iteration: 5, Log-Lik: -30748.847, Max-Change: 0.59453
Iteration: 6, Log-Lik: -30719.452, Max-Change: 0.12690
Iteration: 7, Log-Lik: -30705.609, Max-Change: 0.09802
Iteration: 8, Log-Lik: -30698.245, Max-Change: 0.07444
Iteration: 9, Log-Lik: -30694.133, Max-Change: 0.05731
Iteration: 10, Log-Lik: -30691.742, Max-Change: 0.04385
Iteration: 11, Log-Lik: -30690.338, Max-Change: 0.03402
Iteration: 12, Log-Lik: -30689.502, Max-Change: 0.02636
Iteration: 13, Log-Lik: -30688.364, Max-Change: 0.00563
Iteration: 14, Log-Lik: -30688.337, Max-Change: 0.00437
Iteration: 15, Log-Lik: -30688.322, Max-Change: 0.00278
Iteration: 16, Log-Lik: -30688.309, Max-Change: 0.00292
Iteration: 17, Log-Lik: -30688.305, Max-Change: 0.00151
Iteration: 18, Log-Lik: -30688.303, Max-Change: 0.00137
Iteration: 19, Log-Lik: -30688.299, Max-Change: 0.00218
Iteration: 20, Log-Lik: -30688.299, Max-Change: 0.00160
Iteration: 21, Log-Lik: -30688.299, Max-Change: 0.00105
Iteration: 22, Log-Lik: -30688.299, Max-Change: 0.00080
Iteration: 23, Log-Lik: -30688.299, Max-Change: 0.00056
Iteration: 24, Log-Lik: -30688.299, Max-Change: 0.00044
Iteration: 25, Log-Lik: -30688.299, Max-Change: 0.00056
Iteration: 26, Log-Lik: -30688.299, Max-Change: 0.00014
Iteration: 27, Log-Lik: -30688.299, Max-Change: 0.00008
co <- coef(m,simplify=TRUE, IRTpars = FALSE) # Get coeeficients
co <- tibble::rownames_to_column(as.data.frame(co$items),'item_pair')
coefs_trimmed <- co %>%
  rename(slope = a1) %>%
  filter(slope>0) %>% # make sure all items have positive slopes
  left_join(clip_items) %>%
  group_by(word1) %>%
  mutate(average_slope = mean(slope), both_items = length(unique(word2))) %>%
  filter(both_items==2)

Get rid of problematic items and trim to positive slope items

problems = read_csv(here::here('data/problematic_items_0329.csv')) %>%
filter(Eliminate=='yes')

Look at cliip items I decided were ok

kept = read_csv(here::here('data/problematic_items_0329.csv')) %>%
filter(Eliminate=='no') %>%
  rename(word1 = `Picture Name`) %>%
  left_join(clip_items)

Coefficients after all problems eliminated

after_all_problems <- coefs_trimmed %>%
  filter(!word1 %in% problems$`Picture Name`) %>%
  filter(!word2 %in% problems$`Picture Name`)

Look at what was eliminated

eliminated_targ <- coefs_trimmed %>%
  filter(word1 %in% problems$`Picture Name`)

eliminated_dist <- coefs_trimmed %>%
  filter(word2 %in% problems$`Picture Name`)

Get rid of one last item

coefs_trimmed_final <- after_all_problems %>%
  ungroup() %>%
  filter(slope != min(slope)) %>% # get rid of worst item
  group_by(word1) %>%
  mutate(both_dists = length(unique(word2))) %>%
  filter(both_dists == 2) 

Save it out

save=TRUE

if (save==TRUE){
write_csv(coefs_trimmed_final, file = here::here(paste0('data/preprocessed_data/trimmed_clip_items', today(), '.csv')))
}

Look at removed items

removed <- co %>%
  rename(slope = a1) %>%
  filter(slope<.4) %>%
  left_join(clip_items) 
removed_pairs <- co %>%
  left_join(clip_items) %>%
  filter(word1 %in% removed$word1)

Get distal distractors

Join back full set of potential items

load(file = here::here('data/preprocessed_data/item_corr_with_aoa.RData'))

Candidate distal pairs – not problematic and not duplicates and same aoa range between target and distractors

candidate_distal_pairs <- item_corr_with_aoa %>%
  filter(!Word1 %in% problems$`Picture Name`) %>%#not problematic
  filter(!Word2 %in% problems$`Picture Name`) %>% #not problematic
  filter(abs(diff_aoa)<=3) %>% # keep estimated aoa same
  filter(Word1 %in% coefs_trimmed_final$word1) %>% # for items in set
  filter(!Word2 %in% coefs_trimmed_final$word2) %>% # wed on't have distractors for
  filter(!Word1 == Word2) # can't be duplicates

length(unique(candidate_distal_pairs$Word1))
## [1] 114

Can’t actually sample cross-animacy for every distractor because not enough animals…

possibilities_per_item <- candidate_distal_pairs %>%
group_by(Word1) %>%
summarize(num_items = length(unique(Word2)))

hist(possibilities_per_item$num_items)

Iteratively sample distal items based on minimum clip correlation

candidate_distal_pairs_iter = candidate_distal_pairs
count_word = 0

for (word in unique(candidate_distal_pairs$Word1)){
  print(word)
  count_word = count_word +1
  
  get_pair <- function(word,candidate_distal_pairs_iter) {
    this_word <- candidate_distal_pairs_iter %>%
    filter(Word1==word) %>%
    slice_min(n=1, order_by=cor) # by minimm clip correlation
  }
  
  this_word  = try(get_pair(word, candidate_distal_pairs_iter))
  
  candidate_distal_pairs_iter <- candidate_distal_pairs_iter %>%
    filter(Word2 != this_word$Word2)
  
  if (count_word==1){
  kept_words = this_word 
  }
  else {
    kept_words <- kept_words %>%
    full_join(this_word)
  }
}
## [1] "acorn"
## [1] "aloe"
## [1] "antenna"
## [1] "artichoke"
## [1] "bamboo"
## [1] "barrel"
## [1] "blender"
## [1] "blower"
## [1] "bobsled"
## [1] "bouquet"
## [1] "buffet"
## [1] "bulldozer"
## [1] "cake"
## [1] "candlestick"
## [1] "caramel"
## [1] "carousel"
## [1] "carrot"
## [1] "cassette"
## [1] "cheese"
## [1] "cloak"
## [1] "clothespin"
## [1] "coaster"
## [1] "cork"
## [1] "cornbread"
## [1] "corset"
## [1] "cymbal"
## [1] "dumpling"
## [1] "elbow"
## [1] "fan"
## [1] "flan"
## [1] "foam"
## [1] "footbath"
## [1] "fox"
## [1] "freezer"
## [1] "fruitcake"
## [1] "gondola"
## [1] "grate"
## [1] "gutter"
## [1] "hamster"
## [1] "headdress"
## [1] "hedgehog"
## [1] "hoe"
## [1] "honey"
## [1] "hopscotch"
## [1] "horn"
## [1] "kimono"
## [1] "koala"
## [1] "latch"
## [1] "locker"
## [1] "lollipop"
## [1] "mandolin"
## [1] "map"
## [1] "marshmallow"
## [1] "milkshake"
## [1] "mulch"
## [1] "net"
## [1] "oatmeal"
## [1] "oil"
## [1] "omelet"
## [1] "otter"
## [1] "pajamas"
## [1] "parsley"
## [1] "pie"
## [1] "pistachio"
## [1] "pitcher"
## [1] "potato"
## [1] "prism"
## [1] "prune"
## [1] "puddle"
## [1] "pump"
## [1] "rice"
## [1] "saddle"
## [1] "sandbag"
## [1] "sauerkraut"
## [1] "scaffolding"
## [1] "scoop"
## [1] "scrabble"
## [1] "seagull"
## [1] "ship"
## [1] "shower"
## [1] "silverware"
## [1] "sink"
## [1] "ski"
## [1] "sloth"
## [1] "snail"
## [1] "sorbet"
## [1] "spatula"
## [1] "sprinkler"
## [1] "squash"
## [1] "squirrel"
## [1] "stew"
## [1] "stretcher"
## [1] "stump"
## [1] "sunflower"
## [1] "swing"
## [1] "swordfish"
## [1] "taillight"
## [1] "tapestry"
## [1] "teabag"
## [1] "teapot"
## [1] "telescope"
## [1] "thermos"
## [1] "tongue"
## [1] "treasure"
## [1] "trumpet"
## [1] "tulip"
## [1] "turbine"
## [1] "turkey"
## [1] "turtle"
## [1] "tuxedo"
## [1] "typewriter"
## [1] "watermelon"
## [1] "waterwheel"
## [1] "whistle"

Check we did this right

distal_pairs_to_keep = kept_words

Should be empty

distal_pairs_to_keep %>%
  filter(Word2 %in% coefs_trimmed_final$word2)
## # A tibble: 0 × 12
## # … with 12 variables: Word1 <chr>, Word2 <chr>, cor <dbl>,
## #   AoA_Est_Word1 <dbl>, Animacy_Word_1 <dbl>, Audio_Path_Word1 <chr>,
## #   Audio_File_Word_1 <chr>, AoA_Est_Word2 <dbl>, Animacy_Word_2 <dbl>,
## #   Audio_Path_Word2 <chr>, Audio_File_Word_2 <chr>, diff_aoa <dbl>

Should be 114

distal_pairs_to_keep %>%
  filter(Word1 %in% coefs_trimmed_final$word1)
## # A tibble: 114 × 12
##    Word1     Word2        cor AoA_Est_Word1 Animacy_Word_1 Audio_Path_Word1     
##    <chr>     <chr>      <dbl>         <dbl>          <dbl> <chr>                
##  1 acorn     net        0.589          5.95              0 /Users/brialong/Docu…
##  2 aloe      banner     0.533          8.95              0 /Users/brialong/Docu…
##  3 antenna   backgammon 0.657          7.05              0 /Users/brialong/Docu…
##  4 artichoke kimono     0.520         10                 0 /Users/brialong/Docu…
##  5 bamboo    apron      0.601          9.28              0 /Users/brialong/Docu…
##  6 barrel    anteater   0.663          7.72              0 /Users/brialong/Docu…
##  7 blender   trampoline 0.622          6.84              0 /Users/brialong/Docu…
##  8 blower    gondola    0.638          8.32              0 /Users/brialong/Docu…
##  9 bobsled   cilantro   0.596          9.38              0 /Users/brialong/Docu…
## 10 bouquet   papaya     0.631          8.72              0 /Users/brialong/Docu…
## # … with 104 more rows, and 6 more variables: Audio_File_Word_1 <chr>,
## #   AoA_Est_Word2 <dbl>, Animacy_Word_2 <dbl>, Audio_Path_Word2 <chr>,
## #   Audio_File_Word_2 <chr>, diff_aoa <dbl>

Clean up dataframes

distal_pairs_to_keep <- distal_pairs_to_keep %>%
  select(Word1, Word2, cor) %>%
  rename(word1 = Word1, word2 = Word2) %>%
  mutate(item_pair = paste0(word1, '_', word2)) %>%
  arrange(-cor) 
distal_pairs_to_keep <- distal_pairs_to_keep %>%
  select(-cor) %>%
  mutate(wordPairing = 'distal')

Merge and write out to csv

Join data

out_data <-  coefs_trimmed_final %>%
  select(word1, word2, item_pair, wordPairing) %>%
  full_join(distal_pairs_to_keep)  %>%
  arrange(word1)

Get back item cor and aoa estimates and file paths

out_data_with_cor <- out_data %>%
  left_join(item_corr_with_aoa, by=c('word1' = 'Word1','word2' = 'Word2'))

Sanity check, should be empty, it is

check <- out_data %>%
  filter(word1 %in% problems$`Picture Name`) %>%
  filter(word2 %in% problems$`Picture Name`)

Plot out the clip correlations by item type

ggplot(data = out_data_with_cor, aes(x=wordPairing, y=cor, color=wordPairing)) +
  geom_point() +
  geom_line(aes(group=word1)) +
  theme_few() +
  facet_wrap(~as.factor(round(AoA_Est_Word1)), nrow=3) +
  ylab('clip correlation') +
  xlab('pairing category') +
  ggtitle('Target-Distractor CLIP correlations by (rounded) estimated AoA')

Write out the data

write_csv(out_data, file = here::here(paste0('data/preprocessed_data/clip_items_with_distal', today(), '.csv')))