This is the analysis of the younger longitudinal data from Wordbank. I have included all items which at least 1 child is producing. The output with only items which 5 or more participants produced is available here

1 Prep Data

Open the libraries.

options(max.print=500)
library(wordbankr) # WB data
library(patchwork)
library(GGally)
library(tidyverse) # tidy
library(mirt) # IRT models
library(knitr) # some formatting, tables, etc
library(sirt) # additional IRT functions

Open data: data saved locally because Wordbank will be updated sometime soon.

Inst <- read_rds("Wordbank_Data/Inst_WS_Eng.rds")
Admin <- read_rds("Wordbank_Data/Admin_WS_Eng.rds")
N_total = nrow(Admin) # making sure things add up later
N_long = nrow(filter(Admin, longitudinal==TRUE)) # making sure things add up later
Item <- read_rds("Wordbank_Data/Item_WS_Eng.rds")

Select longitudinal data for younger participants.

Younger <- Admin %>%
  filter(longitudinal==TRUE) %>%
  filter(age < 20)

N_younger = nrow(Younger) # making sure things add up later

Select complexity data items for younger participants and re-code

Younger_Complex <- Younger %>% # combine admin, inst and item. 
  left_join(.,Inst, by="data_id") %>% # left_join because want to add cases that are not in older
  left_join(., Item, by="num_item_id") %>%
  filter(type == "combine" | # to drop non-combiners
           type == "complexity" # to calculate complexity scores.
         ) %>%
  mutate(
    out = ifelse(value=="complex" | value=="sometimes" | value=="produces", yes=1, 
                 no = ifelse(value=="often", yes=2, no =0))
  ) 

N_complexity_items = nrow(filter(Item, type == "combine" | 
           type == "complexity"))

nrow(Younger_Complex) == (N_younger)*N_complexity_items # sanity check
## [1] TRUE

The complexity category variable, coded by WB, is either morphology or syntax for each item on the complexity scale. This will be useful for labeling items later (so we can have morphology760 instead of the definition), but I’ll add “combine” so this shows up in the complexity category variable as well.

Younger_Complex$complexity_category <- ifelse(Younger_Complex$complexity_category == "", yes=Younger_Complex$type, no=Younger_Complex$complexity_category)

Make list of grammar items that at least 5 participants are producing.

# Make list of grammar items that are at least 5 (can't use these). 
gramm_atleast_1 <- Younger_Complex %>%
  group_by(definition) %>%
  mutate(
    ttl = sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  filter(ttl > 0 ) %>%
  dplyr::select("definition", "num_item_id")

Make wide format for IRT models.

# Long to Wide
Younger_Complex_short_with_ids_all <- Younger_Complex %>% 
  filter(num_item_id %in% gramm_atleast_1$num_item_id) %>% # select complexity items
  dplyr::select(data_id, value, out, complexity_category, num_item_id) %>% # select relevant variables
  mutate(
    label = str_c(complexity_category, num_item_id) # mqke new label
  ) %>%
  pivot_wider(id_cols=data_id, names_from = "label", values_from="out") %>% # wide
  dplyr::select(starts_with(c("data_id", "combine", "morphology", "syntax"))) #select relevant variables
 
Younger_Complex_short_with_ids <- Younger_Complex_short_with_ids_all %>%  
drop_na() # drop rows of participants missing data

 # get number of NAs
Younger_Complex_short_grammatical <- Younger_Complex_short_with_ids %>%
  filter(combine760 > 0) %>% # Drop participants wwho are not yet combining words. 
  dplyr::select(starts_with(c("morphology", "syntax", "data_id"))) 

# Sanity Check
N_NA_Younger <- nrow(Younger_Complex_short_with_ids_all) - nrow(Younger_Complex_short_with_ids %>%
  dplyr::select(starts_with(c("data_id", "combine", "morphology", "syntax"))) %>%
  drop_na()) # Number of NAs
N_nog_younger <- nrow(filter(Younger_Complex_short_with_ids, combine760 == 0)) # Number of children not combining words
nrow(Younger_Complex_short_grammatical) == N_younger - N_NA_Younger - N_nog_younger
## [1] TRUE

2 Example Analysis

In order to illustrate the analysis, both to myself and readers, I’ve run the analysis one time below. In this section, I create a subset of vocabulary items (constrained so that they are sampled proportionally to the numbers of each subsection on the CDI), combine this with the grammar items, and run the analysis on that dataset. In the following section, I repeated this analysis 100 times.

2.1 Make Vocab Dataset

Make vocab data: start by selecting all vocabulary items.

Younger_Vocab <- Younger  %>%
  left_join(.,Inst, by="data_id") %>%
  left_join(., Item, by="num_item_id") %>%
  filter(type == "word"
         ) %>%
  mutate(
    out = ifelse(value=="produces", yes=1, no =0)
    ) 

N_vocab = nrow(filter(Item, type == "word")) 

3 Test run on one list of 28 items

Create lists of 16 items (only 16 items > 5 on complexity)

set.seed(6531)

f1 <- 33.5/nrow(filter(Item, lexical_category == "nouns" | lexical_category == "predicates")) # only nouns and verbs

# Make list of words that are all 0 (can't use these). 
atleast_1 <- Younger_Vocab %>%
  filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
  group_by(definition) %>%
  mutate(
    ttl = sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  filter(ttl > 0)


DF <- Item %>%
  filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
  group_by(category) %>% # selected 40, all valid without selecting only certain words.
  slice_sample(prop= f1, replace=FALSE) %>%
  ungroup()

Check that these new lists are similar to the original data set in terms of content

Item %>%
  filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
  group_by(category) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/474
  )
DF %>%
  group_by(category) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/28
  )

Select subsets of vocabulary items defined in those lists above. Check correlation between total of selected items and total of non-selected items.

total_28 <- Younger_Vocab %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(data_id) %>%
  mutate(
    total_28=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("data_id", "total_28")

total_n28 <- Younger_Vocab %>%
  filter(lexical_category == "predicates" | lexical_category=="nouns") %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(data_id) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("data_id", "total_others")

corr = full_join(total_28, total_n28, by="data_id") %>%
  drop_na()

cor(corr$total_28, corr$total_others) # .88
## [1] 0.9445435

Format data set for IRT

Younger_Vocab_short_with_ids_28 <- Younger_Vocab %>% 
  filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
  filter(item_id %in% DF$item_id) %>%
  dplyr::select(data_id, value, out, definition) %>%
  pivot_wider(id_cols=data_id, names_from = "definition", values_from="out")  %>% 
  drop_na() # no item-level missing data
full_younger_28 <- left_join(
  Younger_Complex_short_grammatical, Younger_Vocab_short_with_ids_28, by="data_id"
) %>%
  dplyr::select(-c("data_id")) 


nrow(full_younger_28) == N_younger - N_NA_Younger - N_nog_younger # Still okay -- not missing item-level data from vocab.
## [1] TRUE
full_younger_28 %>%
  pivot_longer(everything(), names_to="item", values_to="out")  %>%
  mutate(
    type = ifelse(str_detect(item, "morphology"), yes="morphology", no = 
                   ifelse(str_detect(item, "syntax"), yes="syntax", no = "word"))
  ) %>%
  group_by(type, item) %>%
  summarise(
    sum = sum(out)
  ) %>%
  ungroup(item) %>%
  kable()
## `summarise()` has grouped output by 'type'. You can override using the `.groups`
## argument.
type item sum
morphology morphology761 22
morphology morphology762 18
morphology morphology763 22
morphology morphology764 8
morphology morphology765 1
morphology morphology766 5
morphology morphology767 11
morphology morphology768 17
morphology morphology769 12
morphology morphology770 3
morphology morphology772 2
syntax syntax773 5
syntax syntax774 10
syntax syntax775 3
syntax syntax777 1
syntax syntax778 3
syntax syntax779 2
syntax syntax780 13
syntax syntax781 8
syntax syntax782 3
syntax syntax784 5
syntax syntax785 1
syntax syntax786 1
syntax syntax789 1
syntax syntax790 4
syntax syntax791 5
syntax syntax792 5
syntax syntax793 3
word ant 21
word backyard 9
word bubbles 128
word candy 31
word cheerios 66
word chicken (animal) 41
word clock 48
word dish 15
word door 90
word dryer 3
word eat 68
word empty 17
word eye 143
word flag 11
word hard 4
word hit 13
word meat 20
word necklace 19
word owl 46
word paint 3
word pancake 18
word push 25
word ride 22
word sing 18
word sticky 6
word swim 15
word tired 13
word toothbrush 49

3.1 DETECT.

We need to remove the items that have all 0s (hence the missing object)

#m1 <- mirt(full_younger_28, 1, "2PL", technical=list(NCYCLES=1000))
#m2 <- mirt(full_younger_28, 1, "spline", technical=list(NCYCLES=2000))
#m3 <- mirt(full_younger_28, 1, "2PL", dentype = "EH")



#write_rds(m1, "IRT_output/m1_young_full.rds")
#write_rds(m2,  "IRT_output/m2_young_full.rds")
#write_rds(m3,  "IRT_output/m3_young_full.rds")


m1 <-  readRDS("IRT_output/m1_young_full.rds")
m2 <-  readRDS("IRT_output/m2_young_full.rds")
m3 <-  readRDS("IRT_output/m3_young_full.rds")

Save latent variables

fscores = fscores(m1)[,1]
fscores_spline = fscores(m2)[,1]
fscores_EH = fscores(m3, use_dentype_estimate=TRUE)[,1]

Plot latent variables

tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

Make dataset of total (raw) vocabulary and grammar

vocab_and_grammar <- Younger_Complex %>% # data frame defined above
  filter(type=="complexity") %>%  # select complexity items only
  group_by(data_id) %>%  # sort by participant
  mutate(
    total = sum(out) # make total score for complexity
  ) %>%
  slice(1) %>% # select 1 row per participant
  ungroup() %>%
  dplyr::select("data_id", "production", "total") # this already contains production score from Admin dataset
vglv <- bind_cols(Younger_Complex_short_grammatical, TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("data_id", "TwoPL", "Spline", "EH") %>%
  left_join(vocab_and_grammar, by="data_id") %>%
  pivot_longer(-c("data_id", "production", "total"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=production, y=LV)) + geom_point() + geom_smooth(method="gam", se=FALSE) + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=total, y=LV)) + geom_point() + geom_smooth(method="gam", se=FALSE) + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()

vocab/grammar

ggsave(filename="plots/wordbank/LV_plot_younger_full.png", last_plot(), dpi=500)  

3.1.1 Confirmatory

full2 = data.frame(full_younger_28)

dtct <- c(rep(1, 28), rep(2, 28))

conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.316    0.316
## ASSI           0.461    0.461
## RATIO          0.697    0.697
## MADCOV100      0.453    0.453
## MCOV100       -0.027   -0.027
conf <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.029    0.029
## ASSI           0.079    0.079
## RATIO          0.117    0.117
## MADCOV100      0.246    0.246
## MCOV100       -0.008   -0.008
conf <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.302    0.302
## ASSI           0.366    0.366
## RATIO          0.626    0.626
## MADCOV100      0.481    0.481
## MCOV100        0.136    0.136

4 Analysis over 100 subsets

Now write a function that selects different subsets of vocabulary (ideally 37 items, but this function drops vocabulary items that no participants produce) and runs detect on all of those. I’m sticking to the IRT approach, rather than the sumscore approach, because use_sum_score=TRUE seems to deflate detect and I’m not sure why.

dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  ## Make list of XX items
  book <- atleast_1 %>% # choose items (from dataset with 5 or more, nouns and predicates only)
  filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
  slice_sample(n=items, replace=FALSE) # select items
 
  ##Select relevant items
  Vocab2 <- Younger_Vocab %>%
     filter(item_id %in% book$item_id) %>% # filter items selected above
     dplyr::select(data_id, value, out, definition) %>% # select variables
     pivot_wider(id_cols=data_id, names_from = "definition", values_from="out")  # make wide
  
  # Make data set of non-selected items (for calculating correlation coefficients)
   Vocab3 <- Younger_Vocab %>%
     filter( source_name != "Byers" & data_id != "135056")%>% # missing item-level data
     filter(!(item_id %in% book$item_id)) %>%
     filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
     dplyr::select(data_id, value, out, definition) %>%
     pivot_wider(id_cols=data_id, names_from = "definition", values_from="out") %>%
    drop_na()# Make vocabulary dataframe
   
   # calculate correlation between included and excluded items
    r = cor(rowSums(Vocab2[,2:items + 1]), rowSums(Vocab3[,2:(478- items + 1)]))
  
  # Combine selected vocabulary items with grammar items
  full <- left_join(Younger_Complex_short_with_ids, Vocab2, by="data_id") %>%
  filter(combine760 > 0) %>%
  dplyr::select(-c("data_id", "combine760")) %>%
  dplyr::select(where(~sum(.) != 0))
  
  # run IRT models
  m1 <- mirt(full, 1, "2PL", verbose=FALSE)
  m_s <- mirt(full, 1, "spline", verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL", dentype = "EH", verbose=FALSE)
  
  # save factor scores
  fscores <-fscores(m1)[,1]
  fscores_s <-fscores(m_s)[,1]
  fscores_EH <-fscores(m_EH)[,1]
  
  # run DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 29), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)


  #combine results with previous iterations  
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}

Run the sim 100 times and save the output; I’ve not included it, because otherwise the output for each run shows up and blows up R Markdown (plus it takes a while)

#sim29<- dim_sim(28, 100)

#write_rds(sim29, "Dim_Sim_Output/younger_full.rds")

sim29 <- read_rds("Dim_Sim_Output/younger_full.rds")

Correlations between vocab subset and total vocab

sim29 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )

Plot

library(ggpirate)
sim29 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .1, label = "Essential Uniidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 52 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/wordbank/detect_plot_younger_full.png", last_plot(), dpi=500)  
## Saving 7 x 5 in image
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 52 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
sim29 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .20),
    mean = mean(value)
  )
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Dutch_Netherlands.1252  LC_CTYPE=Dutch_Netherlands.1252   
## [3] LC_MONETARY=Dutch_Netherlands.1252 LC_NUMERIC=C                      
## [5] LC_TIME=Dutch_Netherlands.1252    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] ggpirate_0.1.2  sirt_3.11-21    knitr_1.37      mirt_1.36.1    
##  [5] lattice_0.20-45 forcats_0.5.1   stringr_1.4.0   dplyr_1.0.7    
##  [9] purrr_0.3.4     readr_2.1.2     tidyr_1.2.0     tibble_3.1.6   
## [13] tidyverse_1.3.1 GGally_2.1.2    ggplot2_3.3.5   patchwork_1.1.2
## [17] wordbankr_0.3.1
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-155          fs_1.5.2              lubridate_1.8.0      
##  [4] RColorBrewer_1.1-3    httr_1.4.2            Deriv_4.1.3          
##  [7] tools_4.1.0           backports_1.4.1       bslib_0.3.1          
## [10] utf8_1.2.2            R6_2.5.1              vegan_2.5-7          
## [13] DBI_1.1.2             mgcv_1.8-39           colorspace_2.0-3     
## [16] permute_0.9-7         withr_2.5.0           tidyselect_1.1.2     
## [19] gridExtra_2.3         compiler_4.1.0        polycor_0.8-1        
## [22] cli_3.0.1             rvest_1.0.2           quantreg_5.88        
## [25] TAM_3.7-16            SparseM_1.81          xml2_1.3.3           
## [28] labeling_0.4.2        sass_0.4.0            scales_1.2.0         
## [31] mvtnorm_1.1-3         pbapply_1.5-0         digest_0.6.29        
## [34] rmarkdown_2.13        quantregGrowth_1.4-0  pkgconfig_2.0.3      
## [37] htmltools_0.5.2       highr_0.9             dbplyr_2.1.1         
## [40] fastmap_1.1.0         rlang_1.0.2           readxl_1.3.1         
## [43] rstudioapi_0.13       farver_2.1.0          jquerylib_0.1.4      
## [46] generics_0.1.3        jsonlite_1.8.0        dcurver_0.9.2        
## [49] magrittr_2.0.1        Matrix_1.4-0          Rcpp_1.0.8.3         
## [52] munsell_0.5.0         fansi_1.0.2           lifecycle_1.0.1      
## [55] stringi_1.7.6         yaml_2.3.5            MASS_7.3-55          
## [58] plyr_1.8.6            grid_4.1.0            parallel_4.1.0       
## [61] crayon_1.5.0          haven_2.4.3           splines_4.1.0        
## [64] hms_1.1.1             pillar_1.8.0          admisc_0.26          
## [67] reprex_2.0.1          GPArotation_2014.11-1 glue_1.4.2           
## [70] evaluate_0.15         renv_0.15.4           modelr_0.1.8         
## [73] vctrs_0.3.8           tzdb_0.2.0            MatrixModels_0.5-0   
## [76] cellranger_1.1.0      gtable_0.3.0          reshape_0.8.8        
## [79] assertthat_0.2.1      CDM_7.5-15            xfun_0.30            
## [82] broom_0.7.12          cluster_2.1.2         ellipsis_0.3.2