In these files, I look at the dimensionality of vocabulary and grammar for 24 months.

1 Prep Data

Open the libraries.

library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_24 <- read_csv("full_24.csv") 

missing_items <- c(13, 14, 545) # alligator, animal, orange 

full_24 <- full_24 %>%
  filter(!(item_id %in% missing_items)) %>%
  filter(PartID != "LUCID_77") # missing complexity

Get number of distinct participants in this age

n_part  <- full_24 %>%
  group_by(PartID) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  count()

Items which at least 1 children are producing

gramm_atleast_1 <- full_24 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n > 0) %>%
  dplyr::select("item")
Complex <- full_24 %>%
  filter(section == "combine" | # to drop non-combiners
           section == "complexity" # to calculate complexity scores.
         ) 

Complex_with_ids<- Complex %>%
  dplyr::select("PartID", "item", "out") %>% 
  pivot_wider(id_cols="PartID", names_from="item", values_from="out")

Complex_with_ids_gramm <- Complex_with_ids %>%
  filter(combine > 0) %>% # select only participants combining words
  dplyr::select(-"combine") %>% # remove combine variable
  dplyr::select("PartID", gramm_atleast_1$item) # select items that 1 or more participants are producing. 

# get number of distinct participants not combining words
n_no_g <- Complex %>%
  group_by(PartID) %>%
  filter(section=="combine") %>%
  filter(out==0) %>%
  ungroup() %>%
  count()
  

# get number of distinct items
n_item <- full_24 %>% 
  filter(PartID == "CLCL_001") %>%
  dplyr::select("section", "item", "item_id") %>%
  filter(section=="complexity")
  

nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)
## [1] TRUE
ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_1) + 1
## [1] TRUE

Get table of the number of non-0 responses for each item on the complexity scale.

full_24 %>%
  filter(section %in% c("complexity")) %>%
  group_by(df, item) %>%
  summarise(
    mean = sum(out)
  ) %>%
  pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
  kable()
## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.
item CLCL LUCID
q01 68 43
q02 58 35
q03 72 51
q04 74 36
q05 50 24
q06 26 12
q07 15 7
q08 23 18
q09 21 16
q10 33 22
q11 44 31
q12 29 26
q13 30 16
q14 27 18
q15 48 33
q16 68 41
q17 68 39
q18 20 20
q19 21 13
q20 30 22
q21 17 11
q22 25 16
q23 6 4
q24 8 9
q25 29 21
q26 47 27
q27 16 17
q28 14 8
q29 53 35
q30 44 26
q31 32 24
q32 41 32
q33 34 25
q34 12 10
q35 20 11
q36 14 12
q37 25 20

2 Test run on one list of 13 items

2.1 Processing vocab

Vocab items that at least 1 participants are producing (a)

vocab_atleast_1  <- full_24 %>%
  filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
  filter(class=="Vocab") %>%
  group_by(item_id) %>%
  mutate(
    count = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item_id) %>%
  filter(count>0) %>%
  dplyr::select("item_id")
Item <- filter(full_24, PartID=="CLCL_001") # make list of items

open_class <- c(
  "action.words", # sections that are open-class
  "animal.names", 
  "body.parts", 
  "clothing", 
  "descriptive.words", 
  "food.and.drink", 
  "furniture.and.rooms", 
  "small.household.items", 
  "toys", 
  "vehicles", 
  "outside.things"
)

Item <- filter(Item, (item_id %in% vocab_atleast_1$item_id) & (section %in% open_class)) # select open-class items with >= 1 participants
set.seed(245)

f1 <- 43/nrow(Item) # proportion for 13 tiems

DF <- Item %>%
  group_by(section) %>%
  slice_sample(prop= f1, replace=FALSE) %>%
  slice(-8) %>% # remove one random cell
  ungroup()

n_open = nrow(filter(Item, section %in% open_class))
Item %>%
  filter(section %in% open_class) %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/n_open
  )
DF %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/37
  )
total_37 <- full_24 %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_37=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_37")

total_n37 <- full_24 %>%
  filter(section %in% open_class) %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_others")

corr = full_join(total_37, total_n37, by="PartID") %>%
  drop_na()

cor(corr$total_37, corr$total_others) # .96
## [1] 0.9682885

Make short version of vocab for IRT

Vocab_short_with_ids <- full_24 %>% 
  filter(item_id %in% DF$item_id) %>%
  mutate(
    id = paste("Vocab", as.character(item_id), by="_")
  ) %>%
  dplyr::select(PartID, id, out) %>%
  pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

3 Combine

full <- left_join(
  Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
) 

full_noid <- full %>%
  dplyr::select(-"PartID")
#m1 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=1000))
#saveRDS(m1, "IRT_models/24mo/m1.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000  iterations and it's close. 
#saveRDS(m2, "IRT_models/24mo/m2.rds")
#m3 <- mirt(full_noid, 1, "2PL", dentype = "EH", technical=list(NCYCLES=2000))
#saveRDS(m3, "IRT_models/24mo/m3.rds")

m1 <-  readRDS("IRT_models/24mo/m1.rds")
m2 <-  readRDS("IRT_models/24mo/m2.rds")
m3 <-  readRDS("IRT_models/24mo/m3.rds")

3.1 Combine

fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]
tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

grammar <- full_24 %>%
  filter(section=="complexity") %>%
  group_by(PartID) %>%
  mutate(
    Complex = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", grammar="Complex")

vocab <- full_24 %>%
  filter(class=="Vocab") %>%
  group_by(PartID) %>%
  mutate(
    Vocab = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", vocab="Vocab")

vg <- full_join(grammar, vocab, by="PartID")
vglv <- Complex_with_ids_gramm %>%
  bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
  left_join(vg, by="PartID") %>%
  pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam")  + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal() 

vocab/grammar
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))

conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.520    0.520
## ASSI           0.352    0.352
## RATIO          0.598    0.598
## MADCOV100      0.869    0.869
## MCOV100        0.076    0.076
conf_spline <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.433    0.433
## ASSI           0.287    0.287
## RATIO          0.522    0.522
## MADCOV100      0.828    0.828
## MCOV100        0.158    0.158
conf_EH <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.530    0.530
## ASSI           0.341    0.341
## RATIO          0.598    0.598
## MADCOV100      0.887    0.887
## MCOV100        0.143    0.143
dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  # Make list of vocab items
  book <- Item %>% 
  filter(section %in% open_class) %>% #select open-class items
  filter(item_id %in% vocab_atleast_1$item_id) %>% # select items that at least 1 participants are producing
  slice_sample(n= items, replace=FALSE) # select random sample of items
 
  # Select items from vocab dataset
  Vocab2<- full_24 %>%
     filter(item_id %in% book$item_id) %>% 
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
     pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

  # Make data frame with non-selected items for calculating correlation coefficients.
  Vocab3 <-  full_24 %>%
     filter(section %in% open_class) %>%
     filter(!(item_id %in% book$item_id)) %>%
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
    pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 
  
  # Correlation
  r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
  
  # IRT models
  full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
  dplyr::select(-"PartID") 
  m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
  m_s <- mirt(full, 1, "spline",  technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL",  technical=list(NCYCLES=2000), dentype = "EH")
  
  # Extract latent variables
  fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
  fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
  fscores_EH  <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
  
  # DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 37), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)

  # save reuslts
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}
#sim37 <- dim_sim(37, 100)
#write_rds(sim37, "Dim_Sim_Output/sim_24.rds")
sim40 <- read_rds("Dim_Sim_Output/sim_24.rds")
sim40 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )
library(ggpirate)
sim40 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 12 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/detect_plot_24.png", last_plot(), dpi=500)
## Saving 7 x 5 in image
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 12 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
sim40 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .40),
    mean = mean(value)
  )