In these files, I look at the dimensionality of vocabulary and grammar for items produced by 5 or more children.

1 Prep Data

Open the libraries.

library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_30 <- read_csv("full_30.csv") 

missing_items <- c(13, 14, 545) # alligator, animal, orange

full_30 <- full_30 %>%
  filter(!(item_id %in% missing_items)) 

Get number of distinct participants in this age

n_part  <- full_30 %>%
  group_by(PartID) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  count()

Items which at least 5 children are producing

gramm_atleast_5 <- full_30 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n > 4) %>%
  dplyr::select("item")
Complex <- full_30 %>%
  filter(section == "combine" | # to drop non-combiners
           section == "complexity" # to calculate complexity scores.
         ) 

Complex_with_ids<- Complex %>%
  dplyr::select("PartID", "item", "out") %>% 
  pivot_wider(id_cols="PartID", names_from="item", values_from="out")

Complex_with_ids_gramm <- Complex_with_ids %>%
  filter(combine > 0) %>% # select only participants combining words
  dplyr::select(-"combine") %>% # remove combine variable
  dplyr::select("PartID", gramm_atleast_5$item) # select items that 5 or more participants are producing. 

# get number of distinct participants not combining words
n_no_g <- Complex %>%
  group_by(PartID) %>%
  filter(section=="combine") %>%
  filter(out==0) %>%
  ungroup() %>%
  count()
  

# get number of distinct items
n_item <- full_30 %>% 
  filter(PartID == "CLCL_001") %>%
  dplyr::select("section", "item", "item_id") %>%
  filter(section=="complexity")
  

nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)
## [1] TRUE
ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_5) + 1
## [1] TRUE

Get table of the number of non-0 responses for each item on the complexity scale.

full_30 %>%
  filter(section %in% c("complexity")) %>%
  group_by(df, item) %>%
  summarise(
    mean = sum(out)
  ) %>%
  pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
  kable()
## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.
item CLCL LUCID
q01 85 63
q02 76 59
q03 90 64
q04 91 57
q05 70 57
q06 68 48
q07 52 41
q08 61 52
q09 68 54
q10 58 49
q11 73 63
q12 66 64
q13 65 54
q14 61 41
q15 80 64
q16 87 68
q17 81 66
q18 61 54
q19 62 54
q20 76 56
q21 56 48
q22 71 51
q23 45 40
q24 49 52
q25 73 57
q26 81 64
q27 60 52
q28 50 40
q29 85 65
q30 73 62
q31 73 61
q32 78 60
q33 78 63
q34 42 42
q35 46 47
q36 62 50
q37 65 55

2 Test run on one list of 13 items

2.1 Processing vocab

Vocab items that at least 5 participants are producing (a)

vocab_atleast_5  <- full_30 %>%
  filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
  filter(class=="Vocab") %>%
  group_by(item_id) %>%
  mutate(
    count = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item_id) %>%
  filter(count>4) %>%
  filter(count <170) %>% # can't have all 1s
  dplyr::select("item_id")
Item <- filter(full_30, PartID=="CLCL_001") # make list of items

open_class <- c(
  "action.words", # sections that are open-class
  "animal.names", 
  "body.parts", 
  "clothing", 
  "descriptive.words", 
  "food.and.drink", 
  "furniture.and.rooms", 
  "small.household.items", 
  "toys", 
  "vehicles", 
  "outside.things"
)

Item <- filter(Item, (item_id %in% vocab_atleast_5$item_id) & (section %in% open_class)) # select open-class items with >= 5 participants
items_to_drop <- full_30 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n == 0) %>%
  dplyr::select("item")
set.seed(245)

f1 <- 43/nrow(Item) # proportion for 13 tiems

DF <- Item %>%
  group_by(section) %>%
  slice_sample(prop= f1, replace=FALSE) %>%
  slice(-8) %>% # remove one random cell
  ungroup()

n_open = nrow(filter(Item, section %in% open_class))
Item %>%
  filter(section %in% open_class) %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/n_open
  )
DF %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/37
  )
total_37 <- full_30 %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_37=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_37")

total_n37 <- full_30 %>%
  filter(section %in% open_class) %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_others")

corr = full_join(total_37, total_n37, by="PartID") %>%
  drop_na()

cor(corr$total_37, corr$total_others) # .97
## [1] 0.9710773
Vocab_short_with_ids <- full_30 %>% 
  filter(item_id %in% DF$item_id) %>%
  mutate(
    id = paste("Vocab", as.character(item_id), by="_")
  ) %>%
  dplyr::select(PartID, id, out) %>%
  pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

3 Combine

full <- left_join(
  Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
) 

full_noid <- full %>%
  dplyr::select(-"PartID")
#m1 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=1000))
#saveRDS(m1, "IRT_models/30mo/m1.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000  iterations and it's close. 
#saveRDS(m2, "IRT_models/30mo/m2.rds")
#m3 <- mirt(full_noid, 1, "2PL", dentype = "EH")
#saveRDS(m3, "IRT_models/30mo/m3.rds")

m1 <-  readRDS("IRT_models/30mo/m1.rds")
m2 <-  readRDS("IRT_models/30mo/m2.rds")
m3 <-  readRDS("IRT_models/30mo/m3.rds")
fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]
tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

grammar <- full_30 %>%
  filter(section=="complexity") %>%
  group_by(PartID) %>%
  mutate(
    Complex = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", grammar="Complex")

vocab <- full_30 %>%
  filter(class=="Vocab") %>%
  group_by(PartID) %>%
  mutate(
    Vocab = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", vocab="Vocab")

vg <- full_join(grammar, vocab, by="PartID")
vglv <- Complex_with_ids_gramm %>%
  bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
  left_join(vg, by="PartID") %>%
  pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam")  + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal() 

vocab/grammar
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))

conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.324    0.324
## ASSI           0.298    0.298
## RATIO          0.474    0.474
## MADCOV100      0.683    0.683
## MCOV100        0.042    0.042
conf_spline <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.285    0.285
## ASSI           0.265    0.265
## RATIO          0.438    0.438
## MADCOV100      0.650    0.650
## MCOV100        0.062    0.062
conf_EH <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.337    0.337
## ASSI           0.288    0.288
## RATIO          0.487    0.487
## MADCOV100      0.692    0.692
## MCOV100        0.137    0.137
dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  # Make list of vocab items
  book <- Item %>% 
  filter(section %in% open_class) %>% #select open-class items
  filter(item_id %in% vocab_atleast_5$item_id) %>% # select items that at least 5 participants are producing
  slice_sample(n= items, replace=FALSE) # select random sample of items
 
  # Select items from vocab dataset
  Vocab2<- full_30 %>%
     filter(item_id %in% book$item_id) %>% 
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
     pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

  # Make data frame with non-selected items for calculating correlation coefficients.
  Vocab3 <-  full_30 %>%
     filter(section %in% open_class) %>%
     filter(!(item_id %in% book$item_id)) %>%
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
    pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 
  
  # Correlation
  r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
  
  # IRT models
  full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
  dplyr::select(-"PartID") 
  m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
  m_s <- mirt(full, 1, "spline",  technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL",  technical=list(NCYCLES=2000), dentype = "EH",verbose=FALSE)
  
  # Extract latent variables
  fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
  fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
  fscores_EH  <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
  
  # DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 37), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)

  # save reuslts
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}
#sim40 <- dim_sim(37, 100)
#write_rds(sim40, "Dim_Sim_Output/full_40_30.rds")
sim30 <- read_rds("Dim_Sim_Output/full_40_30.rds")
sim30 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )
library(ggpirate)
sim30 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.8) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .20, label = "Mild Multidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .75, label = "Moderate Multidimensionality", vjust = -0.5)
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/detect_plot_30.png", last_plot(), dpi=500)
## Saving 7 x 5 in image
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Removed 1 rows containing missing values (geom_hline).
sim30 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .40),
    mean = mean(value)
  )