In these files, I look at the dimensionality of vocabulary and grammar for items produced by 1 or more children.

1 Prep Data

Open the libraries.

library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_21 <- read_csv("full_21.csv") 

missing_items <- c(13, 14, 545) # alligator, animal, orange

full_21 <- full_21 %>%
  filter(!(item_id %in% missing_items)) %>%
  filter(PartID != "LUCID_12") # missing some obs

Get number of distinct participants in this age.

n_part  <- full_21 %>%
  group_by(PartID) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  count()

Items which at least 5 children are producing

gramm_atleast_1 <- full_21 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n > 0) %>%
  dplyr::select("item")

Complex <- full_21 %>%
  filter(section == "combine" | # to drop non-combiners
           section == "complexity" # to calculate complexity scores.
         ) 

Complex_with_ids<- Complex %>%
  dplyr::select("PartID", "item", "out") %>% 
  pivot_wider(id_cols="PartID", names_from="item", values_from="out")

Complex_with_ids_gramm <- Complex_with_ids %>%
  filter(combine > 0) %>% # select only participants combining words
  dplyr::select(-"combine") %>% # remove combine variable
  dplyr::select("PartID", gramm_atleast_1$item) # select items that 5 or more participants are producing. 

# get number of distinct participants not combining words
n_no_g <- Complex %>%
  group_by(PartID) %>%
  filter(section=="combine") %>%
  filter(out==0) %>%
  ungroup() %>%
  count()
  

# get number of distinct items
n_item <- full_21 %>% 
  filter(PartID == "CLCL_005") %>%
  dplyr::select("section", "item", "item_id") %>%
  filter(section=="complexity")
  

nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)

## [1] TRUE

ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_1) + 1

## [1] TRUE

Get table of the number of non-0 responses for each item on the complexity scale.

full_21 %>%
  filter(section %in% c("complexity")) %>%
  group_by(df, item) %>%
  summarise(
    mean = sum(out)
  ) %>%
  pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
  kable()

## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.

item	CLCL	LUCID
q01	40	26
q02	36	23
q03	45	27
q04	45	17
q05	18	6
q06	4	3
q07	2	1
q08	4	3
q09	8	7
q10	9	5
q11	20	9
q12	5	7
q13	6	3
q14	10	8
q15	23	18
q16	46	23
q17	28	23
q18	10	5
q19	4	2
q20	5	5
q21	2	1
q22	3	3
q23	0	1
q24	5	2
q25	7	6
q26	12	9
q27	5	3
q28	5	2
q29	15	10
q30	13	12
q31	6	5
q32	18	12
q33	9	8
q34	3	1
q35	4	4
q36	1	1
q37	6	5

2 Test run on one list of 13 items

2.1 Processing vocab

Vocab items that at least 5 participants are producing

vocab_atleast_1  <- full_21 %>%
  filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
  filter(class=="Vocab") %>%
  group_by(item_id) %>%
  mutate(
    count = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item_id) %>%
  filter(count>0) %>%
  dplyr::select("item_id")

Item <- filter(full_21, PartID=="CLCL_005") # make list of items

open_class <- c(
  "action.words", # sections that are open-class
  "animal.names", 
  "body.parts", 
  "clothing", 
  "descriptive.words", 
  "food.and.drink", 
  "furniture.and.rooms", 
  "small.household.items", 
  "toys", 
  "vehicles", 
  "outside.things"
)

Item <- filter(Item, (item_id %in% vocab_atleast_1$item_id) & (section %in% open_class)) # select open-class items with >= 5 participants

set.seed(75321179)

f1 <- 43/nrow(Item) # proportion for 37 tiems

DF <- Item %>%
  group_by(section) %>%
  slice_sample(prop= f1, replace=FALSE) %>%
  ungroup() %>%
  slice(-7) # remove one random row so there are 37 

n_open = nrow(filter(Item, section %in% open_class))

Item %>%
  filter(section %in% open_class) %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/n_open
  )

DF %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/37
  )

total_37 <- full_21 %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_37=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_37")

total_n37<- full_21 %>%
  filter(section %in% open_class) %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_others")

corr = full_join(total_37, total_n37, by="PartID") %>%
  drop_na()

cor(corr$total_37, corr$total_others) # .98

## [1] 0.9761915

Make short version of vocab for IRT

Vocab_short_with_ids <- full_21 %>% 
  filter(item_id %in% DF$item_id) %>%
  mutate(
    id = paste("Vocab", as.character(item_id), by="_")
  ) %>%
  dplyr::select(PartID, id, out) %>%
  pivot_wider(id_cols=PartID, names_from = "id", values_from="out")

3 Combine

full <- left_join(
  Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
) 

full_noid <- full %>%
  dplyr::select(-"PartID")

#m1 <- mirt(full_noid, 1, technical=list(NCYCLES=2000), "2PL")
#saveRDS(m1, "IRT_models/21mo/m1_full.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=2000)) # doesn't improve at 2000  iterations and it's close. 
#saveRDS(m2, "IRT_models/21mo/m2_full.rds")
#m3 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=2000), dentype = "EH")
#saveRDS(m3, "IRT_models/21mo/m3_full.rds")

m1 <-  readRDS("IRT_models/21mo/m1_full.rds")
m2 <-  readRDS("IRT_models/21mo/m2_full.rds")
m3 <-  readRDS("IRT_models/21mo/m3_full.rds")

Get factor scores

fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]

Scatterplot matrix

tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

Make data frame with total vocab and complexity scores to compare raw scores to latent variables.

grammar <- full_21 %>%
  filter(section=="complexity") %>%
  group_by(PartID) %>%
  mutate(
    Complex = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", grammar="Complex")

vocab <- full_21 %>%
  filter(class=="Vocab") %>%
  group_by(PartID) %>%
  mutate(
    Vocab = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", vocab="Vocab")

vg <- full_join(grammar, vocab, by="PartID")

vglv <- Complex_with_ids_gramm %>%
  bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
  left_join(vg, by="PartID") %>%
  pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam")  + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal() 

vocab/grammar

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))

conf <- conf.detect(full2, fscores, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.312    0.312
## ASSI           0.235    0.235
## RATIO          0.464    0.464
## MADCOV100      0.672    0.672
## MCOV100       -0.027   -0.027

conf_spline <- conf.detect(full2, fscores_spline, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.329    0.329
## ASSI           0.247    0.247
## RATIO          0.494    0.494
## MADCOV100      0.665    0.665
## MCOV100        0.010    0.010

conf_EH <- conf.detect(full2, fscores_EH, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.312    0.312
## ASSI           0.229    0.229
## RATIO          0.467    0.467
## MADCOV100      0.669    0.669
## MCOV100        0.002    0.002

4 Analysis of 100 data sets

dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  # Make list of vocab items
  book <- Item %>% 
  filter(section %in% open_class) %>% #select open-class items
  filter(item_id %in% vocab_atleast_1$item_id) %>% # select items that at least 1 participant is producing
  slice_sample(n= items, replace=FALSE) # select random sample of items
 
  # Select items from vocab dataset
  Vocab2<- full_21 %>%
     filter(item_id %in% book$item_id) %>% 
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
     pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

  # Make data frame with non-selected items for calculating correlation coefficients.
  Vocab3 <-  full_21 %>%
     filter(section %in% open_class) %>%
     filter(!(item_id %in% book$item_id)) %>%
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
    pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 
  
  # Correlation
  r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
  
  # IRT models
  full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
  dplyr::select(-"PartID") 
  m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
  m_s <- mirt(full, 1, "spline",  technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL",  technical=list(NCYCLES=2000), dentype = "EH")
  
  # Extract latent variables
  fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
  fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
  fscores_EH  <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
  
  # DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 37), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)

  # save reuslts
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}

#sim21 <- dim_sim(37, 100)
#write_rds(sim21, "Dim_Sim_Output/sim_21_full.rds")
sim21 <- read_rds("Dim_Sim_Output/sim_21_full.rds")

sim21 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )

library(ggpirate)
sim21 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)

## Warning: Removed 3 rows containing missing values (geom_col_pirate).

## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/detect_plot_21.png", last_plot(), dpi=500)

## Saving 7 x 5 in image

## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Removed 1 rows containing missing values (geom_hline).

sim21 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .20),
    mean = mean(value)
  )

Dimensionality of Longitudinal Data Vocabulary and Grammar at twentyone months

Seamus Donnelly

2023-02-23

1 Prep Data

2 Test run on one list of 13 items

2.1 Processing vocab

3 Combine

4 Analysis of 100 data sets

item	CLCL	LUCID
q01	40	26
q02	36	23
q03	45	27
q04	45	17
q05	18	6
q06	4	3
q07	2	1
q08	4	3
q09	8	7
q10	9	5
q11	20	9
q12	5	7
q13	6	3
q14	10	8
q15	23	18
q16	46	23
q17	28	23
q18	10	5
q19	4	2
q20	5	5
q21	2	1
q22	3	3
q23	0	1
q24	5	2
q25	7	6
q26	12	9
q27	5	3
q28	5	2
q29	15	10
q30	13	12
q31	6	5
q32	18	12
q33	9	8
q34	3	1
q35	4	4
q36	1	1
q37	6	5

item	CLCL	LUCID
q01	40	26
q02	36	23
q03	45	27
q04	45	17
q05	18	6
q06	4	3
q07	2	1
q08	4	3
q09	8	7
q10	9	5
q11	20	9
q12	5	7
q13	6	3
q14	10	8
q15	23	18
q16	46	23
q17	28	23
q18	10	5
q19	4	2
q20	5	5
q21	2	1
q22	3	3
q23	0	1
q24	5	2
q25	7	6
q26	12	9
q27	5	3
q28	5	2
q29	15	10
q30	13	12
q31	6	5
q32	18	12
q33	9	8
q34	3	1
q35	4	4
q36	1	1
q37	6	5

item	CLCL	LUCID
q01	40	26
q02	36	23
q03	45	27
q04	45	17
q05	18	6
q06	4	3
q07	2	1
q08	4	3
q09	8	7
q10	9	5
q11	20	9
q12	5	7
q13	6	3
q14	10	8
q15	23	18
q16	46	23
q17	28	23
q18	10	5
q19	4	2
q20	5	5
q21	2	1
q22	3	3
q23	0	1
q24	5	2
q25	7	6
q26	12	9
q27	5	3
q28	5	2
q29	15	10
q30	13	12
q31	6	5
q32	18	12
q33	9	8
q34	3	1
q35	4	4
q36	1	1
q37	6	5