In these files, I look at the dimensionality of vocabulary and grammar for items produced by 5 or more children.

1 Prep Data

Open the libraries.

library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_30 <- read_csv("full_30.csv") 

missing_items <- c(13, 14, 545) # alligator, animal, orange

full_30 <- full_30 %>%
  filter(!(item_id %in% missing_items))

Get number of distinct participants in this age

n_part  <- full_30 %>%
  group_by(PartID) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  count()

Items which at least 5 children are producing

gramm_atleast_5 <- full_30 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n > 4) %>%
  dplyr::select("item")

Complex <- full_30 %>%
  filter(section == "combine" | # to drop non-combiners
           section == "complexity" # to calculate complexity scores.
         ) 

Complex_with_ids<- Complex %>%
  dplyr::select("PartID", "item", "out") %>% 
  pivot_wider(id_cols="PartID", names_from="item", values_from="out")

Complex_with_ids_gramm <- Complex_with_ids %>%
  filter(combine > 0) %>% # select only participants combining words
  dplyr::select(-"combine") %>% # remove combine variable
  dplyr::select("PartID", gramm_atleast_5$item) # select items that 5 or more participants are producing. 

# get number of distinct participants not combining words
n_no_g <- Complex %>%
  group_by(PartID) %>%
  filter(section=="combine") %>%
  filter(out==0) %>%
  ungroup() %>%
  count()
  

# get number of distinct items
n_item <- full_30 %>% 
  filter(PartID == "CLCL_001") %>%
  dplyr::select("section", "item", "item_id") %>%
  filter(section=="complexity")
  

nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)

## [1] TRUE

ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_5) + 1

## [1] TRUE

Get table of the number of non-0 responses for each item on the complexity scale.

full_30 %>%
  filter(section %in% c("complexity")) %>%
  group_by(df, item) %>%
  summarise(
    mean = sum(out)
  ) %>%
  pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
  kable()

## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.

item	CLCL	LUCID
q01	85	63
q02	76	59
q03	90	64
q04	91	57
q05	70	57
q06	68	48
q07	52	41
q08	61	52
q09	68	54
q10	58	49
q11	73	63
q12	66	64
q13	65	54
q14	61	41
q15	80	64
q16	87	68
q17	81	66
q18	61	54
q19	62	54
q20	76	56
q21	56	48
q22	71	51
q23	45	40
q24	49	52
q25	73	57
q26	81	64
q27	60	52
q28	50	40
q29	85	65
q30	73	62
q31	73	61
q32	78	60
q33	78	63
q34	42	42
q35	46	47
q36	62	50
q37	65	55

2 Test run on one list of 13 items

2.1 Processing vocab

Vocab items that at least 5 participants are producing (a)

vocab_atleast_5  <- full_30 %>%
  filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
  filter(class=="Vocab") %>%
  group_by(item_id) %>%
  mutate(
    count = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item_id) %>%
  filter(count>4) %>%
  filter(count <170) %>% # can't have all 1s
  dplyr::select("item_id")

Item <- filter(full_30, PartID=="CLCL_001") # make list of items

open_class <- c(
  "action.words", # sections that are open-class
  "animal.names", 
  "body.parts", 
  "clothing", 
  "descriptive.words", 
  "food.and.drink", 
  "furniture.and.rooms", 
  "small.household.items", 
  "toys", 
  "vehicles", 
  "outside.things"
)

Item <- filter(Item, (item_id %in% vocab_atleast_5$item_id) & (section %in% open_class)) # select open-class items with >= 5 participants

items_to_drop <- full_30 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n == 0) %>%
  dplyr::select("item")

set.seed(245)

f1 <- 43/nrow(Item) # proportion for 13 tiems

DF <- Item %>%
  group_by(section) %>%
  slice_sample(prop= f1, replace=FALSE) %>%
  slice(-8) %>% # remove one random cell
  ungroup()

n_open = nrow(filter(Item, section %in% open_class))

Item %>%
  filter(section %in% open_class) %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/n_open
  )

DF %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/37
  )

total_37 <- full_30 %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_37=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_37")

total_n37 <- full_30 %>%
  filter(section %in% open_class) %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_others")

corr = full_join(total_37, total_n37, by="PartID") %>%
  drop_na()

cor(corr$total_37, corr$total_others) # .97

## [1] 0.9710773

Vocab_short_with_ids <- full_30 %>% 
  filter(item_id %in% DF$item_id) %>%
  mutate(
    id = paste("Vocab", as.character(item_id), by="_")
  ) %>%
  dplyr::select(PartID, id, out) %>%
  pivot_wider(id_cols=PartID, names_from = "id", values_from="out")

3 Combine

full <- left_join(
  Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
) 

full_noid <- full %>%
  dplyr::select(-"PartID")

#m1 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=1000))
#saveRDS(m1, "IRT_models/30mo/m1.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000  iterations and it's close. 
#saveRDS(m2, "IRT_models/30mo/m2.rds")
#m3 <- mirt(full_noid, 1, "2PL", dentype = "EH")
#saveRDS(m3, "IRT_models/30mo/m3.rds")

m1 <-  readRDS("IRT_models/30mo/m1.rds")
m2 <-  readRDS("IRT_models/30mo/m2.rds")
m3 <-  readRDS("IRT_models/30mo/m3.rds")

fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]

tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

grammar <- full_30 %>%
  filter(section=="complexity") %>%
  group_by(PartID) %>%
  mutate(
    Complex = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", grammar="Complex")

vocab <- full_30 %>%
  filter(class=="Vocab") %>%
  group_by(PartID) %>%
  mutate(
    Vocab = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", vocab="Vocab")

vg <- full_join(grammar, vocab, by="PartID")

vglv <- Complex_with_ids_gramm %>%
  bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
  left_join(vg, by="PartID") %>%
  pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam")  + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal() 

vocab/grammar

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 3 rows containing non-finite values (stat_smooth).

## Warning: Removed 3 rows containing missing values (geom_point).

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))

conf <- conf.detect(full2, fscores, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.324    0.324
## ASSI           0.298    0.298
## RATIO          0.474    0.474
## MADCOV100      0.683    0.683
## MCOV100        0.042    0.042

conf_spline <- conf.detect(full2, fscores_spline, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.285    0.285
## ASSI           0.265    0.265
## RATIO          0.438    0.438
## MADCOV100      0.650    0.650
## MCOV100        0.062    0.062

conf_EH <- conf.detect(full2, fscores_EH, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.337    0.337
## ASSI           0.288    0.288
## RATIO          0.487    0.487
## MADCOV100      0.692    0.692
## MCOV100        0.137    0.137

dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  # Make list of vocab items
  book <- Item %>% 
  filter(section %in% open_class) %>% #select open-class items
  filter(item_id %in% vocab_atleast_5$item_id) %>% # select items that at least 5 participants are producing
  slice_sample(n= items, replace=FALSE) # select random sample of items
 
  # Select items from vocab dataset
  Vocab2<- full_30 %>%
     filter(item_id %in% book$item_id) %>% 
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
     pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

  # Make data frame with non-selected items for calculating correlation coefficients.
  Vocab3 <-  full_30 %>%
     filter(section %in% open_class) %>%
     filter(!(item_id %in% book$item_id)) %>%
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
    pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 
  
  # Correlation
  r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
  
  # IRT models
  full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
  dplyr::select(-"PartID") 
  m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
  m_s <- mirt(full, 1, "spline",  technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL",  technical=list(NCYCLES=2000), dentype = "EH",verbose=FALSE)
  
  # Extract latent variables
  fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
  fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
  fscores_EH  <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
  
  # DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 37), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)

  # save reuslts
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}

#sim40 <- dim_sim(37, 100)
#write_rds(sim40, "Dim_Sim_Output/full_40_30.rds")
sim30 <- read_rds("Dim_Sim_Output/full_40_30.rds")

sim30 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )

library(ggpirate)
sim30 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.8) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .20, label = "Mild Multidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .75, label = "Moderate Multidimensionality", vjust = -0.5)

## Warning: Removed 3 rows containing missing values (geom_col_pirate).

## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/detect_plot_30.png", last_plot(), dpi=500)

## Saving 7 x 5 in image

## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Removed 1 rows containing missing values (geom_hline).

sim30 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .40),
    mean = mean(value)
  )

Dimensionality of Longitudinal Data Vocabulary and Grammar at thirty months

Seamus Donnelly

2023-02-20

1 Prep Data

2 Test run on one list of 13 items

2.1 Processing vocab

3 Combine

item	CLCL	LUCID
q01	85	63
q02	76	59
q03	90	64
q04	91	57
q05	70	57
q06	68	48
q07	52	41
q08	61	52
q09	68	54
q10	58	49
q11	73	63
q12	66	64
q13	65	54
q14	61	41
q15	80	64
q16	87	68
q17	81	66
q18	61	54
q19	62	54
q20	76	56
q21	56	48
q22	71	51
q23	45	40
q24	49	52
q25	73	57
q26	81	64
q27	60	52
q28	50	40
q29	85	65
q30	73	62
q31	73	61
q32	78	60
q33	78	63
q34	42	42
q35	46	47
q36	62	50
q37	65	55

item	CLCL	LUCID
q01	85	63
q02	76	59
q03	90	64
q04	91	57
q05	70	57
q06	68	48
q07	52	41
q08	61	52
q09	68	54
q10	58	49
q11	73	63
q12	66	64
q13	65	54
q14	61	41
q15	80	64
q16	87	68
q17	81	66
q18	61	54
q19	62	54
q20	76	56
q21	56	48
q22	71	51
q23	45	40
q24	49	52
q25	73	57
q26	81	64
q27	60	52
q28	50	40
q29	85	65
q30	73	62
q31	73	61
q32	78	60
q33	78	63
q34	42	42
q35	46	47
q36	62	50
q37	65	55

item	CLCL	LUCID
q01	85	63
q02	76	59
q03	90	64
q04	91	57
q05	70	57
q06	68	48
q07	52	41
q08	61	52
q09	68	54
q10	58	49
q11	73	63
q12	66	64
q13	65	54
q14	61	41
q15	80	64
q16	87	68
q17	81	66
q18	61	54
q19	62	54
q20	76	56
q21	56	48
q22	71	51
q23	45	40
q24	49	52
q25	73	57
q26	81	64
q27	60	52
q28	50	40
q29	85	65
q30	73	62
q31	73	61
q32	78	60
q33	78	63
q34	42	42
q35	46	47
q36	62	50
q37	65	55