In these files, I look at the dimensionality of vocabulary and grammar for 24 months.

1 Prep Data

Open the libraries.

library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_24 <- read_csv("full_24.csv") 

missing_items <- c(13, 14, 545) # alligator, animal, orange 

full_24 <- full_24 %>%
  filter(!(item_id %in% missing_items)) %>%
  filter(PartID != "LUCID_77") # missing complexity

Get number of distinct participants in this age

n_part  <- full_24 %>%
  group_by(PartID) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  count()

Items which at least 1 children are producing

gramm_atleast_1 <- full_24 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n > 0) %>%
  dplyr::select("item")

Complex <- full_24 %>%
  filter(section == "combine" | # to drop non-combiners
           section == "complexity" # to calculate complexity scores.
         ) 

Complex_with_ids<- Complex %>%
  dplyr::select("PartID", "item", "out") %>% 
  pivot_wider(id_cols="PartID", names_from="item", values_from="out")

Complex_with_ids_gramm <- Complex_with_ids %>%
  filter(combine > 0) %>% # select only participants combining words
  dplyr::select(-"combine") %>% # remove combine variable
  dplyr::select("PartID", gramm_atleast_1$item) # select items that 1 or more participants are producing. 

# get number of distinct participants not combining words
n_no_g <- Complex %>%
  group_by(PartID) %>%
  filter(section=="combine") %>%
  filter(out==0) %>%
  ungroup() %>%
  count()
  

# get number of distinct items
n_item <- full_24 %>% 
  filter(PartID == "CLCL_001") %>%
  dplyr::select("section", "item", "item_id") %>%
  filter(section=="complexity")
  

nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)

## [1] TRUE

ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_1) + 1

## [1] TRUE

Get table of the number of non-0 responses for each item on the complexity scale.

full_24 %>%
  filter(section %in% c("complexity")) %>%
  group_by(df, item) %>%
  summarise(
    mean = sum(out)
  ) %>%
  pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
  kable()

## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.

item	CLCL	LUCID
q01	68	43
q02	58	35
q03	72	51
q04	74	36
q05	50	24
q06	26	12
q07	15	7
q08	23	18
q09	21	16
q10	33	22
q11	44	31
q12	29	26
q13	30	16
q14	27	18
q15	48	33
q16	68	41
q17	68	39
q18	20	20
q19	21	13
q20	30	22
q21	17	11
q22	25	16
q23	6	4
q24	8	9
q25	29	21
q26	47	27
q27	16	17
q28	14	8
q29	53	35
q30	44	26
q31	32	24
q32	41	32
q33	34	25
q34	12	10
q35	20	11
q36	14	12
q37	25	20

2 Test run on one list of 13 items

2.1 Processing vocab

Vocab items that at least 1 participants are producing (a)

vocab_atleast_1  <- full_24 %>%
  filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
  filter(class=="Vocab") %>%
  group_by(item_id) %>%
  mutate(
    count = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item_id) %>%
  filter(count>0) %>%
  dplyr::select("item_id")

Item <- filter(full_24, PartID=="CLCL_001") # make list of items

open_class <- c(
  "action.words", # sections that are open-class
  "animal.names", 
  "body.parts", 
  "clothing", 
  "descriptive.words", 
  "food.and.drink", 
  "furniture.and.rooms", 
  "small.household.items", 
  "toys", 
  "vehicles", 
  "outside.things"
)

Item <- filter(Item, (item_id %in% vocab_atleast_1$item_id) & (section %in% open_class)) # select open-class items with >= 1 participants

set.seed(245)

f1 <- 43/nrow(Item) # proportion for 13 tiems

DF <- Item %>%
  group_by(section) %>%
  slice_sample(prop= f1, replace=FALSE) %>%
  slice(-8) %>% # remove one random cell
  ungroup()

n_open = nrow(filter(Item, section %in% open_class))

Item %>%
  filter(section %in% open_class) %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/n_open
  )

DF %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/37
  )

total_37 <- full_24 %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_37=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_37")

total_n37 <- full_24 %>%
  filter(section %in% open_class) %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_others")

corr = full_join(total_37, total_n37, by="PartID") %>%
  drop_na()

cor(corr$total_37, corr$total_others) # .96

## [1] 0.9682885

Make short version of vocab for IRT

Vocab_short_with_ids <- full_24 %>% 
  filter(item_id %in% DF$item_id) %>%
  mutate(
    id = paste("Vocab", as.character(item_id), by="_")
  ) %>%
  dplyr::select(PartID, id, out) %>%
  pivot_wider(id_cols=PartID, names_from = "id", values_from="out")

3 Combine

full <- left_join(
  Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
) 

full_noid <- full %>%
  dplyr::select(-"PartID")

#m1 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=1000))
#saveRDS(m1, "IRT_models/24mo/m1.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000  iterations and it's close. 
#saveRDS(m2, "IRT_models/24mo/m2.rds")
#m3 <- mirt(full_noid, 1, "2PL", dentype = "EH", technical=list(NCYCLES=2000))
#saveRDS(m3, "IRT_models/24mo/m3.rds")

m1 <-  readRDS("IRT_models/24mo/m1.rds")
m2 <-  readRDS("IRT_models/24mo/m2.rds")
m3 <-  readRDS("IRT_models/24mo/m3.rds")

3.1 Combine

fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]

tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

grammar <- full_24 %>%
  filter(section=="complexity") %>%
  group_by(PartID) %>%
  mutate(
    Complex = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", grammar="Complex")

vocab <- full_24 %>%
  filter(class=="Vocab") %>%
  group_by(PartID) %>%
  mutate(
    Vocab = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(PartID) %>%
  dplyr::select("PartID", vocab="Vocab")

vg <- full_join(grammar, vocab, by="PartID")

vglv <- Complex_with_ids_gramm %>%
  bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
  left_join(vg, by="PartID") %>%
  pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam")  + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal() 

vocab/grammar

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))

conf <- conf.detect(full2, fscores, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.520    0.520
## ASSI           0.352    0.352
## RATIO          0.598    0.598
## MADCOV100      0.869    0.869
## MCOV100        0.076    0.076

conf_spline <- conf.detect(full2, fscores_spline, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.433    0.433
## ASSI           0.287    0.287
## RATIO          0.522    0.522
## MADCOV100      0.828    0.828
## MCOV100        0.158    0.158

conf_EH <- conf.detect(full2, fscores_EH, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.530    0.530
## ASSI           0.341    0.341
## RATIO          0.598    0.598
## MADCOV100      0.887    0.887
## MCOV100        0.143    0.143

dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  # Make list of vocab items
  book <- Item %>% 
  filter(section %in% open_class) %>% #select open-class items
  filter(item_id %in% vocab_atleast_1$item_id) %>% # select items that at least 1 participants are producing
  slice_sample(n= items, replace=FALSE) # select random sample of items
 
  # Select items from vocab dataset
  Vocab2<- full_24 %>%
     filter(item_id %in% book$item_id) %>% 
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
     pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

  # Make data frame with non-selected items for calculating correlation coefficients.
  Vocab3 <-  full_24 %>%
     filter(section %in% open_class) %>%
     filter(!(item_id %in% book$item_id)) %>%
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
    pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 
  
  # Correlation
  r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
  
  # IRT models
  full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
  dplyr::select(-"PartID") 
  m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
  m_s <- mirt(full, 1, "spline",  technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL",  technical=list(NCYCLES=2000), dentype = "EH")
  
  # Extract latent variables
  fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
  fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
  fscores_EH  <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
  
  # DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 37), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)

  # save reuslts
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}

#sim37 <- dim_sim(37, 100)
#write_rds(sim37, "Dim_Sim_Output/sim_24.rds")
sim40 <- read_rds("Dim_Sim_Output/sim_24.rds")

sim40 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )

library(ggpirate)
sim40 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)

## Warning: Removed 12 rows containing non-finite values (stat_summary).

## Warning: Removed 12 rows containing non-finite values (stat_ydensity_pirate).

## Warning: Removed 12 rows containing non-finite values (stat_summary).

## Warning: Removed 3 rows containing missing values (geom_col_pirate).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/detect_plot_24.png", last_plot(), dpi=500)

## Saving 7 x 5 in image

## Warning: Removed 12 rows containing non-finite values (stat_summary).

## Warning: Removed 12 rows containing non-finite values (stat_ydensity_pirate).

## Warning: Removed 12 rows containing non-finite values (stat_summary).

## Warning: Removed 3 rows containing missing values (geom_col_pirate).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_hline).

sim40 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .40),
    mean = mean(value)
  )

Dimensionality of Longitudinal Data Vocabulary and Grammar at twentyfour months

Seamus Donnelly

2023-02-23

1 Prep Data

2 Test run on one list of 13 items

2.1 Processing vocab

3 Combine

3.1 Combine

item	CLCL	LUCID
q01	68	43
q02	58	35
q03	72	51
q04	74	36
q05	50	24
q06	26	12
q07	15	7
q08	23	18
q09	21	16
q10	33	22
q11	44	31
q12	29	26
q13	30	16
q14	27	18
q15	48	33
q16	68	41
q17	68	39
q18	20	20
q19	21	13
q20	30	22
q21	17	11
q22	25	16
q23	6	4
q24	8	9
q25	29	21
q26	47	27
q27	16	17
q28	14	8
q29	53	35
q30	44	26
q31	32	24
q32	41	32
q33	34	25
q34	12	10
q35	20	11
q36	14	12
q37	25	20

item	CLCL	LUCID
q01	68	43
q02	58	35
q03	72	51
q04	74	36
q05	50	24
q06	26	12
q07	15	7
q08	23	18
q09	21	16
q10	33	22
q11	44	31
q12	29	26
q13	30	16
q14	27	18
q15	48	33
q16	68	41
q17	68	39
q18	20	20
q19	21	13
q20	30	22
q21	17	11
q22	25	16
q23	6	4
q24	8	9
q25	29	21
q26	47	27
q27	16	17
q28	14	8
q29	53	35
q30	44	26
q31	32	24
q32	41	32
q33	34	25
q34	12	10
q35	20	11
q36	14	12
q37	25	20

item	CLCL	LUCID
q01	68	43
q02	58	35
q03	72	51
q04	74	36
q05	50	24
q06	26	12
q07	15	7
q08	23	18
q09	21	16
q10	33	22
q11	44	31
q12	29	26
q13	30	16
q14	27	18
q15	48	33
q16	68	41
q17	68	39
q18	20	20
q19	21	13
q20	30	22
q21	17	11
q22	25	16
q23	6	4
q24	8	9
q25	29	21
q26	47	27
q27	16	17
q28	14	8
q29	53	35
q30	44	26
q31	32	24
q32	41	32
q33	34	25
q34	12	10
q35	20	11
q36	14	12
q37	25	20