In these files, I look at the dimensionality of vocabulary and grammar for items produced by 1 or more children.

1 Prep Data

Open the libraries.

library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_18 <- read_csv("full_18.csv") 

missing_items <- c(13, 14, 545) # alligator, animal, orange (these items were not consistently entered in L05)

full_18 <- full_18 %>%
  filter(!(item_id %in% missing_items))

Get number of distinct participants in this age.

n_part  <- full_18 %>%
  group_by(PartID) %>% # Group by Part ID
  slice(1) %>% # select 1 row
  ungroup(PartID) %>%  # ungroup
  count() # count

Items which at least 5 children are producing

gramm_atleast_1 <- full_18 %>%
  filter(section %in% c("complexity")) %>%
  group_by(item) %>%
  mutate(
    n = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item) %>%
  filter(n > 0) %>%
  dplyr::select("item")

Complex <- full_18 %>%
  filter(section == "combine" | # to drop non-combiners
           section == "complexity" # to calculate complexity scores.
         ) 

Complex_with_ids<- Complex %>%
  dplyr::select("PartID", "item", "out") %>% 
  pivot_wider(id_cols="PartID", names_from="item", values_from="out")

Complex_with_ids_gramm <- Complex_with_ids %>%
  filter(combine > 0) %>% # select only participants combining words
  dplyr::select(-"combine") %>% # remove combine variable
  dplyr::select("PartID", gramm_atleast_1$item) # select items that 5 or more participants are producing. 

# get number of distinct participants not combining words
n_no_g <- Complex %>%
  group_by(PartID) %>%
  filter(section=="combine") %>%
  filter(out==0) %>%
  ungroup() %>%
  count()
  

# get number of distinct items
n_item <- full_18 %>% 
  filter(PartID == "CLCL_001") %>%
  dplyr::select("section", "item", "item_id") %>%
  filter(section=="complexity")
  

nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)

## [1] TRUE

ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_1) + 1

## [1] TRUE

Get table of the number of non-0 responses for each item on the complexity scale.

full_18 %>%
  filter(section %in% c("complexity")) %>%
  group_by(df, item) %>%
  summarise(
    mean = sum(out)
  ) %>%
  pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
  kable()

## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.

item	CLCL	LUCID
q01	14	13
q02	11	7
q03	10	8
q04	9	6
q05	4	2
q06	0	2
q07	1	0
q08	1	1
q09	1	0
q10	5	0
q11	5	5
q12	0	2
q13	2	0
q14	2	2
q15	4	7
q16	16	10
q17	5	7
q18	2	2
q19	0	2
q20	0	1
q21	1	0
q22	0	0
q23	0	0
q24	0	0
q25	1	1
q26	0	1
q27	0	0
q28	0	0
q29	2	3
q30	3	4
q31	0	0
q32	2	3
q33	0	2
q34	0	0
q35	0	0
q36	0	0
q37	0	0

13 grammar items above 5

2 Test run on one list of 13 items

2.1 Processing vocab

Vocab items that at least 5 participants are producing

vocab_atleast_1  <- full_18 %>%
  filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
  filter(class=="Vocab") %>%
  group_by(item_id) %>%
  mutate(
    count = sum(out)
  ) %>%
  slice(1) %>%
  ungroup(item_id) %>%
  filter(count>0) %>%
  dplyr::select("item_id")

Item <- filter(full_18, PartID=="CLCL_001") # make list of items

open_class <- c(
  "action.words", # sections that are open-class
  "animal.names", 
  "body.parts", 
  "clothing", 
  "descriptive.words", 
  "food.and.drink", 
  "furniture.and.rooms", 
  "small.household.items", 
  "toys", 
  "vehicles", 
  "outside.things"
)

Item <- filter(Item, (item_id %in% vocab_atleast_1$item_id) & (section %in% open_class)) # select open-class items with >= 1 participants

set.seed(245)

f1 <- 32/nrow(Item) # proportion for 27 tiems

DF <- Item %>%
  group_by(section) %>%
  slice_sample(prop= f1, replace=FALSE) %>%
  ungroup()

n_open = nrow(filter(Item, section %in% open_class))

Item %>%
  filter(section %in% open_class) %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/n_open
  )

DF %>%
  group_by(section) %>%
  count() %>%
  ungroup() %>%
  mutate(
    Prop = n/27
  )

total_27 <- full_18 %>%
  filter(item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_27=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_27")

total_n27<- full_18 %>%
  filter(section %in% open_class) %>%
  filter(!item_id %in% DF$item_id) %>%
  group_by(PartID) %>%
  mutate(
    total_others=sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "total_others")

corr = full_join(total_27, total_n27, by="PartID") %>%
  drop_na()

cor(corr$total_27, corr$total_others) # .96

## [1] 0.9649968

Make short version of vocab for IRT

Vocab_short_with_ids <- full_18 %>% 
  filter(item_id %in% DF$item_id) %>%
  mutate(
    id = paste("Vocab", as.character(item_id), by="_")
  ) %>%
  dplyr::select(PartID, id, out) %>%
  pivot_wider(id_cols=PartID, names_from = "id", values_from="out")

2.2 Combine

full <- left_join(
  Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
) 

full_noid <- full %>%
  dplyr::select(-"PartID")

#m1 <- mirt(full_noid, 1, technical=list(NCYCLES=1000), "2PL")
#saveRDS(m1, "IRT_models/18mo/m1_full.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000  iterations and it's close. 
#saveRDS(m2, "IRT_models/18mo/m2_full.rds")
#m3 <- mirt(full_noid, 1, "2PL", , technical=list(NCYCLES=1000), dentype = "EH")
#saveRDS(m3, "IRT_models/18mo/m3_full.rds")

m1 <-  readRDS("IRT_models/18mo/m1_full.rds")
m2 <-  readRDS("IRT_models/18mo/m2_full.rds")
m3 <-  readRDS("IRT_models/18mo/m3_full.rds")

Get factor scores

fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]

Scatterplot matrix

tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  ggpairs()

Make data frame with total vocab and complexity scores to compare raw scores to latent variables.

g <- Complex %>% # data frame defined above
  filter(section=="complexity") %>% # select complexity items only
  group_by(PartID) %>% # sort by participant
  mutate(
    grammar = sum(out) # make total score for complexity
  ) %>%
  slice(1) %>% # select 1 row per participant
  ungroup() %>% 
  dplyr::select("PartID", "grammar") 

vg <- full_18 %>%
  filter(class=="Vocab") %>%
  group_by(PartID) %>%
  mutate(
    vocab = sum(out)
  ) %>%
  slice(1) %>%
  ungroup() %>%
  dplyr::select("PartID", "vocab") %>%
  full_join(., g, by="PartID")

Plot latent variables against total vocabulary and grammar

vglv <- Complex_with_ids_gramm %>%
  bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
  dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
  left_join(vg, by="PartID") %>%
  pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")

vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam")  + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal() 

vocab/grammar

## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

DETECT

full2 <- data.frame(full_noid)
dtct <- c(rep(1, 27), rep(2, 27))

conf <- conf.detect(full2, fscores, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.195    0.195
## ASSI           0.106    0.106
## RATIO          0.309    0.309
## MADCOV100      0.629    0.629
## MCOV100       -0.044   -0.044

conf_spline <- conf.detect(full2, fscores_spline, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.133    0.133
## ASSI           0.071    0.071
## RATIO          0.255    0.255
## MADCOV100      0.524    0.524
## MCOV100        0.027    0.027

conf_EH <- conf.detect(full2, fscores_EH, dtct)

## -----------------------------------------------------------
## Confirmatory DETECT Analysis 
## Conditioning on 1 Score
## Bandwidth Scale: 1.1 
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## ...........................................................
## Nonparametric Estimation of conditional covariances 
##  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 
## 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## -----------------------------------------------------------
##           unweighted weighted
## DETECT         0.214    0.214
## ASSI           0.092    0.092
## RATIO          0.322    0.322
## MADCOV100      0.664    0.664
## MCOV100       -0.003   -0.003

3 Analysis of 100 data sets

dim_sim <- function(items, iter){
     # Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
  
for(i in 1:iter){
  # Make list of vocab items
  book <- Item %>% 
  filter(section %in% open_class) %>% #select open-class items
  filter(item_id %in% vocab_atleast_1$item_id) %>% # select items that at least 1 participants are producing
  slice_sample(n= items, replace=FALSE) # select random sample of items
 
  # Select items from vocab dataset
  Vocab2<- full_18 %>%
     filter(item_id %in% book$item_id) %>% 
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
     pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 

  # Make data frame with non-selected items for calculating correlation coefficients.
  Vocab3 <-  full_18 %>%
     filter(section %in% open_class) %>%
     filter(!(item_id %in% book$item_id)) %>%
     mutate(
       id = paste("Vocab", as.character(item_id), by="_")
     ) %>%
     dplyr::select(PartID, id, out) %>%
    pivot_wider(id_cols=PartID, names_from = "id", values_from="out") 
  
  # Correlation
  r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
  
  # IRT models
  full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
  dplyr::select(-"PartID") 
  m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
  m_s <- mirt(full, 1, "spline",  technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
  m_EH <- mirt(full, 1, "2PL",  technical=list(NCYCLES=2000), dentype = "EH")
  
  # Extract latent variables
  fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
  fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
  fscores_EH  <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
  
  # DETECT
  fulldf <- data.frame(full)
  dtct <- c(rep(1, 27), rep(2, items))
  conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
  conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
  conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)

  # save reuslts
  df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
                 detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
                 detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1], 
                 corr = r)
  out = bind_rows(out, df)
  
  
}

return(out)
}

#sim18 <- dim_sim(27, 100)
#write_rds(sim18, "Dim_Sim_Output/sim_18_full.rds")
sim18 <- read_rds("Dim_Sim_Output/sim_18_full.rds")

sim18 %>%
  dplyr::select("corr") %>%
  summarise(
    min = min(corr), 
    max = max(corr), 
    mean = mean(corr)
  )

library(ggpirate)
sim18 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
  annotate("text", x = "b", y = .05, label = "Essential Unidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .20, label = "Mild Multidimensionality", vjust = -0.5) + 
  annotate("text", x = "b", y = .60, label = "Moderate Multidimensionality", vjust = -0.5)

## Warning: Removed 3 rows containing missing values (geom_col_pirate).

## Warning: Removed 1 rows containing missing values (geom_hline).

ggsave(filename="plots/detect_plot_18.png", last_plot(), dpi=500)

## Saving 7 x 5 in image

## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Removed 1 rows containing missing values (geom_hline).

sim18 %>%
  dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
  mutate(
    run = row_number()
  ) %>%
  pivot_longer(!c(run, corr), 
               names_to=c("measure", "LV"), 
               names_sep = "_", 
               values_to="value") %>%
  filter(measure=="detect") %>%
  group_by(LV) %>%
  summarise(
    Less_20 = sum(value < .2),
    mean = mean(value)
  )

Dimensionality of Longitudinal Data Vocabulary and Grammar at eighteen months

Seamus Donnelly

2023-02-23

1 Prep Data

2 Test run on one list of 13 items

2.1 Processing vocab

2.2 Combine

3 Analysis of 100 data sets

item	CLCL	LUCID
q01	14	13
q02	11	7
q03	10	8
q04	9	6
q05	4	2
q06	0	2
q07	1	0
q08	1	1
q09	1	0
q10	5	0
q11	5	5
q12	0	2
q13	2	0
q14	2	2
q15	4	7
q16	16	10
q17	5	7
q18	2	2
q19	0	2
q20	0	1
q21	1	0
q22	0	0
q23	0	0
q24	0	0
q25	1	1
q26	0	1
q27	0	0
q28	0	0
q29	2	3
q30	3	4
q31	0	0
q32	2	3
q33	0	2
q34	0	0
q35	0	0
q36	0	0
q37	0	0

item	CLCL	LUCID
q01	14	13
q02	11	7
q03	10	8
q04	9	6
q05	4	2
q06	0	2
q07	1	0
q08	1	1
q09	1	0
q10	5	0
q11	5	5
q12	0	2
q13	2	0
q14	2	2
q15	4	7
q16	16	10
q17	5	7
q18	2	2
q19	0	2
q20	0	1
q21	1	0
q22	0	0
q23	0	0
q24	0	0
q25	1	1
q26	0	1
q27	0	0
q28	0	0
q29	2	3
q30	3	4
q31	0	0
q32	2	3
q33	0	2
q34	0	0
q35	0	0
q36	0	0
q37	0	0

item	CLCL	LUCID
q01	14	13
q02	11	7
q03	10	8
q04	9	6
q05	4	2
q06	0	2
q07	1	0
q08	1	1
q09	1	0
q10	5	0
q11	5	5
q12	0	2
q13	2	0
q14	2	2
q15	4	7
q16	16	10
q17	5	7
q18	2	2
q19	0	2
q20	0	1
q21	1	0
q22	0	0
q23	0	0
q24	0	0
q25	1	1
q26	0	1
q27	0	0
q28	0	0
q29	2	3
q30	3	4
q31	0	0
q32	2	3
q33	0	2
q34	0	0
q35	0	0
q36	0	0
q37	0	0