This is the analysis of the younger longitudinal data from Wordbank. I have included all items which at least 1 child is producing. The output with only items which 5 or more participants produced is available here
Open the libraries.
options(max.print=500)
library(wordbankr) # WB data
library(patchwork)
library(GGally)
library(tidyverse) # tidy
library(mirt) # IRT models
library(knitr) # some formatting, tables, etc
library(sirt) # additional IRT functions
Open data: data saved locally because Wordbank will be updated sometime soon.
Inst <- read_rds("Wordbank_Data/Inst_WS_Eng.rds")
Admin <- read_rds("Wordbank_Data/Admin_WS_Eng.rds")
N_total = nrow(Admin) # making sure things add up later
N_long = nrow(filter(Admin, longitudinal==TRUE)) # making sure things add up later
Item <- read_rds("Wordbank_Data/Item_WS_Eng.rds")
Select longitudinal data for younger participants.
Younger <- Admin %>%
filter(longitudinal==TRUE) %>%
filter(age < 20)
N_younger = nrow(Younger) # making sure things add up later
Select complexity data items for younger participants and re-code
Younger_Complex <- Younger %>% # combine admin, inst and item.
left_join(.,Inst, by="data_id") %>% # left_join because want to add cases that are not in older
left_join(., Item, by="num_item_id") %>%
filter(type == "combine" | # to drop non-combiners
type == "complexity" # to calculate complexity scores.
) %>%
mutate(
out = ifelse(value=="complex" | value=="sometimes" | value=="produces", yes=1,
no = ifelse(value=="often", yes=2, no =0))
)
N_complexity_items = nrow(filter(Item, type == "combine" |
type == "complexity"))
nrow(Younger_Complex) == (N_younger)*N_complexity_items # sanity check
## [1] TRUE
The complexity category variable, coded by WB, is either morphology or syntax for each item on the complexity scale. This will be useful for labeling items later (so we can have morphology760 instead of the definition), but I’ll add “combine” so this shows up in the complexity category variable as well.
Younger_Complex$complexity_category <- ifelse(Younger_Complex$complexity_category == "", yes=Younger_Complex$type, no=Younger_Complex$complexity_category)
Make list of grammar items that at least 5 participants are producing.
# Make list of grammar items that are at least 5 (can't use these).
gramm_atleast_1 <- Younger_Complex %>%
group_by(definition) %>%
mutate(
ttl = sum(out)
) %>%
slice(1) %>%
ungroup() %>%
filter(ttl > 0 ) %>%
dplyr::select("definition", "num_item_id")
Make wide format for IRT models.
# Long to Wide
Younger_Complex_short_with_ids_all <- Younger_Complex %>%
filter(num_item_id %in% gramm_atleast_1$num_item_id) %>% # select complexity items
dplyr::select(data_id, value, out, complexity_category, num_item_id) %>% # select relevant variables
mutate(
label = str_c(complexity_category, num_item_id) # mqke new label
) %>%
pivot_wider(id_cols=data_id, names_from = "label", values_from="out") %>% # wide
dplyr::select(starts_with(c("data_id", "combine", "morphology", "syntax"))) #select relevant variables
Younger_Complex_short_with_ids <- Younger_Complex_short_with_ids_all %>%
drop_na() # drop rows of participants missing data
# get number of NAs
Younger_Complex_short_grammatical <- Younger_Complex_short_with_ids %>%
filter(combine760 > 0) %>% # Drop participants wwho are not yet combining words.
dplyr::select(starts_with(c("morphology", "syntax", "data_id")))
# Sanity Check
N_NA_Younger <- nrow(Younger_Complex_short_with_ids_all) - nrow(Younger_Complex_short_with_ids %>%
dplyr::select(starts_with(c("data_id", "combine", "morphology", "syntax"))) %>%
drop_na()) # Number of NAs
N_nog_younger <- nrow(filter(Younger_Complex_short_with_ids, combine760 == 0)) # Number of children not combining words
nrow(Younger_Complex_short_grammatical) == N_younger - N_NA_Younger - N_nog_younger
## [1] TRUE
In order to illustrate the analysis, both to myself and readers, I’ve run the analysis one time below. In this section, I create a subset of vocabulary items (constrained so that they are sampled proportionally to the numbers of each subsection on the CDI), combine this with the grammar items, and run the analysis on that dataset. In the following section, I repeated this analysis 100 times.
Make vocab data: start by selecting all vocabulary items.
Younger_Vocab <- Younger %>%
left_join(.,Inst, by="data_id") %>%
left_join(., Item, by="num_item_id") %>%
filter(type == "word"
) %>%
mutate(
out = ifelse(value=="produces", yes=1, no =0)
)
N_vocab = nrow(filter(Item, type == "word"))
Create lists of 16 items (only 16 items > 5 on complexity)
set.seed(6531)
f1 <- 33.5/nrow(filter(Item, lexical_category == "nouns" | lexical_category == "predicates")) # only nouns and verbs
# Make list of words that are all 0 (can't use these).
atleast_1 <- Younger_Vocab %>%
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
group_by(definition) %>%
mutate(
ttl = sum(out)
) %>%
slice(1) %>%
ungroup() %>%
filter(ttl > 0)
DF <- Item %>%
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
group_by(category) %>% # selected 40, all valid without selecting only certain words.
slice_sample(prop= f1, replace=FALSE) %>%
ungroup()
Check that these new lists are similar to the original data set in terms of content
Item %>%
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
group_by(category) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/474
)
DF %>%
group_by(category) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/28
)
Select subsets of vocabulary items defined in those lists above. Check correlation between total of selected items and total of non-selected items.
total_28 <- Younger_Vocab %>%
filter(item_id %in% DF$item_id) %>%
group_by(data_id) %>%
mutate(
total_28=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("data_id", "total_28")
total_n28 <- Younger_Vocab %>%
filter(lexical_category == "predicates" | lexical_category=="nouns") %>%
filter(!item_id %in% DF$item_id) %>%
group_by(data_id) %>%
mutate(
total_others=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("data_id", "total_others")
corr = full_join(total_28, total_n28, by="data_id") %>%
drop_na()
cor(corr$total_28, corr$total_others) # .88
## [1] 0.9445435
Format data set for IRT
Younger_Vocab_short_with_ids_28 <- Younger_Vocab %>%
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
filter(item_id %in% DF$item_id) %>%
dplyr::select(data_id, value, out, definition) %>%
pivot_wider(id_cols=data_id, names_from = "definition", values_from="out") %>%
drop_na() # no item-level missing data
full_younger_28 <- left_join(
Younger_Complex_short_grammatical, Younger_Vocab_short_with_ids_28, by="data_id"
) %>%
dplyr::select(-c("data_id"))
nrow(full_younger_28) == N_younger - N_NA_Younger - N_nog_younger # Still okay -- not missing item-level data from vocab.
## [1] TRUE
full_younger_28 %>%
pivot_longer(everything(), names_to="item", values_to="out") %>%
mutate(
type = ifelse(str_detect(item, "morphology"), yes="morphology", no =
ifelse(str_detect(item, "syntax"), yes="syntax", no = "word"))
) %>%
group_by(type, item) %>%
summarise(
sum = sum(out)
) %>%
ungroup(item) %>%
kable()
## `summarise()` has grouped output by 'type'. You can override using the `.groups`
## argument.
| type | item | sum |
|---|---|---|
| morphology | morphology761 | 22 |
| morphology | morphology762 | 18 |
| morphology | morphology763 | 22 |
| morphology | morphology764 | 8 |
| morphology | morphology765 | 1 |
| morphology | morphology766 | 5 |
| morphology | morphology767 | 11 |
| morphology | morphology768 | 17 |
| morphology | morphology769 | 12 |
| morphology | morphology770 | 3 |
| morphology | morphology772 | 2 |
| syntax | syntax773 | 5 |
| syntax | syntax774 | 10 |
| syntax | syntax775 | 3 |
| syntax | syntax777 | 1 |
| syntax | syntax778 | 3 |
| syntax | syntax779 | 2 |
| syntax | syntax780 | 13 |
| syntax | syntax781 | 8 |
| syntax | syntax782 | 3 |
| syntax | syntax784 | 5 |
| syntax | syntax785 | 1 |
| syntax | syntax786 | 1 |
| syntax | syntax789 | 1 |
| syntax | syntax790 | 4 |
| syntax | syntax791 | 5 |
| syntax | syntax792 | 5 |
| syntax | syntax793 | 3 |
| word | ant | 21 |
| word | backyard | 9 |
| word | bubbles | 128 |
| word | candy | 31 |
| word | cheerios | 66 |
| word | chicken (animal) | 41 |
| word | clock | 48 |
| word | dish | 15 |
| word | door | 90 |
| word | dryer | 3 |
| word | eat | 68 |
| word | empty | 17 |
| word | eye | 143 |
| word | flag | 11 |
| word | hard | 4 |
| word | hit | 13 |
| word | meat | 20 |
| word | necklace | 19 |
| word | owl | 46 |
| word | paint | 3 |
| word | pancake | 18 |
| word | push | 25 |
| word | ride | 22 |
| word | sing | 18 |
| word | sticky | 6 |
| word | swim | 15 |
| word | tired | 13 |
| word | toothbrush | 49 |
We need to remove the items that have all 0s (hence the missing object)
#m1 <- mirt(full_younger_28, 1, "2PL", technical=list(NCYCLES=1000))
#m2 <- mirt(full_younger_28, 1, "spline", technical=list(NCYCLES=2000))
#m3 <- mirt(full_younger_28, 1, "2PL", dentype = "EH")
#write_rds(m1, "IRT_output/m1_young_full.rds")
#write_rds(m2, "IRT_output/m2_young_full.rds")
#write_rds(m3, "IRT_output/m3_young_full.rds")
m1 <- readRDS("IRT_output/m1_young_full.rds")
m2 <- readRDS("IRT_output/m2_young_full.rds")
m3 <- readRDS("IRT_output/m3_young_full.rds")
Save latent variables
fscores = fscores(m1)[,1]
fscores_spline = fscores(m2)[,1]
fscores_EH = fscores(m3, use_dentype_estimate=TRUE)[,1]
Plot latent variables
tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
ggpairs()
Make dataset of total (raw) vocabulary and grammar
vocab_and_grammar <- Younger_Complex %>% # data frame defined above
filter(type=="complexity") %>% # select complexity items only
group_by(data_id) %>% # sort by participant
mutate(
total = sum(out) # make total score for complexity
) %>%
slice(1) %>% # select 1 row per participant
ungroup() %>%
dplyr::select("data_id", "production", "total") # this already contains production score from Admin dataset
vglv <- bind_cols(Younger_Complex_short_grammatical, TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
dplyr::select("data_id", "TwoPL", "Spline", "EH") %>%
left_join(vocab_and_grammar, by="data_id") %>%
pivot_longer(-c("data_id", "production", "total"), names_to="model", values_to="LV")
vocab<-ggplot(vglv, aes(x=production, y=LV)) + geom_point() + geom_smooth(method="gam", se=FALSE) + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=total, y=LV)) + geom_point() + geom_smooth(method="gam", se=FALSE) + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
vocab/grammar
ggsave(filename="plots/wordbank/LV_plot_younger_full.png", last_plot(), dpi=500)
full2 = data.frame(full_younger_28)
dtct <- c(rep(1, 28), rep(2, 28))
conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.316 0.316
## ASSI 0.461 0.461
## RATIO 0.697 0.697
## MADCOV100 0.453 0.453
## MCOV100 -0.027 -0.027
conf <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.029 0.029
## ASSI 0.079 0.079
## RATIO 0.117 0.117
## MADCOV100 0.246 0.246
## MCOV100 -0.008 -0.008
conf <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.302 0.302
## ASSI 0.366 0.366
## RATIO 0.626 0.626
## MADCOV100 0.481 0.481
## MCOV100 0.136 0.136
Now write a function that selects different subsets of vocabulary (ideally 37 items, but this function drops vocabulary items that no participants produce) and runs detect on all of those. I’m sticking to the IRT approach, rather than the sumscore approach, because use_sum_score=TRUE seems to deflate detect and I’m not sure why.
dim_sim <- function(items, iter){
# Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
for(i in 1:iter){
## Make list of XX items
book <- atleast_1 %>% # choose items (from dataset with 5 or more, nouns and predicates only)
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
slice_sample(n=items, replace=FALSE) # select items
##Select relevant items
Vocab2 <- Younger_Vocab %>%
filter(item_id %in% book$item_id) %>% # filter items selected above
dplyr::select(data_id, value, out, definition) %>% # select variables
pivot_wider(id_cols=data_id, names_from = "definition", values_from="out") # make wide
# Make data set of non-selected items (for calculating correlation coefficients)
Vocab3 <- Younger_Vocab %>%
filter( source_name != "Byers" & data_id != "135056")%>% # missing item-level data
filter(!(item_id %in% book$item_id)) %>%
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
dplyr::select(data_id, value, out, definition) %>%
pivot_wider(id_cols=data_id, names_from = "definition", values_from="out") %>%
drop_na()# Make vocabulary dataframe
# calculate correlation between included and excluded items
r = cor(rowSums(Vocab2[,2:items + 1]), rowSums(Vocab3[,2:(478- items + 1)]))
# Combine selected vocabulary items with grammar items
full <- left_join(Younger_Complex_short_with_ids, Vocab2, by="data_id") %>%
filter(combine760 > 0) %>%
dplyr::select(-c("data_id", "combine760")) %>%
dplyr::select(where(~sum(.) != 0))
# run IRT models
m1 <- mirt(full, 1, "2PL", verbose=FALSE)
m_s <- mirt(full, 1, "spline", verbose=FALSE) # likely few iterations, but this takes so much time
m_EH <- mirt(full, 1, "2PL", dentype = "EH", verbose=FALSE)
# save factor scores
fscores <-fscores(m1)[,1]
fscores_s <-fscores(m_s)[,1]
fscores_EH <-fscores(m_EH)[,1]
# run DETECT
fulldf <- data.frame(full)
dtct <- c(rep(1, 29), rep(2, items))
conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)
#combine results with previous iterations
df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1],
corr = r)
out = bind_rows(out, df)
}
return(out)
}
Run the sim 100 times and save the output; I’ve not included it, because otherwise the output for each run shows up and blows up R Markdown (plus it takes a while)
#sim29<- dim_sim(28, 100)
#write_rds(sim29, "Dim_Sim_Output/younger_full.rds")
sim29 <- read_rds("Dim_Sim_Output/younger_full.rds")
Correlations between vocab subset and total vocab
sim29 %>%
dplyr::select("corr") %>%
summarise(
min = min(corr),
max = max(corr),
mean = mean(corr)
)
Plot
library(ggpirate)
sim29 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
annotate("text", x = "b", y = .1, label = "Essential Uniidimensionality", vjust = -0.5) +
annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 52 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
ggsave(filename="plots/wordbank/detect_plot_younger_full.png", last_plot(), dpi=500)
## Saving 7 x 5 in image
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 52 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 52 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
sim29 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
group_by(LV) %>%
summarise(
Less_20 = sum(value < .20),
mean = mean(value)
)
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Dutch_Netherlands.1252 LC_CTYPE=Dutch_Netherlands.1252
## [3] LC_MONETARY=Dutch_Netherlands.1252 LC_NUMERIC=C
## [5] LC_TIME=Dutch_Netherlands.1252
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] ggpirate_0.1.2 sirt_3.11-21 knitr_1.37 mirt_1.36.1
## [5] lattice_0.20-45 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7
## [9] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.6
## [13] tidyverse_1.3.1 GGally_2.1.2 ggplot2_3.3.5 patchwork_1.1.2
## [17] wordbankr_0.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-155 fs_1.5.2 lubridate_1.8.0
## [4] RColorBrewer_1.1-3 httr_1.4.2 Deriv_4.1.3
## [7] tools_4.1.0 backports_1.4.1 bslib_0.3.1
## [10] utf8_1.2.2 R6_2.5.1 vegan_2.5-7
## [13] DBI_1.1.2 mgcv_1.8-39 colorspace_2.0-3
## [16] permute_0.9-7 withr_2.5.0 tidyselect_1.1.2
## [19] gridExtra_2.3 compiler_4.1.0 polycor_0.8-1
## [22] cli_3.0.1 rvest_1.0.2 quantreg_5.88
## [25] TAM_3.7-16 SparseM_1.81 xml2_1.3.3
## [28] labeling_0.4.2 sass_0.4.0 scales_1.2.0
## [31] mvtnorm_1.1-3 pbapply_1.5-0 digest_0.6.29
## [34] rmarkdown_2.13 quantregGrowth_1.4-0 pkgconfig_2.0.3
## [37] htmltools_0.5.2 highr_0.9 dbplyr_2.1.1
## [40] fastmap_1.1.0 rlang_1.0.2 readxl_1.3.1
## [43] rstudioapi_0.13 farver_2.1.0 jquerylib_0.1.4
## [46] generics_0.1.3 jsonlite_1.8.0 dcurver_0.9.2
## [49] magrittr_2.0.1 Matrix_1.4-0 Rcpp_1.0.8.3
## [52] munsell_0.5.0 fansi_1.0.2 lifecycle_1.0.1
## [55] stringi_1.7.6 yaml_2.3.5 MASS_7.3-55
## [58] plyr_1.8.6 grid_4.1.0 parallel_4.1.0
## [61] crayon_1.5.0 haven_2.4.3 splines_4.1.0
## [64] hms_1.1.1 pillar_1.8.0 admisc_0.26
## [67] reprex_2.0.1 GPArotation_2014.11-1 glue_1.4.2
## [70] evaluate_0.15 renv_0.15.4 modelr_0.1.8
## [73] vctrs_0.3.8 tzdb_0.2.0 MatrixModels_0.5-0
## [76] cellranger_1.1.0 gtable_0.3.0 reshape_0.8.8
## [79] assertthat_0.2.1 CDM_7.5-15 xfun_0.30
## [82] broom_0.7.12 cluster_2.1.2 ellipsis_0.3.2