In these files, I look at the dimensionality of vocabulary and grammar for 24 months.
Open the libraries.
library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_24 <- read_csv("full_24.csv")
missing_items <- c(13, 14, 545) # alligator, animal, orange
full_24 <- full_24 %>%
filter(!(item_id %in% missing_items)) %>%
filter(PartID != "LUCID_77") # missing complexity
Get number of distinct participants in this age
n_part <- full_24 %>%
group_by(PartID) %>%
slice(1) %>%
ungroup(PartID) %>%
count()
Items which at least 1 children are producing
gramm_atleast_1 <- full_24 %>%
filter(section %in% c("complexity")) %>%
group_by(item) %>%
mutate(
n = sum(out)
) %>%
slice(1) %>%
ungroup(item) %>%
filter(n > 0) %>%
dplyr::select("item")
Complex <- full_24 %>%
filter(section == "combine" | # to drop non-combiners
section == "complexity" # to calculate complexity scores.
)
Complex_with_ids<- Complex %>%
dplyr::select("PartID", "item", "out") %>%
pivot_wider(id_cols="PartID", names_from="item", values_from="out")
Complex_with_ids_gramm <- Complex_with_ids %>%
filter(combine > 0) %>% # select only participants combining words
dplyr::select(-"combine") %>% # remove combine variable
dplyr::select("PartID", gramm_atleast_1$item) # select items that 1 or more participants are producing.
# get number of distinct participants not combining words
n_no_g <- Complex %>%
group_by(PartID) %>%
filter(section=="combine") %>%
filter(out==0) %>%
ungroup() %>%
count()
# get number of distinct items
n_item <- full_24 %>%
filter(PartID == "CLCL_001") %>%
dplyr::select("section", "item", "item_id") %>%
filter(section=="complexity")
nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)
## [1] TRUE
ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_1) + 1
## [1] TRUE
Get table of the number of non-0 responses for each item on the complexity scale.
full_24 %>%
filter(section %in% c("complexity")) %>%
group_by(df, item) %>%
summarise(
mean = sum(out)
) %>%
pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
kable()
## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.
| item | CLCL | LUCID |
|---|---|---|
| q01 | 68 | 43 |
| q02 | 58 | 35 |
| q03 | 72 | 51 |
| q04 | 74 | 36 |
| q05 | 50 | 24 |
| q06 | 26 | 12 |
| q07 | 15 | 7 |
| q08 | 23 | 18 |
| q09 | 21 | 16 |
| q10 | 33 | 22 |
| q11 | 44 | 31 |
| q12 | 29 | 26 |
| q13 | 30 | 16 |
| q14 | 27 | 18 |
| q15 | 48 | 33 |
| q16 | 68 | 41 |
| q17 | 68 | 39 |
| q18 | 20 | 20 |
| q19 | 21 | 13 |
| q20 | 30 | 22 |
| q21 | 17 | 11 |
| q22 | 25 | 16 |
| q23 | 6 | 4 |
| q24 | 8 | 9 |
| q25 | 29 | 21 |
| q26 | 47 | 27 |
| q27 | 16 | 17 |
| q28 | 14 | 8 |
| q29 | 53 | 35 |
| q30 | 44 | 26 |
| q31 | 32 | 24 |
| q32 | 41 | 32 |
| q33 | 34 | 25 |
| q34 | 12 | 10 |
| q35 | 20 | 11 |
| q36 | 14 | 12 |
| q37 | 25 | 20 |
Vocab items that at least 1 participants are producing (a)
vocab_atleast_1 <- full_24 %>%
filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
filter(class=="Vocab") %>%
group_by(item_id) %>%
mutate(
count = sum(out)
) %>%
slice(1) %>%
ungroup(item_id) %>%
filter(count>0) %>%
dplyr::select("item_id")
Item <- filter(full_24, PartID=="CLCL_001") # make list of items
open_class <- c(
"action.words", # sections that are open-class
"animal.names",
"body.parts",
"clothing",
"descriptive.words",
"food.and.drink",
"furniture.and.rooms",
"small.household.items",
"toys",
"vehicles",
"outside.things"
)
Item <- filter(Item, (item_id %in% vocab_atleast_1$item_id) & (section %in% open_class)) # select open-class items with >= 1 participants
set.seed(245)
f1 <- 43/nrow(Item) # proportion for 13 tiems
DF <- Item %>%
group_by(section) %>%
slice_sample(prop= f1, replace=FALSE) %>%
slice(-8) %>% # remove one random cell
ungroup()
n_open = nrow(filter(Item, section %in% open_class))
Item %>%
filter(section %in% open_class) %>%
group_by(section) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/n_open
)
DF %>%
group_by(section) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/37
)
total_37 <- full_24 %>%
filter(item_id %in% DF$item_id) %>%
group_by(PartID) %>%
mutate(
total_37=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "total_37")
total_n37 <- full_24 %>%
filter(section %in% open_class) %>%
filter(!item_id %in% DF$item_id) %>%
group_by(PartID) %>%
mutate(
total_others=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "total_others")
corr = full_join(total_37, total_n37, by="PartID") %>%
drop_na()
cor(corr$total_37, corr$total_others) # .96
## [1] 0.9682885
Make short version of vocab for IRT
Vocab_short_with_ids <- full_24 %>%
filter(item_id %in% DF$item_id) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
full <- left_join(
Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
)
full_noid <- full %>%
dplyr::select(-"PartID")
#m1 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=1000))
#saveRDS(m1, "IRT_models/24mo/m1.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000 iterations and it's close.
#saveRDS(m2, "IRT_models/24mo/m2.rds")
#m3 <- mirt(full_noid, 1, "2PL", dentype = "EH", technical=list(NCYCLES=2000))
#saveRDS(m3, "IRT_models/24mo/m3.rds")
m1 <- readRDS("IRT_models/24mo/m1.rds")
m2 <- readRDS("IRT_models/24mo/m2.rds")
m3 <- readRDS("IRT_models/24mo/m3.rds")
fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]
tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
ggpairs()
grammar <- full_24 %>%
filter(section=="complexity") %>%
group_by(PartID) %>%
mutate(
Complex = sum(out)
) %>%
slice(1) %>%
ungroup(PartID) %>%
dplyr::select("PartID", grammar="Complex")
vocab <- full_24 %>%
filter(class=="Vocab") %>%
group_by(PartID) %>%
mutate(
Vocab = sum(out)
) %>%
slice(1) %>%
ungroup(PartID) %>%
dplyr::select("PartID", vocab="Vocab")
vg <- full_join(grammar, vocab, by="PartID")
vglv <- Complex_with_ids_gramm %>%
bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
left_join(vg, by="PartID") %>%
pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")
vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
vocab/grammar
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))
conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.520 0.520
## ASSI 0.352 0.352
## RATIO 0.598 0.598
## MADCOV100 0.869 0.869
## MCOV100 0.076 0.076
conf_spline <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.433 0.433
## ASSI 0.287 0.287
## RATIO 0.522 0.522
## MADCOV100 0.828 0.828
## MCOV100 0.158 0.158
conf_EH <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.530 0.530
## ASSI 0.341 0.341
## RATIO 0.598 0.598
## MADCOV100 0.887 0.887
## MCOV100 0.143 0.143
dim_sim <- function(items, iter){
# Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
for(i in 1:iter){
# Make list of vocab items
book <- Item %>%
filter(section %in% open_class) %>% #select open-class items
filter(item_id %in% vocab_atleast_1$item_id) %>% # select items that at least 1 participants are producing
slice_sample(n= items, replace=FALSE) # select random sample of items
# Select items from vocab dataset
Vocab2<- full_24 %>%
filter(item_id %in% book$item_id) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
# Make data frame with non-selected items for calculating correlation coefficients.
Vocab3 <- full_24 %>%
filter(section %in% open_class) %>%
filter(!(item_id %in% book$item_id)) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
# Correlation
r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
# IRT models
full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
dplyr::select(-"PartID")
m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
m_s <- mirt(full, 1, "spline", technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
m_EH <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), dentype = "EH")
# Extract latent variables
fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
# DETECT
fulldf <- data.frame(full)
dtct <- c(rep(1, 37), rep(2, items))
conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)
# save reuslts
df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1],
corr = r)
out = bind_rows(out, df)
}
return(out)
}
#sim37 <- dim_sim(37, 100)
#write_rds(sim37, "Dim_Sim_Output/sim_24.rds")
sim40 <- read_rds("Dim_Sim_Output/sim_24.rds")
sim40 %>%
dplyr::select("corr") %>%
summarise(
min = min(corr),
max = max(corr),
mean = mean(corr)
)
library(ggpirate)
sim40 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) +
annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 12 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
ggsave(filename="plots/detect_plot_24.png", last_plot(), dpi=500)
## Saving 7 x 5 in image
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 12 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 12 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
sim40 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
group_by(LV) %>%
summarise(
Less_20 = sum(value < .40),
mean = mean(value)
)