In these files, I look at the dimensionality of vocabulary and grammar for items produced by 1 or more children.
Open the libraries.
library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_21 <- read_csv("full_21.csv")
missing_items <- c(13, 14, 545) # alligator, animal, orange
full_21 <- full_21 %>%
filter(!(item_id %in% missing_items)) %>%
filter(PartID != "LUCID_12") # missing some obs
Get number of distinct participants in this age.
n_part <- full_21 %>%
group_by(PartID) %>%
slice(1) %>%
ungroup(PartID) %>%
count()
Items which at least 5 children are producing
gramm_atleast_1 <- full_21 %>%
filter(section %in% c("complexity")) %>%
group_by(item) %>%
mutate(
n = sum(out)
) %>%
slice(1) %>%
ungroup(item) %>%
filter(n > 0) %>%
dplyr::select("item")
Complex <- full_21 %>%
filter(section == "combine" | # to drop non-combiners
section == "complexity" # to calculate complexity scores.
)
Complex_with_ids<- Complex %>%
dplyr::select("PartID", "item", "out") %>%
pivot_wider(id_cols="PartID", names_from="item", values_from="out")
Complex_with_ids_gramm <- Complex_with_ids %>%
filter(combine > 0) %>% # select only participants combining words
dplyr::select(-"combine") %>% # remove combine variable
dplyr::select("PartID", gramm_atleast_1$item) # select items that 5 or more participants are producing.
# get number of distinct participants not combining words
n_no_g <- Complex %>%
group_by(PartID) %>%
filter(section=="combine") %>%
filter(out==0) %>%
ungroup() %>%
count()
# get number of distinct items
n_item <- full_21 %>%
filter(PartID == "CLCL_005") %>%
dplyr::select("section", "item", "item_id") %>%
filter(section=="complexity")
nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)
## [1] TRUE
ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_1) + 1
## [1] TRUE
Get table of the number of non-0 responses for each item on the complexity scale.
full_21 %>%
filter(section %in% c("complexity")) %>%
group_by(df, item) %>%
summarise(
mean = sum(out)
) %>%
pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
kable()
## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.
| item | CLCL | LUCID |
|---|---|---|
| q01 | 40 | 26 |
| q02 | 36 | 23 |
| q03 | 45 | 27 |
| q04 | 45 | 17 |
| q05 | 18 | 6 |
| q06 | 4 | 3 |
| q07 | 2 | 1 |
| q08 | 4 | 3 |
| q09 | 8 | 7 |
| q10 | 9 | 5 |
| q11 | 20 | 9 |
| q12 | 5 | 7 |
| q13 | 6 | 3 |
| q14 | 10 | 8 |
| q15 | 23 | 18 |
| q16 | 46 | 23 |
| q17 | 28 | 23 |
| q18 | 10 | 5 |
| q19 | 4 | 2 |
| q20 | 5 | 5 |
| q21 | 2 | 1 |
| q22 | 3 | 3 |
| q23 | 0 | 1 |
| q24 | 5 | 2 |
| q25 | 7 | 6 |
| q26 | 12 | 9 |
| q27 | 5 | 3 |
| q28 | 5 | 2 |
| q29 | 15 | 10 |
| q30 | 13 | 12 |
| q31 | 6 | 5 |
| q32 | 18 | 12 |
| q33 | 9 | 8 |
| q34 | 3 | 1 |
| q35 | 4 | 4 |
| q36 | 1 | 1 |
| q37 | 6 | 5 |
Vocab items that at least 5 participants are producing
vocab_atleast_1 <- full_21 %>%
filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
filter(class=="Vocab") %>%
group_by(item_id) %>%
mutate(
count = sum(out)
) %>%
slice(1) %>%
ungroup(item_id) %>%
filter(count>0) %>%
dplyr::select("item_id")
Item <- filter(full_21, PartID=="CLCL_005") # make list of items
open_class <- c(
"action.words", # sections that are open-class
"animal.names",
"body.parts",
"clothing",
"descriptive.words",
"food.and.drink",
"furniture.and.rooms",
"small.household.items",
"toys",
"vehicles",
"outside.things"
)
Item <- filter(Item, (item_id %in% vocab_atleast_1$item_id) & (section %in% open_class)) # select open-class items with >= 5 participants
set.seed(75321179)
f1 <- 43/nrow(Item) # proportion for 37 tiems
DF <- Item %>%
group_by(section) %>%
slice_sample(prop= f1, replace=FALSE) %>%
ungroup() %>%
slice(-7) # remove one random row so there are 37
n_open = nrow(filter(Item, section %in% open_class))
Item %>%
filter(section %in% open_class) %>%
group_by(section) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/n_open
)
DF %>%
group_by(section) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/37
)
total_37 <- full_21 %>%
filter(item_id %in% DF$item_id) %>%
group_by(PartID) %>%
mutate(
total_37=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "total_37")
total_n37<- full_21 %>%
filter(section %in% open_class) %>%
filter(!item_id %in% DF$item_id) %>%
group_by(PartID) %>%
mutate(
total_others=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "total_others")
corr = full_join(total_37, total_n37, by="PartID") %>%
drop_na()
cor(corr$total_37, corr$total_others) # .98
## [1] 0.9761915
Make short version of vocab for IRT
Vocab_short_with_ids <- full_21 %>%
filter(item_id %in% DF$item_id) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
full <- left_join(
Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
)
full_noid <- full %>%
dplyr::select(-"PartID")
#m1 <- mirt(full_noid, 1, technical=list(NCYCLES=2000), "2PL")
#saveRDS(m1, "IRT_models/21mo/m1_full.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=2000)) # doesn't improve at 2000 iterations and it's close.
#saveRDS(m2, "IRT_models/21mo/m2_full.rds")
#m3 <- mirt(full_noid, 1, "2PL", technical=list(NCYCLES=2000), dentype = "EH")
#saveRDS(m3, "IRT_models/21mo/m3_full.rds")
m1 <- readRDS("IRT_models/21mo/m1_full.rds")
m2 <- readRDS("IRT_models/21mo/m2_full.rds")
m3 <- readRDS("IRT_models/21mo/m3_full.rds")
Get factor scores
fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]
Scatterplot matrix
tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
ggpairs()
Make data frame with total vocab and complexity scores to compare raw scores to latent variables.
grammar <- full_21 %>%
filter(section=="complexity") %>%
group_by(PartID) %>%
mutate(
Complex = sum(out)
) %>%
slice(1) %>%
ungroup(PartID) %>%
dplyr::select("PartID", grammar="Complex")
vocab <- full_21 %>%
filter(class=="Vocab") %>%
group_by(PartID) %>%
mutate(
Vocab = sum(out)
) %>%
slice(1) %>%
ungroup(PartID) %>%
dplyr::select("PartID", vocab="Vocab")
vg <- full_join(grammar, vocab, by="PartID")
vglv <- Complex_with_ids_gramm %>%
bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
left_join(vg, by="PartID") %>%
pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")
vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
vocab/grammar
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
full2 <- data.frame(full_noid)
dtct <- c(rep(1, 37), rep(2, 37))
conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.312 0.312
## ASSI 0.235 0.235
## RATIO 0.464 0.464
## MADCOV100 0.672 0.672
## MCOV100 -0.027 -0.027
conf_spline <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.329 0.329
## ASSI 0.247 0.247
## RATIO 0.494 0.494
## MADCOV100 0.665 0.665
## MCOV100 0.010 0.010
conf_EH <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.312 0.312
## ASSI 0.229 0.229
## RATIO 0.467 0.467
## MADCOV100 0.669 0.669
## MCOV100 0.002 0.002
dim_sim <- function(items, iter){
# Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
for(i in 1:iter){
# Make list of vocab items
book <- Item %>%
filter(section %in% open_class) %>% #select open-class items
filter(item_id %in% vocab_atleast_1$item_id) %>% # select items that at least 1 participant is producing
slice_sample(n= items, replace=FALSE) # select random sample of items
# Select items from vocab dataset
Vocab2<- full_21 %>%
filter(item_id %in% book$item_id) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
# Make data frame with non-selected items for calculating correlation coefficients.
Vocab3 <- full_21 %>%
filter(section %in% open_class) %>%
filter(!(item_id %in% book$item_id)) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
# Correlation
r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
# IRT models
full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
dplyr::select(-"PartID")
m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
m_s <- mirt(full, 1, "spline", technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
m_EH <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), dentype = "EH")
# Extract latent variables
fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
# DETECT
fulldf <- data.frame(full)
dtct <- c(rep(1, 37), rep(2, items))
conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)
# save reuslts
df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1],
corr = r)
out = bind_rows(out, df)
}
return(out)
}
#sim21 <- dim_sim(37, 100)
#write_rds(sim21, "Dim_Sim_Output/sim_21_full.rds")
sim21 <- read_rds("Dim_Sim_Output/sim_21_full.rds")
sim21 %>%
dplyr::select("corr") %>%
summarise(
min = min(corr),
max = max(corr),
mean = mean(corr)
)
library(ggpirate)
sim21 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
annotate("text", x = "b", y = .05, label = "Essential Uniidimensionality", vjust = -0.5) +
annotate("text", x = "b", y = .35, label = "Mild Multidimensionality", vjust = -0.5)
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 1 rows containing missing values (geom_hline).
ggsave(filename="plots/detect_plot_21.png", last_plot(), dpi=500)
## Saving 7 x 5 in image
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Removed 1 rows containing missing values (geom_hline).
sim21 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
group_by(LV) %>%
summarise(
Less_20 = sum(value < .20),
mean = mean(value)
)