In these files, I look at the dimensionality of vocabulary and grammar for items produced by 5 or more children.
Open the libraries.
library(readr)
library(knitr)
library(tidyverse)
library(mirt)
library(GGally)
library(patchwork)
library(sirt)
full_18 <- read_csv("full_18.csv")
missing_items <- c(13, 14, 545) # alligator, animal, orange (these items were not consistently entered in L05)
full_18 <- full_18 %>%
filter(!(item_id %in% missing_items))
Get number of distinct participants in this age.
n_part <- full_18 %>%
group_by(PartID) %>% # Group by Part ID
slice(1) %>% # select 1 row
ungroup(PartID) %>% # ungroup
count() # count
Items which at least 5 children are producing
gramm_atleast_5 <- full_18 %>%
filter(section %in% c("complexity")) %>%
group_by(item) %>%
mutate(
n = sum(out)
) %>%
slice(1) %>%
ungroup(item) %>%
filter(n > 4) %>%
dplyr::select("item")
Complex <- full_18 %>%
filter(section == "combine" | # to drop non-combiners
section == "complexity" # to calculate complexity scores.
)
Complex_with_ids<- Complex %>%
dplyr::select("PartID", "item", "out") %>%
pivot_wider(id_cols="PartID", names_from="item", values_from="out")
Complex_with_ids_gramm <- Complex_with_ids %>%
filter(combine > 0) %>% # select only participants combining words
dplyr::select(-"combine") %>% # remove combine variable
dplyr::select("PartID", gramm_atleast_5$item) # select items that 5 or more participants are producing.
# get number of distinct participants not combining words
n_no_g <- Complex %>%
group_by(PartID) %>%
filter(section=="combine") %>%
filter(out==0) %>%
ungroup() %>%
count()
# get number of distinct items
n_item <- full_18 %>%
filter(PartID == "CLCL_001") %>%
dplyr::select("section", "item", "item_id") %>%
filter(section=="complexity")
nrow(Complex_with_ids_gramm ) == (n_part$n - n_no_g$n)
## [1] TRUE
ncol(Complex_with_ids_gramm) == nrow(gramm_atleast_5) + 1
## [1] TRUE
Get table of the number of non-0 responses for each item on the complexity scale.
full_18 %>%
filter(section %in% c("complexity")) %>%
group_by(df, item) %>%
summarise(
mean = sum(out)
) %>%
pivot_wider(id_cols=item, names_from=df, values_from=mean) %>%
kable()
## `summarise()` has grouped output by 'df'. You can override using the `.groups`
## argument.
| item | CLCL | LUCID |
|---|---|---|
| q01 | 14 | 13 |
| q02 | 11 | 7 |
| q03 | 10 | 8 |
| q04 | 9 | 6 |
| q05 | 4 | 2 |
| q06 | 0 | 2 |
| q07 | 1 | 0 |
| q08 | 1 | 1 |
| q09 | 1 | 0 |
| q10 | 5 | 0 |
| q11 | 5 | 5 |
| q12 | 0 | 2 |
| q13 | 2 | 0 |
| q14 | 2 | 2 |
| q15 | 4 | 7 |
| q16 | 16 | 10 |
| q17 | 5 | 7 |
| q18 | 2 | 2 |
| q19 | 0 | 2 |
| q20 | 0 | 1 |
| q21 | 1 | 0 |
| q22 | 0 | 0 |
| q23 | 0 | 0 |
| q24 | 0 | 0 |
| q25 | 1 | 1 |
| q26 | 0 | 1 |
| q27 | 0 | 0 |
| q28 | 0 | 0 |
| q29 | 2 | 3 |
| q30 | 3 | 4 |
| q31 | 0 | 0 |
| q32 | 2 | 3 |
| q33 | 0 | 2 |
| q34 | 0 | 0 |
| q35 | 0 | 0 |
| q36 | 0 | 0 |
| q37 | 0 | 0 |
13 grammar items above 5
Vocab items that at least 5 participants are producing
vocab_atleast_5 <- full_18 %>%
filter(PartID %in% Complex_with_ids_gramm$PartID) %>%
filter(class=="Vocab") %>%
group_by(item_id) %>%
mutate(
count = sum(out)
) %>%
slice(1) %>%
ungroup(item_id) %>%
filter(count>4) %>%
dplyr::select("item_id")
Item <- filter(full_18, PartID=="CLCL_001") # make list of items
open_class <- c(
"action.words", # sections that are open-class
"animal.names",
"body.parts",
"clothing",
"descriptive.words",
"food.and.drink",
"furniture.and.rooms",
"small.household.items",
"toys",
"vehicles",
"outside.things"
)
Item <- filter(Item, (item_id %in% vocab_atleast_5$item_id) & (section %in% open_class)) # select open-class items with >= 5 participants
set.seed(245)
f1 <- 17.5/nrow(Item) # proportion for 13 tiems
DF <- Item %>%
group_by(section) %>%
slice_sample(prop= f1, replace=FALSE) %>%
ungroup()
n_open = nrow(filter(Item, section %in% open_class))
Item %>%
filter(section %in% open_class) %>%
group_by(section) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/n_open
)
DF %>%
group_by(section) %>%
count() %>%
ungroup() %>%
mutate(
Prop = n/13
)
total_13 <- full_18 %>%
filter(item_id %in% DF$item_id) %>%
group_by(PartID) %>%
mutate(
total_13=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "total_13")
total_n13 <- full_18 %>%
filter(section %in% open_class) %>%
filter(!item_id %in% DF$item_id) %>%
group_by(PartID) %>%
mutate(
total_others=sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "total_others")
corr = full_join(total_13, total_n13, by="PartID") %>%
drop_na()
cor(corr$total_13, corr$total_others) # .91
## [1] 0.914061
Make short version of vocab for IRT
Vocab_short_with_ids <- full_18 %>%
filter(item_id %in% DF$item_id) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
full <- left_join(
Complex_with_ids_gramm, Vocab_short_with_ids, by="PartID"
)
full_noid <- full %>%
dplyr::select(-"PartID")
#m1 <- mirt(full_noid, 1, "2PL")
#saveRDS(m1, "IRT_models/18mo/m1.rds")
#m2 <- mirt(full_noid, 1, "spline", technical=list(NCYCLES=1000)) # doesn't improve at 2000 iterations and it's close.
#saveRDS(m2, "IRT_models/18mo/m2.rds")
#m3 <- mirt(full_noid, 1, "2PL", dentype = "EH")
#saveRDS(m3, "IRT_models/18mo/m3.rds")
m1 <- readRDS("IRT_models/18mo/m1.rds")
m2 <- readRDS("IRT_models/18mo/m2.rds")
m3 <- readRDS("IRT_models/18mo/m3.rds")
Get factor scores
fscores <- fscores(m1, use_dentype_estimate=TRUE)[,1]
fscores_spline <- fscores(m2, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m3, use_dentype_estimate=TRUE)[,1]
Scatterplot matrix
tibble(TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
ggpairs()
Make data frame with total vocab and complexity scores to compare raw scores to latent variables.
g <- Complex %>% # data frame defined above
filter(section=="complexity") %>% # select complexity items only
group_by(PartID) %>% # sort by participant
mutate(
grammar = sum(out) # make total score for complexity
) %>%
slice(1) %>% # select 1 row per participant
ungroup() %>%
dplyr::select("PartID", "grammar")
vg <- full_18 %>%
filter(class=="Vocab") %>%
group_by(PartID) %>%
mutate(
vocab = sum(out)
) %>%
slice(1) %>%
ungroup() %>%
dplyr::select("PartID", "vocab") %>%
full_join(., g, by="PartID")
Plot latent variables against total vocabulary and grammar
vglv <- Complex_with_ids_gramm %>%
bind_cols(., TwoPL = fscores, Spline = fscores_spline, EH = fscores_EH) %>%
dplyr::select("PartID", "TwoPL", "Spline", "EH") %>%
left_join(vg, by="PartID") %>%
pivot_longer(-c("PartID", "vocab", "grammar"), names_to="model", values_to="LV")
vocab<-ggplot(vglv, aes(x=vocab, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Vocabulary") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
grammar<-ggplot(vglv, aes(x=grammar, y=LV)) + geom_point() + stat_smooth(method="gam") + labs(x = "Grammar") + facet_wrap(~factor(model, levels=c('TwoPL','EH','Spline'))) + theme_minimal()
vocab/grammar
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
DETECT
full2 <- data.frame(full_noid)
dtct <- c(rep(1, 13), rep(2, 13))
conf <- conf.detect(full2, fscores, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.282 0.282
## ASSI 0.163 0.163
## RATIO 0.323 0.323
## MADCOV100 0.873 0.873
## MCOV100 -0.222 -0.222
conf_spline <- conf.detect(full2, fscores_spline, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.341 0.341
## ASSI 0.249 0.249
## RATIO 0.465 0.465
## MADCOV100 0.733 0.733
## MCOV100 -0.008 -0.008
conf_EH <- conf.detect(full2, fscores_EH, dtct)
## -----------------------------------------------------------
## Confirmatory DETECT Analysis
## Conditioning on 1 Score
## Bandwidth Scale: 1.1
## Pairwise Estimation of Conditional Covariances
## ...........................................................
## Nonparametric ICC estimation
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## ...........................................................
## Nonparametric Estimation of conditional covariances
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
## 55% 60% 65% 70% 75% 80% 85% 90% 95%
## -----------------------------------------------------------
## unweighted weighted
## DETECT 0.317 0.317
## ASSI 0.249 0.249
## RATIO 0.390 0.390
## MADCOV100 0.811 0.811
## MCOV100 -0.048 -0.048
dim_sim <- function(items, iter){
# Make tibble for output of loop
out = tibble(detect=numeric(), assi=numeric(), ratio=numeric(), detect_s =numeric(), assi_s =numeric(), ratio_s=numeric(), detect_EH =numeric(), assi_EH =numeric(), ratio_EH=numeric() )
for(i in 1:iter){
# Make list of vocab items
book <- Item %>%
filter(section %in% open_class) %>% #select open-class items
filter(item_id %in% vocab_atleast_5$item_id) %>% # select items that at least 5 participants are producing
slice_sample(n= items, replace=FALSE) # select random sample of items
# Select items from vocab dataset
Vocab2<- full_18 %>%
filter(item_id %in% book$item_id) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
# Make data frame with non-selected items for calculating correlation coefficients.
Vocab3 <- full_18 %>%
filter(section %in% open_class) %>%
filter(!(item_id %in% book$item_id)) %>%
mutate(
id = paste("Vocab", as.character(item_id), by="_")
) %>%
dplyr::select(PartID, id, out) %>%
pivot_wider(id_cols=PartID, names_from = "id", values_from="out")
# Correlation
r = cor(rowSums(Vocab2[,2:items]), rowSums(Vocab3[,2:(455 - items)]))
# IRT models
full <- left_join(Complex_with_ids_gramm, Vocab2, by="PartID") %>% # combine with complexity data
dplyr::select(-"PartID")
m <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), verbose=FALSE)
m_s <- mirt(full, 1, "spline", technical=list(NCYCLES=2000), verbose=FALSE) # likely few iterations, but this takes so much time
m_EH <- mirt(full, 1, "2PL", technical=list(NCYCLES=2000), dentype = "EH")
# Extract latent variables
fscores <- fscores(m, use_dentype_estimate=TRUE)[,1]
fscores_s <- fscores(m_s, use_dentype_estimate=TRUE)[,1]
fscores_EH <- fscores(m_EH, use_dentype_estimate=TRUE)[,1]
# DETECT
fulldf <- data.frame(full)
dtct <- c(rep(1, 13), rep(2, items))
conf <- conf.detect(fulldf, fscores, dtct, progress=FALSE)
conf_s <- conf.detect(fulldf, fscores_s, dtct, progress=FALSE)
conf_EH <- conf.detect(fulldf, fscores_EH, dtct, progress=FALSE)
# save reuslts
df <- tibble(detect=conf$detect.summary[1, 1], assi=conf$detect.summary[2, 1], ratio=conf$detect.summary[3, 1],
detect_s =conf_s$detect.summary[1, 1], assi_s =conf_s$detect.summary[2, 1], ratio_s=conf_s$detect.summary[3, 1],
detect_EH =conf_EH$detect.summary[1, 1], assi_EH =conf_EH$detect.summary[2, 1], ratio_EH=conf_EH$detect.summary[3, 1],
corr = r)
out = bind_rows(out, df)
}
return(out)
}
#sim18 <- dim_sim(13, 100)
#write_rds(sim18, "Dim_Sim_Output/sim_18_shortened.rds")
sim18 <- read_rds("Dim_Sim_Output/sim_18_shortened.rds")
sim18 %>%
dplyr::select("corr") %>%
summarise(
min = min(corr),
max = max(corr),
mean = mean(corr)
)
library(ggpirate)
sim18 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
ggplot(aes(x=LV, y= value)) + geom_pirate(aes(color=LV)) + ylim(.05,.6) + theme_minimal() + scale_x_discrete("IRT Model", labels = c("b" = "2PL","EH" = "Empircal Histogram", "s" = "Spline")) + geom_hline(yintercept=c(.2, .4, 1), linetype='dotted') +
annotate("text", x = "b", y = .05, label = "Essential Unidimensionality", vjust = -0.5) +
annotate("text", x = "b", y = .20, label = "Mild Multidimensionality", vjust = -0.5) +
annotate("text", x = "b", y = .60, label = "Moderate Multidimensionality", vjust = -0.5)
## Warning: Removed 22 rows containing non-finite values (stat_summary).
## Warning: Removed 22 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 22 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 22 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
ggsave(filename="plots/detect_plot_18.png", last_plot(), dpi=500)
## Saving 7 x 5 in image
## Warning: Removed 22 rows containing non-finite values (stat_summary).
## Warning: Removed 22 rows containing non-finite values (stat_ydensity_pirate).
## Warning: Removed 22 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_col_pirate).
## Warning: Removed 22 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
sim18 %>%
dplyr::rename(detect_b = detect, assi_b = assi, ratio_b = ratio) %>%
mutate(
run = row_number()
) %>%
pivot_longer(!c(run, corr),
names_to=c("measure", "LV"),
names_sep = "_",
values_to="value") %>%
filter(measure=="detect") %>%
group_by(LV) %>%
summarise(
Less_20 = sum(value < .4),
mean = mean(value)
)