library(readr)
library(knitr)
library(tidyverse)
library(ggplot2)
library(psych)
library(sirt)
library(Gifi)
Inst <- read_rds("Wordbank_Data/Inst_WS_Span.rds") # needs to be saved as rds - csv conflates "" and NA
Admin <- read_rds("Wordbank_Data/Admin_WS_Span.rds")
N_total = nrow(Admin)
N_long = nrow(filter(Admin, longitudinal==TRUE))
Item <- read_rds("Wordbank_Data/Item_WS_Span.rds")
Complex <- Admin %>%
full_join(.,Inst, by="data_id") %>%
full_join(., Item, by="num_item_id") %>%
filter(longitudinal==FALSE) %>%
filter(type == "combine" | # to drop non-combiners
type == "complexity" # to calculate complexity scores.
) %>%
mutate(
out = ifelse(value=="yes" | value=="produces" | value=="complex", yes=1, no=0)
)
N_complexity_items = nrow(filter(Item, type == "combine" |
type == "complexity"))
nrow(Complex) == (N_total - N_long)*N_complexity_items
## [1] TRUE
Complex_short_with_ids_all <- Complex %>%
dplyr::select(data_id, type, value, out, num_item_id) %>%
mutate(
label = str_c(type, num_item_id)
) %>%
pivot_wider(id_cols=data_id, names_from = "label", values_from="out") %>%
dplyr::select(starts_with(c("data_id", "combine", "complexity")))
Complex_short_with_ids <- Complex_short_with_ids_all %>%
drop_na()
N_NA <- nrow(Complex_short_with_ids_all) - nrow(Complex_short_with_ids)
Complex_short_grammatical <- Complex_short_with_ids %>%
filter(combine710 > 0) %>%
dplyr::select(starts_with(c("complexity"))) # need to get rid of participant names for IRT models.
N_nog <- nrow(filter(Complex_short_with_ids, combine710 == 0))
nrow(Complex_short_grammatical) == N_total - N_long - N_NA - N_nog
## [1] TRUE
Quick visualization of Grammar items
Complex_tetra <- polychoric(Complex_short_grammatical)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
rho <- Complex_tetra$rho
corPlot(rho)
fa.parallel(rho, fa="fa", cor="poly", n.obs = 775)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
plot(princals(rho))
Admin %>% # Start with new administration dataset.
left_join(.,Inst, by="data_id") %>%
left_join(., Item, by="num_item_id") %>%
filter(longitudinal==FALSE) %>%
filter(type == "word"
) %>%
group_by(value) %>%
count()
Vocab <- Admin %>% # Start with new administration dataset.
left_join(.,Inst, by="data_id") %>%
left_join(., Item, by="num_item_id") %>%
filter(longitudinal==FALSE) %>%
filter(type == "word"
) %>%
mutate(
out = ifelse(value=="produces", yes=1, no =0)
)
N_vocab = nrow(filter(Item, type == "word"))
nrow(Vocab) == (N_total - N_long)*N_vocab
## [1] TRUE
Vocab_short_with_ids_all <- Vocab %>%
filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
dplyr::select(data_id, value, out, definition) %>%
pivot_wider(id_cols=data_id, names_from = "definition", values_from="out")
Vocab_short_with_ids <- Vocab_short_with_ids_all %>%
drop_na()
N_NA_vocab <- nrow(Vocab_short_with_ids_all) - nrow(Vocab_short_with_ids)
Vocab_short <- Vocab_short_with_ids %>%
dplyr::select(-"data_id") # dataset for IRT can't have IDs
nrow(Vocab_short_with_ids) == N_total - N_long - N_NA_vocab
## [1] TRUE
full <- full_join(
Complex_short_with_ids, Vocab_short_with_ids, by="data_id"
) %>%
filter(combine710 > 0) %>%
dplyr::select(-c("data_id", "combine710")) %>%
drop_na()
full_poly <- polychoric(full)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
rho <- full_poly$rho
fa.parallel(rho, fa="fa", fm="minres", cor="poly", n.obs = 775)
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 15 and the number of components = NA
#dtct <- c(rep(1, 37), rep(2, 478))
#full2 <- data.frame(full)
#fscores = rowSums(full)
#conf <- conf.detect(full2, fscores, dtct)
#saveRDS(conf, "PreReg_Analyses/Span_output/conf1.rds")
conf <- readRDS("PreReg_Analyses/Span_output/conf1.rds")
#conf2 <- conf.detect(full2, fscores, dtct, use_sum_score=TRUE)
#saveRDS(conf, "PreReg_Analyses/Span_output/conf2.rds")
conf2 <- readRDS("PreReg_Analyses/Span_output/conf2.rds")
Exploratory Detect
#m1 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=1)
#m2 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=347)
#m3 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=4861)
#m4 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=6789)
#saveRDS(m1, "PreReg_Analyses/Span_output/m1.rds")
#saveRDS(m2, "PreReg_Analyses/Span_output/m2.rds")
#saveRDS(m3, "PreReg_Analyses/Span_output/m3.rds")
#saveRDS(m4, "PreReg_Analyses/Span_output/m4.rds")
m1<- readRDS("PreReg_Analyses/Span_output/m1.rds")
m2<- readRDS("PreReg_Analyses/Span_output/m2.rds")
m3<- readRDS("PreReg_Analyses/Span_output/m3.rds")
m4<- readRDS("PreReg_Analyses/Span_output/m4.rds")