library(readr)
library(knitr)
library(tidyverse)
library(ggplot2)
library(psych)
library(sirt)
library(Gifi)
Inst <- read_rds("Wordbank_Data/Inst_WS_Span.rds") # needs to be saved as rds - csv conflates "" and NA 
Admin <- read_rds("Wordbank_Data/Admin_WS_Span.rds")
N_total = nrow(Admin) 
N_long = nrow(filter(Admin, longitudinal==TRUE))
Item <- read_rds("Wordbank_Data/Item_WS_Span.rds")

Make Complexity Data

Complex <- Admin %>%
  full_join(.,Inst, by="data_id") %>%
  full_join(., Item, by="num_item_id") %>%
  filter(longitudinal==FALSE) %>%
  filter(type == "combine" | # to drop non-combiners
           type == "complexity" # to calculate complexity scores.
         ) %>%
  mutate(
    out = ifelse(value=="yes" | value=="produces" | value=="complex", yes=1, no=0)
  ) 

N_complexity_items = nrow(filter(Item, type == "combine" | 
           type == "complexity"))

nrow(Complex) == (N_total - N_long)*N_complexity_items
## [1] TRUE
Complex_short_with_ids_all <- Complex %>% 
  dplyr::select(data_id, type, value, out, num_item_id) %>%
  mutate(
    label = str_c(type, num_item_id)
  ) %>%
  pivot_wider(id_cols=data_id, names_from = "label", values_from="out") %>%
  dplyr::select(starts_with(c("data_id", "combine", "complexity"))) 

Complex_short_with_ids <- Complex_short_with_ids_all %>%  
drop_na()

N_NA <- nrow(Complex_short_with_ids_all) - nrow(Complex_short_with_ids)

Complex_short_grammatical <- Complex_short_with_ids %>%
  filter(combine710 > 0) %>%
  dplyr::select(starts_with(c("complexity"))) # need to get rid of participant names for IRT models. 

N_nog <- nrow(filter(Complex_short_with_ids, combine710 == 0))

nrow(Complex_short_grammatical) == N_total - N_long - N_NA - N_nog
## [1] TRUE

Quick visualization of Grammar items

Complex_tetra <- polychoric(Complex_short_grammatical)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
rho <- Complex_tetra$rho
corPlot(rho)

fa.parallel(rho, fa="fa", cor="poly", n.obs = 775)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA
plot(princals(rho))

Make Vocab Data

Admin %>% # Start with new administration dataset.
  left_join(.,Inst, by="data_id") %>%
  left_join(., Item, by="num_item_id") %>%
  filter(longitudinal==FALSE) %>%
  filter(type == "word"
         ) %>%
  group_by(value) %>%
  count()
Vocab <- Admin %>% # Start with new administration dataset.
  left_join(.,Inst, by="data_id") %>%
  left_join(., Item, by="num_item_id") %>%
  filter(longitudinal==FALSE) %>%
  filter(type == "word"
         ) %>%
  mutate(
    out = ifelse(value=="produces", yes=1, no =0)
    ) 

N_vocab = nrow(filter(Item, type == "word")) 

nrow(Vocab) == (N_total - N_long)*N_vocab 
## [1] TRUE
Vocab_short_with_ids_all <- Vocab %>% 
  filter(lexical_category == "nouns" | lexical_category == "predicates") %>%
  dplyr::select(data_id, value, out, definition) %>%
  pivot_wider(id_cols=data_id, names_from = "definition", values_from="out") 

Vocab_short_with_ids <- Vocab_short_with_ids_all %>% 
  drop_na()


N_NA_vocab <- nrow(Vocab_short_with_ids_all) - nrow(Vocab_short_with_ids)

Vocab_short <- Vocab_short_with_ids %>%
  dplyr::select(-"data_id") # dataset for IRT can't have IDs

nrow(Vocab_short_with_ids) == N_total - N_long - N_NA_vocab  
## [1] TRUE
full <- full_join(
  Complex_short_with_ids, Vocab_short_with_ids, by="data_id"
) %>%
  filter(combine710 > 0) %>%
  dplyr::select(-c("data_id", "combine710")) %>%
  drop_na()

Some quick checks of dimensionality.

full_poly <- polychoric(full)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
rho <- full_poly$rho
fa.parallel(rho, fa="fa", fm="minres", cor="poly", n.obs = 775)
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  15  and the number of components =  NA
#dtct <- c(rep(1, 37), rep(2, 478))

#full2 <- data.frame(full)
#fscores = rowSums(full)

#conf <- conf.detect(full2, fscores,  dtct)
#saveRDS(conf, "PreReg_Analyses/Span_output/conf1.rds")

conf <- readRDS("PreReg_Analyses/Span_output/conf1.rds")
#conf2 <- conf.detect(full2, fscores,  dtct, use_sum_score=TRUE)

#saveRDS(conf, "PreReg_Analyses/Span_output/conf2.rds")

conf2 <- readRDS("PreReg_Analyses/Span_output/conf2.rds")

Exploratory Detect

#m1 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=1)
#m2 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=347)
#m3 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=4861)
#m4 <- expl.detect(full2, fscores, nclusters=2, use_sum_score=TRUE, seed=6789)


#saveRDS(m1, "PreReg_Analyses/Span_output/m1.rds")
#saveRDS(m2, "PreReg_Analyses/Span_output/m2.rds")
#saveRDS(m3, "PreReg_Analyses/Span_output/m3.rds")
#saveRDS(m4, "PreReg_Analyses/Span_output/m4.rds")

m1<- readRDS("PreReg_Analyses/Span_output/m1.rds")
m2<- readRDS("PreReg_Analyses/Span_output/m2.rds")
m3<- readRDS("PreReg_Analyses/Span_output/m3.rds")
m4<- readRDS("PreReg_Analyses/Span_output/m4.rds")