Calculate all theta values and plot by age; French (French) behaves weirdly with an unusual peak near 19mo.
calc_theta <- function(xldf, languages, form='WS') {
xx <- tibble()
for(lang in languages) {
# if(lang %in% c("Spanish (Mexican)", "French (French)")) next
message(glue("Processing {lang}\r"))
load(here(paste("data/",form,"/",lang,"_",form,"_data.Rdata", sep='')))
ids <- d_prod |> rownames() |> as_tibble() |>
rename(data_id = value) |>
mutate(data_id = as.double(data_id)) |>
left_join(d_demo |> select(data_id, age), by = "data_id")
thetas <- fscores(models[[lang]], method = "MAP")
thetas <- cbind(ids, thetas) |>
mutate(language = lang)
saveRDS(thetas, glue("data/thetas/{lang}.rds"))
xx <- xx |> bind_rows(thetas)
}
return(xx)
}
all_thetas <- calc_theta(xldf, languages)
saveRDS(all_thetas, "data/thetas/all_thetas.rds")
all_thetas <- readRDS("data/thetas/all_thetas.rds")
theta_plot <- ggplot(all_thetas, aes(x=age, y=G, col=language)) +
theme_classic()
theta_plot +
geom_point(alpha = .1, position="jitter") +
geom_smooth(method="loess", alpha=.5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1621 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1621 rows containing missing values (`geom_point()`).
# No scatter, but with hover tooltips
ggplotly(theta_plot +
geom_smooth(method="loess", se=F))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1621 rows containing non-finite values (`stat_smooth()`).
Compare Swadesh correlations with theta against random sublist correlations with theta
run_swadesh_comparisons_theta <- function(xldf, languages, form='WS') {
xx <- tibble()
for(lang in languages) {
# if(lang %in% c("Spanish (Mexican)", "French (French)")) next
message(glue("Processing {lang}\r"))
load(here(paste("data/",form,"/",lang,"_",form,"_data.Rdata", sep='')))
swad_l <- subset(xldf, language==lang & is.element(uni_lemma, good_prod$uni_lemma))
thetas <- all_thetas |> filter(language == lang)
swad_cor = cor(thetas$G, rowSums(d_prod[,swad_l$item_id], na.rm=T), use = "complete.obs")
rand_cors <- sapply(1:1000, \(x) {
rand_inds = sample(1:ncol(d_prod), nrow(swad_l)) # N random words
rand_cor = cor(thetas$G, rowSums(d_prod[,rand_inds], na.rm=T), use = "complete.obs")
rand_cor
})
xx <- xx |> bind_rows(tibble(language = lang, sublist = "Swadesh",
run = NA, cor = swad_cor, N = nrow(swad_l)))
xx <- xx |> bind_rows(tibble(language = lang, sublist = "random",
run = 1:1000, cor = rand_cors, N = nrow(swad_l)))
# xx <- xx |> bind_rows(tibble(language = lang, `Swadesh r` = swad_cor,
# `Rand r` = rand_cor, N = nrow(swad_l)))
# d_demo, d_long, d_prod
}
return(xx)
}
load(here("data/xling-WSprod-IRTparms.Rdata"))
load("data/multiling_2pl_age_WS_prod_fits.Rdata")
xldf <- xldf |>
mutate(uni_lemma = ifelse(uni_lemma=="NA", NA, uni_lemma)) |>
filter(!is.na(uni_lemma))
xldf[which(xldf$category=="descriptive_words (adjectives)"),]$category = "descriptive_words"
xldf[which(xldf$category=="outside_places"),]$category = "outside"
languages = names(coefs) |> sort()
low_data_langs <- c("Kiswahili", "Kigiriama", "Irish", "Persian", "Finnish",
"English (Irish)", "Spanish (Peruvian)", "Greek (Cypriot)")
languages <- setdiff(languages, low_data_langs)
pars <- xldf |> filter(!is.na(uni_lemma), !is.na(d))
ul <- sort(table(pars$uni_lemma))
short_list = ul[which(ul>9)]
prod_pars <-
pars |> arrange(desc(language), desc(uni_lemma), desc(a1)) |> # get most discriminating uni_lemma per lang
filter(is.element(uni_lemma, names(short_list))) |>
select(uni_lemma, category, lexical_category, language, d) |>
group_by(uni_lemma, language) |>
slice(1) |>
pivot_wider(names_from = language, values_from = d)
prod_pars$sd = apply(prod_pars[,4:ncol(prod_pars)], 1, sd, na.rm=T)
prod_pars$numNAs = apply(prod_pars[,4:ncol(prod_pars)], 1, function(x) { sum(is.na(x)) })
med_d = median(prod_pars$sd, na.rm=T) # 1.17
sd_d = sd(prod_pars$sd, na.rm=T) # .44
good_prod <- prod_pars |>
filter(numNAs < 16) |>
filter(sd < (med_d - .5*sd_d)) |>
arrange(numNAs)
xx <- run_swadesh_comparisons_theta(xldf, languages)
xx_sum <- xx |>
group_by(language, sublist, N) |>
summarise(r = mean(cor))
t.test(xx_sum |> filter(language != "French (French)", sublist == "Swadesh") |> pull(r),
xx_sum |> filter(language != "French (French)", sublist == "random") |> pull(r),
paired=T)
##
## Paired t-test
##
## data: pull(filter(xx_sum, language != "French (French)", sublist == "Swadesh"), r) and pull(filter(xx_sum, language != "French (French)", sublist == "random"), r)
## t = 1.1082, df = 24, p-value = 0.2788
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.002455059 0.008148700
## sample estimates:
## mean difference
## 0.00284682
From the plot, we consider theta in range [-4, 4]. Plot Swadesh vs random test information over this theta range.
theta_range <- matrix(seq(-4,4,.01))
tinfo <- tibble()
form <- "WS"
for(lang in languages) {
message(glue("Processing {lang}\r"))
load(here(paste("data/",form,"/",lang,"_",form,"_data.Rdata", sep='')))
xldf_l <- subset(xldf, language==lang)
good_idx <- is.element(xldf_l$uni_lemma, good_prod$uni_lemma) |> which()
swad_tinfo <- testinfo(models[[lang]], theta_range,
which.items = good_idx)
rand_tinfos <- sapply(1:1000, \(x) {
rand_idx = sample(1:nrow(xldf_l), length(good_idx))
rand_tinfo <- testinfo(models[[lang]], theta_range,
which.items = rand_idx)
rand_tinfo
}) |> as_tibble()
tinfo <- tinfo |> bind_rows(tibble(language = lang,
theta = theta_range[,1],
Swadesh = swad_tinfo) |>
cbind(rand_tinfos) |>
pivot_longer(cols = -c(language, theta),
names_to = "run",
values_to = "tinfo") |>
mutate(sublist = ifelse(run == "Swadesh",
"Swadesh", "random")))
}
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
tinfo_sum <- tinfo |>
group_by(language, theta, sublist) |>
summarise(tinfo = mean(tinfo))
tinfo_plot <- ggplot(tinfo_sum |> filter(language != "French (French)"),
aes(x = theta, y = tinfo, col = language, lty = sublist)) +
geom_line() +
theme_classic()
ggplotly(tinfo_plot)
Compare total test information (area under curve). (Note that this value seems to be quite contingent on the specific random subset; probably good to run multiple simulations)
total_tinfo <- tinfo_sum |>
group_by(language, sublist) |>
summarise(total_tinfo = sum(tinfo)) |>
filter(language != "French (French)")
t.test(total_tinfo |> filter(sublist == "Swadesh") |> pull(total_tinfo),
total_tinfo |> filter(sublist == "random") |> pull(total_tinfo),
paired=T)
##
## Paired t-test
##
## data: pull(filter(total_tinfo, sublist == "Swadesh"), total_tinfo) and pull(filter(total_tinfo, sublist == "random"), total_tinfo)
## t = -2.3657, df = 24, p-value = 0.02642
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -608.02871 -41.42524
## sample estimates:
## mean difference
## -324.727
Compare total test information only for tails (defined loosely as values at ≤20% of the peak of the curve). (Note that this value seems to be quite contingent on the specific random subset; probably good to run multiple simulations)
tinfo_max <- tinfo_sum |> group_by(language, sublist) |>
summarise(max = max(tinfo))
tinfo_tails <- tinfo_sum |>
left_join(tinfo_max, by = c("language", "sublist")) |>
filter(tinfo <= 0.2 * max) |>
group_by(language, sublist) |>
summarise(tail_tinfo = sum(tinfo)) |>
filter(language != "French (French)")
t.test(tinfo_tails |> filter(sublist == "Swadesh") |> pull(tail_tinfo),
tinfo_tails |> filter(sublist == "random") |> pull(tail_tinfo),
paired=T)
##
## Paired t-test
##
## data: pull(filter(tinfo_tails, sublist == "Swadesh"), tail_tinfo) and pull(filter(tinfo_tails, sublist == "random"), tail_tinfo)
## t = 1.1501, df = 24, p-value = 0.2614
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -52.58443 184.95919
## sample estimates:
## mean difference
## 66.18738