Swadesh CDI: Exploratory analyses

Calculate all theta values and plot by age; French (French) behaves weirdly with an unusual peak near 19mo.

calc_theta <- function(xldf, languages, form='WS') {
  xx <- tibble()
  for(lang in languages) {
    # if(lang %in% c("Spanish (Mexican)", "French (French)")) next
    message(glue("Processing {lang}\r"))
    load(here(paste("data/",form,"/",lang,"_",form,"_data.Rdata", sep='')))
    ids <- d_prod |> rownames() |> as_tibble() |> 
      rename(data_id = value) |> 
      mutate(data_id = as.double(data_id)) |> 
      left_join(d_demo |> select(data_id, age), by = "data_id")
    
    thetas <- fscores(models[[lang]], method = "MAP")
    thetas <- cbind(ids, thetas) |> 
      mutate(language = lang)
    
    saveRDS(thetas, glue("data/thetas/{lang}.rds"))
    xx <- xx |> bind_rows(thetas)
  }
  return(xx)
}

all_thetas <- calc_theta(xldf, languages)
saveRDS(all_thetas, "data/thetas/all_thetas.rds")

all_thetas <- readRDS("data/thetas/all_thetas.rds")

theta_plot <- ggplot(all_thetas, aes(x=age, y=G, col=language)) +
  theme_classic()

theta_plot +
  geom_point(alpha = .1, position="jitter") + 
  geom_smooth(method="loess", alpha=.5)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1621 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1621 rows containing missing values (`geom_point()`).

# No scatter, but with hover tooltips
ggplotly(theta_plot + 
  geom_smooth(method="loess", se=F))

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1621 rows containing non-finite values (`stat_smooth()`).

Compare Swadesh correlations with theta against random sublist correlations with theta

run_swadesh_comparisons_theta <- function(xldf, languages, form='WS') {
  xx <- tibble()
  for(lang in languages) {
    # if(lang %in% c("Spanish (Mexican)", "French (French)")) next
    message(glue("Processing {lang}\r"))
    load(here(paste("data/",form,"/",lang,"_",form,"_data.Rdata", sep='')))
    swad_l <- subset(xldf, language==lang & is.element(uni_lemma, good_prod$uni_lemma)) 
    thetas <- all_thetas |> filter(language == lang)
    swad_cor = cor(thetas$G, rowSums(d_prod[,swad_l$item_id], na.rm=T), use = "complete.obs")
    
    rand_cors <- sapply(1:1000, \(x) {
      rand_inds = sample(1:ncol(d_prod), nrow(swad_l)) # N random words 
      rand_cor = cor(thetas$G, rowSums(d_prod[,rand_inds], na.rm=T), use = "complete.obs")
      rand_cor
    })
    
    xx <- xx |> bind_rows(tibble(language = lang, sublist = "Swadesh", 
                                 run = NA, cor = swad_cor, N = nrow(swad_l)))
    xx <- xx |> bind_rows(tibble(language = lang, sublist = "random", 
                                 run = 1:1000, cor = rand_cors, N = nrow(swad_l)))
    
    # xx <- xx |> bind_rows(tibble(language = lang, `Swadesh r` = swad_cor, 
    #                               `Rand r` = rand_cor, N = nrow(swad_l)))
    # d_demo, d_long, d_prod
  }
  return(xx)
}

load(here("data/xling-WSprod-IRTparms.Rdata"))
load("data/multiling_2pl_age_WS_prod_fits.Rdata")
xldf <- xldf |> 
  mutate(uni_lemma = ifelse(uni_lemma=="NA", NA, uni_lemma)) |> 
  filter(!is.na(uni_lemma))
xldf[which(xldf$category=="descriptive_words (adjectives)"),]$category = "descriptive_words"
xldf[which(xldf$category=="outside_places"),]$category = "outside"
languages = names(coefs) |> sort()
low_data_langs <- c("Kiswahili", "Kigiriama", "Irish", "Persian", "Finnish",
                    "English (Irish)", "Spanish (Peruvian)", "Greek (Cypriot)")
languages <- setdiff(languages, low_data_langs)

pars <- xldf |> filter(!is.na(uni_lemma), !is.na(d)) 
ul <- sort(table(pars$uni_lemma)) 
short_list = ul[which(ul>9)]
prod_pars <-
  pars |> arrange(desc(language), desc(uni_lemma), desc(a1)) |> # get most discriminating uni_lemma per lang
  filter(is.element(uni_lemma, names(short_list))) |>
  select(uni_lemma, category, lexical_category, language, d) |>
  group_by(uni_lemma, language) |>
  slice(1) |> 
  pivot_wider(names_from = language, values_from = d)
prod_pars$sd = apply(prod_pars[,4:ncol(prod_pars)], 1, sd, na.rm=T)
prod_pars$numNAs = apply(prod_pars[,4:ncol(prod_pars)], 1, function(x) { sum(is.na(x)) })
med_d = median(prod_pars$sd, na.rm=T) # 1.17
sd_d = sd(prod_pars$sd, na.rm=T) # .44
good_prod <- prod_pars |> 
  filter(numNAs < 16) |>
  filter(sd < (med_d - .5*sd_d)) |> 
  arrange(numNAs)

xx <- run_swadesh_comparisons_theta(xldf, languages)

xx_sum <- xx |> 
  group_by(language, sublist, N) |> 
  summarise(r = mean(cor))

t.test(xx_sum |> filter(language != "French (French)", sublist == "Swadesh") |> pull(r),
       xx_sum |> filter(language != "French (French)", sublist == "random") |> pull(r),
       paired=T)

## 
##  Paired t-test
## 
## data:  pull(filter(xx_sum, language != "French (French)", sublist == "Swadesh"), r) and pull(filter(xx_sum, language != "French (French)", sublist == "random"), r)
## t = 1.1082, df = 24, p-value = 0.2788
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.002455059  0.008148700
## sample estimates:
## mean difference 
##      0.00284682

From the plot, we consider theta in range [-4, 4]. Plot Swadesh vs random test information over this theta range.

theta_range <- matrix(seq(-4,4,.01))
tinfo <- tibble()
form <- "WS"
for(lang in languages) {
  message(glue("Processing {lang}\r"))
  load(here(paste("data/",form,"/",lang,"_",form,"_data.Rdata", sep='')))
  xldf_l <- subset(xldf, language==lang)
  good_idx <- is.element(xldf_l$uni_lemma, good_prod$uni_lemma) |> which()
  
  swad_tinfo <- testinfo(models[[lang]], theta_range,
                         which.items = good_idx)
  
  rand_tinfos <- sapply(1:1000, \(x) {
    rand_idx = sample(1:nrow(xldf_l), length(good_idx))
    rand_tinfo <- testinfo(models[[lang]], theta_range,
                           which.items = rand_idx)
    rand_tinfo
  }) |> as_tibble()
  
  tinfo <- tinfo |> bind_rows(tibble(language = lang,
                                     theta = theta_range[,1],
                                     Swadesh = swad_tinfo) |> 
                                cbind(rand_tinfos) |> 
                                pivot_longer(cols = -c(language, theta),
                                             names_to = "run",
                                             values_to = "tinfo") |> 
                                mutate(sublist = ifelse(run == "Swadesh",
                                                        "Swadesh", "random")))
}

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.

tinfo_sum <- tinfo |> 
  group_by(language, theta, sublist) |> 
  summarise(tinfo = mean(tinfo))

tinfo_plot <- ggplot(tinfo_sum |> filter(language != "French (French)"), 
       aes(x = theta, y = tinfo, col = language, lty = sublist)) +
  geom_line() +
  theme_classic()

ggplotly(tinfo_plot)

Compare total test information (area under curve). (Note that this value seems to be quite contingent on the specific random subset; probably good to run multiple simulations)

total_tinfo <- tinfo_sum |> 
  group_by(language, sublist) |> 
  summarise(total_tinfo = sum(tinfo)) |> 
  filter(language != "French (French)")

t.test(total_tinfo |> filter(sublist == "Swadesh") |> pull(total_tinfo),
       total_tinfo |> filter(sublist == "random") |> pull(total_tinfo),
       paired=T)

## 
##  Paired t-test
## 
## data:  pull(filter(total_tinfo, sublist == "Swadesh"), total_tinfo) and pull(filter(total_tinfo, sublist == "random"), total_tinfo)
## t = -2.3657, df = 24, p-value = 0.02642
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -608.02871  -41.42524
## sample estimates:
## mean difference 
##        -324.727

Compare total test information only for tails (defined loosely as values at ≤20% of the peak of the curve). (Note that this value seems to be quite contingent on the specific random subset; probably good to run multiple simulations)

tinfo_max <- tinfo_sum |> group_by(language, sublist) |> 
  summarise(max = max(tinfo))

tinfo_tails <- tinfo_sum |> 
  left_join(tinfo_max, by = c("language", "sublist")) |> 
  filter(tinfo <= 0.2 * max) |> 
  group_by(language, sublist) |> 
  summarise(tail_tinfo = sum(tinfo)) |> 
  filter(language != "French (French)")

t.test(tinfo_tails |> filter(sublist == "Swadesh") |> pull(tail_tinfo),
       tinfo_tails |> filter(sublist == "random") |> pull(tail_tinfo),
       paired=T)

## 
##  Paired t-test
## 
## data:  pull(filter(tinfo_tails, sublist == "Swadesh"), tail_tinfo) and pull(filter(tinfo_tails, sublist == "random"), tail_tinfo)
## t = 1.1501, df = 24, p-value = 0.2614
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -52.58443 184.95919
## sample estimates:
## mean difference 
##        66.18738

Swadesh CDI: Exploratory analyses

Alvin Tan

2023-01-25