analysis

Goes through 2 strategies of sampling:

[c_1 … c_i], sampled n_cV times, where n_cV is total number of co-occurrences in VI corpus
[c_1 … c_i] / n_E, sampled n_V times, where n_E and n_V are total number of words in EN and VI corpus respectively

Strategy 1

[c_1 … c_i], sampled n_cV times, where n_cV is total number of co-occurrences in VI corpus

corpusE <- read.csv("corpusE.csv")
corpusV <- read.csv("corpusV.csv")

#calculate number of occurrences in VI corpus
n_coocc <- sum(corpusV["tax.frequency"]) + sum(corpusV["theme.frequency"])

#get co-occurrence vector 
v_tax_E <- corpusE["tax.frequency"]
colnames(v_tax_E) = c("co-occ")
v_theme_E <- corpusE["theme.frequency"]
colnames(v_theme_E) = c("co-occ")
#full vector with tax + theme
v_E <- bind_rows(v_tax_E, v_theme_E) #co-occ count 

v_E_vec = v_E
for(atmp in v_E) { v_E_vec <- atmp }

#samplings has the form 
# | triad (represented by a number from 1 to 210) | # of occurrences after taking samples for 1000 times |

samplings <- data.frame(matrix(ncol = 1000, nrow = 210)) #1-210 as row names
rownames(samplings) <- seq(from = 1, to = 210)

for (i in 1:1000) {
    #create resample
    out <- sample(length(v_E_vec), n_coocc, replace = T, prob = v_E_vec)
    t <- as.data.frame(table(out)) 
    #get table of # of occurrence corresponding to each prob in v_E_vec
    #has the form
    # | triad (represented by a number from 1 to 210) | # of occurrences after taking samples for 1000 times |
    #add to big sampling table
    #some triads are not represented in the table (picked 0 times)
    
    for(j in 1:nrow(t)) {
      row <- t[j,]
      triad_n <- row[,"out"]
      #so we use the triad numbers as index to make sure they go into the correct "slot"
      samplings[as.character(triad_n), i] <- row["Freq"]
      
    }
}
#replace the NA (not presented in the table) with 0
samplings[is.na(samplings)] <- 0

names <- c()
for (i in 1:1000) {
  name = paste("resample", as.character(i), sep = "")
  names <- c(names, name)
}

colnames(samplings) <- names

write.csv(samplings, "resamples_EtoV.csv", row.names=FALSE)

#calculate frequency from vietnamese corpus
corpusV<- read.csv("corpusV.csv")
corpusV <- corpusV %>%
  mutate(theme_ratio_V = theme.frequency / (theme.frequency + tax.frequency)) %>%
  filter(!is.na(theme_ratio_V)) %>%
  select(Cue_renamed, theme_ratio_V)

data <- read.csv("data_anon0115_coded.csv")

resamples <- read.csv("resamples_EtoV.csv")
triads <- read.csv("triads.csv")

fixef_mR <- c()

for (i in 1:ncol(resamples)) {
  
  curr_sample <- resamples[, i]
  tax.frequency <- curr_sample[1:105]
  theme.frequency <- curr_sample[106:210]
  resampleE <- cbind(triads, tax.frequency, theme.frequency)
  
  resampleE <- resampleE %>%
    mutate(theme_ratio_rE = theme.frequency / (theme.frequency + tax.frequency)) %>%
    filter(!is.na(theme_ratio_rE)) %>%
    select(Cue_renamed, theme_ratio_rE)
  
  data_corpus <- left_join(data, resampleE, by = "Cue_renamed")
  data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")

  data_sum <- data_corpus %>%
    group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_rE) %>%
    summarize(theme_resp_ratio_bysubj = mean(responses_theme))
  
  data_model <- data_sum

  mR = glmer(responses_theme ~ theme_ratio_rE + (1 | subject) + (1 | Cue_renamed), 
             data = data_model, family="binomial")
  #plot(fitted(mR), residuals(mR))
  #summary(mR)
  
  fixef_mR <- append(fixef_mR, fixef(mR)[2])
}

write.csv(fixef_mR, "resample_EtoV_est.csv", row.names = FALSE)

Load resample + distribution of coeff from mixed model

fixef_mR <- read.csv("resample_EtoV_est.csv")
fixef_mR_dist <- fixef_mR[["x"]]

fixef_mV = 1.1450 #taken from analysis_E.Rmd

hist(fixef_mR_dist, xlim = c(1,2.5))
abline(v=fixef_mV,col="red")

resamples <- read.csv("resamples_EtoV.csv")
triads <- read.csv("triads.csv")

estPearson_mR <- c()

for (i in 1:ncol(resamples)) {
  
  curr_sample <- resamples[, i]
  tax.frequency <- curr_sample[1:105]
  theme.frequency <- curr_sample[106:210]
  resampleE <- cbind(triads, tax.frequency, theme.frequency)
  
  resampleE <- resampleE %>%
    mutate(theme_ratio_rE = theme.frequency / (theme.frequency + tax.frequency)) %>%
    filter(!is.na(theme_ratio_rE)) %>%
    select(Cue_renamed, theme_ratio_rE)
  
  data_corpus <- left_join(data, resampleE, by = "Cue_renamed")
  data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")

  data_sum <- data_corpus %>%
    group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_rE) %>%
    summarize(theme_resp_ratio_bysubj = mean(responses_theme)) %>%
    select( -theme_resp_ratio_bysubj) %>%
    group_by(Cue_renamed, theme_ratio_rE) %>%
    summarize(emp_theme_prop = mean(responses_theme)) %>%
    rename(triad=Cue_renamed) %>%
    rename(Ecorpus_theme_prop=theme_ratio_rE)
  
  data_model <- data_sum

  test <- cor.test(data_model$Ecorpus_theme_prop,data_model$emp_theme_prop, na.rm=T)
  
  #plot(fitted(mR), residuals(mR))
  #summary(mR)
  
  estPearson_mR <- append(estPearson_mR, test$estimate)
}

write.csv(estPearson_mR, "resample_EtoV_estPearson.csv", row.names = FALSE)

Load resample + distribution of pearson coeff

estPearson_mR <- read.csv("resample_EtoV_estPearson.csv")
estPearson_mR_dist <- estPearson_mR[["x"]]

estPearson_mV = 0.4408283 #taken from analysis.Rmd

hist(estPearson_mR_dist, xlim = c(0,1))
abline(v=estPearson_mV,col="red")

#p-value: 0.046
#x <- order(estPearson_mR_dist)
#estPearson_mR_dist[x]

Strategy 2

[c_1 … c_i] / n_E, sampled n_V times, where n_E and n_V are total number of words in EN and VI corpus respectively

corpusE <- read.csv("corpusE.csv")
corpusV <- read.csv("corpusV.csv")

#calculate number of occurrences in VI corpus
n_coocc <- sum(corpusV["tax.frequency"]) + sum(corpusV["theme.frequency"])

#get co-occurrence vector 
v_tax_E <- corpusE["tax.frequency"]
colnames(v_tax_E) = c("co-occ")
v_theme_E <- corpusE["theme.frequency"]
colnames(v_theme_E) = c("co-occ")
#full vector with tax + theme
v_E <- bind_rows(v_tax_E, v_theme_E) #co-occ count 

n_Ecorpus = 1001610938 #https://www.english-corpora.org/coca/
n_Vcorpus = round(1391781771 / 70014755 * 1000000)

v_E_vec_normed = v_E
for(atmp in v_E) { v_E_vec_normed <- atmp / n_Ecorpus }

samplings_normed <- data.frame(matrix(ncol = 1000, nrow = 210)) #1-210 as row names
rownames(samplings_normed) <- seq(from = 1, to = 210)

for (i in 1:1000) {
  print (i)
    #create resample
    out <- sample(length(v_E_vec_normed), n_Vcorpus, replace = T, prob = v_E_vec_normed)
    t <- as.data.frame(table(out))
    #add to big sampling table
    for(j in 1:nrow(t)) {
      row <- t[j,]
      triad_n <- row[,"out"]
      samplings_normed[as.character(triad_n), i] <- row["Freq"]
    }
}
samplings_normed[is.na(samplings_normed)] <- 0

names <- c()
for (i in 1:1000) {
  name = paste("resample", as.character(i), sep = "")
  names <- c(names, name)
}

colnames(samplings_normed) <- names

write.csv(samplings_normed, "resamples_EtoV_normed.csv", row.names=FALSE)

resamples <- read.csv("resamples_EtoV_normed.csv")
triads <- read.csv("triads.csv")

estPearson_mR_normed <- c()

for (i in 1:ncol(resamples)) {
  
  curr_sample <- resamples[, i]
  tax.frequency <- curr_sample[1:105]
  theme.frequency <- curr_sample[106:210]
  resampleE <- cbind(triads, tax.frequency, theme.frequency)
  
  resampleE <- resampleE %>%
    mutate(theme_ratio_rE = theme.frequency / (theme.frequency + tax.frequency)) %>%
    filter(!is.na(theme_ratio_rE)) %>%
    select(Cue_renamed, theme_ratio_rE)
  
  data_corpus <- left_join(data, resampleE, by = "Cue_renamed")
  data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")

  data_sum <- data_corpus %>%
    group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_rE) %>%
    summarize(theme_resp_ratio_bysubj = mean(responses_theme)) %>%
    select( -theme_resp_ratio_bysubj) %>%
    group_by(Cue_renamed, theme_ratio_rE) %>%
    summarize(emp_theme_prop = mean(responses_theme)) %>%
    rename(triad=Cue_renamed) %>%
    rename(Ecorpus_theme_prop=theme_ratio_rE)
  
  data_model <- data_sum

  test <- cor.test(data_model$Ecorpus_theme_prop,data_model$emp_theme_prop, na.rm=T)
  
  #plot(fitted(mR), residuals(mR))
  #summary(mR)
  
  estPearson_mR_normed <- append(estPearson_mR_normed, test$estimate)
}

write.csv(estPearson_mR_normed, "resamples_EtoV_normed_estPearson.csv", row.names = FALSE)

Load resample + distribution of pearson coeff

estPearson_mR_normed <- read.csv("resamples_EtoV_normed_estPearson.csv")
estPearson_mR_normed_dist <- estPearson_mR_normed[["x"]]

estPearson_mV = 0.4408283 #taken from analysis.Rmd

hist(estPearson_mR_normed_dist, xlim = c(0.4,0.6))
abline(v=estPearson_mV,col="red")