Goes through 2 strategies of sampling:
[c_1 … c_i], sampled n_cV times, where n_cV is total number of co-occurrences in VI corpus
[c_1 … c_i] / n_E, sampled n_V times, where n_E and n_V are total number of words in EN and VI corpus respectively
corpusE <- read.csv("corpusE.csv")
corpusV <- read.csv("corpusV.csv")
#calculate number of occurrences in VI corpus
n_coocc <- sum(corpusV["tax.frequency"]) + sum(corpusV["theme.frequency"])
#get co-occurrence vector
v_tax_E <- corpusE["tax.frequency"]
colnames(v_tax_E) = c("co-occ")
v_theme_E <- corpusE["theme.frequency"]
colnames(v_theme_E) = c("co-occ")
#full vector with tax + theme
v_E <- bind_rows(v_tax_E, v_theme_E) #co-occ count
v_E_vec = v_E
for(atmp in v_E) { v_E_vec <- atmp }
#samplings has the form
# | triad (represented by a number from 1 to 210) | # of occurrences after taking samples for 1000 times |
samplings <- data.frame(matrix(ncol = 1000, nrow = 210)) #1-210 as row names
rownames(samplings) <- seq(from = 1, to = 210)
for (i in 1:1000) {
#create resample
out <- sample(length(v_E_vec), n_coocc, replace = T, prob = v_E_vec)
t <- as.data.frame(table(out))
#get table of # of occurrence corresponding to each prob in v_E_vec
#has the form
# | triad (represented by a number from 1 to 210) | # of occurrences after taking samples for 1000 times |
#add to big sampling table
#some triads are not represented in the table (picked 0 times)
for(j in 1:nrow(t)) {
row <- t[j,]
triad_n <- row[,"out"]
#so we use the triad numbers as index to make sure they go into the correct "slot"
samplings[as.character(triad_n), i] <- row["Freq"]
}
}
#replace the NA (not presented in the table) with 0
samplings[is.na(samplings)] <- 0
names <- c()
for (i in 1:1000) {
name = paste("resample", as.character(i), sep = "")
names <- c(names, name)
}
colnames(samplings) <- names
write.csv(samplings, "resamples_EtoV.csv", row.names=FALSE)
#calculate frequency from vietnamese corpus
corpusV<- read.csv("corpusV.csv")
corpusV <- corpusV %>%
mutate(theme_ratio_V = theme.frequency / (theme.frequency + tax.frequency)) %>%
filter(!is.na(theme_ratio_V)) %>%
select(Cue_renamed, theme_ratio_V)
data <- read.csv("data_anon0115_coded.csv")
resamples <- read.csv("resamples_EtoV.csv")
triads <- read.csv("triads.csv")
fixef_mR <- c()
for (i in 1:ncol(resamples)) {
curr_sample <- resamples[, i]
tax.frequency <- curr_sample[1:105]
theme.frequency <- curr_sample[106:210]
resampleE <- cbind(triads, tax.frequency, theme.frequency)
resampleE <- resampleE %>%
mutate(theme_ratio_rE = theme.frequency / (theme.frequency + tax.frequency)) %>%
filter(!is.na(theme_ratio_rE)) %>%
select(Cue_renamed, theme_ratio_rE)
data_corpus <- left_join(data, resampleE, by = "Cue_renamed")
data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")
data_sum <- data_corpus %>%
group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_rE) %>%
summarize(theme_resp_ratio_bysubj = mean(responses_theme))
data_model <- data_sum
mR = glmer(responses_theme ~ theme_ratio_rE + (1 | subject) + (1 | Cue_renamed),
data = data_model, family="binomial")
#plot(fitted(mR), residuals(mR))
#summary(mR)
fixef_mR <- append(fixef_mR, fixef(mR)[2])
}
write.csv(fixef_mR, "resample_EtoV_est.csv", row.names = FALSE)
fixef_mR <- read.csv("resample_EtoV_est.csv")
fixef_mR_dist <- fixef_mR[["x"]]
fixef_mV = 1.1450 #taken from analysis_E.Rmd
hist(fixef_mR_dist, xlim = c(1,2.5))
abline(v=fixef_mV,col="red")
resamples <- read.csv("resamples_EtoV.csv")
triads <- read.csv("triads.csv")
estPearson_mR <- c()
for (i in 1:ncol(resamples)) {
curr_sample <- resamples[, i]
tax.frequency <- curr_sample[1:105]
theme.frequency <- curr_sample[106:210]
resampleE <- cbind(triads, tax.frequency, theme.frequency)
resampleE <- resampleE %>%
mutate(theme_ratio_rE = theme.frequency / (theme.frequency + tax.frequency)) %>%
filter(!is.na(theme_ratio_rE)) %>%
select(Cue_renamed, theme_ratio_rE)
data_corpus <- left_join(data, resampleE, by = "Cue_renamed")
data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")
data_sum <- data_corpus %>%
group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_rE) %>%
summarize(theme_resp_ratio_bysubj = mean(responses_theme)) %>%
select( -theme_resp_ratio_bysubj) %>%
group_by(Cue_renamed, theme_ratio_rE) %>%
summarize(emp_theme_prop = mean(responses_theme)) %>%
rename(triad=Cue_renamed) %>%
rename(Ecorpus_theme_prop=theme_ratio_rE)
data_model <- data_sum
test <- cor.test(data_model$Ecorpus_theme_prop,data_model$emp_theme_prop, na.rm=T)
#plot(fitted(mR), residuals(mR))
#summary(mR)
estPearson_mR <- append(estPearson_mR, test$estimate)
}
write.csv(estPearson_mR, "resample_EtoV_estPearson.csv", row.names = FALSE)
estPearson_mR <- read.csv("resample_EtoV_estPearson.csv")
estPearson_mR_dist <- estPearson_mR[["x"]]
estPearson_mV = 0.4408283 #taken from analysis.Rmd
hist(estPearson_mR_dist, xlim = c(0,1))
abline(v=estPearson_mV,col="red")
#p-value: 0.046
#x <- order(estPearson_mR_dist)
#estPearson_mR_dist[x]
corpusE <- read.csv("corpusE.csv")
corpusV <- read.csv("corpusV.csv")
#calculate number of occurrences in VI corpus
n_coocc <- sum(corpusV["tax.frequency"]) + sum(corpusV["theme.frequency"])
#get co-occurrence vector
v_tax_E <- corpusE["tax.frequency"]
colnames(v_tax_E) = c("co-occ")
v_theme_E <- corpusE["theme.frequency"]
colnames(v_theme_E) = c("co-occ")
#full vector with tax + theme
v_E <- bind_rows(v_tax_E, v_theme_E) #co-occ count
n_Ecorpus = 1001610938 #https://www.english-corpora.org/coca/
n_Vcorpus = round(1391781771 / 70014755 * 1000000)
v_E_vec_normed = v_E
for(atmp in v_E) { v_E_vec_normed <- atmp / n_Ecorpus }
samplings_normed <- data.frame(matrix(ncol = 1000, nrow = 210)) #1-210 as row names
rownames(samplings_normed) <- seq(from = 1, to = 210)
for (i in 1:1000) {
print (i)
#create resample
out <- sample(length(v_E_vec_normed), n_Vcorpus, replace = T, prob = v_E_vec_normed)
t <- as.data.frame(table(out))
#add to big sampling table
for(j in 1:nrow(t)) {
row <- t[j,]
triad_n <- row[,"out"]
samplings_normed[as.character(triad_n), i] <- row["Freq"]
}
}
samplings_normed[is.na(samplings_normed)] <- 0
names <- c()
for (i in 1:1000) {
name = paste("resample", as.character(i), sep = "")
names <- c(names, name)
}
colnames(samplings_normed) <- names
write.csv(samplings_normed, "resamples_EtoV_normed.csv", row.names=FALSE)
resamples <- read.csv("resamples_EtoV_normed.csv")
triads <- read.csv("triads.csv")
estPearson_mR_normed <- c()
for (i in 1:ncol(resamples)) {
curr_sample <- resamples[, i]
tax.frequency <- curr_sample[1:105]
theme.frequency <- curr_sample[106:210]
resampleE <- cbind(triads, tax.frequency, theme.frequency)
resampleE <- resampleE %>%
mutate(theme_ratio_rE = theme.frequency / (theme.frequency + tax.frequency)) %>%
filter(!is.na(theme_ratio_rE)) %>%
select(Cue_renamed, theme_ratio_rE)
data_corpus <- left_join(data, resampleE, by = "Cue_renamed")
data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")
data_sum <- data_corpus %>%
group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_rE) %>%
summarize(theme_resp_ratio_bysubj = mean(responses_theme)) %>%
select( -theme_resp_ratio_bysubj) %>%
group_by(Cue_renamed, theme_ratio_rE) %>%
summarize(emp_theme_prop = mean(responses_theme)) %>%
rename(triad=Cue_renamed) %>%
rename(Ecorpus_theme_prop=theme_ratio_rE)
data_model <- data_sum
test <- cor.test(data_model$Ecorpus_theme_prop,data_model$emp_theme_prop, na.rm=T)
#plot(fitted(mR), residuals(mR))
#summary(mR)
estPearson_mR_normed <- append(estPearson_mR_normed, test$estimate)
}
write.csv(estPearson_mR_normed, "resamples_EtoV_normed_estPearson.csv", row.names = FALSE)
estPearson_mR_normed <- read.csv("resamples_EtoV_normed_estPearson.csv")
estPearson_mR_normed_dist <- estPearson_mR_normed[["x"]]
estPearson_mV = 0.4408283 #taken from analysis.Rmd
hist(estPearson_mR_normed_dist, xlim = c(0.4,0.6))
abline(v=estPearson_mV,col="red")