# data <- read.csv("data_anon0109_coded.csv")
# corpus <- read.csv("corpusE.csv")
# data_corpus <- left_join(data, corpus, by = "Cue_renamed")
#
# data_responses <- data_corpus %>%
# group_by(subject, Cue_renamed, responses_theme) %>%
# summarize(theme_resp_ratio_bysubj = mean(responses_theme))
#
# data_freq <- corpus %>%
# mutate(theme_ratio = theme.frequency / (theme.frequency + tax.frequency)) %>%
# filter(!is.na(theme_ratio)) %>%
# select(Cue_renamed, theme_ratio)
#
# data_sum <- inner_join(data_responses, data_freq, by = "Cue_renamed") #removed lilypad
data <- read.csv("dataUSVN_corpusEV.csv") %>%
mutate(theme_ratio_E = theme.frequency_E / (theme.frequency_E + tax.frequency_E)) %>%
mutate(theme_ratio_V = theme.frequency_V / (theme.frequency_V + tax.frequency_V))
bias <- read.csv("bias_EV.csv") %>%
select(Cue_renamed, Bias_E, Bias_V)
data <- left_join(data, bias, by = "Cue_renamed")
corpusE_omit <- read.csv("corpusE_0.csv") %>%
filter(tax.frequency == 0 & theme.frequency == 0)
corpusV_omit <- read.csv("corpusV_0.csv") %>%
filter(tax.frequency == 0 & theme.frequency == 0)
triads_omit <- append(corpusE_omit$Cue_renamed, corpusV_omit$Cue_renamed)
data <- data %>%
filter(!(Cue_renamed %in% triads_omit))
# data_corpus <- left_join(data, corpusE, by = "Cue_renamed")
# data_corpus <- inner_join(data_corpus, corpusV, by = "Cue_renamed")
#
# data_sum <- data_corpus %>%
# group_by(subject, Cue_renamed, responses_theme, theme_ratio_V, theme_ratio_E) %>%
# summarize(theme_resp_ratio_bysubj = mean(responses_theme))
#write.csv(data_sum, "dataSumUS_0115.csv")
US responses
Regressions
data_model <- data %>%
filter(language == "English")
mV = glmer(responses_theme ~ theme_ratio_V + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
plot(fitted(mV), residuals(mV))

summary(mV)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_V + (1 | subject) + (1 | Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 10003.5 10032.1 -4997.7 9995.5 9499
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7628 -0.6305 0.2538 0.6144 13.7471
##
## Random effects:
## Groups Name Variance Std.Dev.
## subject (Intercept) 1.071 1.035
## Cue_renamed (Intercept) 1.706 1.306
## Number of obs: 9503, groups: subject, 132; Cue_renamed, 72
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5391 0.2551 -2.113 0.034617 *
## theme_ratio_V 1.2932 0.3651 3.543 0.000396 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## theme_rat_V -0.707
mE = glmer(responses_theme ~ theme_ratio_E + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
plot(fitted(mE), residuals(mE))

summary(mE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_E + (1 | subject) + (1 | Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 9997.8 10026.4 -4994.9 9989.8 9499
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7324 -0.6319 0.2543 0.6122 13.8553
##
## Random effects:
## Groups Name Variance Std.Dev.
## subject (Intercept) 1.071 1.035
## Cue_renamed (Intercept) 1.548 1.244
## Number of obs: 9503, groups: subject, 132; Cue_renamed, 72
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1510 0.3314 -3.473 0.000514 ***
## theme_ratio_E 2.1304 0.4796 4.442 8.92e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## theme_rat_E -0.851
#NEW, model with bias
mE_bias = glmer(responses_theme ~ theme_ratio_E + Bias_E + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
plot(fitted(mE_bias), residuals(mE_bias))

summary(mE_bias)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_E + Bias_E + (1 | subject) + (1 |
## Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 9999.4 10035.2 -4994.7 9989.4 9498
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7341 -0.6319 0.2543 0.6121 13.8576
##
## Random effects:
## Groups Name Variance Std.Dev.
## subject (Intercept) 1.071 1.035
## Cue_renamed (Intercept) 1.538 1.240
## Number of obs: 9503, groups: subject, 132; Cue_renamed, 72
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9825 1.3313 -1.489 0.136
## theme_ratio_E 2.1751 0.4826 4.507 6.58e-06 ***
## Bias_Enone 0.8168 1.2675 0.644 0.519
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) thm__E
## theme_rat_E -0.346
## Bias_Enone -0.969 0.141
## English/Vietnamese comparison
mVE = glmer(responses_theme ~ theme_ratio_V + theme_ratio_E + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
summary(mVE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_V + theme_ratio_E + (1 | subject) +
## (1 | Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 9998.6 10034.4 -4994.3 9988.6 9498
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7480 -0.6313 0.2543 0.6125 13.8545
##
## Random effects:
## Groups Name Variance Std.Dev.
## subject (Intercept) 1.071 1.035
## Cue_renamed (Intercept) 1.524 1.235
## Number of obs: 9503, groups: subject, 132; Cue_renamed, 72
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1351 0.3296 -3.444 0.000573 ***
## theme_ratio_V 0.4951 0.4547 1.089 0.276270
## theme_ratio_E 1.6875 0.6266 2.693 0.007075 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) thm__V
## theme_rat_V 0.044
## theme_rat_E -0.675 -0.650
Plot Model vs Data
require(ggrepel)
## Loading required package: ggrepel
dataUS <- data %>%
filter(language == "English")
data_avgUS <- dataUS %>% #compute emp_theme_prop
#select( -theme_resp_ratio_bysubj) %>%
group_by(Cue_renamed) %>%
summarize(emp_theme_prop = mean(responses_theme))
## `summarise()` ungrouping output (override with `.groups` argument)
corrE <- dataUS %>%
ungroup() %>%
select(-subject) %>%
distinct(Cue_renamed, theme_ratio_E) %>%
left_join(data_avgUS, by="Cue_renamed") %>%
rename(triad=Cue_renamed) %>%
rename(Ecorpus_theme_prop=theme_ratio_E)
corrE %>% ggplot(aes(x=emp_theme_prop, y=Ecorpus_theme_prop, label = triad)) +
geom_text(size = 3) +
#geom_label_repel() +
theme_classic() +
geom_abline(slope=1, colour="red", intercept = 0, alpha = 0.8) +
xlab("Proportion Thematic Chosen (US)") +
xlim (0, 1) +
ylab("Proportion Thematic Predicted (English)") +
scale_x_continuous(expand = c(0, 0), limits = c(0, 1.05)) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 1.05))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

cor.test(corrE$Ecorpus_theme_prop,corrE$emp_theme_prop, na.rm=T)
##
## Pearson's product-moment correlation
##
## data: corrE$Ecorpus_theme_prop and corrE$emp_theme_prop
## t = 4.775, df = 70, p-value = 9.549e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2982524 0.6524220
## sample estimates:
## cor
## 0.4956722
corrV <- dataUS %>%
ungroup() %>%
select(-subject) %>%
distinct(Cue_renamed, theme_ratio_V) %>%
left_join(data_avgUS, by="Cue_renamed") %>%
rename(triad=Cue_renamed) %>%
rename(Vcorpus_theme_prop=theme_ratio_V)
corrV %>% ggplot(aes(x=emp_theme_prop, y=Vcorpus_theme_prop, label = triad)) +
geom_text() +
#geom_label_repel()
geom_abline(slope=1, colour="red", intercept = 0, alpha = 0.8) +
theme_classic() +
xlab("Proportion Thematic Chosen (US)") +
ylab("Proportion Thematic Predicted (Vietnamese)")

cor.test(corrV$Vcorpus_theme_prop,corrV$emp_theme_prop, na.rm=T)
##
## Pearson's product-moment correlation
##
## data: corrV$Vcorpus_theme_prop and corrV$emp_theme_prop
## t = 3.594, df = 70, p-value = 0.0006017
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1794280 0.5738843
## sample estimates:
## cor
## 0.3946901
dataUS %>% ggplot(aes(x=theme_ratio_E, y=theme_ratio_V, label = Cue_renamed)) +
geom_text() +
#geom_label_repel() +
theme_classic() +
xlab("Proportion Thematic Predicted (English)") +
ylab("Proportion Thematic Predicted (Vietnamese)")

cor.test(corrE$Ecorpus_theme_prop,corrV$Vcorpus_theme_prop, na.rm=T)
##
## Pearson's product-moment correlation
##
## data: corrE$Ecorpus_theme_prop and corrV$Vcorpus_theme_prop
## t = 7.1387, df = 70, p-value = 6.975e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4912804 0.7656175
## sample estimates:
## cor
## 0.649075
VN responses
Regressions
#data_model <- data_responses
data_model <- data %>%
filter(language == "Vietnamese")
mV = glmer(responses_theme ~ theme_ratio_V + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
plot(fitted(mV), residuals(mV))

summary(mV)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_V + (1 | subject) + (1 | Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 4214.4 4239.4 -2103.2 4206.4 3859
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2567 -0.6587 0.2701 0.6141 4.5409
##
## Random effects:
## Groups Name Variance Std.Dev.
## Cue_renamed (Intercept) 1.8235 1.3504
## subject (Intercept) 0.4715 0.6866
## Number of obs: 3863, groups: Cue_renamed, 72; subject, 54
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4826 0.2678 -1.802 0.071608 .
## theme_ratio_V 1.3750 0.3848 3.573 0.000353 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## theme_rat_V -0.708
#NEW, with bias
mV_bias = glmer(responses_theme ~ theme_ratio_V + Bias_V + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
plot(fitted(mV_bias), residuals(mV_bias))

summary(mV_bias)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_V + Bias_V + (1 | subject) + (1 |
## Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 4217.3 4254.8 -2102.6 4205.3 3857
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.1698 -0.6596 0.2689 0.6152 4.5095
##
## Random effects:
## Groups Name Variance Std.Dev.
## Cue_renamed (Intercept) 1.7906 1.3381
## subject (Intercept) 0.4714 0.6866
## Number of obs: 3863, groups: Cue_renamed, 72; subject, 54
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.69377 0.36585 -1.896 0.057920 .
## theme_ratio_V 1.43039 0.39672 3.606 0.000312 ***
## Bias_Vnone 0.32958 0.37563 0.877 0.380261
## Bias_Vshared_morpheme -0.05723 0.55430 -0.103 0.917769
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) thm__V Bs_Vnn
## theme_rat_V -0.472
## Bias_Vnone -0.686 -0.017
## Bs_Vshrd_mr -0.352 -0.250 0.462
mE = glmer(responses_theme ~ theme_ratio_E + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
plot(fitted(mE), residuals(mE))

summary(mE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_E + (1 | subject) + (1 | Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 4213.3 4238.3 -2102.6 4205.3 3859
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.1753 -0.6559 0.2708 0.6150 4.4258
##
## Random effects:
## Groups Name Variance Std.Dev.
## Cue_renamed (Intercept) 1.7955 1.3400
## subject (Intercept) 0.4714 0.6866
## Number of obs: 3863, groups: Cue_renamed, 72; subject, 54
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9618 0.3615 -2.660 0.007810 **
## theme_ratio_E 1.9670 0.5250 3.746 0.000179 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## theme_rat_E -0.854
## English/Vietnamese comparison
mVE = glmer(responses_theme ~ theme_ratio_V + theme_ratio_E + (1 | subject) + (1 | Cue_renamed), data = data_model, family="binomial")
summary(mVE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: responses_theme ~ theme_ratio_V + theme_ratio_E + (1 | subject) +
## (1 | Cue_renamed)
## Data: data_model
##
## AIC BIC logLik deviance df.resid
## 4212.9 4244.2 -2101.5 4202.9 3858
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2272 -0.6561 0.2694 0.6141 4.4519
##
## Random effects:
## Groups Name Variance Std.Dev.
## Cue_renamed (Intercept) 1.7291 1.3150
## subject (Intercept) 0.4714 0.6866
## Number of obs: 3863, groups: Cue_renamed, 72; subject, 54
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9350 0.3559 -2.627 0.00862 **
## theme_ratio_V 0.7674 0.4937 1.554 0.12009
## theme_ratio_E 1.2789 0.6784 1.885 0.05941 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) thm__V
## theme_rat_V 0.045
## theme_rat_E -0.677 -0.650
Plot Model vs Data
require(ggrepel)
dataVN <- data %>%
filter(language == "Vietnamese")
data_avgV<- dataVN %>% #compute emp_theme_prop
#select( -theme_resp_ratio_bysubj) %>%
group_by(Cue_renamed) %>%
summarize(emp_theme_prop = mean(responses_theme))
## `summarise()` ungrouping output (override with `.groups` argument)
corrE <- data_model %>%
ungroup() %>%
select(-subject) %>%
distinct(Cue_renamed, theme_ratio_E) %>%
left_join(data_avgV, by="Cue_renamed") %>%
rename(triad=Cue_renamed) %>%
rename(Ecorpus_theme_prop=theme_ratio_E)
corrE %>% ggplot(aes(x=emp_theme_prop, y=Ecorpus_theme_prop, label = triad)) +
geom_text() +
#geom_label_repel() +
theme_classic() +
xlab("Proportion Thematic Chosen (VN)") +
ylab("Proportion Thematic Predicted (English)")
## Warning: Removed 1 rows containing missing values (geom_text).

cor.test(corrE$Ecorpus_theme_prop,corrE$emp_theme_prop, na.rm=T)
##
## Pearson's product-moment correlation
##
## data: corrE$Ecorpus_theme_prop and corrE$emp_theme_prop
## t = 3.8511, df = 70, p-value = 0.0002574
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2064557 0.5924095
## sample estimates:
## cor
## 0.4181256
corrV <- data_model %>%
ungroup() %>%
select(-subject) %>%
distinct(Cue_renamed, theme_ratio_V) %>%
left_join(data_avgV, by="Cue_renamed") %>%
rename(triad=Cue_renamed) %>%
rename(Vcorpus_theme_prop=theme_ratio_V)
corrV %>% ggplot(aes(x=emp_theme_prop, y=Vcorpus_theme_prop, label = triad)) +
geom_text(size = 3) +
#geom_label_repel() +
geom_abline(slope=1, colour="red", intercept = 0.0, alpha = 0.8) +
theme_classic() +
xlab("Proportion Thematic Chosen (VN)") +
xlim(0, 1) +
ylab("Proportion Thematic Predicted (Vietnamese)") +
scale_x_continuous(expand = c(0, 0), limits = c(-0.05, 1.05)) +
scale_y_continuous(expand = c(0, 0), limits = c(-0.05, 1.05))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_text).

cor.test(corrV$Vcorpus_theme_prop,corrV$emp_theme_prop, na.rm=T)
##
## Pearson's product-moment correlation
##
## data: corrV$Vcorpus_theme_prop and corrV$emp_theme_prop
## t = 3.661, df = 70, p-value = 0.0004838
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1865288 0.5787907
## sample estimates:
## cor
## 0.4008744
data_model %>% ggplot(aes(x=theme_ratio_E, y=theme_ratio_V, label = Cue_renamed)) +
geom_text() +
#geom_label_repel() +
theme_classic() +
xlab("Proportion Thematic Predicted (English)") +
ylab("Proportion Thematic Predicted (Vietnamese)")
## Warning: Removed 25 rows containing missing values (geom_text).

cor.test(corrE$Ecorpus_theme_prop,corrV$Vcorpus_theme_prop, na.rm=T)
##
## Pearson's product-moment correlation
##
## data: corrE$Ecorpus_theme_prop and corrV$Vcorpus_theme_prop
## t = 7.1387, df = 70, p-value = 6.975e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4912804 0.7656175
## sample estimates:
## cor
## 0.649075
Where do empirical results and corpus prediction differ?
dataUS_sum <- dataUS%>%
group_by(subject, Cue_renamed, responses_theme) %>%
summarize(theme_resp_ratio_bysubj = mean(responses_theme))%>%
#select( -theme_resp_ratio_bysubj) %>%
group_by(Cue_renamed) %>%
summarize(Eemp_theme_prop = mean(theme_resp_ratio_bysubj))
## `summarise()` regrouping output by 'subject', 'Cue_renamed' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
dataVN_sum <- dataVN %>%
group_by(subject, Cue_renamed, responses_theme) %>%
summarize(theme_resp_ratio_bysubj = mean(responses_theme))%>%
#select( -theme_resp_ratio_bysubj) %>%
group_by(Cue_renamed) %>%
summarize(Vemp_theme_prop = mean(theme_resp_ratio_bysubj))
## `summarise()` regrouping output by 'subject', 'Cue_renamed' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
dataVN_sum <- na.omit(dataVN_sum)
E_V_resp_compare <- full_join(dataUS_sum, dataVN_sum, by = "Cue_renamed")
E_V_corp_compare <- data %>%
select(Cue_renamed, theme_ratio_E, theme_ratio_V) %>%
group_by(Cue_renamed) %>%
summarize(theme_ratio_E = median(theme_ratio_E),
theme_ratio_V = median(theme_ratio_V))
## `summarise()` ungrouping output (override with `.groups` argument)
E_V_compare <- left_join(E_V_resp_compare, E_V_corp_compare)
## Joining, by = "Cue_renamed"
E_V_compare <- E_V_compare %>%
mutate(emp_diff = Eemp_theme_prop - Vemp_theme_prop) %>%
mutate(corpus_diff = theme_ratio_E - theme_ratio_V)
ggplot(data = E_V_compare,
mapping = aes(x = emp_diff, y = corpus_diff, label = Cue_renamed)) +
geom_text(size = 3) +
#geom_label_repel() +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
theme_classic()
