Are semantic associations related to individuals differences? Namely: self-report measures of visual, linguistic and spatial processing preferences.

The task: participants given cue and asked to produce 4 associates.


Read in and summarize data.

d.raw = read.csv("../data/assocData_complete.csv", header = TRUE)%>%
  select(workerId, cue, associate, num, forwardStrength, backwardStrength, visual, spatial, word, Dom_PoS_SUBTLEX, Lg10WF)

py.measures = read.csv("ranked_LV/all_associate_stringsP.csv")

d.raw = left_join(d.raw, py.measures) %>%
  select(-a1, -a2, -a3, -a4)

mean.associates = d.raw %>% 
  group_by(workerId) %>% 
  summarise(n = n()) %>%
  summarise(mean = mean(n))

There are 65 participants. The mean number of associates per participants is 156.

Processing difference measures

measures = d.raw %>%
  group_by(workerId) %>%
  slice(1) %>%
  ungroup() %>%
  select(workerId, visual, spatial, word) 
  
measures %>%
  gather("measure", "value", 2:4) %>%
  ggplot(aes(x = value, fill = measure)) +
           geom_density(alpha = .4) +
  facet_grid(~ measure) +
  theme_bw() +
  theme(legend.position ="none")

Visual and spatial correlation

kable(tidy(cor.test(measures$visual, measures$spatial)))
estimate statistic p.value parameter conf.low conf.high
0.1928055 1.55961 0.1238619 63 -0.0536148 0.4170909

Visual and word correlation

kable(tidy(cor.test(measures$visual, measures$word)))
estimate statistic p.value parameter conf.low conf.high
0.0018503 0.0146864 0.9883287 63 -0.2421581 0.2456386

Word and spatial correlation

kable(tidy(cor.test(measures$word, measures$spatial)))
estimate statistic p.value parameter conf.low conf.high
-0.0206234 -0.1637279 0.8704695 63 -0.2631986 0.2244045

Measures are uncorrelated. Now on to the key analyses….

Overall agreement

Are there differences in agreement with the average partiicpant, by processing preference? This is the numWordMatch measure taken from the rankedComparision.py script.

Word matches

overall.word.matches = d.raw%>%
  group_by(workerId) %>%
  summarize(mean.numWordMatch = mean(numWordMatch),
            visual = mean(visual),
            spatial = mean(spatial),
            word = mean(word)) %>%
  gather("measure", "value", 3:5)

ggplot(overall.word.matches, aes(y = mean.numWordMatch, x = value)) +
  facet_grid(.~measure) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("Number of matched words") +
  theme_bw()

Within-group agreement

Within a given word/visual bin, and given the same cue, what proportion/number of matches are there? Two measures: The raw number of counts (Counted) and proportion two randomly sampled participants give the same associate (Sampled).

Counted

Word

d.raw$word.bin = as.factor(ifelse(d.raw$word > median(measures$word), 1, 0))
d.raw$visual.bin = as.factor(ifelse(d.raw$visual > median(measures$visual), 1, 0))
d.raw$word.quant = quantcut(d.raw$word)
d.raw$visual.quant = quantcut(d.raw$visual)

get_num_matchesW <- function(data, this.cue, this.word.bin, this.num, this.associate){
  all_associates = filter(data, 
                          cue == this.cue & word.bin == this.word.bin
                          & num == this.num)$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matchesW(d.raw, cue, word.bin, num, associate)) 

same.responses %>% 
  group_by(word.bin, num) %>%
  multi_boot_standard(column = "num_same", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = word.bin, 
             color = word.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Num matches") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Stats – collapsing across num.

d.word.stats = same.responses %>%
  group_by(workerId, word.bin) %>%
  summarise(num_same = mean(num_same))
kable(tidy(t.test(filter(d.word.stats, word.bin == 1)$num_same, 
                  filter(d.word.stats, word.bin == 0)$num_same)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
0.2724359 3.387821 3.115385 2.617174 0.0112349 59.24685 0.0641595 0.4807123

Quantiles

get_num_matchesWQ <- function(data, this.cue, this.word.quant, this.num, this.associate){
  all_associates = filter(data, 
                          cue == this.cue & word.quant== this.word.quant
                          & num == this.num)$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matchesWQ(d.raw, cue, word.quant, num, associate)) 

same.responses %>% 
  group_by(word.quant) %>%
  multi_boot_standard(column = "num_same", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, 
             color = word.quant)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Word quantile") +
  ylab("Num matches") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Visual

get_num_matchesV <- function(data, this.cue, this.visual.bin, 
                             this.num, this.associate){
  all_associates = filter(data, 
                          cue == this.cue & visual.bin == this.visual.bin
                          & num == this.num)$associate
  num_matches = sum(all_associates == this.associate) 
  num_matches
}

same.responses = d.raw %>%
  rowwise() %>% 
  mutate(num_same = get_num_matchesV(d.raw, cue, visual.bin, num, associate)) 

same.responses %>% 
  group_by(visual.bin, num) %>%
  multi_boot_standard(column = "num_same", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = visual.bin, 
             color = visual.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Num matches") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Stats – collapsing across num.

d.visual.stats = same.responses %>%
  group_by(workerId, visual.bin) %>%
  summarise(num_same = mean(num_same))
kable(tidy(t.test(filter(d.visual.stats, visual.bin == 1)$num_same, 
                  filter(d.visual.stats, visual.bin == 0)$num_same)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
0.0774704 3.275434 3.197964 0.7183286 0.4753458 59.93184 -0.1382631 0.293204

So, there is a difference for prop same for word (more word, more agreement), but not for visual.

What about the two “high” groups? Comparing high word to high visual. There is no difference there.

kable(tidy(t.test(filter(d.visual.stats, visual.bin == 1)$num_same, 
                  filter(d.word.stats, word.bin == 1)$num_same)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
-0.1123863 3.275434 3.387821 -1.241645 0.2191316 60.86193 -0.2933886 0.0686161

Sampled

Here I calculated the probability that two participants sampled from the same word bin and given the same cue will produce the same associate. To do this, we randomly sample 2 participants x N_PAIR_SAMPLED for each cue with an education level. We then take the proportion agreement across samples, and then average across cues within an education level.

Word

Do people who are higher on word produce more similiar responses to one another?

one_sample <- function(associate) {
  function(k) {
    sample.is = sample(1:length(associate), 2, replace = T)
    associate[sample.is[1]] == associate[sample.is[2]] 
  }
}

all_samples <- function(associate, n_pair_samples) {
  sample_values <- 1:n_pair_samples %>%
    map(one_sample(associate)) %>%
    unlist()
  data.frame(prop = sum(sample_values)/length(sample_values))
}

N_PAIR_SAMPLED = 100
sim.responses = d.raw %>%
  filter(associate != "NA") %>%
  group_by(cue, word.bin, num) %>%
  mutate(prop_similar = unlist(all_samples(associate,
                                           N_PAIR_SAMPLED)))

sim.responses %>%
  group_by(word.bin, num) %>%
  multi_boot_standard(column = "prop_similar", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = word.bin, 
             color = word.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Proportion Same") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Quantiles

sim.responses = d.raw %>%
  filter(associate != "NA") %>%
  group_by(cue, word.quant, num) %>%
  mutate(prop_similar = unlist(all_samples(associate,
                                           N_PAIR_SAMPLED)))
sim.responses %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "prop_similar", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, 
             color = word.quant)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Proportion Same") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Visual

Do people who are higher on visual produce less similiar responses to one another?

d.raw$visual.bin = as.factor(ifelse(d.raw$visual > median(measures$visual), 1, 0))

N_PAIR_SAMPLED = 100
sim.responses = d.raw %>%
  filter(associate != "NA") %>%
  group_by(cue, visual.bin, num) %>%
  mutate(prop_similar = unlist(all_samples(associate,
                                           N_PAIR_SAMPLED))) 

sim.responses %>%
  group_by(visual.bin, num) %>%
  multi_boot_standard(column = "prop_similar", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = visual.bin, 
             color = visual.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("Proportion Same") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Overall, this measure looks like the counted measure.

Edit distance

unranked

Using the by-word procedure you emailed.

# spread associates
d.raw.wide = d.raw %>%
  select(workerId, cue, num, associate, word.bin, visual.bin, workerId) %>%
  spread(num, associate) %>%
  rename(a1 = `1`, a2 = `2`, a3 = `3`, a4 = `4`)

# write.csv(select(d.raw.wide, workerId, cue, a1, a2, a3, a4), "all_associate_strings.csv") # for rankedComparision.py

d.raw.wide$all = paste(d.raw.wide$a1, 
                       d.raw.wide$a2, d.raw.wide$a3, d.raw.wide$a4, " ") 
d.raw.wide$all=  gsub("NA","", d.raw.wide$all)

get_edit_distance <- function(this.cue, this.workerId) {
  this.string = filter(d.raw.wide, 
                       cue == this.cue & workerId == this.workerId)$all
  all.strings = filter(d.raw.wide, 
                       cue == this.cue & workerId != this.workerId)$all
  
  this.string.int = hashr::hash(strsplit(this.string,"[[:blank:]]+"))
  all.strings.int = hashr::hash(strsplit(all.strings,"[[:blank:]]+"))
  
  edits = unlist(lapply(all.strings.int, 
                        function(x) seq_dist(x, this.string.int)))
  
  mean(edits, na.rm = T)
}

word.edits = d.raw.wide %>%
  group_by(cue, workerId) %>%
  summarize(edit_distance = get_edit_distance(cue[1], workerId[1]))  %>%
  select(cue, workerId, edit_distance) %>% 
  group_by(workerId) %>% # collapse across trials for each worker
  summarise(mean.edit.distance = mean(edit_distance, na.rm = T))

word.edits.full = left_join(word.edits, d.raw %>% ungroup() %>% group_by(workerId) %>% slice(1) %>% ungroup() %>% select(spatial, visual, word, workerId, word.bin, visual.bin), by = "workerId") 

word.edits.long = select(word.edits.full, workerId, visual, spatial, word, mean.edit.distance) %>%
  gather("measure", "value", 2:4)

ggplot(word.edits.long, aes(y= mean.edit.distance, x = value)) +
  facet_grid(.~measure) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

kable(word.edits.long %>%
        group_by(measure) %>%
        do(tidy(cor.test(.$mean.edit.distance, .$value)))) 
measure estimate statistic p.value parameter conf.low conf.high
spatial -0.0644754 -0.5128243 0.6098677 63 -0.3036002 0.1822904
visual -0.1451754 -1.1646320 0.2485592 63 -0.3757692 0.1023477
word -0.0176941 -0.1404649 0.8887408 63 -0.2604692 0.2271854

There is not a significant correlation between edit distance and any of the processing preferences.

kable(tidy(t.test(filter(word.edits.full, word.bin == 1)$mean.edit.distance, 
                  filter(word.edits.full, visual.bin == 1)$mean.edit.distance)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
-0.0036752 3.712891 3.716566 -0.1857602 0.8532517 60.75032 -0.0432408 0.0358903

No difference in edit distance for high word and visual groups.

kable(tidy(t.test(filter(word.edits.full, word.bin == 0)$mean.edit.distance, 
                  filter(word.edits.full, word.bin == 1)$mean.edit.distance)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
0.0263285 3.739219 3.712891 1.315183 0.1932291 62.81698 -0.0136783 0.0663353

No difference in edit distance between high and low word groups.

kable(tidy(lm(mean.edit.distance~visual+ spatial + word, data = word.edits.full)))
term estimate std.error statistic p.value
(Intercept) 3.7262574 0.0102047 365.1500461 0.0000000
visual -0.0144586 0.0135385 -1.0679658 0.2897442
spatial -0.0037199 0.0125341 -0.2967822 0.7676410
word -0.0018809 0.0130628 -0.1439900 0.8859831

Predicting edit distance with all the individual differences, none are significant predictors.

ranked

This is the measure taken from the rankedComparision.py script writted by Gary.

word.edits.full.ranked = d.raw %>%
  group_by(workerId) %>% # collapse across trials for each worker
  summarise(mean.rankDiff = mean(rankDiff, na.rm = T),
            mean.numWordMatch = mean(numWordMatch, na.rm = T)) %>%  
  left_join(d.raw %>% ungroup() %>%group_by(workerId) %>% slice(1)%>% ungroup() %>% select(workerId, word.bin, visual.bin, word, visual, spatial), by = "workerId") 

word.edits.long.ranked = select(word.edits.full.ranked, workerId, visual,
                                spatial, word, mean.numWordMatch, mean.rankDiff) %>%
  gather("measure", "value", 2:4)

ggplot(word.edits.long.ranked, aes(y = mean.rankDiff, x = value)) +
  facet_grid(.~measure) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("Rank edit difference") +
  theme_bw()

kable(word.edits.long.ranked %>%
        group_by(measure) %>%
        do(tidy(cor.test(.$mean.rankDiff, .$value)))) 
measure estimate statistic p.value parameter conf.low conf.high
spatial -0.1141937 -0.9123523 0.3650615 63 -0.3483895 0.1334215
visual -0.2232166 -1.8175867 0.0738831 63 -0.4429979 0.0218733
word -0.0722010 -0.5745775 0.5676233 63 -0.3106300 0.1747758

There is not a significant correlation between edit distance and any of the processing preferences.

kable(tidy(t.test(filter(word.edits.full.ranked, word.bin == 1)$mean.rankDiff, 
                  filter(word.edits.full.ranked, visual.bin == 1)$mean.rankDiff)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
-0.0015179 2.493409 2.494927 -0.1582802 0.8747659 60.18372 -0.0206996 0.0176638

No difference in edit distance for high word and visual groups.

kable(tidy(t.test(filter(word.edits.full.ranked, word.bin == 0)$mean.rankDiff, 
                  filter(word.edits.full.ranked, word.bin == 1)$mean.rankDiff)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
0.0190235 2.512433 2.493409 1.646336 0.1053344 55.6191 -0.0041276 0.0421746

No difference in edit distance between high and low word groups.

kable(tidy(lm(mean.rankDiff~visual+ spatial + word, data = word.edits.full.ranked)))
term estimate std.error statistic p.value
(Intercept) 2.5030672 0.0058460 428.1675984 0.0000000
visual -0.0127867 0.0077558 -1.6486556 0.1043597
spatial -0.0042854 0.0071804 -0.5968203 0.5528361
word -0.0044232 0.0074833 -0.5910723 0.5566559

Predicting edit distance with all the individual differences, none are significant predictors.

Frequency

Using Lg10WF.

d.raw %>%
  group_by(workerId, word.bin, num) %>%
  summarize(log.freq = mean(Lg10WF, na.rm = T)) %>%
  group_by(word.bin, num) %>%
  multi_boot_standard(column = "log.freq", na.rm = T) %>%
  ggplot(aes(x = num, y = mean, group = word.bin, 
             color = word.bin)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  xlab("Associate") +
  ylab("log frequency") +
  geom_point() + 
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Forward strength (Nelson)

Word - non log transformed

FS.by.participant = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(forwardStrength = mean(forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, word.bin == 1)$forwardStrength, 
                  filter(FS.by.participant, word.bin == 0)$forwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
0.0088468 0.1456551 0.1368083 1.89681 0.0627037 59.57996 -0.000484 0.0181776
summary(lmer(forwardStrength~word+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: forwardStrength ~ word + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -3984.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9723 -0.5354 -0.1128  0.3927  4.3775 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  workerId (Intercept) 4.706e-04 0.021693      
##           num         4.811e-05 0.006936 -1.00
##  cue      (Intercept) 8.298e-03 0.091091      
##  Residual             2.388e-02 0.154536      
## Number of obs: 4650, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.228973   0.015692  14.592
## word         0.003569   0.003085   1.157
## num         -0.037061   0.002237 -16.571
## 
## Correlation of Fixed Effects:
##      (Intr) word  
## word  0.000       
## num  -0.333  0.008

This is marginal.

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, word.quant) %>%
  summarize(forwardStrength = mean(forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, group = word.quant, 
             fill = word.quant)) +
  xlab("Word Quantile") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, word) %>%
  summarize(forwardStrength = mean(forwardStrength, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = word, y = forwardStrength)) +
  xlab("Word Quantile") +
  ylab("Forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(cor.test(FS.by.participant$forwardStrength, FS.by.participant$word)))
estimate statistic p.value parameter conf.low conf.high
0.2010658 1.629182 0.1082652 63 -0.0450422 0.424164

Visual - non log transformed

FS.by.participant = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(forwardStrength = mean(forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "forwardStrength", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

  kable(tidy(t.test(filter(FS.by.participant, visual.bin == 1)$forwardStrength, 
                  filter(FS.by.participant, visual.bin == 0)$forwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
0.0032322 0.1428543 0.1396221 0.6783044 0.5000751 62.68405 -0.0062911 0.0127555
summary(lmer(forwardStrength~visual+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: forwardStrength ~ visual + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -3983.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9773 -0.5340 -0.1135  0.3926  4.3726 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  workerId (Intercept) 0.0005216 0.022838      
##           num         0.0000519 0.007204 -1.00
##  cue      (Intercept) 0.0082980 0.091093      
##  Residual             0.0238787 0.154527      
## Number of obs: 4650, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.2289642  0.0157177  14.567
## visual      -0.0009031  0.0032081  -0.282
## num         -0.0370787  0.0022495 -16.483
## 
## Correlation of Fixed Effects:
##        (Intr) visual
## visual -0.006       
## num    -0.336  0.005

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, visual.quant) %>%
  summarize(forwardStrength = mean(forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(visual.quant) %>%
  multi_boot_standard(column = "forwardStrength", na.rm = T) %>%
  ggplot(aes(x = visual.quant, y = mean, group = visual.quant, 
             fill = visual.quant)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Word - log transformed

Forward strengths are skewed. Let’s try log transforming.

ggplot(d.raw, aes(x = log(forwardStrength))) +
         geom_histogram() +
         theme_bw()

d.raw$log.forwardStrength = log(d.raw$forwardStrength)
FS.by.participant = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Associate") +
  ylab("Log Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, word.bin == 1)$log.forwardStrength, 
                  filter(FS.by.participant, word.bin == 0)$log.forwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0694542 -2.610892 -2.680347 2.013825 0.0487148 57.50959 0.000405 0.1385035 Welch Two Sample t-test two.sided
summary(lmer(log.forwardStrength~word+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.forwardStrength ~ word + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: 13697.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6884 -0.7391 -0.0161  0.7117  3.3780 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  workerId (Intercept) 0.022936 0.15145       
##           num         0.001868 0.04322  -1.00
##  cue      (Intercept) 0.203911 0.45157       
##  Residual             1.077435 1.03800       
## Number of obs: 4650, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -2.04561    0.08224 -24.873
## word         0.03100    0.02143   1.447
## num         -0.26474    0.01486 -17.814
## 
## Correlation of Fixed Effects:
##      (Intr) word  
## word  0.000       
## num  -0.428  0.008

This is marginal.

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, word.quant) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, group = word.quant, 
             fill = word.quant)) +
  xlab("Word Quantile") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, word) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = word, y = log.forwardStrength)) +
  xlab("Word Quantile") +
  ylab("Forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(cor.test(FS.by.participant$log.forwardStrength, FS.by.participant$word)))
estimate statistic p.value parameter conf.low conf.high method alternative
0.2400805 1.962992 0.0540657 63 -0.0040561 0.4572076 Pearson’s product-moment correlation two.sided

Visual -log transformed

FS.by.participant = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, visual.bin == 1)$log.forwardStrength, 
                  filter(FS.by.participant, visual.bin == 0)$log.forwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0359891 -2.627329 -2.663318 1.030719 0.306895 58.78184 -0.0338841 0.1058623 Welch Two Sample t-test two.sided
summary(lmer(log.forwardStrength~visual+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.forwardStrength ~ visual + num + (num | workerId) + (1 |  
##     cue)
##    Data: d.raw
## 
## REML criterion at convergence: 13699.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6939 -0.7352 -0.0206  0.7135  3.3691 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  workerId (Intercept) 0.026763 0.16359       
##           num         0.002104 0.04587  -1.00
##  cue      (Intercept) 0.204127 0.45180       
##  Residual             1.077159 1.03786       
## Number of obs: 4650, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -2.04596    0.08264 -24.759
## visual      -0.00464    0.02244  -0.207
## num         -0.26485    0.01498 -17.678
## 
## Correlation of Fixed Effects:
##        (Intr) visual
## visual -0.007       
## num    -0.434  0.004

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, visual.quant) %>%
  summarize(log.forwardStrength = mean(log.forwardStrength, na.rm = T))

FS.by.participant %>%
  group_by(visual.quant) %>%
  multi_boot_standard(column = "log.forwardStrength", na.rm = T) %>%
  ggplot(aes(x = visual.quant, y = mean, group = visual.quant, 
             fill = visual.quant)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Forward strength (SWOW)

Word - non log transformed

swow.FS = read.csv("SWOW_CP/mean_SWOW_CP.csv")
swow.FS.weights = read.csv("SWOW_CP/SWOW_associate_weights.csv")

d.raw = d.raw %>%
        mutate(bigram = paste(cue, associate)) %>%
        left_join(swow.FS) %>% # merge in cps
        left_join(swow.FS.weights, by=c("num" = "associate")) %>% # merge in weights
        mutate(swow.weighted.FS = trans.prob * weights) # get weighted cp

ggplot(d.raw, aes(x = swow.weighted.FS)) +
         geom_histogram() +
         theme_bw() +
         ggtitle("weighted forward strengths")

FS.by.participant = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(forwardStrengths.swow = mean(swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, word.bin == 1)$forwardStrengths.swow, 
                  filter(FS.by.participant, word.bin == 0)$forwardStrengths.swow)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0002752 0.0039151 0.0036399 2.339964 0.0231573 52.02398 3.92e-05 0.0005111 Welch Two Sample t-test two.sided
summary(lmer(swow.weighted.FS~word+num+(num|workerId)+(1|cue),data=d.raw)) # log values don't converge here - but this may be what we want?s
## Linear mixed model fit by REML ['lmerMod']
## Formula: swow.weighted.FS ~ word + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -62071.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7584 -0.6163 -0.0690  0.4064  4.1859 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 1.858e-06 0.0013631      
##           num         1.689e-07 0.0004109 -1.00
##  cue      (Intercept) 3.689e-06 0.0019208      
##  Residual             2.655e-05 0.0051526      
## Number of obs: 8094, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  1.192e-02  3.762e-04   31.68
## word         7.449e-05  8.504e-05    0.88
## num         -3.407e-03  7.232e-05  -47.11
## 
## Correlation of Fixed Effects:
##      (Intr) word  
## word -0.001       
## num  -0.548  0.006

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, word.quant) %>%
  summarize(forwardStrengths.swow = mean(swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, group = word.quant, 
             fill = word.quant)) +
  xlab("Word Quantile") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, word) %>%
  summarize(forwardStrength = mean(swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = word, y = forwardStrength)) +
  xlab("Word") +
  ylab("Forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(cor.test(FS.by.participant$forwardStrength, FS.by.participant$word)))
estimate statistic p.value parameter conf.low conf.high method alternative
0.2256263 1.838255 0.0707401 63 -0.0193369 0.4450351 Pearson’s product-moment correlation two.sided

Visual-non log transformed

FS.by.participant = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(forwardStrengths.swow = mean(swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, visual.bin == 1)$forwardStrengths.swow, 
                  filter(FS.by.participant, visual.bin == 0)$forwardStrengths.swow)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0001258 0.0038412 0.0037154 1.033532 0.3053198 62.78353 -0.0001174 0.0003689 Welch Two Sample t-test two.sided
summary(lmer(swow.weighted.FS~visual+num+(num|workerId)+(1|cue),data=d.raw)) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: swow.weighted.FS ~ visual + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -62071.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7499 -0.6138 -0.0707  0.4034  4.1868 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 1.892e-06 0.0013754      
##           num         1.707e-07 0.0004131 -1.00
##  cue      (Intercept) 3.689e-06 0.0019208      
##  Residual             2.655e-05 0.0051524      
## Number of obs: 8094, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  1.192e-02  3.768e-04   31.62
## visual       5.474e-05  8.794e-05    0.62
## num         -3.407e-03  7.250e-05  -46.99
## 
## Correlation of Fixed Effects:
##        (Intr) visual
## visual  0.000       
## num    -0.550 -0.005

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, visual.quant) %>%
  summarize(forwardStrengths.swow = mean(swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(visual.quant) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = visual.quant, y = mean, group = visual.quant, 
             fill = visual.quant)) +
  xlab("Visual Quantile") +
  ylab("Forward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, visual) %>%
  summarize(forwardStrength = mean(swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = visual, y = forwardStrength)) +
  xlab("Visual") +
  ylab("Forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(cor.test(FS.by.participant$forwardStrength, FS.by.participant$visual)))
estimate statistic p.value parameter conf.low conf.high method alternative
0.0638193 0.5075848 0.613517 63 -0.1829271 0.3030021 Pearson’s product-moment correlation two.sided

Word -log transformed

Data are skewed, let’s log-transform and replot.

d.raw$log.swow.weighted.FS = log(d.raw$swow.weighted.FS)
ggplot(d.raw, aes(x = log.swow.weighted.FS)) +
         geom_histogram() +
         theme_bw() +
         ggtitle("weighted forward strengths")

FS.by.participant = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(forwardStrengths.swow = mean(log.swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  #geom_bar(position = "dodge", stat = "identity") +
  ylim(-7,-5)+
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, word.bin == 1)$forwardStrengths.swow, 
                  filter(FS.by.participant, word.bin == 0)$forwardStrengths.swow)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0936953 -6.395849 -6.489545 2.081713 0.0423024 52.05855 0.0033811 0.1840095 Welch Two Sample t-test two.sided
summary(lmer(log.swow.weighted.FS~word+num+(num|workerId)+(1|cue),data=d.raw)) # log values don't converge here - but this may be what we want?s
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.swow.weighted.FS ~ word + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: 20164.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.81224 -0.72345  0.09215  0.80781  2.17407 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  workerId (Intercept) 0.000000 0.00000      
##           num         0.004743 0.06887   NaN
##  cue      (Intercept) 0.124581 0.35296      
##  Residual             1.387895 1.17809      
## Number of obs: 6314, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -3.82581    0.06821  -56.09
## word         0.05208    0.02685    1.94
## num         -1.35626    0.02012  -67.40
## 
## Correlation of Fixed Effects:
##      (Intr) word  
## word  0.004       
## num  -0.467 -0.003
## convergence code: 0
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, word.quant) %>%
  summarize(forwardStrengths.swow = mean(log.swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(word.quant) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = word.quant, y = mean, group = word.quant, 
             fill = word.quant)) +
  xlab("Word Quantile") +
  ylab("Forward strength") +
  #geom_bar(position = "dodge", stat = "identity") +
  ylim(-7,-5)+
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, word) %>%
  summarize(forwardStrength = mean(log.swow.weighted.FS, na.rm = T)) 

FS.by.participant %>%
  ggplot(aes(x = word, y = forwardStrength)) +
  xlab("Word") +
  ylab("Forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(cor.test(FS.by.participant$forwardStrength, FS.by.participant$word)))
estimate statistic p.value parameter conf.low conf.high method alternative
0.1795606 1.448765 0.1523625 63 -0.0672853 0.4056925 Pearson’s product-moment correlation two.sided

Visual - log transformed

FS.by.participant = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(forwardStrengths.swow = mean(log.swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Associate") +
  ylab("Forward strength") +
  #geom_bar(position = "dodge", stat = "identity") +
  ylim(-7,-5)+
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(FS.by.participant, visual.bin == 1)$forwardStrengths.swow, 
                  filter(FS.by.participant, visual.bin == 0)$forwardStrengths.swow)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0583484 -6.412897 -6.471246 1.279863 0.2055876 59.18273 -0.0328702 0.1495671 Welch Two Sample t-test two.sided
summary(lmer(log.swow.weighted.FS~visual+num+(num|workerId)+(1|cue),data=d.raw)) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.swow.weighted.FS ~ visual + num + (num | workerId) + (1 |  
##     cue)
##    Data: d.raw
## 
## REML criterion at convergence: 20166.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7956 -0.7225  0.0915  0.8093  2.1782 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  workerId (Intercept) 0.000000 0.00000      
##           num         0.004616 0.06794   NaN
##  cue      (Intercept) 0.124609 0.35300      
##  Residual             1.388505 1.17835      
## Number of obs: 6314, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -3.82652    0.06822  -56.09
## visual       0.03897    0.02734    1.43
## num         -1.35608    0.02008  -67.54
## 
## Correlation of Fixed Effects:
##        (Intr) visual
## visual -0.002       
## num    -0.468  0.000
## convergence code: 0
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues

Quantiles

FS.by.participant = d.raw %>%
  group_by(workerId, visual.quant) %>%
  summarize(forwardStrengths.swow = mean(log.swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  group_by(visual.quant) %>%
  multi_boot_standard(column = "forwardStrengths.swow", na.rm = T) %>%
  ggplot(aes(x = visual.quant, y = mean, group = visual.quant, 
             fill = visual.quant)) +
  xlab("Visual Quantile") +
  ylab("Forward strength") +
  #geom_bar(position = "dodge", stat = "identity") + 
  ylim(-7,-5)+
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Continuous

FS.by.participant = d.raw %>%
  group_by(workerId, visual) %>%
  summarize(forwardStrength = mean(log.swow.weighted.FS, na.rm = T))

FS.by.participant %>%
  ggplot(aes(x = visual, y = forwardStrength)) +
  xlab("Visual") +
  ylab("Forward strength") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(cor.test(FS.by.participant$forwardStrength, FS.by.participant$visual)))
estimate statistic p.value parameter conf.low conf.high method alternative
0.1198526 0.9582078 0.3416209 63 -0.1277818 0.3534206 Pearson’s product-moment correlation two.sided

Backward strength

Word - non log transformed

BS.by.participant = d.raw %>%
  group_by(workerId, word.bin) %>%
  summarize(backwardStrength = mean(backwardStrength, na.rm = T))

BS.by.participant %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "backwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Associate") +
  ylab("Backward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(BS.by.participant, word.bin == 1)$backwardStrength, 
                  filter(BS.by.participant, word.bin == 0)$backwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
-0.0034444 0.1200617 0.1235062 -0.5128392 0.6098637 62.77598 -0.016867 0.0099782 Welch Two Sample t-test two.sided
summary(lmer(backwardStrength~word+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: backwardStrength ~ word + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -2980.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3118 -0.4587 -0.1067  0.1687  4.9219 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  workerId (Intercept) 6.400e-04 0.025299      
##           num         8.219e-05 0.009066 -0.90
##  cue      (Intercept) 1.900e-02 0.137855      
##  Residual             2.769e-02 0.166390      
## Number of obs: 4273, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.146964   0.023066   6.371
## word        -0.003025   0.003767  -0.803
## num         -0.007831   0.002589  -3.024
## 
## Correlation of Fixed Effects:
##      (Intr) word  
## word  0.001       
## num  -0.258  0.003

Visual - non log transformed

BS.by.participant = d.raw %>%
  group_by(workerId, visual.bin) %>%
  summarize(backwardStrength = mean(backwardStrength, na.rm = T))

BS.by.participant %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "backwardStrength", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Associate") +
  ylab("Backward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(BS.by.participant, visual.bin == 1)$backwardStrength, 
                  filter(BS.by.participant, visual.bin == 0)$backwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0045679 0.1241998 0.1196319 0.6802049 0.4988805 62.65541 -0.0088534 0.0179893 Welch Two Sample t-test two.sided
summary(lmer(backwardStrength~visual+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: backwardStrength ~ visual + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -2984
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2686 -0.4621 -0.1077  0.1720  4.9286 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  workerId (Intercept) 5.948e-04 0.024388      
##           num         8.398e-05 0.009164 -0.91
##  cue      (Intercept) 1.899e-02 0.137820      
##  Residual             2.768e-02 0.166376      
## Number of obs: 4273, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.146792   0.023046   6.370
## visual       0.007619   0.003773   2.019
## num         -0.007785   0.002595  -3.001
## 
## Correlation of Fixed Effects:
##        (Intr) visual
## visual -0.004       
## num    -0.257  0.007

Word - log transformed

d.raw$log.backwardStrength = log(d.raw$backwardStrength)

BS.by.participant = d.raw %>%
  filter(is.finite(log.backwardStrength )) %>%
  group_by(workerId, word.bin) %>%
  summarize(log.backwardStrength = mean(log.backwardStrength, na.rm = T)) 

BS.by.participant %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "log.backwardStrength", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Associate") +
  ylab("Backward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(BS.by.participant, word.bin == 1)$log.backwardStrength, 
                  filter(BS.by.participant, word.bin == 0)$log.backwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
-0.0356786 -2.141023 -2.105344 -0.5409371 0.5904694 62.64453 -0.167498 0.0961407 Welch Two Sample t-test two.sided

Visual - log transformed

BS.by.participant = d.raw %>%
  filter(is.finite(log.backwardStrength )) %>%
  group_by(workerId, visual.bin) %>%
  summarize(log.backwardStrength = mean(log.backwardStrength, na.rm = T))

BS.by.participant %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "log.backwardStrength", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Associate") +
  ylab("Backward strength") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

kable(tidy(t.test(filter(BS.by.participant, visual.bin == 1)$log.backwardStrength, 
                  filter(BS.by.participant, visual.bin == 0)$log.backwardStrength)))
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0294428 -2.107508 -2.136951 0.4484982 0.6553421 62.60762 -0.1017594 0.160645 Welch Two Sample t-test two.sided
summary(lmer(backwardStrength~visual+num+(num|workerId)+(1|cue),data=d.raw))
## Linear mixed model fit by REML ['lmerMod']
## Formula: backwardStrength ~ visual + num + (num | workerId) + (1 | cue)
##    Data: d.raw
## 
## REML criterion at convergence: -2984
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2686 -0.4621 -0.1077  0.1720  4.9286 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  workerId (Intercept) 5.948e-04 0.024388      
##           num         8.398e-05 0.009164 -0.91
##  cue      (Intercept) 1.899e-02 0.137820      
##  Residual             2.768e-02 0.166376      
## Number of obs: 4273, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.146792   0.023046   6.370
## visual       0.007619   0.003773   2.019
## num         -0.007785   0.002595  -3.001
## 
## Correlation of Fixed Effects:
##        (Intr) visual
## visual -0.004       
## num    -0.257  0.007

Semantic similarity measure - sentence method

Based on word bins

i.e. How similar is a participant to other high/low word participants?

#write.csv(select(d.raw.wide, workerId, cue, a1, a2, a3, a4, word.bin, visual.bin), "all_associate_strings_bins.csv",  row.names=FALSE) # for semantic similarity.py

#freqs = select(d.raw, associate, Lg10WF) %>% 
#  mutate(Lg10WF = ifelse(is.na(Lg10WF), 0, Lg10WF))  %>%
#        unique()write.csv(freqs, "freqs.csv",  row.names=FALSE)

py.sim.measures = read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_word.csv")

sim.long.word = py.sim.measures %>%
  gather("measure", "word_name", 3:6) %>%
  left_join(d.raw %>% group_by(workerId) %>% slice(1) %>% ungroup () %>% select(visual, word, workerId)) %>%
  left_join(d.raw %>% select(associate, cue, workerId, num))

sim.by.participant.word = sim.long.word %>%
  group_by(workerId, word.bin, word) %>%
  summarize(sim.word.bin = mean(sim.word.bin, na.rm = T)) %>%
  ungroup() %>%
  mutate(word.bin = as.factor(word.bin))

sim.by.participant.word %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "sim.word.bin", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Word bin") +
  ylab("Overall similarity") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position ="none")

t.test:

word.t.test = tidy(t.test(filter(sim.by.participant.word, word.bin == 1)$sim.word.bin, 
                  filter(sim.by.participant.word, word.bin == 0)$sim.word.bin))
kable(word.t.test)
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0211509 0.4586593 0.4375084 3.12636 0.0027021 61.49797 0.007625 0.0346768 Welch Two Sample t-test two.sided

Effect size:

es_all = tes(word.t.test$statistic, nrow(filter(sim.by.participant.word, word.bin == 1)), nrow(filter(sim.by.participant.word, word.bin == 0)), verbose = F)
es_all$d
## [1] 0.78
summary(lmer(sim.word.bin~word + visual + num + (num|workerId) + (1|cue), data=sim.long.word)) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: sim.word.bin ~ word + visual + num + (num | workerId) + (1 |  
##     cue)
##    Data: sim.long.word
## 
## REML criterion at convergence: -92907.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.4994 -0.6164  0.1016  0.7244  2.7114 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr
##  workerId (Intercept) 8.172e-04 2.859e-02     
##           num         6.361e-18 2.522e-09 1.00
##  cue      (Intercept) 3.642e-03 6.035e-02     
##  Residual             5.842e-03 7.643e-02     
## Number of obs: 40560, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 4.479e-01  1.034e-02   43.34
## word        5.478e-03  4.564e-03    1.20
## visual      7.117e-03  4.642e-03    1.53
## num         1.902e-15  3.395e-04    0.00
## 
## Correlation of Fixed Effects:
##        (Intr) word   visual
## word    0.000              
## visual  0.000 -0.002       
## num    -0.082  0.000  0.000
summary(glmer(sim.word.bin~word.bin + visual.bin + num + (num|workerId) + (1|cue), data=sim.long.word))
## Linear mixed model fit by REML ['lmerMod']
## Formula: sim.word.bin ~ word.bin + visual.bin + num + (num | workerId) +  
##     (1 | cue)
##    Data: sim.long.word
## 
## REML criterion at convergence: -92915.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.5002 -0.6160  0.1014  0.7249  2.7095 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 7.401e-04 2.720e-02      
##           num         3.027e-17 5.501e-09 -1.00
##  cue      (Intercept) 3.642e-03 6.035e-02      
##  Residual             5.842e-03 7.643e-02      
## Number of obs: 40560, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  4.344e-01  1.130e-02   38.44
## word.bin     2.125e-02  6.793e-03    3.13
## visual.bin   6.362e-03  6.799e-03    0.94
## num         -3.140e-16  3.395e-04    0.00
## 
## Correlation of Fixed Effects:
##            (Intr) wrd.bn vsl.bn
## word.bin   -0.301              
## visual.bin -0.292  0.016       
## num        -0.075  0.000  0.000

Based on visual bins

py.sim.measures.visual = read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_visual.csv")

sim.long.vis = py.sim.measures.visual %>%
  gather("measure", "associate", 3:6) %>%
  left_join(d.raw %>% group_by(workerId) %>% slice(1) %>% ungroup () %>% select(visual, workerId, word)) %>%
  left_join(d.raw %>% select(associate, cue, workerId, num))

sim.by.participant.vis = sim.long.vis %>%
  group_by(workerId, visual.bin) %>%
  summarize(sim.visual.bin = mean(sim.visual.bin, na.rm = T)) %>%
  ungroup() %>%
  mutate(visual.bin = as.factor(visual.bin))

sim.by.participant.vis %>%
  group_by(visual.bin) %>%
  multi_boot_standard(column = "sim.visual.bin", na.rm = T) %>%
  ggplot(aes(x = visual.bin, y = mean, group = visual.bin, 
             fill = visual.bin)) +
  xlab("Visual bin") +
  ylab("Overall similarity") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position ="none")

t.test:

visual.t.test = tidy(t.test(filter(sim.by.participant.vis, visual.bin == 1)$sim.visual.bin, 
                  filter(sim.by.participant.vis, visual.bin == 0)$sim.visual.bin))
kable(visual.t.test)
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0158635 0.4557792 0.4399157 2.301396 0.0247314 62.21466 0.0020856 0.0296414 Welch Two Sample t-test two.sided

effect size:

es_all = tes(visual.t.test$statistic, nrow(filter(sim.by.participant.vis, visual.bin == 1)), nrow(filter(sim.by.participant.vis, visual.bin == 0)), verbose = F)
es_all$d
## [1] 0.57
summary(lmer(sim.visual.bin ~ visual + word + num + (num|workerId) + (1|cue), data = sim.long.vis)) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: sim.visual.bin ~ visual + word + num + (num | workerId) + (1 |  
##     cue)
##    Data: sim.long.vis
## 
## REML criterion at convergence: -22937.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.2922 -0.6091  0.1018  0.7087  2.4636 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 7.489e-04 2.737e-02      
##           num         1.422e-17 3.771e-09 -0.90
##  cue      (Intercept) 3.614e-03 6.012e-02      
##  Residual             5.846e-03 7.646e-02      
## Number of obs: 10140, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  4.475e-01  1.038e-02   43.13
## visual       1.114e-02  4.528e-03    2.46
## word        -1.038e-03  4.451e-03   -0.23
## num         -2.230e-15  6.791e-04    0.00
## 
## Correlation of Fixed Effects:
##        (Intr) visual word  
## visual  0.000              
## word    0.000 -0.002       
## num    -0.164  0.000  0.000
summary(glmer(sim.visual.bin ~ visual.bin + word.bin + num + (num|workerId) + (1|cue), data=sim.long.vis))
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## sim.visual.bin ~ visual.bin + word.bin + num + (num | workerId) +  
##     (1 | cue)
##    Data: sim.long.vis
## 
## REML criterion at convergence: -22940
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.2947 -0.6071  0.1017  0.7087  2.4582 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 7.361e-04 2.713e-02      
##           num         3.465e-16 1.861e-08 -1.00
##  cue      (Intercept) 3.614e-03 6.012e-02      
##  Residual             5.846e-03 7.646e-02      
## Number of obs: 10140, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  4.351e-01  1.141e-02   38.13
## visual.bin   1.602e-02  6.908e-03    2.32
## word.bin     9.665e-03  6.901e-03    1.40
## num         -3.450e-15  6.791e-04    0.00
## 
## Correlation of Fixed Effects:
##            (Intr) vsl.bn wrd.bn
## visual.bin -0.294              
## word.bin   -0.302  0.016       
## num        -0.149  0.000  0.000

t-tests are reliable for both word and visual, but effect size is bigger for word.

Predicting word.binned similaritiy with both bins, word but not visual bin is significant. Continuous analysis not reliable.

Predicting visual.binned similarity with visual is a siginficant predictor in both binned and continuous analysis.

Semantic similarity measure - by-word-method

Based on word bins

py.sim.measures.byword = read.csv("semantic_similarity/all_associate_strings_bins_subtlexus_P_word_associate_wise.csv")

sim.long.word = py.sim.measures.byword %>%
  gather("measure", "word_name", 3:6) %>%
  left_join(d.raw %>% group_by(workerId) %>% slice(1) %>% ungroup () %>% select(visual, word, workerId)) %>%
  left_join(d.raw %>% select(associate, cue, workerId, num))

sim.by.participant.word = sim.long.word %>%
  group_by(workerId, word.bin) %>%
  summarize(sim.word.bin = mean(sim.word.bin, na.rm = T)) %>%
  ungroup() %>%
  mutate(word.bin = as.factor(word.bin))

sim.by.participant.word %>%
  group_by(word.bin) %>%
  multi_boot_standard(column = "sim.word.bin", na.rm = T) %>%
  ggplot(aes(x = word.bin, y = mean, group = word.bin, 
             fill = word.bin)) +
  xlab("Word bin") +
  ylab("Overall similarity") +
  geom_bar(position = "dodge", stat = "identity") + 
  theme_bw(base_size = 15) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position ="none")

t.test:

word.t.test = tidy(t.test(filter(sim.by.participant.word, word.bin == 1)$sim.word.bin, 
                  filter(sim.by.participant.word, word.bin == 0)$sim.word.bin))
kable(word.t.test)
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
0.0022976 0.4378299 0.4355323 0.3072799 0.75965 62.61635 -0.0126462 0.0172413 Welch Two Sample t-test two.sided

Effect size:

es_all = tes(word.t.test$statistic, nrow(filter(sim.by.participant.word, word.bin == 1)), nrow(filter(sim.by.participant.word, word.bin == 0)), verbose = F)
es_all$d
## [1] 0.08
summary(lmer(sim.word.bin~word + visual + num + (num|workerId) + (1|cue), data=sim.long.word)) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: sim.word.bin ~ word + visual + num + (num | workerId) + (1 |  
##     cue)
##    Data: sim.long.word
## 
## REML criterion at convergence: -81439.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9972 -0.6403 -0.0072  0.6381  5.4906 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 8.367e-04 2.893e-02      
##           num         1.656e-18 1.287e-09 -1.00
##  cue      (Intercept) 4.779e-03 6.913e-02      
##  Residual             7.748e-03 8.802e-02      
## Number of obs: 40544, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  4.367e-01  1.169e-02   37.37
## word        -7.719e-04  4.626e-03   -0.17
## visual       1.127e-02  4.705e-03    2.40
## num         -7.859e-13  3.910e-04    0.00
## 
## Correlation of Fixed Effects:
##        (Intr) word   visual
## word    0.000              
## visual  0.000 -0.002       
## num    -0.084  0.000  0.000
summary(glmer(sim.word.bin~word.bin + visual.bin + num + (num|workerId) + (1|cue), data=sim.long.word))
## Linear mixed model fit by REML ['lmerMod']
## Formula: sim.word.bin ~ word.bin + visual.bin + num + (num | workerId) +  
##     (1 | cue)
##    Data: sim.long.word
## 
## REML criterion at convergence: -81439
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9995 -0.6396 -0.0065  0.6377  5.4909 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  workerId (Intercept) 8.688e-04 2.948e-02      
##           num         2.061e-17 4.540e-09 -1.00
##  cue      (Intercept) 4.780e-03 6.913e-02      
##  Residual             7.748e-03 8.802e-02      
## Number of obs: 40544, groups:  workerId, 65; cue, 39
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  4.291e-01  1.277e-02   33.61
## word.bin     2.601e-03  7.366e-03    0.35
## visual.bin   1.317e-02  7.373e-03    1.79
## num         -2.829e-12  3.910e-04    0.00
## 
## Correlation of Fixed Effects:
##            (Intr) wrd.bn vsl.bn
## word.bin   -0.288              
## visual.bin -0.280  0.016       
## num        -0.077  0.000  0.000