Across Experiments (ranked image comparisons)

mem <- read_csv("../memorability/memorability_task_cleaned_byImg.csv")
mem <- mem %>% select(image_code, fa_r) 
mem <- mem[order(mem$fa_r, mem$image_code, decreasing = TRUE),]

# centering by subject
f_rate <- read_csv("familiarity_image_response.csv")
f_rate <- f_rate %>%
  group_by(image_code) %>%
  summarise(avg = mean(response - ave(f_rate$response, f_rate$subject_id))) %>% 
  filter(! is.na(image_code)) 
f_rate <- f_rate[order(f_rate$avg, f_rate$image_code, decreasing = TRUE),]

f_fc <- read_csv("../fam_2AFC/familiarity_fc_cleaned.csv")
fc_seen <- table(f_fc$left) + table(f_fc$right)
f_fc <- f_fc$chosen %>% table() %>% as.data.frame()
f_fc$rate <- f_fc$Freq / fc_seen
rm(fc_seen)
f_fc$image_code <- gsub("[^0-9]", "", f_fc$.) %>% as.numeric()
f_fc <- f_fc %>%
  select(image_code, rate)
f_fc <- f_fc[order(f_fc$rate, f_fc$image_code, decreasing = TRUE),]
f_fc$image_code <- f_fc$image_code %>% as.numeric()

image_code <- f_rate[order(f_rate$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything

rank_dt <- data.frame(image_code = image_code, 
                      memorability_rank = mem$image_code,
                      memorability_index = match(image_code, mem$image_code),
                      rating_rank = f_rate$image_code,
                      rating_index = match(image_code, f_rate$image_code),
                      fc_rank = f_fc$image_code,
                      fc_index = match(image_code, f_fc$image_code))


cor.test(x = rank_dt$memorability_index, y = rank_dt$rating_index, method = 'kendall')

## 
##  Kendall's rank correlation tau
## 
## data:  rank_dt$memorability_index and rank_dt$rating_index
## z = 4.4017, p-value = 1.074e-05
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.2985859

cor.test(x = rank_dt$fc_index, y = rank_dt$rating_index, method = 'kendall')

## 
##  Kendall's rank correlation tau
## 
## data:  rank_dt$fc_index and rank_dt$rating_index
## z = 7.07, p-value = 1.549e-12
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##      tau 
## 0.479596

cor.test(x = rank_dt$fc_index, y = rank_dt$memorability_index, method = 'kendall')

## 
##  Kendall's rank correlation tau
## 
## data:  rank_dt$fc_index and rank_dt$memorability_index
## z = 3.3772, p-value = 0.0007323
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.2290909

Doing rank-by-rank images

image_code <- f_rate[order(f_rate$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything

rank_dt <- data.frame(image_code = image_code, 
                      memorability_rank = mem$image_code,
                      memorability_index = match(image_code, mem$image_code),
                      rating_rank = f_rate$image_code,
                      rating_index = match(image_code, f_rate$image_code),
                      fc_rank = f_fc$image_code,
                      fc_index = match(image_code, f_fc$image_code))

cor.test(x = rank_dt$memorability_index, y = rank_dt$rating_index, method = 'kendall')

## 
##  Kendall's rank correlation tau
## 
## data:  rank_dt$memorability_index and rank_dt$rating_index
## z = 4.4017, p-value = 1.074e-05
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.2985859

cor.test(x = rank_dt$fc_index, y = rank_dt$rating_index, method = 'kendall')

## 
##  Kendall's rank correlation tau
## 
## data:  rank_dt$fc_index and rank_dt$rating_index
## z = 7.07, p-value = 1.549e-12
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##      tau 
## 0.479596

cor.test(x = rank_dt$fc_index, y = rank_dt$memorability_index, method = 'kendall')

## 
##  Kendall's rank correlation tau
## 
## data:  rank_dt$fc_index and rank_dt$memorability_index
## z = 3.3772, p-value = 0.0007323
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.2290909

# Memorability by Rating

p1 <- ggplot(data = rank_dt, aes(x = memorability_index,
                           y = rating_index)) +
    geom_point(aes(alpha = .5)) +
    geom_smooth(method = "lm") +
    geom_text(aes(label = image_code)) +
  theme(legend.position = "none") +
  xlab("Memorability Ranking") + ylab("Subjective Rating Ranking") +
  labs(title = "Image Ranking: Memorability x Subjective Rating",
       subtitle = "Kendall tau-b correlation = .30")


p2 <- ggplot(data = rank_dt, aes(x = memorability_index,
                           y = fc_index)) +
  geom_point(aes(alpha = .5)) +
  geom_smooth(method = "lm") +
  geom_text(aes(label = image_code)) +
  theme(legend.position = "none") +
  xlab("Memorability Ranking") + ylab("Forced Choice Ranking") +
  labs(title = "Memorability x Forced Choice",
       subtitle = "Kendall tau-b correlation = .23") 


p3 <- ggplot(data = rank_dt, aes(x = rating_index,
                           y = fc_index)) +
  geom_point(aes(alpha = .5)) +
  geom_smooth(method = "lm") +
  geom_text(aes(label = image_code)) +
  theme(legend.position = "none") +
  xlab("Subjective Rating Ranking") + ylab("Forced Choice Ranking") +
  labs(title = "Subjective Rating x Forced Choice",
       subtitle = "Kendall tau-b correlation = ..48",
       caption = "point labels = image IDs") 

plot_grid(p1, p2, p3, nrow = 1)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Memorability & Familiarity

Face images can be more or less intrinsically “memorable” (Bainbridge et al., 2016) – can they be intrinsically familiar as well, possessing some property that is consistently deemed “familiar” across observers?

We were interested in whether images with salient familiarity signals (based on false alarm rates in the memorability task) would correspond to other, more explicit measures of familiarity. We expect consistent memorability and familiarity scores for each image across individuals.

Been looking at data quality issues. I’m thinking about removing people that have false alarms/misses that are above/below two standard deviations from the mean. How to deal with people that don’t have any false alarms?

Calculating Sensitivity (d’) and Response Bias

d’= someone’s ability to distinguish whether they’ve seen something before or not (“seen” signal is present or absent) aka sensitivity to the signal

c = response bias! How likely they are to just… say yes

We could regress d’ or c onto the strategy scores! We would use them like normal DVs in our regression.

By ID

rates_id <- d %>%
  group_by(id) %>%
    summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
              fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))

sdt_indices_id <- psycho::dprime(n_hit = counts_id$hit,
               n_fa = counts_id$fa,
               n_miss = counts_id$miss,
               n_cr = counts_id$cr,
               adjusted = TRUE)

rates_id$dprime <- sdt_indices_id$dprime
rates_id$c <- sdt_indices_id$c


### OLD WAY  (about the same, just doesn't handle NAs) ###
# first we'll calculate d'
# rates_id$dprime <- qnorm(rates_id$hit_r) - qnorm(rates_id$fa_r)

# next we'll calculate sensitivity 
#rates_id$sensi <- -.5 * (qnorm(rates_id$hit_r) + qnorm(rates_id$fa_r))
### !! ### !! ###

By Image

rates_img <- d %>%
  group_by(image_code) %>%
    summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
              fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))


sdt_indices_img <- psycho::dprime(n_hit = counts_img$hit,
               n_fa = counts_img$fa,
               n_miss = counts_img$miss,
               n_cr = counts_img$cr,
               adjusted = TRUE)

rates_img$dprime <- sdt_indices_img$dprime
rates_img$c <- sdt_indices_img$c

### !! OLD WAY !! ###
# rates_img$dprime <- qnorm(rates_img$hit_r) - qnorm(rates_img$fa_r)
# rates_img$sensi <- -.5 * (qnorm(rates_img$hit_r) + qnorm(rates_img$fa_r))
### !! ### !! ###

Split Half Ranked Correlations (“Reliability”)

# make lists for FA and hitS
fa_cor_list <- list()
hit_cor_list <- list()
d_cor_list <- list()

for (i in 1:100){
  set.seed(i+10)
  shuffled_ids <- d$id %>% unique() %>% sample(size = length(unique(d$id)))
  end <- length(shuffled_ids) 
  split <- length(shuffled_ids)/2
  
  ids_1 <- data.frame(id = shuffled_ids[1:split])
  ids_2 <- data.frame(id = shuffled_ids[(split+1):end])
  
  half_1 <- d[d$id %in% ids_1$id,]
  half_2 <- d[d$id %in% ids_2$id,]
  
  
  list_1 <- half_1 %>%
    group_by(image_code) %>%
    summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
              fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
  
  list_2  <- half_2 %>%
    group_by(image_code) %>%
    summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
              fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
  
  
  counts_1 <- half_1 %>% 
  group_by(image) %>%
  summarise(fa = sum(perf_str == "FA"),
            hit = sum(perf_str == "hit"),
            cr = sum(perf_str == "CR"),
            miss = sum(perf_str == "miss"),
            seen = n()) 
  
  
  counts_2 <- half_2 %>% 
  group_by(image) %>%
  summarise(fa = sum(perf_str == "FA"),
            hit = sum(perf_str == "hit"),
            cr = sum(perf_str == "CR"),
            miss = sum(perf_str == "miss"),
            seen = n()) 
  
  sdt_1 <- psycho::dprime(n_hit = counts_1$hit,
               n_fa = counts_1$fa,
               n_miss = counts_1$miss,
               n_cr = counts_1$cr,
               adjusted = TRUE)
  list_1$dprime <- sdt_1$dprime
  list_1$c <- sdt_1$c
  
  sdt_2 <- psycho::dprime(n_hit = counts_2$hit,
               n_fa = counts_2$fa,
               n_miss = counts_2$miss,
               n_cr = counts_2$cr,
               adjusted = TRUE)
  list_2$dprime <- sdt_2$dprime
  list_2$c <- sdt_2$c

  
    image_code <- list_1[order(list_1$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything

    # sort by false alarm rate, then by name
  list_1 <- list_1[
    with(list_1, order(fa_r, image_code, decreasing = T)),]
  
  
  list_2 <- list_2[
    with(list_2, order(fa_r, image_code, decreasing = T)),]
  


  rank_dt <- data.frame(image_code = image_code, 
                      index_1 = match(image_code, list_1$image_code),
                      index_2 = match(image_code, list_2$image_code))
  
  
  # get ranked correlation for FA sorted list
  fa_cor_list[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate

  
  # sort by hit rate, then by name
  list_1 <- list_1[
    with(list_1, order(hit_r, image_code, decreasing = T)),]
  
  list_2 <- list_2[
    with(list_2, order(hit_r, image_code, decreasing = T)),]
  
  rank_dt <- data.frame(image_code = image_code, 
                      index_1 = match(image_code, list_1$image_code),
                      index_2 = match(image_code, list_2$image_code))
  
  
  hit_cor_list[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
  
  
  
  # sort by d-prime, then by name
  list_1 <- list_1[
    with(list_1, order(dprime, image_code, decreasing = T)),]
  
  list_2 <- list_2[
    with(list_2, order(dprime, image_code, decreasing = T)),]
  
    rank_dt <- data.frame(image_code = image_code, 
                      index_1 = match(image_code, list_1$image_code),
                      index_2 = match(image_code, list_2$image_code))
  
  
  d_cor_list[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
  
  # sort by c, then by name
  
}

# fa_cor_list %>% unlist() %>% summary()
fa_cor_list %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##     average       min       max         sd
## 1 0.2561535 0.1656566 0.3729293 0.04154242

hit_cor_list %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##     average       min       max         sd
## 1 0.2121535 0.0820202 0.3979798 0.04600192

d_cor_list %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##     average        min       max         sd
## 1 0.1934909 0.06626263 0.3220202 0.04609074

rm(half_1)
rm(half_2)
rm(ids_1)
rm(ids_2)
rm(counts_1)
rm(counts_2)
rm(sdt_1)
rm(sdt_2)

Random Permutations

# make lists for FA and hitS
image_codes <- unique(d$image_code)
rand_cor_list <- list()
for (i in 1:100){
  set.seed(i)
  list_1 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
  list_2 <- image_codes %>% as.data.frame %>% sample_n(size = 100)

  rand_cor_list[i] <- cor.test(x = list_1$., y = list_2$., method = 'kendall')$estimate
}

rand_cor_list %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##       average        min       max         sd
## 1 0.009862626 -0.1329293 0.1721212 0.05830017

rm(list_1)
rm(list_2)

Plotting the distribution of split-half ranked correlations

cor_rates_dt <- data.frame(i = c(1:100), far = unlist(fa_cor_list), hits = unlist(hit_cor_list), dprime = unlist(d_cor_list))

cor_fa_dt <- cor_rates_dt
cor_rates_dt <- pivot_longer(cor_rates_dt, c("far", "hits", "dprime"), names_to = "measure")

# to include random permutation correlation distribution
permute_dt <- data.frame(i = c(1:100), permuted_cor = unlist(rand_cor_list))

# ggplot(data = cor_rates_dt, aes(x = i, y = value, color = as.factor(measure))) +
#   geom_line(stat = "identity") + xlab("iteration") + ylab("coefficient (Kendall tau-b)") +
#   labs(title = "Distribution of Correlations: Memorability", subtitle ="100 Split-Halves of Participants; N = 93") + scale_color_brewer(palette = "Dark2") + theme(legend.title = element_blank())
#   

ggplot(data = cor_rates_dt, aes(value, color = as.factor(measure))) +
  geom_histogram(bins = 50) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
  labs(title = "Histogram of Correlations: Memorability", subtitle ="100 Split-Halves of Participants; N = 93") + scale_color_brewer(palette = "Dark2") + theme(legend.position = "none") +
  facet_wrap(~measure)

ggplot(data = cor_fa_dt, aes(far)) +
  geom_histogram(bins = 50, aes(fill = "blue")) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
  labs(title = "Histogram of Correlations: Memorability, ranked by FAR", subtitle ="100 Split-Halves of Participants; N = 93; Grey bars = Null Distribution") + theme(legend.position = "none") +
  geom_histogram(bins = 50, data = permute_dt, aes(permuted_cor, alpha = .001))

Are these significantly different distributions?

test_dt <- data.frame(far = cor_fa_dt$far, permute = permute_dt$permuted_cor)
test_dt <- pivot_longer(test_dt, c("far", "permute"), names_to = "distribution")
test_dt$distribution_c <- if_else(test_dt$distribution == "far", .5, -.5)
t.test(value ~ distribution, data = test_dt, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  value by distribution
## t = 34.404, df = 178.93, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2321646 0.2604172
## sample estimates:
##     mean in group far mean in group permute 
##           0.256153535           0.009862626

They are significantly different! Interesting!

Check for uniform distribution of FAs

# Just a histograms of false alarms (count is # of images with that false alarm rate)
fa_rates <- d %>%
  group_by(image_code) %>%
     summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
              fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")),
              seen_count = n())

ggplot(fa_rates, aes(x=fa_r)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666")

# Check out the distributions of other ratings by image!
counts_img <- d %>% 
  group_by(image) %>%
  summarise(fa_count = sum(perf_str == "FA"),
            hit_count = sum(perf_str == "hit"),
            cr_count = sum(perf_str == "CR"),
            miss_count = sum(perf_str == "miss"),
            seen = n()) 


rates_plot <- melt(counts_img[,-6])
a <- ggplot(rates_plot, aes(x=value, color = variable)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="grey") 
a

# histogram of false alarms, testing for uniformity...
fa_hist <- hist(fa_rates$fa_r, breaks = 100)

uniform.test(fa_hist)

## 
##  Chi-squared test for given probabilities
## 
## data:  hist.output$counts
## X-squared = 763.76, df = 121, p-value < 2.2e-16

Logistic Regression, Leave-One-Out by ID (“Consistency”)

# idea is for each person, for each image, use the responses from everyone else for that image to predict test response
d <- read.csv("../memorability/memorability_all_observations.csv")

variance <- list()
estimates <- list()
p_values <- list()
summary_list <- list()
j <- 1
for (id in unique(d$id)){
    curr_id <- id
    train <- d %>% filter(!id == curr_id) %>%
      group_by(image_code) %>%
      summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
                fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
    train[is.na(train)] <- 0
    
    
    test <- d %>% filter(id == curr_id) %>%
      group_by(image_code) %>%
      summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
                fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
    test[is.na(test)] <- 0
    
    dt <- data.frame(image_code = train$image_code, train_fa = train$fa_r, test_fa = test$fa_r)
    
    # try this:
    #   model_fa <- glm(test_fa ~ train_fa, data = dt, family = binomial(logit))
    
    model_fa <- glm(test_fa ~ train_fa, data = dt, family = binomial())
    temp <- model_fa %>% summary()
    
    summary_list[j] <- model_fa
    
    variance[j] <- (temp$coefficients[2,2]^2 / 100)
    estimates[j] <- temp$coefficients[2,1]
    p_values[j] <- temp$coefficients[2,4]
    
    j <- j + 1
}


# weight by inverse variance
unlist(estimates) %>% summary()

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -31.3551   0.2057   5.0443   5.4133   8.3634  32.5332

avg_b1_weighted <- unlist(estimates)[-14] * (1/unlist(variance)[-14])
avg_b1_weighted %>% summary()

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -28.8708   0.5694  14.9662  12.5627  21.2791  47.5897

unlist(p_values)[-14] %>% summary()

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0005386 0.1251106 0.3892863 0.4302077 0.7486809 1.0000000

“Consistency” - As Recommended by Markus Brauer

“By averaging across participants the authors are basically ignoring the variability due to participants. They try to work their way around it by using weighted means (weighting by the inverse variance of the b1 estimate for each participant). But there are more elegant ways to do that. The authors could have simply estimated a generalized linear mixed-effects model and examined if the by-participant random intercept and by-stimulus random intercept are reliably different from zero … In addition, one may wonder whether the slope of the logit function (= the predicted increase in log-odds for every one-unit increase in the predictor) is the right indicator: Why the increase in log-odds, and not the increase in odds or the increase in probability? (You will learn about logistic regression in unit 34). But their analysis and”my" analysis will probably yield the same conclusion in 97% of the cases, so no major issue here."

Next steps:

Do Markus’ recommendation for consistency
Monte Carlo simulations to generate null distribution regression coefficients?

Two Alternative Forced Choice

In another sample, we captured image familiarity somewhat more explicitly, using a two-alternative, forced-choice task where participants chose the “more familiar” face from pairs of two, randomly paired, GAN face images.

Split Half Ranked Correlations (“Reliability”)

fam_fc_cor <- list()
for (i in 1:100){
  set.seed(i+100)
  shuffled_ids <- d$subject_id %>% unique() %>% sample(size = length(unique(d$subject_id)))
  end <- length(shuffled_ids) 
  split <- length(shuffled_ids)/2
  
  ids_1 <- data.frame(id = shuffled_ids[1:split])
  ids_2 <- data.frame(id = shuffled_ids[(split+1):end])
  
  half_1 <- d[d$subject_id %in% ids_1$id,]
  half_2 <- d[d$subject_id %in% ids_2$id,]
  
  seen_1 <- (table(half_1$left) + table(half_1$right)) %>% as.data.frame()
  seen_2 <- (table(half_2$left) + table(half_2$right)) %>% as.data.frame()
  
  # get rates based on number of times seen

  list_1 <- half_1$chosen %>%
    table() %>% as.data.frame()
  
  list_2 <- half_2$chosen %>%
    table() %>% as.data.frame()
  
  list_1$rate <- list_1$Freq / seen_1$Freq
  list_2$rate <- list_2$Freq / seen_2$Freq
  
  list_1$image_code <- gsub("[^0-9]", "", list_1$.) %>% as.numeric()
  list_2$image_code <- gsub("[^0-9]", "", list_2$.) %>% as.numeric()
  
  list_1 <- list_1[order(list_1$rate, list_1$image_code, decreasing = TRUE),]
  list_2 <- list_2[order(list_2$rate, list_2$image_code, decreasing = TRUE),]
  
    image_code <- list_1[order(list_1$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything

    rank_dt <- data.frame(image_code = image_code, 
                      index_1 = match(image_code, list_1$image_code),
                      index_2 = match(image_code, list_2$image_code))
  
    
  fam_fc_cor[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
}  

 fam_fc_cor %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##    average       min       max         sd
## 1 0.512396 0.4363636 0.5894949 0.03502862

rates_dt <- data.frame(iteration = c(1:100), coefficient = unlist(fam_fc_cor))
# 
# ggplot(data = rates_dt, aes(coefficient)) +
#   geom_histogram(bins = 50) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
#   labs(title = "Distribution of Correlations: 2-Alternative Forced Choice", subtitle ="100 Split-Halves of Participants; N = 100") + scale_color_brewer(palette = "Dark2") + theme(legend.title = element_blank()) +
#   geom_density(alpha = .2)

Random Permutations

# make lists for FA and hitS
image_codes <- unique(d$image_code)
rand_cor_list <- list()
for (i in 1:100){
  set.seed(i)
  list_1 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
  list_2 <- image_codes %>% as.data.frame %>% sample_n(size = 100)

  rand_cor_list[i] <- cor.test(x = list_1$., y = list_2$., method = 'kendall')$estimate
}

rand_cor_list %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##       average        min       max         sd
## 1 0.006929293 -0.1818182 0.1628283 0.06490201

rm(list_1)
rm(list_2)

permute_dt <- data.frame(i = c(1:100), permuted_cor = unlist(rand_cor_list))

ggplot(data = rates_dt, aes(coefficient)) +
  geom_histogram(bins = 50, aes(fill = "red")) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
  labs(title = "Distribution of Correlations: 2-Alternative Forced Choice", subtitle ="100 Split-Halves of Participants; N = 100; Grey bars = Null Distribution") + theme(legend.position = "none") +
  geom_histogram(bins = 50, data = permute_dt, aes(permuted_cor, alpha = .001))

Are these significantly different distributions?

test_dt <- data.frame(fc_cors = unlist(fam_fc_cor), permute = permute_dt$permuted_cor)
test_dt <- pivot_longer(test_dt, c("fc_cors", "permute"), names_to = "distribution")
test_dt$distribution_c <- if_else(test_dt$distribution == "fc_cors", .5, -.5)
t.test(value ~ distribution, data = test_dt, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  value by distribution
## t = 68.537, df = 152.16, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4908958 0.5200376
## sample estimates:
## mean in group fc_cors mean in group permute 
##           0.512395960           0.006929293

Next steps:

“by item consistency”

Explicit Ratings & Written Responses

Image ratings

In our third experiment we obtained a hyper-explicit measures of familiarity. Participants rated 100 “unfamiliar” GAN face images on a sliding scale between “Not at all familiar” and “Extremely familiar” (N = 43).

## Warning: Missing column names filled in: 'X1' [1]

## # A tibble: 6 x 5
##      X1 subject_id image                          image_code response
##   <dbl> <chr>      <chr>                               <dbl>    <dbl>
## 1     1 0l70vzbs   resources/face_55_resized.jpg          55       58
## 2     2 0l70vzbs   resources/face_291_resized.jpg        291       60
## 3     3 0l70vzbs   resources/face_359_resized.jpg        359       39
## 4     4 0l70vzbs   resources/face_560_resized.jpg        560       73
## 5     5 0l70vzbs   resources/face_228_resized.jpg        228       67
## 6     6 0l70vzbs   resources/face_428_resized.jpg        428       68

Should I z-score around participant average?

d$response_c <- d$response - ave(d$response, d$subject_id)

Split Half Ranked Correlations - “Reliability”

d <- d %>% filter(!grepl("mickey", d$image))
d$response_c <- d$response - ave(d$response, d$subject_id)

fam_rating_cor <- list()
for (i in 1:100){
  set.seed(i+100)
  shuffled_ids <- d$subject_id %>% unique() %>% sample(size = length(unique(d$subject_id)))
  end <- length(shuffled_ids) 
  split <- round(length(shuffled_ids)/2)
  
  ids_1 <- data.frame(id = shuffled_ids[1:split])
  ids_2 <- data.frame(id = shuffled_ids[(split+1):end])
  
  half_1 <- d[d$subject_id %in% ids_1$id,]
  half_2 <- d[d$subject_id %in% ids_2$id,]
  
  list_1 <- half_1 %>%
    group_by(image_code) %>%
    summarise(response_c = mean(response_c))

  list_2 <- half_2 %>%
    group_by(image_code) %>%
    summarise(response_c = mean(response_c))

  list_1 <- list_1[order(list_1$response_c, list_1$image_code, decreasing = TRUE),]
  list_2 <- list_2[order(list_2$response_c, list_2$image_code, decreasing = TRUE),]
  
      image_code <- list_1[order(list_1$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything

      rank_dt <-  data.frame(image_code = image_code, 
                  index_1 = match(image_code, list_1$image_code),
                  index_2 = match(image_code, list_2$image_code))
  
  fam_rating_cor[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
}  

fam_rating_cor %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##     average      min       max         sd
## 1 0.3224485 0.179798 0.4464646 0.04938527

rates_dt <- data.frame(iteration = c(1:100), coefficient = unlist(fam_rating_cor))

# ggplot(data = rates_dt, aes(coefficient)) +
#   geom_histogram() + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
#   labs(title = "Distribution of Correlations: Subjective Ratings", subtitle ="100 Split-Halves of Participants; N = 95") + scale_color_brewer(palette = "Dark2") + theme(legend.title = element_blank())

# make lists for FA and hitS
image_codes <- unique(d$image_code)
rand_cor_list <- list()
for (i in 1:100){
  set.seed(i)
  list_1 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
  list_2 <- image_codes %>% as.data.frame %>% sample_n(size = 100)

  rand_cor_list[i] <- cor.test(x = list_1$., y = list_2$., method = 'kendall')$estimate
}

rand_cor_list %>% unlist() %>% as.data.frame() %>%
  summarise(average = mean(.),
            min = min(.),
            max = max(.),
            sd = sd(.))

##        average        min       max         sd
## 1 -0.004472727 -0.1624242 0.2141414 0.07277703

rm(list_1)
rm(list_2)

permute_dt <- data.frame(i = c(1:100), permuted_cor = unlist(rand_cor_list))

rates_dt <- data.frame(iteration = c(1:100), coefficient = unlist(fam_rating_cor))


ggplot(data = rates_dt, aes(coefficient)) +
  geom_histogram(bins = 50, aes(fill = "red")) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
  labs(title = "Distribution of Correlations: Subjective Ratings", subtitle ="100 Split-Halves of Participants; N = 95; Grey bars = Null Distribution") + theme(legend.position = "none") +
  geom_histogram(bins = 50, data = permute_dt, aes(permuted_cor, alpha = .001))

Are these significantly different distributions?

test_dt <- data.frame(fc_cors = unlist(fam_rating_cor), permute = permute_dt$permuted_cor)
test_dt <- pivot_longer(test_dt, c("fc_cors", "permute"), names_to = "distribution")
test_dt$distribution_c <- if_else(test_dt$distribution == "fc_cors", .5, -.5)
t.test(value ~ distribution, data = test_dt, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  value by distribution
## t = 37.171, df = 174.22, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.3095625 0.3442799
## sample estimates:
## mean in group fc_cors mean in group permute 
##           0.322448485          -0.004472727

Analyses by individual:

Analyses by image:

Unsurprisingly, the three faces with high mean responses are the three Mickey Mouse catch trials.

# do Cronbach's alpha

d$image_code <- as.numeric(as.factor(d$image)) # dummy code as suggested

d %>%
  select(response, image_code) %>%
  psych::alpha(check.keys = T)

## Number of categories should be increased  in order to count frequencies.

## Warning in psych::alpha(., check.keys = T): Some items were negatively correlated with total scale and were automatically reversed.
##  This is indicated by a negative sign for the variable name.

## 
## Reliability analysis   
## Call: psych::alpha(x = ., check.keys = T)
## 
##   raw_alpha std.alpha G6(smc) average_r   S/N   ase mean sd median_r
##      0.075     0.076   0.039     0.039 0.082 0.029   53 21    0.039
## 
##  lower alpha upper     95% confidence boundaries
## 0.02 0.08 0.13 
## 
##  Reliability if an item is dropped:
##            raw_alpha std.alpha G6(smc) average_r   S/N alpha se var.r med.r
## response-      0.041     0.039  0.0015     0.039 0.041       NA     0 0.039
## image_code     0.038     0.039  0.0015     0.039 0.041       NA     0 0.039
## 
##  Item statistics 
##               n raw.r std.r r.cor r.drop mean sd
## response-  4000  0.74  0.72  0.14  0.039   55 30
## image_code 4000  0.71  0.72  0.14  0.039   50 29

Not at all reliable! How do I compute an alpha without this dummy coding?

That makes sense because this measure is experience-based…

Next steps:

ranked correlations within people
- within/out of 95% of average rating range?…then do simulations?
make “response profile” plots

Strategy Statements

Beyond determining whether different measurements of familiarity are consistent with one another, we were also interested in capturing the experience of familiarity. To this end, our third experiment included a survey, in which participants rated their agreement with statements about different strategies for rating familiarity (e.g. “I was likely to rate a face as familiar when it looked like a celebrity”; “I used specific face features when rating familiarity”). ~data visualization to come~

Next steps:

clean strings for analysis
response patterns for those who use whole/face parts & celeb/personal

Describing Familiarity… in words!

In addition, participants provided a short written response to the question: “How would you describe ‘familiarity’?”. The results indicate varying working definitions of familiarity, reflected by diverse use of the familiarity scale.

Many adhere to a more traditional definition:

“whether they resembled someone i’d seen before in real life or on this survey”
“When something new looks like something you’ve seen before”
“If they look somewhat like someone that I know”

To some, it can be a property of the face or image, related to memory… sometimes conflating with "memorability:

“the kind of face i’ve seen at least once or twice before”
“fame and the face shapes”
“a feature or identifying composition that is stored in one’s memory”

Some describe it as a “knowing”:

“Familiarity is a measure of the level to which you know something”
“Being well experienced with/in something”

To others, a feeling of closeness:

“something that brings me comfort in knowing”
“familiarity is a feeling of comfort and closeness and a friendly relationship”
“A feeling you have of recognition or closeness or knowing about someone”

Some illustrative combinations of these things:

“Familiarity is defined as knowledge of someone or something, or to a feeling of comfort and closeness with someone or something. When you have heard of a brand of computer, this is an example of a familiarity with the computer.”
“Familiarity is when I see a face that reminds me of someone I personally know or someone famous such as a movie actor, musician, etc.”

And our problematically ambiguous favorite: “good”

Extra notes:

Email from Markus:

My overall reaction is that I have no (major) concerns about the authors’ analyses. As I said in lecture, these authors are interested in how stimuli differ is “recognizability.” Are certain faces recognized more easily than others? So what are participants for us is stimuli for them (rows), and what is stimuli for us is participants for them (columns). Whereas we compute a hit rate and a false alarm rate for each participant, they compute a hit rate and a false alarm rate for each stimulus. Whereas we examine if participants differ in recognition performance they examine whether faces differ if recognizability.

When looking at reliability, we (in lecture) may compare the correlations between participants’ d’ scores [and HRs and FARs] for a randomly selected half of the stimuli and their d’ scores [and HRs and FARs] for the other half of the stimuli. In contrast, the authors look at reliability by computing the correlations between all stimuli’s d’ scores [and HRs and FARs] for a randomly selected half of the participants and the stimuli’s d’ scores [and HRs and FARs] for the other half of the participants. For these types of analyses, it’s a plus that the authors show it for all three indicators (d’, HR, and FAR).

In one of the typical recognition memory studies discussed in lecture, we may wonder whether inter-individual differences in recognition performance hold across stimuli. The authors are examining whether inter-stimulus differences in recognizability hold across participants.

I have a (minor) issue with the analyses reported on the bottom of page 1326 (right column). By averaging across participants the authors are basically ignoring the variability due to participants. They try to work their way around it by using weighted means (weighting by the inverse variance of the b1 estimate for each participant). But there are more elegant ways to do that. The authors could have simply estimated a generalized linear mixed-effects model and examined if the by-participant random intercept and by-stimulus random intercept are reliably different from zero (You will learn about models with multiple types of random effects in unit 33). In addition, one may wonder whether the slope of the logit function (= the predicted increase in log-odds for every one-unit increase in the predictor) is the right indicator: Why the increase in log-odds, and not the increase in odds or the increase in probability? (You will learn about logistic regression in unit 34). But their analysis and “my” analysis will probably yield the same conclusion in 97% of the cases, so no major issue here.

I am not very impressed by the median splits reported in the left column of page 1327. But the basic idea is fine. In lecture, we might examine if there are distinct types of participants: recognition experts (high FA, low FA -> highly positive d’, c around 0), yea sayers with poor recognition performance (high FA, high FA -> d’ around 0, highly negative c), nay sayers with poor recognition performance (low FA, low FA -> d’ around 0, highly positive c), and participants with good recognition performance who want to annoy the experimenter or mixed up the two response buttons (low FA, high FA -> highly negative d’, c around 0). The authors do something similar but for stimuli:

distinctive and highly memorable faces (high HR, low FAR -> highly positive d’, c around 0),

typical faces (high HR, high FAR -> d’ around 0, highly negative c),

forgettable faces (low HR, low FAR -> d’ around 0, highly positive c),

and faces evoking many false memories but few true ones (low HR, high FAR -> highly negative d’, c around 0).

I probably would have examined the extent to which the stimuli’s position in the two-dimensional space defined by d’ and c changes across random split halves of the participants. But the authors’ data-analytic approach yields four nicely defined groups of faces that readers can easily comprehend and remember, whereas “my” analysis doesn’t, so you could certainly argue that the authors’ data analytic approach, despite being slightly sub-optimal from a statistical perspective, is the better approach.

More notes cont’d:

For those with more rigid definitions of familiarity, consistently low ratings were given to all of the unfamiliar GAN faces, while consistently high ratings were given to the Mickey Mouse catch trials.

Getting Turker response of concrete vs. abstract definitions based on stuff in the written responses

TO DO:

reanalyze with linear mixed-effect models…
Incremental N rankings on participant level
standardize by individual! DO WHAT WE DID IN STATS to compute d’ and c!
Analyses by strategy in rating study (maybe include this in regression models? talk to Markus)
Confirm method for removing bad data in memorability experiment -> include a catch trial

DONE:

ranked correlation for “most familiar” faces between people and groups
- for false alarms and hit rates
incremental rank agreement measures by experiment
make plot of ordered face list overall for each experiment
Finalize participant count -> doubling seems reasonable for all three! I’m worried about being underpowered…
- settled on ~96 for memorability -> double!

Abstract / Rough Outline

Familiarity for never-before-seen faces is a phenomenon tied to both visual perception and personal experience (Lyon, 1996). Can face images be intrinsically familiar? If so, can familiarity be measured consistently? We obtained three measures of familiarity for 100 hyper-realistic, GAN-generated faces, and examined the correspondence in responses among participants and among experiments. Our first task captured memorability (accurate recognition of something previously seen; recognition hit rate, Bainbridge et al., 2016) and familiarity (false recognition of something not previously seen; false alarm rate). However, false alarm-based quantification of familiarity alone is likely more conservative than our typical experiences of familiarity.

Therefore, in a second experiment, we measured familiarity using an untimed forced-choice task in which participants chose the “more familiar” face in random pairs of faces. The resulting score for each face across participants serves as its familiarity score. Finally, in a third study, we aimed to capture the subjective nature of familiarity for individual faces by having participants rate faces on a sliding scale between “Not at all familiar” and “Extremely familiar”.

To establish the reliability among participants in their familiarity judgements, we computed Kendall ranked correlations between image rankings (by familiarity score) for 100 split-halves of the data for each experiment. We found widespread variability in image rankings (Exp.1 mean tau=0.07, Exp.2=0.01, Exp.3=0.04). We calculated the consistency of participant responses relative to population responses using logistic regression to predict familiarity scores and found varying levels of agreement by participant. Finally, we computed a Kendall correlation for image rankings among all experiments and found no significant correlation. The lack of correspondence in responses among participants and experiments suggests that “familiar” is likely not an intrinsic property and that experimental measures may fail to capture our everyday experience of face familiarity consistently.

Compost Pile!

We computed ranked correlations by “familiarity” score across all three experiments… Our results indicate that face images can have measurable, visually “familiar” properties that roughly correspond to increasingly more explicit measures of familiarity, suggesting some agreement between perceptual familiarity, and the individual experience of familiarity.

is it consistent and is it distinct from memorability? don’t overpromise, lead with reasons why we would choose three tasks

Face images can be intrinsically “memorable”– a measurable visual property independent from individual experience (Bainbridge et al., 2013).

Many of us have experienced a sense of “familiarity” for faces we have never seen before (e.g. at the extreme, pictures of celebrity look-alikes). Is the phenomenon of face familiarity one of individual experience and memory– a sense of “knowing”or “feeling”–, or a consequence of intrinsic properties of the face image itself? We obtained three measures of familiarity for 100, hyper-realistic, GAN-generated face images, and examined if these face images were consistently rated as familiar– across participants, measurements, and conceptual definitions.

To establish whether, and which, images in our dataset have salient, “familiar”, visual properties, we employed a memorability task paradigm that captured both visual memorability scores for 103 images (100 “unfamiliar” GAN faces, 3 images of Mickey Mouse), as well as inferred measurements of image familiarity (Bainbridge et al., 2016). We expected highly “familiar” images identified in this experiment to be consistently highly-scored under other measurements of familiarity.

In another sample, we captured image familiarity somewhat more explicitly, using a two-alternative, forced-choice task. Here, participants chose the “more familiar” face from pairs of two GAN face images, randomly paired without repeats. For each trial, images either get a score of 1 (familiar) or 0 (not familiar). The resulting score for each image across all participants serves as a measure of an image’s generalized familiarity score.

In our third experiment we obtained explicit measures of familiarity. Participants rated 100 “unfamiliar” GAN face images on a sliding scale between “Not at all familiar” and “Extremely familiar”. This is intended to capture the subtleties of “familiarity” for a given face, in comparison to the time and task pressures of the forced choice and memorability tasks.

Beyond determining whether our three measurements of familiarity are consistent with one another, we were also interested in capturing the experience of familiarity. To this end, our third experiment included a survey, in which participants rated their agreement with statements about different strategies for rating familiarity. Participants also provided a short written response to the question: “How would you describe ‘familiarity’?”.

Our results indicate that familiarity is likely not an intrinsic visual feature of face images, but rather a subjective rating of individual experience. Approximating measurements of familiarity with memory, forced-choice, or explicit rating tasks yield inconsistent definitions, and therefore measurements, of what “familiar” is.

Familiarity

Ivette Colón

4/28/2021

Across Experiments (ranked image comparisons)

Doing rank-by-rank images

Memorability & Familiarity

Calculating Sensitivity (d’) and Response Bias

By ID

By Image

Split Half Ranked Correlations (“Reliability”)

Random Permutations

Plotting the distribution of split-half ranked correlations

Are these significantly different distributions?

Check for uniform distribution of FAs

Logistic Regression, Leave-One-Out by ID (“Consistency”)

“Consistency” - As Recommended by Markus Brauer

Two Alternative Forced Choice

Split Half Ranked Correlations (“Reliability”)

Random Permutations

Are these significantly different distributions?

Explicit Ratings & Written Responses

Image ratings

Should I z-score around participant average?

Split Half Ranked Correlations - “Reliability”

Are these significantly different distributions?

Analyses by individual:

Analyses by image:

Strategy Statements

Describing Familiarity… in words!

And our problematically ambiguous favorite: “good”

Extra notes:

Email from Markus:

More notes cont’d:

TO DO:

DONE:

Abstract / Rough Outline

Compost Pile!