mem <- read_csv("../memorability/memorability_task_cleaned_byImg.csv")
mem <- mem %>% select(image_code, fa_r)
mem <- mem[order(mem$fa_r, mem$image_code, decreasing = TRUE),]
# centering by subject
f_rate <- read_csv("familiarity_image_response.csv")
f_rate <- f_rate %>%
group_by(image_code) %>%
summarise(avg = mean(response - ave(f_rate$response, f_rate$subject_id))) %>%
filter(! is.na(image_code))
f_rate <- f_rate[order(f_rate$avg, f_rate$image_code, decreasing = TRUE),]
f_fc <- read_csv("../fam_2AFC/familiarity_fc_cleaned.csv")
fc_seen <- table(f_fc$left) + table(f_fc$right)
f_fc <- f_fc$chosen %>% table() %>% as.data.frame()
f_fc$rate <- f_fc$Freq / fc_seen
rm(fc_seen)
f_fc$image_code <- gsub("[^0-9]", "", f_fc$.) %>% as.numeric()
f_fc <- f_fc %>%
select(image_code, rate)
f_fc <- f_fc[order(f_fc$rate, f_fc$image_code, decreasing = TRUE),]
f_fc$image_code <- f_fc$image_code %>% as.numeric()
image_code <- f_rate[order(f_rate$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything
rank_dt <- data.frame(image_code = image_code,
memorability_rank = mem$image_code,
memorability_index = match(image_code, mem$image_code),
rating_rank = f_rate$image_code,
rating_index = match(image_code, f_rate$image_code),
fc_rank = f_fc$image_code,
fc_index = match(image_code, f_fc$image_code))
cor.test(x = rank_dt$memorability_index, y = rank_dt$rating_index, method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: rank_dt$memorability_index and rank_dt$rating_index
## z = 4.4017, p-value = 1.074e-05
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.2985859
cor.test(x = rank_dt$fc_index, y = rank_dt$rating_index, method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: rank_dt$fc_index and rank_dt$rating_index
## z = 7.07, p-value = 1.549e-12
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.479596
cor.test(x = rank_dt$fc_index, y = rank_dt$memorability_index, method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: rank_dt$fc_index and rank_dt$memorability_index
## z = 3.3772, p-value = 0.0007323
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.2290909
image_code <- f_rate[order(f_rate$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything
rank_dt <- data.frame(image_code = image_code,
memorability_rank = mem$image_code,
memorability_index = match(image_code, mem$image_code),
rating_rank = f_rate$image_code,
rating_index = match(image_code, f_rate$image_code),
fc_rank = f_fc$image_code,
fc_index = match(image_code, f_fc$image_code))
cor.test(x = rank_dt$memorability_index, y = rank_dt$rating_index, method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: rank_dt$memorability_index and rank_dt$rating_index
## z = 4.4017, p-value = 1.074e-05
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.2985859
cor.test(x = rank_dt$fc_index, y = rank_dt$rating_index, method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: rank_dt$fc_index and rank_dt$rating_index
## z = 7.07, p-value = 1.549e-12
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.479596
cor.test(x = rank_dt$fc_index, y = rank_dt$memorability_index, method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: rank_dt$fc_index and rank_dt$memorability_index
## z = 3.3772, p-value = 0.0007323
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.2290909
# Memorability by Rating
p1 <- ggplot(data = rank_dt, aes(x = memorability_index,
y = rating_index)) +
geom_point(aes(alpha = .5)) +
geom_smooth(method = "lm") +
geom_text(aes(label = image_code)) +
theme(legend.position = "none") +
xlab("Memorability Ranking") + ylab("Subjective Rating Ranking") +
labs(title = "Image Ranking: Memorability x Subjective Rating",
subtitle = "Kendall tau-b correlation = .30")
p2 <- ggplot(data = rank_dt, aes(x = memorability_index,
y = fc_index)) +
geom_point(aes(alpha = .5)) +
geom_smooth(method = "lm") +
geom_text(aes(label = image_code)) +
theme(legend.position = "none") +
xlab("Memorability Ranking") + ylab("Forced Choice Ranking") +
labs(title = "Memorability x Forced Choice",
subtitle = "Kendall tau-b correlation = .23")
p3 <- ggplot(data = rank_dt, aes(x = rating_index,
y = fc_index)) +
geom_point(aes(alpha = .5)) +
geom_smooth(method = "lm") +
geom_text(aes(label = image_code)) +
theme(legend.position = "none") +
xlab("Subjective Rating Ranking") + ylab("Forced Choice Ranking") +
labs(title = "Subjective Rating x Forced Choice",
subtitle = "Kendall tau-b correlation = ..48",
caption = "point labels = image IDs")
plot_grid(p1, p2, p3, nrow = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Face images can be more or less intrinsically “memorable” (Bainbridge et al., 2016) – can they be intrinsically familiar as well, possessing some property that is consistently deemed “familiar” across observers?
We were interested in whether images with salient familiarity signals (based on false alarm rates in the memorability task) would correspond to other, more explicit measures of familiarity. We expect consistent memorability and familiarity scores for each image across individuals.
Been looking at data quality issues. I’m thinking about removing people that have false alarms/misses that are above/below two standard deviations from the mean. How to deal with people that don’t have any false alarms?
d’= someone’s ability to distinguish whether they’ve seen something before or not (“seen” signal is present or absent) aka sensitivity to the signal
c = response bias! How likely they are to just… say yes
We could regress d’ or c onto the strategy scores! We would use them like normal DVs in our regression.
rates_id <- d %>%
group_by(id) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
sdt_indices_id <- psycho::dprime(n_hit = counts_id$hit,
n_fa = counts_id$fa,
n_miss = counts_id$miss,
n_cr = counts_id$cr,
adjusted = TRUE)
rates_id$dprime <- sdt_indices_id$dprime
rates_id$c <- sdt_indices_id$c
### OLD WAY (about the same, just doesn't handle NAs) ###
# first we'll calculate d'
# rates_id$dprime <- qnorm(rates_id$hit_r) - qnorm(rates_id$fa_r)
# next we'll calculate sensitivity
#rates_id$sensi <- -.5 * (qnorm(rates_id$hit_r) + qnorm(rates_id$fa_r))
### !! ### !! ###
rates_img <- d %>%
group_by(image_code) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
sdt_indices_img <- psycho::dprime(n_hit = counts_img$hit,
n_fa = counts_img$fa,
n_miss = counts_img$miss,
n_cr = counts_img$cr,
adjusted = TRUE)
rates_img$dprime <- sdt_indices_img$dprime
rates_img$c <- sdt_indices_img$c
### !! OLD WAY !! ###
# rates_img$dprime <- qnorm(rates_img$hit_r) - qnorm(rates_img$fa_r)
# rates_img$sensi <- -.5 * (qnorm(rates_img$hit_r) + qnorm(rates_img$fa_r))
### !! ### !! ###
# make lists for FA and hitS
fa_cor_list <- list()
hit_cor_list <- list()
d_cor_list <- list()
for (i in 1:100){
set.seed(i+10)
shuffled_ids <- d$id %>% unique() %>% sample(size = length(unique(d$id)))
end <- length(shuffled_ids)
split <- length(shuffled_ids)/2
ids_1 <- data.frame(id = shuffled_ids[1:split])
ids_2 <- data.frame(id = shuffled_ids[(split+1):end])
half_1 <- d[d$id %in% ids_1$id,]
half_2 <- d[d$id %in% ids_2$id,]
list_1 <- half_1 %>%
group_by(image_code) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
list_2 <- half_2 %>%
group_by(image_code) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
counts_1 <- half_1 %>%
group_by(image) %>%
summarise(fa = sum(perf_str == "FA"),
hit = sum(perf_str == "hit"),
cr = sum(perf_str == "CR"),
miss = sum(perf_str == "miss"),
seen = n())
counts_2 <- half_2 %>%
group_by(image) %>%
summarise(fa = sum(perf_str == "FA"),
hit = sum(perf_str == "hit"),
cr = sum(perf_str == "CR"),
miss = sum(perf_str == "miss"),
seen = n())
sdt_1 <- psycho::dprime(n_hit = counts_1$hit,
n_fa = counts_1$fa,
n_miss = counts_1$miss,
n_cr = counts_1$cr,
adjusted = TRUE)
list_1$dprime <- sdt_1$dprime
list_1$c <- sdt_1$c
sdt_2 <- psycho::dprime(n_hit = counts_2$hit,
n_fa = counts_2$fa,
n_miss = counts_2$miss,
n_cr = counts_2$cr,
adjusted = TRUE)
list_2$dprime <- sdt_2$dprime
list_2$c <- sdt_2$c
image_code <- list_1[order(list_1$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything
# sort by false alarm rate, then by name
list_1 <- list_1[
with(list_1, order(fa_r, image_code, decreasing = T)),]
list_2 <- list_2[
with(list_2, order(fa_r, image_code, decreasing = T)),]
rank_dt <- data.frame(image_code = image_code,
index_1 = match(image_code, list_1$image_code),
index_2 = match(image_code, list_2$image_code))
# get ranked correlation for FA sorted list
fa_cor_list[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
# sort by hit rate, then by name
list_1 <- list_1[
with(list_1, order(hit_r, image_code, decreasing = T)),]
list_2 <- list_2[
with(list_2, order(hit_r, image_code, decreasing = T)),]
rank_dt <- data.frame(image_code = image_code,
index_1 = match(image_code, list_1$image_code),
index_2 = match(image_code, list_2$image_code))
hit_cor_list[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
# sort by d-prime, then by name
list_1 <- list_1[
with(list_1, order(dprime, image_code, decreasing = T)),]
list_2 <- list_2[
with(list_2, order(dprime, image_code, decreasing = T)),]
rank_dt <- data.frame(image_code = image_code,
index_1 = match(image_code, list_1$image_code),
index_2 = match(image_code, list_2$image_code))
d_cor_list[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
# sort by c, then by name
}
# fa_cor_list %>% unlist() %>% summary()
fa_cor_list %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.2561535 0.1656566 0.3729293 0.04154242
hit_cor_list %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.2121535 0.0820202 0.3979798 0.04600192
d_cor_list %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.1934909 0.06626263 0.3220202 0.04609074
rm(half_1)
rm(half_2)
rm(ids_1)
rm(ids_2)
rm(counts_1)
rm(counts_2)
rm(sdt_1)
rm(sdt_2)
# make lists for FA and hitS
image_codes <- unique(d$image_code)
rand_cor_list <- list()
for (i in 1:100){
set.seed(i)
list_1 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
list_2 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
rand_cor_list[i] <- cor.test(x = list_1$., y = list_2$., method = 'kendall')$estimate
}
rand_cor_list %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.009862626 -0.1329293 0.1721212 0.05830017
rm(list_1)
rm(list_2)
cor_rates_dt <- data.frame(i = c(1:100), far = unlist(fa_cor_list), hits = unlist(hit_cor_list), dprime = unlist(d_cor_list))
cor_fa_dt <- cor_rates_dt
cor_rates_dt <- pivot_longer(cor_rates_dt, c("far", "hits", "dprime"), names_to = "measure")
# to include random permutation correlation distribution
permute_dt <- data.frame(i = c(1:100), permuted_cor = unlist(rand_cor_list))
# ggplot(data = cor_rates_dt, aes(x = i, y = value, color = as.factor(measure))) +
# geom_line(stat = "identity") + xlab("iteration") + ylab("coefficient (Kendall tau-b)") +
# labs(title = "Distribution of Correlations: Memorability", subtitle ="100 Split-Halves of Participants; N = 93") + scale_color_brewer(palette = "Dark2") + theme(legend.title = element_blank())
#
ggplot(data = cor_rates_dt, aes(value, color = as.factor(measure))) +
geom_histogram(bins = 50) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
labs(title = "Histogram of Correlations: Memorability", subtitle ="100 Split-Halves of Participants; N = 93") + scale_color_brewer(palette = "Dark2") + theme(legend.position = "none") +
facet_wrap(~measure)
ggplot(data = cor_fa_dt, aes(far)) +
geom_histogram(bins = 50, aes(fill = "blue")) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
labs(title = "Histogram of Correlations: Memorability, ranked by FAR", subtitle ="100 Split-Halves of Participants; N = 93; Grey bars = Null Distribution") + theme(legend.position = "none") +
geom_histogram(bins = 50, data = permute_dt, aes(permuted_cor, alpha = .001))
test_dt <- data.frame(far = cor_fa_dt$far, permute = permute_dt$permuted_cor)
test_dt <- pivot_longer(test_dt, c("far", "permute"), names_to = "distribution")
test_dt$distribution_c <- if_else(test_dt$distribution == "far", .5, -.5)
t.test(value ~ distribution, data = test_dt, var.equal = F)
##
## Welch Two Sample t-test
##
## data: value by distribution
## t = 34.404, df = 178.93, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2321646 0.2604172
## sample estimates:
## mean in group far mean in group permute
## 0.256153535 0.009862626
They are significantly different! Interesting!
# Just a histograms of false alarms (count is # of images with that false alarm rate)
fa_rates <- d %>%
group_by(image_code) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")),
seen_count = n())
ggplot(fa_rates, aes(x=fa_r)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666")
# Check out the distributions of other ratings by image!
counts_img <- d %>%
group_by(image) %>%
summarise(fa_count = sum(perf_str == "FA"),
hit_count = sum(perf_str == "hit"),
cr_count = sum(perf_str == "CR"),
miss_count = sum(perf_str == "miss"),
seen = n())
rates_plot <- melt(counts_img[,-6])
a <- ggplot(rates_plot, aes(x=value, color = variable)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="grey")
a
# histogram of false alarms, testing for uniformity...
fa_hist <- hist(fa_rates$fa_r, breaks = 100)
uniform.test(fa_hist)
##
## Chi-squared test for given probabilities
##
## data: hist.output$counts
## X-squared = 763.76, df = 121, p-value < 2.2e-16
# idea is for each person, for each image, use the responses from everyone else for that image to predict test response
d <- read.csv("../memorability/memorability_all_observations.csv")
variance <- list()
estimates <- list()
p_values <- list()
summary_list <- list()
j <- 1
for (id in unique(d$id)){
curr_id <- id
train <- d %>% filter(!id == curr_id) %>%
group_by(image_code) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
train[is.na(train)] <- 0
test <- d %>% filter(id == curr_id) %>%
group_by(image_code) %>%
summarise(hit_r = sum(perf_str == "hit")/(sum(perf_str == "hit") + sum(perf_str == "miss")),
fa_r = sum(perf_str == "FA") / (sum(perf_str == "FA") + sum(perf_str == "CR")))
test[is.na(test)] <- 0
dt <- data.frame(image_code = train$image_code, train_fa = train$fa_r, test_fa = test$fa_r)
# try this:
# model_fa <- glm(test_fa ~ train_fa, data = dt, family = binomial(logit))
model_fa <- glm(test_fa ~ train_fa, data = dt, family = binomial())
temp <- model_fa %>% summary()
summary_list[j] <- model_fa
variance[j] <- (temp$coefficients[2,2]^2 / 100)
estimates[j] <- temp$coefficients[2,1]
p_values[j] <- temp$coefficients[2,4]
j <- j + 1
}
# weight by inverse variance
unlist(estimates) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -31.3551 0.2057 5.0443 5.4133 8.3634 32.5332
avg_b1_weighted <- unlist(estimates)[-14] * (1/unlist(variance)[-14])
avg_b1_weighted %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -28.8708 0.5694 14.9662 12.5627 21.2791 47.5897
unlist(p_values)[-14] %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0005386 0.1251106 0.3892863 0.4302077 0.7486809 1.0000000
“By averaging across participants the authors are basically ignoring the variability due to participants. They try to work their way around it by using weighted means (weighting by the inverse variance of the b1 estimate for each participant). But there are more elegant ways to do that. The authors could have simply estimated a generalized linear mixed-effects model and examined if the by-participant random intercept and by-stimulus random intercept are reliably different from zero … In addition, one may wonder whether the slope of the logit function (= the predicted increase in log-odds for every one-unit increase in the predictor) is the right indicator: Why the increase in log-odds, and not the increase in odds or the increase in probability? (You will learn about logistic regression in unit 34). But their analysis and”my" analysis will probably yield the same conclusion in 97% of the cases, so no major issue here."
Next steps:
In another sample, we captured image familiarity somewhat more explicitly, using a two-alternative, forced-choice task where participants chose the “more familiar” face from pairs of two, randomly paired, GAN face images.
fam_fc_cor <- list()
for (i in 1:100){
set.seed(i+100)
shuffled_ids <- d$subject_id %>% unique() %>% sample(size = length(unique(d$subject_id)))
end <- length(shuffled_ids)
split <- length(shuffled_ids)/2
ids_1 <- data.frame(id = shuffled_ids[1:split])
ids_2 <- data.frame(id = shuffled_ids[(split+1):end])
half_1 <- d[d$subject_id %in% ids_1$id,]
half_2 <- d[d$subject_id %in% ids_2$id,]
seen_1 <- (table(half_1$left) + table(half_1$right)) %>% as.data.frame()
seen_2 <- (table(half_2$left) + table(half_2$right)) %>% as.data.frame()
# get rates based on number of times seen
list_1 <- half_1$chosen %>%
table() %>% as.data.frame()
list_2 <- half_2$chosen %>%
table() %>% as.data.frame()
list_1$rate <- list_1$Freq / seen_1$Freq
list_2$rate <- list_2$Freq / seen_2$Freq
list_1$image_code <- gsub("[^0-9]", "", list_1$.) %>% as.numeric()
list_2$image_code <- gsub("[^0-9]", "", list_2$.) %>% as.numeric()
list_1 <- list_1[order(list_1$rate, list_1$image_code, decreasing = TRUE),]
list_2 <- list_2[order(list_2$rate, list_2$image_code, decreasing = TRUE),]
image_code <- list_1[order(list_1$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything
rank_dt <- data.frame(image_code = image_code,
index_1 = match(image_code, list_1$image_code),
index_2 = match(image_code, list_2$image_code))
fam_fc_cor[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
}
fam_fc_cor %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.512396 0.4363636 0.5894949 0.03502862
rates_dt <- data.frame(iteration = c(1:100), coefficient = unlist(fam_fc_cor))
#
# ggplot(data = rates_dt, aes(coefficient)) +
# geom_histogram(bins = 50) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
# labs(title = "Distribution of Correlations: 2-Alternative Forced Choice", subtitle ="100 Split-Halves of Participants; N = 100") + scale_color_brewer(palette = "Dark2") + theme(legend.title = element_blank()) +
# geom_density(alpha = .2)
# make lists for FA and hitS
image_codes <- unique(d$image_code)
rand_cor_list <- list()
for (i in 1:100){
set.seed(i)
list_1 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
list_2 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
rand_cor_list[i] <- cor.test(x = list_1$., y = list_2$., method = 'kendall')$estimate
}
rand_cor_list %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.006929293 -0.1818182 0.1628283 0.06490201
rm(list_1)
rm(list_2)
permute_dt <- data.frame(i = c(1:100), permuted_cor = unlist(rand_cor_list))
ggplot(data = rates_dt, aes(coefficient)) +
geom_histogram(bins = 50, aes(fill = "red")) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
labs(title = "Distribution of Correlations: 2-Alternative Forced Choice", subtitle ="100 Split-Halves of Participants; N = 100; Grey bars = Null Distribution") + theme(legend.position = "none") +
geom_histogram(bins = 50, data = permute_dt, aes(permuted_cor, alpha = .001))
test_dt <- data.frame(fc_cors = unlist(fam_fc_cor), permute = permute_dt$permuted_cor)
test_dt <- pivot_longer(test_dt, c("fc_cors", "permute"), names_to = "distribution")
test_dt$distribution_c <- if_else(test_dt$distribution == "fc_cors", .5, -.5)
t.test(value ~ distribution, data = test_dt, var.equal = F)
##
## Welch Two Sample t-test
##
## data: value by distribution
## t = 68.537, df = 152.16, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4908958 0.5200376
## sample estimates:
## mean in group fc_cors mean in group permute
## 0.512395960 0.006929293
Next steps:
“by item consistency”
In our third experiment we obtained a hyper-explicit measures of familiarity. Participants rated 100 “unfamiliar” GAN face images on a sliding scale between “Not at all familiar” and “Extremely familiar” (N = 43).
## Warning: Missing column names filled in: 'X1' [1]
## # A tibble: 6 x 5
## X1 subject_id image image_code response
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 0l70vzbs resources/face_55_resized.jpg 55 58
## 2 2 0l70vzbs resources/face_291_resized.jpg 291 60
## 3 3 0l70vzbs resources/face_359_resized.jpg 359 39
## 4 4 0l70vzbs resources/face_560_resized.jpg 560 73
## 5 5 0l70vzbs resources/face_228_resized.jpg 228 67
## 6 6 0l70vzbs resources/face_428_resized.jpg 428 68
d$response_c <- d$response - ave(d$response, d$subject_id)
d <- d %>% filter(!grepl("mickey", d$image))
d$response_c <- d$response - ave(d$response, d$subject_id)
fam_rating_cor <- list()
for (i in 1:100){
set.seed(i+100)
shuffled_ids <- d$subject_id %>% unique() %>% sample(size = length(unique(d$subject_id)))
end <- length(shuffled_ids)
split <- round(length(shuffled_ids)/2)
ids_1 <- data.frame(id = shuffled_ids[1:split])
ids_2 <- data.frame(id = shuffled_ids[(split+1):end])
half_1 <- d[d$subject_id %in% ids_1$id,]
half_2 <- d[d$subject_id %in% ids_2$id,]
list_1 <- half_1 %>%
group_by(image_code) %>%
summarise(response_c = mean(response_c))
list_2 <- half_2 %>%
group_by(image_code) %>%
summarise(response_c = mean(response_c))
list_1 <- list_1[order(list_1$response_c, list_1$image_code, decreasing = TRUE),]
list_2 <- list_2[order(list_2$response_c, list_2$image_code, decreasing = TRUE),]
image_code <- list_1[order(list_1$image_code, decreasing = FALSE),]$image_code # just to get image codes in numerical order, not by a ranking or anything
rank_dt <- data.frame(image_code = image_code,
index_1 = match(image_code, list_1$image_code),
index_2 = match(image_code, list_2$image_code))
fam_rating_cor[i] <- cor.test(x = rank_dt$index_1, y = rank_dt$index_2, method = 'kendall')$estimate
}
fam_rating_cor %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 0.3224485 0.179798 0.4464646 0.04938527
rates_dt <- data.frame(iteration = c(1:100), coefficient = unlist(fam_rating_cor))
# ggplot(data = rates_dt, aes(coefficient)) +
# geom_histogram() + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
# labs(title = "Distribution of Correlations: Subjective Ratings", subtitle ="100 Split-Halves of Participants; N = 95") + scale_color_brewer(palette = "Dark2") + theme(legend.title = element_blank())
# make lists for FA and hitS
image_codes <- unique(d$image_code)
rand_cor_list <- list()
for (i in 1:100){
set.seed(i)
list_1 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
list_2 <- image_codes %>% as.data.frame %>% sample_n(size = 100)
rand_cor_list[i] <- cor.test(x = list_1$., y = list_2$., method = 'kendall')$estimate
}
rand_cor_list %>% unlist() %>% as.data.frame() %>%
summarise(average = mean(.),
min = min(.),
max = max(.),
sd = sd(.))
## average min max sd
## 1 -0.004472727 -0.1624242 0.2141414 0.07277703
rm(list_1)
rm(list_2)
permute_dt <- data.frame(i = c(1:100), permuted_cor = unlist(rand_cor_list))
rates_dt <- data.frame(iteration = c(1:100), coefficient = unlist(fam_rating_cor))
ggplot(data = rates_dt, aes(coefficient)) +
geom_histogram(bins = 50, aes(fill = "red")) + xlab("correlation coefficient (tau-b)") + ylab("frequency") +
labs(title = "Distribution of Correlations: Subjective Ratings", subtitle ="100 Split-Halves of Participants; N = 95; Grey bars = Null Distribution") + theme(legend.position = "none") +
geom_histogram(bins = 50, data = permute_dt, aes(permuted_cor, alpha = .001))
test_dt <- data.frame(fc_cors = unlist(fam_rating_cor), permute = permute_dt$permuted_cor)
test_dt <- pivot_longer(test_dt, c("fc_cors", "permute"), names_to = "distribution")
test_dt$distribution_c <- if_else(test_dt$distribution == "fc_cors", .5, -.5)
t.test(value ~ distribution, data = test_dt, var.equal = F)
##
## Welch Two Sample t-test
##
## data: value by distribution
## t = 37.171, df = 174.22, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.3095625 0.3442799
## sample estimates:
## mean in group fc_cors mean in group permute
## 0.322448485 -0.004472727
Unsurprisingly, the three faces with high mean responses are the three Mickey Mouse catch trials.
# do Cronbach's alpha
d$image_code <- as.numeric(as.factor(d$image)) # dummy code as suggested
d %>%
select(response, image_code) %>%
psych::alpha(check.keys = T)
## Number of categories should be increased in order to count frequencies.
## Warning in psych::alpha(., check.keys = T): Some items were negatively correlated with total scale and were automatically reversed.
## This is indicated by a negative sign for the variable name.
##
## Reliability analysis
## Call: psych::alpha(x = ., check.keys = T)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.075 0.076 0.039 0.039 0.082 0.029 53 21 0.039
##
## lower alpha upper 95% confidence boundaries
## 0.02 0.08 0.13
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## response- 0.041 0.039 0.0015 0.039 0.041 NA 0 0.039
## image_code 0.038 0.039 0.0015 0.039 0.041 NA 0 0.039
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## response- 4000 0.74 0.72 0.14 0.039 55 30
## image_code 4000 0.71 0.72 0.14 0.039 50 29
Not at all reliable! How do I compute an alpha without this dummy coding?
That makes sense because this measure is experience-based…
Next steps:
Beyond determining whether different measurements of familiarity are consistent with one another, we were also interested in capturing the experience of familiarity. To this end, our third experiment included a survey, in which participants rated their agreement with statements about different strategies for rating familiarity (e.g. “I was likely to rate a face as familiar when it looked like a celebrity”; “I used specific face features when rating familiarity”). ~data visualization to come~
Next steps:
In addition, participants provided a short written response to the question: “How would you describe ‘familiarity’?”. The results indicate varying working definitions of familiarity, reflected by diverse use of the familiarity scale.
Many adhere to a more traditional definition:
To some, it can be a property of the face or image, related to memory… sometimes conflating with "memorability:
Some describe it as a “knowing”:
To others, a feeling of closeness:
Some illustrative combinations of these things:
My overall reaction is that I have no (major) concerns about the authors’ analyses. As I said in lecture, these authors are interested in how stimuli differ is “recognizability.” Are certain faces recognized more easily than others? So what are participants for us is stimuli for them (rows), and what is stimuli for us is participants for them (columns). Whereas we compute a hit rate and a false alarm rate for each participant, they compute a hit rate and a false alarm rate for each stimulus. Whereas we examine if participants differ in recognition performance they examine whether faces differ if recognizability.
When looking at reliability, we (in lecture) may compare the correlations between participants’ d’ scores [and HRs and FARs] for a randomly selected half of the stimuli and their d’ scores [and HRs and FARs] for the other half of the stimuli. In contrast, the authors look at reliability by computing the correlations between all stimuli’s d’ scores [and HRs and FARs] for a randomly selected half of the participants and the stimuli’s d’ scores [and HRs and FARs] for the other half of the participants. For these types of analyses, it’s a plus that the authors show it for all three indicators (d’, HR, and FAR).
In one of the typical recognition memory studies discussed in lecture, we may wonder whether inter-individual differences in recognition performance hold across stimuli. The authors are examining whether inter-stimulus differences in recognizability hold across participants.
I have a (minor) issue with the analyses reported on the bottom of page 1326 (right column). By averaging across participants the authors are basically ignoring the variability due to participants. They try to work their way around it by using weighted means (weighting by the inverse variance of the b1 estimate for each participant). But there are more elegant ways to do that. The authors could have simply estimated a generalized linear mixed-effects model and examined if the by-participant random intercept and by-stimulus random intercept are reliably different from zero (You will learn about models with multiple types of random effects in unit 33). In addition, one may wonder whether the slope of the logit function (= the predicted increase in log-odds for every one-unit increase in the predictor) is the right indicator: Why the increase in log-odds, and not the increase in odds or the increase in probability? (You will learn about logistic regression in unit 34). But their analysis and “my” analysis will probably yield the same conclusion in 97% of the cases, so no major issue here.
I am not very impressed by the median splits reported in the left column of page 1327. But the basic idea is fine. In lecture, we might examine if there are distinct types of participants: recognition experts (high FA, low FA -> highly positive d’, c around 0), yea sayers with poor recognition performance (high FA, high FA -> d’ around 0, highly negative c), nay sayers with poor recognition performance (low FA, low FA -> d’ around 0, highly positive c), and participants with good recognition performance who want to annoy the experimenter or mixed up the two response buttons (low FA, high FA -> highly negative d’, c around 0). The authors do something similar but for stimuli:
distinctive and highly memorable faces (high HR, low FAR -> highly positive d’, c around 0),
typical faces (high HR, high FAR -> d’ around 0, highly negative c),
forgettable faces (low HR, low FAR -> d’ around 0, highly positive c),
and faces evoking many false memories but few true ones (low HR, high FAR -> highly negative d’, c around 0).
I probably would have examined the extent to which the stimuli’s position in the two-dimensional space defined by d’ and c changes across random split halves of the participants. But the authors’ data-analytic approach yields four nicely defined groups of faces that readers can easily comprehend and remember, whereas “my” analysis doesn’t, so you could certainly argue that the authors’ data analytic approach, despite being slightly sub-optimal from a statistical perspective, is the better approach.
For those with more rigid definitions of familiarity, consistently low ratings were given to all of the unfamiliar GAN faces, while consistently high ratings were given to the Mickey Mouse catch trials.
Getting Turker response of concrete vs. abstract definitions based on stuff in the written responses
reanalyze with linear mixed-effect models…
Incremental N rankings on participant level
standardize by individual! DO WHAT WE DID IN STATS to compute d’ and c!
Analyses by strategy in rating study (maybe include this in regression models? talk to Markus)
Confirm method for removing bad data in memorability experiment -> include a catch trial
ranked correlation for “most familiar” faces between people and groups
incremental rank agreement measures by experiment
make plot of ordered face list overall for each experiment
Finalize participant count -> doubling seems reasonable for all three! I’m worried about being underpowered…
Familiarity for never-before-seen faces is a phenomenon tied to both visual perception and personal experience (Lyon, 1996). Can face images be intrinsically familiar? If so, can familiarity be measured consistently? We obtained three measures of familiarity for 100 hyper-realistic, GAN-generated faces, and examined the correspondence in responses among participants and among experiments. Our first task captured memorability (accurate recognition of something previously seen; recognition hit rate, Bainbridge et al., 2016) and familiarity (false recognition of something not previously seen; false alarm rate). However, false alarm-based quantification of familiarity alone is likely more conservative than our typical experiences of familiarity.
Therefore, in a second experiment, we measured familiarity using an untimed forced-choice task in which participants chose the “more familiar” face in random pairs of faces. The resulting score for each face across participants serves as its familiarity score. Finally, in a third study, we aimed to capture the subjective nature of familiarity for individual faces by having participants rate faces on a sliding scale between “Not at all familiar” and “Extremely familiar”.
To establish the reliability among participants in their familiarity judgements, we computed Kendall ranked correlations between image rankings (by familiarity score) for 100 split-halves of the data for each experiment. We found widespread variability in image rankings (Exp.1 mean tau=0.07, Exp.2=0.01, Exp.3=0.04). We calculated the consistency of participant responses relative to population responses using logistic regression to predict familiarity scores and found varying levels of agreement by participant. Finally, we computed a Kendall correlation for image rankings among all experiments and found no significant correlation. The lack of correspondence in responses among participants and experiments suggests that “familiar” is likely not an intrinsic property and that experimental measures may fail to capture our everyday experience of face familiarity consistently.
We computed ranked correlations by “familiarity” score across all three experiments… Our results indicate that face images can have measurable, visually “familiar” properties that roughly correspond to increasingly more explicit measures of familiarity, suggesting some agreement between perceptual familiarity, and the individual experience of familiarity.
is it consistent and is it distinct from memorability? don’t overpromise, lead with reasons why we would choose three tasks
Face images can be intrinsically “memorable”– a measurable visual property independent from individual experience (Bainbridge et al., 2013).
Many of us have experienced a sense of “familiarity” for faces we have never seen before (e.g. at the extreme, pictures of celebrity look-alikes). Is the phenomenon of face familiarity one of individual experience and memory– a sense of “knowing”or “feeling”–, or a consequence of intrinsic properties of the face image itself? We obtained three measures of familiarity for 100, hyper-realistic, GAN-generated face images, and examined if these face images were consistently rated as familiar– across participants, measurements, and conceptual definitions.
To establish whether, and which, images in our dataset have salient, “familiar”, visual properties, we employed a memorability task paradigm that captured both visual memorability scores for 103 images (100 “unfamiliar” GAN faces, 3 images of Mickey Mouse), as well as inferred measurements of image familiarity (Bainbridge et al., 2016). We expected highly “familiar” images identified in this experiment to be consistently highly-scored under other measurements of familiarity.
In another sample, we captured image familiarity somewhat more explicitly, using a two-alternative, forced-choice task. Here, participants chose the “more familiar” face from pairs of two GAN face images, randomly paired without repeats. For each trial, images either get a score of 1 (familiar) or 0 (not familiar). The resulting score for each image across all participants serves as a measure of an image’s generalized familiarity score.
In our third experiment we obtained explicit measures of familiarity. Participants rated 100 “unfamiliar” GAN face images on a sliding scale between “Not at all familiar” and “Extremely familiar”. This is intended to capture the subtleties of “familiarity” for a given face, in comparison to the time and task pressures of the forced choice and memorability tasks.
Beyond determining whether our three measurements of familiarity are consistent with one another, we were also interested in capturing the experience of familiarity. To this end, our third experiment included a survey, in which participants rated their agreement with statements about different strategies for rating familiarity. Participants also provided a short written response to the question: “How would you describe ‘familiarity’?”.
Our results indicate that familiarity is likely not an intrinsic visual feature of face images, but rather a subjective rating of individual experience. Approximating measurements of familiarity with memory, forced-choice, or explicit rating tasks yield inconsistent definitions, and therefore measurements, of what “familiar” is.