1 Set up R environment

library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plyr)
library(magick)
library(png)
library(EBImage)

Set the R working drectory to the main experiment directory.

setwd("/Users/adambarnas/Box/Mudsplash/Results")  

2 Format & manipulate raw data files

2.1 Read-in datafiles

First, read in the individual subject files (saved automatically on the server as csv files).

tbl_all <- list.files(path = "./Wolfe1_data/", pattern = "*.csv", full.names = T) %>%
  map_df(~read_csv(.))
tbl_all <- data.frame(tbl_all)
#head(tbl_all,10)

Get a count of the number of subjects.

nrow(tbl_all %>% distinct(workerId,.keep_all = FALSE))
## [1] 115

Next, rename the catch trials to the same convention as the main trials and break apart the unmod_image column into database (the lab where the stims come from) and image (the name of the image file).

tbl_all$unmod_image[tbl_all$unmod_image == "catchAirplane-a"] <- "wolfe_catchAirplane-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchBoat-a"] <- "wolfe_catchBoat-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchCow-a"] <- "wolfe_catchCow-a"
tbl_all$unmod_image <- lapply(tbl_all$unmod_image, gsub, pattern='-a', replacement='')
tbl_all <- tbl_all %>%
separate(unmod_image,into=c('database', 'image'), sep = "([\\_])")
#head(tbl_all,10)

Let’s, for now, also assign the trials to bins based on the trial number. The 2 practice trials at the beginning and the 1 catch trial at the end will be labeled “filler”.

tbl_all$bin = "filler"
tbl_all[which(tbl_all$trial_number %in% c(3:8)), "bin"] = "block_1"
tbl_all[which(tbl_all$trial_number %in% c(9:14)), "bin"] = "block_2"
tbl_all[which(tbl_all$trial_number %in% c(15:20)), "bin"] = "block_3"
tbl_all[which(tbl_all$trial_number %in% c(21:26)), "bin"] = "block_4"
tbl_all[which(tbl_all$trial_number %in% c(27:32)), "bin"] = "block_5"

Get the total number of trials for each subject and the initial count for each image.

tbl_all_counts <- tbl_all %>%
  group_by(workerId,image) %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow") %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_all_counts,10)

image_count_initial <- data.frame(image_count = colSums(tbl_all_counts[,2:112], na.rm = TRUE))
knitr::kable(image_count_initial)
image_count
image-001 33
image-002 28
image-003 31
image-004 35
image-005 31
image-006 33
image-007 34
image-008 31
image-009 32
image-010 32
image-011 33
image-012 33
image-013 33
image-014 32
image-015 33
image-016 33
image-017 32
image-018 30
image-019 33
image-020 34
image-021 31
image-022 30
image-023 32
image-024 33
image-025 33
image-026 32
image-027 32
image-028 33
image-029 33
image-030 33
image-031 32
image-032 31
image-033 33
image-034 34
image-035 33
image-037 32
image-038 33
image-039 34
image-040 29
image-041 30
image-042 32
image-043 31
image-044 31
image-045 34
image-046 32
image-047 34
image-048 31
image-049 31
image-050 30
image-076 32
image-077 33
image-078 31
image-079 31
image-080 34
image-081 32
image-082 33
image-083 32
image-084 33
image-085 33
image-086 34
image-087 32
image-088 35
image-089 33
image-090 33
image-091 31
image-092 33
image-093 32
image-094 33
image-095 33
image-096 33
image-097 32
image-098 33
image-099 34
image-100 32
image-101 31
image-102 33
image-103 30
image-104 30
image-105 30
image-106 29
image-107 32
image-108 32
image-109 34
image-110 32
image-111 31
image-112 32
image-113 32
image-114 33
image-115 35
image-116 37
image-117 30
image-118 29
image-119 35
image-120 29
image-121 32
image-122 34
image-123 30
image-124 34
image-125 30
image-126 33
image-127 31
image-128 33
image-129 33
image-130 33
image-131 32
image-132 31
image-133 30
image-134 31
image-135 34
image-136 32
image-137 32

The data are loaded. Let’s move on and examine the quality of the data.

2.2 Analyze accuracy

In this chunk, every click for a given image is compared to the image difference hull. The process involves the addition of two arrays - the difference hull array and an array created by the script and the subject’s click. The difference hull array is composed of 0s and 1s, with 1s corresponding to the changing object. An equally sized array of all 0s is composed, with one 1 corresponding to the X,Y coordinates of the click. These two arrays are added together and the maximum value is queried. A maximum value of 2 indicates that the click occurred within the boundaries of the image difference hall (an accurate click). A values less than 2 indicates that the click occurred outside the boundaries of the image difference hall (an inaccurate click). In the new click_ACC column, 1s correspond to accurate clicks and 0s correspond to inaccurate clicks. This will analyze the accuracy for the 2 practice images, all main images, and the 1 catch image.

img_train <- list.files(path = "/Users/adambarnas/Box/Mudsplash/Boxes_Wolfe1/", pattern = ".png", all.files = TRUE,full.names = TRUE,no.. = TRUE)
img_array <- readPNG(img_train)
img_list <- lapply(img_train, readPNG)
img_names <- row.names(image_count_initial)
img_names <- c("catchAirplane", "catchBoat", "catchCow", img_names)
names(img_list) = img_names

tbl_all$x[tbl_all$x == "0"] <- 1
tbl_all$y[tbl_all$y == "0"] <- 1

tbl_all$click_ACC= "filler"

for (i in 1:length(tbl_all$workerId)){
  img <- data.frame(img_list[tbl_all$image[i]])
  blank <- data.frame(array(c(0,0), dim = c(nrow(img),ncol(img))))
  blank[tbl_all$y[i], tbl_all$x[i]] <- 1
  combo <- img + blank
  which(combo==2, arr.ind=TRUE)
  if (max(combo, na.rm=TRUE) == 2){
    tbl_all$click_ACC[i] = 1
  } else {
    tbl_all$click_ACC[i] = 0
  }
} 

2.2.1 Catch trials

Check the accuracy of the catch trial. As a reminder, the catch trial was a large, salient changing object. If a subject did not click on the changing object during the catch trial, their performance on the main trials is likely poor and will be excluded. This chunk will filter the data by accuracy for both inaccurate (bad) catch trials and accurate (good) catch trials and save new dataframes. This chunk will also provide the number and workerIds for inaccurate and accurate catch trial performance.

tbl_all_catch_acc <- tbl_all %>%
  filter(image == "catchCow")
tbl_bad_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 0)
tbl_good_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 1)

tbl_bad_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_bad_catch_acc$workerId),]
nrow(tbl_bad_catch_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 2
tbl_good_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_good_catch_acc$workerId),]
nrow(tbl_good_catch_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 113

2.2.2 Main trials

Now, check the accuracy of the clicks for the main images. This chunk will compute the total number of inaccurate and accurate clicks for each subject.

tbl_good_catch_acc_all_main_acc <- tbl_good_catch_acc %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow")
tbl_good_catch_acc_all_main_acc_counts <- tbl_good_catch_acc_all_main_acc %>%
  group_by(workerId,click_ACC) %>%
  dplyr::summarize(counts = n()) %>%
  spread(click_ACC,counts) %>%
  mutate(total = rowSums(.[2:3], na.rm = TRUE))
colnames(tbl_good_catch_acc_all_main_acc_counts) <- c("workerId", "inacc", "acc", "total")

Here, we can plot the overall accuracy of the main trial clicks for the group.

tbl_good_catch_acc_all_main_acc_rate <- (tbl_good_catch_acc_all_main_acc_counts$acc / tbl_good_catch_acc_all_main_acc_counts$total)
tbl_good_catch_acc_all_main_acc_rate <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_counts[,1], tbl_good_catch_acc_all_main_acc_rate)
colnames(tbl_good_catch_acc_all_main_acc_rate) <- c("workerId", "acc_rate")
tbl_good_catch_acc_all_main_acc_rate[is.na(tbl_good_catch_acc_all_main_acc_rate)] <- 0


tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", add = "mean_se", ylim = c(0, 1), xlab = "Group", width = 0.5, label = TRUE, lab.nb.digits = 2, lab.vjust = -1.6, title = "Main Trial Accuracy for All Subjects")

tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(x = "workerId", y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), title = "Main Trial Accuracy for Individual Subjects", font.xtickslab = 4, sort.val = c("asc")) + rotate_x_text()

Count the number of subjects and only remove inaccurate trials.

nrow(tbl_good_catch_acc_all_main_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 113
tbl_good_catch_acc_all_main_acc_inacc_trials_removed <- tbl_good_catch_acc_all_main_acc %>% 
  filter(click_ACC == 1)

2.3 Remove outlier trials

Next, we can remove outlier RTs that are more than 3 SDs away from the mean.

Let’s get the number of trials. This is the initial number of trials.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts,10)

Before the data are trimmed, let’s generate histograms of all RTs and the mean RT of each subject

tbl_good_catch_acc_all_main_acc_inacc_trials_removed$rt_s = tbl_good_catch_acc_all_main_acc_inacc_trials_removed$rt/1000
tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,600), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_mean_subj_RT <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 25, xlim = c(0,25), ylim = c(0,12), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

Trial timer maxed out at 60 sec. Any RTs recorded as 60 sec should be discarded.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>% 
  filter(rt < 60000)

Next, data are inspected for RT outliers. Two additional columns are added to the data table. First, an “outliers” column is added that labels an RT as an outlier or not (0 = not an outlier, 1 = an outlier less than 3 SDs, 2 = an outlier greater than 3 SDs). Second, a “removed_RT” column is added that contains non-outlier RTs.

Note: code can be changed to allow for replacement of outliers with the cutoff values.

correct.trials <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed[tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed$click_ACC == "1",]
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed <- ddply(correct.trials, .(workerId), function(x){
  m <- mean(x$rt)
  s <- sd(x$rt)
  upper <- m + 3*s #change 3 with another number to increase or decrease cutoff criteria
  lower <- m - 3*s #change 3 with another number to increase or decrease cutoff criteria

  x$outliers <- 0
  x$outliers[x$rt > upper] <- 2
  x$outliers[x$rt < lower] <- 1
  x$removed_RT <- x$rt
  x$removed_RT[x$rt > upper]<- NA #change NA with upper to replace an outlier with the upper cutoff
  x$removed_RT[x$rt < lower]<- NA #change NA with lower to replace an outlier with the lower cutoff
  
  x
})
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed,10)

Next, let’s completely toss out the outlier trials (labeled as NA).

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed[!is.na(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed$removed_RT),]
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed,10)

Let’s get the number of trials. This is the number of trials that “survive” the data trimming.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts,10)

Here are new histograms of all RTs and the mean RT of each subject.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,600), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 25, xlim = c(0,25), ylim = c(0,15), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

What is the percentage of outlier RTs that were removed overall?

tbl_all_main_acc_rts_3SD_removed_count <- data.frame(total_removed = tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum - tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts$sum)

per_RTs_removed <- (sum(tbl_all_main_acc_rts_3SD_removed_count) / sum(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum)) * 100
per_RTs_removed
## [1] 2.521008

What is the percentage of outlier RTs that were removed per subject? This is easy to visualize in a plot.

tbl_per_rts_3SD_removed_by_subj <- data.frame((tbl_all_main_acc_rts_3SD_removed_count / tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum) * 100)
tbl_per_rts_3SD_removed_by_subj <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts[1],tbl_all_main_acc_rts_3SD_removed_count,tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum,tbl_per_rts_3SD_removed_by_subj)
colnames(tbl_per_rts_3SD_removed_by_subj) <- c("workerId", "outlier_RTs", "total_RTs", "percent_excluded")
#head(tbl_per_rts_3SD_removed_by_subj,10)

tbl_per_rts_3SD_removed_by_subj %>% 
  ggbarplot(x = "workerId", y = "percent_excluded", ylab = "% RTs excluded", fill = "#f7a800", font.tickslab = 4, sort.val = c("asc")) + rotate_x_text()

3 Analyze data

3.1 Some summary statistics

Let’s again confirm how many subjects we’re working with. This is the total number of subjects with good catch trial accuracy and good main trial accuracy.

nrow(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts %>% distinct(workerId,.keep_all = FALSE))
## [1] 106

3.2 Plot the results

This is a plot of the mean detection RT for each image.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  ggbarplot(x = "image", y = "rt_s", ylab = "Mean Detection RT (sec)", fill = "#f7a800", add = "mean_se", font.xtickslab = 4, sort.val = c("asc")) + rotate_x_text() + theme(legend.position = "none")

This table contains the final count for each image. This is after RTs were excluded that were more than 3 SDs from the mean.

image_count_final <- data.frame(image_count = colSums(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts[,2:112], na.rm = TRUE))
knitr::kable(image_count_final)
image_count
image-001 27
image-002 22
image-003 28
image-004 29
image-005 23
image-006 20
image-007 23
image-008 23
image-009 24
image-010 27
image-011 27
image-012 21
image-013 20
image-014 24
image-015 25
image-016 14
image-017 10
image-018 23
image-019 29
image-020 22
image-021 21
image-022 17
image-023 22
image-024 21
image-025 23
image-026 26
image-027 5
image-028 23
image-029 23
image-030 20
image-031 20
image-032 20
image-033 21
image-034 23
image-035 6
image-037 23
image-038 10
image-039 22
image-040 23
image-041 22
image-042 18
image-043 20
image-044 14
image-045 16
image-046 20
image-047 23
image-048 10
image-049 25
image-050 25
image-076 15
image-077 22
image-078 22
image-079 17
image-080 29
image-081 17
image-082 26
image-083 23
image-084 26
image-085 21
image-086 13
image-087 24
image-088 27
image-089 22
image-090 15
image-091 16
image-092 24
image-093 19
image-094 20
image-095 27
image-096 18
image-097 21
image-098 18
image-099 24
image-100 22
image-101 24
image-102 8
image-103 19
image-104 17
image-105 21
image-106 21
image-107 24
image-108 22
image-109 26
image-110 25
image-111 20
image-112 24
image-113 22
image-114 25
image-115 26
image-116 5
image-117 24
image-118 17
image-119 28
image-120 26
image-121 21
image-122 20
image-123 19
image-124 18
image-125 23
image-126 24
image-127 18
image-128 24
image-129 10
image-130 18
image-131 15
image-132 19
image-133 22
image-134 21
image-135 26
image-136 23
image-137 24