1 Set up R environment

library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plyr)
library(magick)
library(png)
library(EBImage)

Set the R working drectory to the main experiment directory.

setwd("/Users/adambarnas/Box/Mudsplash/Results")  

2 Format & manipulate raw data files

2.1 Read-in datafiles

First, read in the individual subject files (saved automatically on the server as csv files).

tbl_all <- list.files(path = "./Ma_data/", pattern = "*.csv", full.names = T) %>%
  map_df(~read_csv(.))
tbl_all <- data.frame(tbl_all)
#head(tbl_all,10)

Get a count of the number of subjects.

nrow(tbl_all %>% distinct(workerId,.keep_all = FALSE))
## [1] 63

Next, rename the catch trials to the same convention as the main trials and break apart the unmod_image column into database (the lab where the stims come from) and image (the name of the image file).

tbl_all$unmod_image[tbl_all$unmod_image == "catchAirplane-a"] <- "rensink_catchAirplane-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchBoat-a"] <- "rensink_catchBoat-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchCow-a"] <- "rensink_catchCow-a"
tbl_all <- tbl_all %>%
separate(unmod_image,into=c('database', 'image', NA), sep = "([\\_\\-])")
#head(tbl_all,10)

Let’s, for now, also assign the trials to bins based on the trial number. The 2 practice trials at the beginning and the 1 catch trial at the end will be labeled “filler”.

tbl_all$bin = "filler"
tbl_all[which(tbl_all$trial_number %in% c(3:8)), "bin"] = "block_1"
tbl_all[which(tbl_all$trial_number %in% c(9:14)), "bin"] = "block_2"
tbl_all[which(tbl_all$trial_number %in% c(15:20)), "bin"] = "block_3"
tbl_all[which(tbl_all$trial_number %in% c(21:26)), "bin"] = "block_4"
tbl_all[which(tbl_all$trial_number %in% c(27:32)), "bin"] = "block_5"

Get the total number of trials for each subject and the initial count for each image.

tbl_all_counts <- tbl_all %>%
  group_by(workerId,image) %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow") %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_all_counts,10)

image_count_initial <- data.frame(image_count = colSums(tbl_all_counts[,2:70], na.rm = TRUE))
knitr::kable(image_count_initial)
image_count
10504629 27
10810329 25
1191801 28
12115280 25
12178414 27
13141692 27
1383096 30
13873251 28
16527526 28
18169626 27
18345691 28
22020472 28
23024660 30
23199105 27
24383097 25
25107991 28
27857618 25
3099758 28
31236119 28
32289063 26
38466626 28
38546029 28
42429798 24
4247084 29
44993860 23
45525109 29
46475259 27
46635293 27
48384711 25
48486405 29
51537628 29
51856108 29
55174490 26
56835136 27
57861456 26
61118260 30
62096551 29
62224663 30
67862299 29
69128765 27
70687495 27
72488522 29
73637203 27
74173745 27
75081153 25
75958241 26
77345858 27
77574131 29
77793328 28
79191795 29
79222679 27
79241011 29
79573638 28
8197559 26
81993755 28
83536470 28
83691215 28
83785171 26
85741618 27
86520382 29
87983207 26
88767165 27
89354846 30
8974554 27
90405028 27
95091295 26
97475929 28
98156944 26
98265889 26

The data are loaded. Let’s move on and examine the quality of the data.

2.2 Analyze accuracy

In this chunk, every click for a given image is compared to the image difference hull. The process involves the addition of two arrays - the difference hull array and an array created by the script and the subject’s click. The difference hull array is composed of 0s and 1s, with 1s corresponding to the changing object. An equally sized array of all 0s is composed, with one 1 corresponding to the X,Y coordinates of the click. These two arrays are added together and the maximum value is queried. A maximum value of 2 indicates that the click occurred within the boundaries of the image difference hall (an accurate click). A values less than 2 indicates that the click occurred outside the boundaries of the image difference hall (an inaccurate click). In the new click_ACC column, 1s correspond to accurate clicks and 0s correspond to inaccurate clicks. This will analyze the accuracy for the 2 practice images, all main images, and the 1 catch image.

img_train <- list.files(path = "/Users/adambarnas/Box/Mudsplash/Boxes_Ma/", pattern = ".png", all.files = TRUE,full.names = TRUE,no.. = TRUE)
img_array <- readPNG(img_train)
img_list <- lapply(img_train, readPNG)
img_names <- row.names(image_count_initial)
img_names <- c("catchAirplane", "catchBoat", "catchCow", img_names)
names(img_list) = img_names

tbl_all$x[tbl_all$x == "0"] <- 1
tbl_all$y[tbl_all$y == "0"] <- 1

tbl_all$click_ACC= "filler"

for (i in 1:length(tbl_all$workerId)){
  img <- data.frame(img_list[tbl_all$image[i]])
  blank <- data.frame(array(c(0,0), dim = c(nrow(img),ncol(img))))
  blank[tbl_all$y[i], tbl_all$x[i]] <- 1
  combo <- img + blank
  which(combo==2, arr.ind=TRUE)
  if (max(combo, na.rm=TRUE) == 2){
    tbl_all$click_ACC[i] = 1
  } else {
    tbl_all$click_ACC[i] = 0
  }
} 

2.2.1 Catch trials

Check the accuracy of the catch trial. As a reminder, the catch trial was a large, salient changing object. If a subject did not click on the changing object during the catch trial, their performance on the main trials is likely poor and will be excluded. This chunk will filter the data by accuracy for both inaccurate (bad) catch trials and accurate (good) catch trials and save new dataframes. This chunk will also provide the number and workerIds for inaccurate and accurate catch trial performance.

tbl_all_catch_acc <- tbl_all %>%
  filter(image == "catchCow")
tbl_bad_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 0)
tbl_good_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 1)

tbl_bad_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_bad_catch_acc$workerId),]
nrow(tbl_bad_catch_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 1
tbl_good_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_good_catch_acc$workerId),]
nrow(tbl_good_catch_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 62

2.2.2 Main trials

Now, check the accuracy of the clicks for the main images. This chunk will compute the total number of inaccurate and accurate clicks for each subject.

tbl_good_catch_acc_all_main_acc <- tbl_good_catch_acc %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow")
tbl_good_catch_acc_all_main_acc_counts <- tbl_good_catch_acc_all_main_acc %>%
  group_by(workerId,click_ACC) %>%
  dplyr::summarize(counts = n()) %>%
  spread(click_ACC,counts) %>%
  mutate(total = rowSums(.[2:3], na.rm = TRUE))
colnames(tbl_good_catch_acc_all_main_acc_counts) <- c("workerId", "inacc", "acc", "total")

Here, we can plot the overall accuracy of the main trial clicks for the group.

tbl_good_catch_acc_all_main_acc_rate <- (tbl_good_catch_acc_all_main_acc_counts$acc / tbl_good_catch_acc_all_main_acc_counts$total)
tbl_good_catch_acc_all_main_acc_rate <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_counts[,1], tbl_good_catch_acc_all_main_acc_rate)
colnames(tbl_good_catch_acc_all_main_acc_rate) <- c("workerId", "acc_rate")
tbl_good_catch_acc_all_main_acc_rate[is.na(tbl_good_catch_acc_all_main_acc_rate)] <- 0


tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", add = "mean_se", ylim = c(0, 1), xlab = "Group", width = 0.5, label = TRUE, lab.nb.digits = 2, lab.vjust = -1.6, title = "Main Trial Accuracy for All Subjects")

tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(x = "workerId", y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), title = "Main Trial Accuracy for Individual Subjects", font.xtickslab = 8, sort.val = c("asc")) + rotate_x_text()

Count the number of subjects and only remove inaccurate trials.

nrow(tbl_good_catch_acc_all_main_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 62
tbl_good_catch_acc_all_main_acc_inacc_trials_removed <- tbl_good_catch_acc_all_main_acc %>% 
  filter(click_ACC == 1)

2.3 Remove outlier trials

Next, we can remove outlier RTs that are more than 3 SDs away from the mean.

Let’s get the number of trials. This is the initial number of trials.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts,10)

Before the data are trimmed, let’s generate histograms of all RTs and the mean RT of each subject

tbl_good_catch_acc_all_main_acc_inacc_trials_removed$rt_s = tbl_good_catch_acc_all_main_acc_inacc_trials_removed$rt/1000
tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,500), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_mean_subj_RT <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 20, xlim = c(0,20), ylim = c(0,10), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

Trial timer maxed out at 60 sec. Any RTs recorded as 60 sec should be discarded.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>% 
  filter(rt < 60000)

Next, data are inspected for RT outliers. Two additional columns are added to the data table. First, an “outliers” column is added that labels an RT as an outlier or not (0 = not an outlier, 1 = an outlier less than 3 SDs, 2 = an outlier greater than 3 SDs). Second, a “removed_RT” column is added that contains non-outlier RTs.

Note: code can be changed to allow for replacement of outliers with the cutoff values.

correct.trials <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed[tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed$click_ACC == "1",]
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed <- ddply(correct.trials, .(workerId), function(x){
  m <- mean(x$rt)
  s <- sd(x$rt)
  upper <- m + 3*s #change 3 with another number to increase or decrease cutoff criteria
  lower <- m - 3*s #change 3 with another number to increase or decrease cutoff criteria

  x$outliers <- 0
  x$outliers[x$rt > upper] <- 2
  x$outliers[x$rt < lower] <- 1
  x$removed_RT <- x$rt
  x$removed_RT[x$rt > upper]<- NA #change NA with upper to replace an outlier with the upper cutoff
  x$removed_RT[x$rt < lower]<- NA #change NA with lower to replace an outlier with the lower cutoff
  
  x
})
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed,10)

Next, let’s completely toss out the outlier trials (labeled as NA).

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed[!is.na(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed$removed_RT),]
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed,10)

Let’s get the number of trials. This is the number of trials that “survive” the data trimming.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts,10)

Here are new histograms of all RTs and the mean RT of each subject.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,400), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 20, xlim = c(0,20), ylim = c(0,10), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

What is the percentage of outlier RTs that were removed overall?

tbl_all_main_acc_rts_3SD_removed_count <- data.frame(total_removed = tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum - tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts$sum)

per_RTs_removed <- (sum(tbl_all_main_acc_rts_3SD_removed_count) / sum(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum)) * 100
per_RTs_removed
## [1] 2.941176

What is the percentage of outlier RTs that were removed per subject? This is easy to visualize in a plot.

tbl_per_rts_3SD_removed_by_subj <- data.frame((tbl_all_main_acc_rts_3SD_removed_count / tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum) * 100)
tbl_per_rts_3SD_removed_by_subj <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts[1],tbl_all_main_acc_rts_3SD_removed_count,tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum,tbl_per_rts_3SD_removed_by_subj)
colnames(tbl_per_rts_3SD_removed_by_subj) <- c("workerId", "outlier_RTs", "total_RTs", "percent_excluded")
#head(tbl_per_rts_3SD_removed_by_subj,10)

tbl_per_rts_3SD_removed_by_subj %>% 
  ggbarplot(x = "workerId", y = "percent_excluded", ylab = "% RTs excluded", fill = "#f7a800", font.tickslab = 8, sort.val = c("asc")) + rotate_x_text()

3 Analyze data

3.1 Some summary statistics

Let’s again confirm how many subjects we’re working with. This is the total number of subjects with good catch trial accuracy and good main trial accuracy.

nrow(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts %>% distinct(workerId,.keep_all = FALSE))
## [1] 57

3.2 Plot the results

This is a plot of the mean detection RT for each image.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  ggbarplot(x = "image", y = "rt_s", ylab = "Mean Detection RT (sec)", fill = "#f7a800", add = "mean_se", font.xtickslab = 8, sort.val = c("asc")) + rotate_x_text() + theme(legend.position = "none")

This table contains the final count for each image. This is after RTs were excluded that were more than 3 SDs from the mean.

image_count_final <- data.frame(image_count = colSums(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts[,2:70], na.rm = TRUE))
knitr::kable(image_count_final)
image_count
10504629 21
10810329 18
1191801 23
12115280 15
12178414 16
13141692 19
1383096 19
13873251 21
16527526 15
18169626 15
18345691 18
22020472 20
23024660 25
23199105 15
24383097 18
25107991 20
27857618 24
3099758 20
31236119 21
32289063 15
38466626 22
38546029 15
42429798 13
4247084 16
44993860 20
45525109 13
46475259 16
46635293 18
48384711 10
48486405 21
51537628 20
51856108 19
55174490 19
56835136 19
57861456 18
61118260 25
62096551 22
62224663 13
67862299 20
69128765 8
70687495 16
72488522 22
73637203 18
74173745 20
75081153 19
75958241 20
77345858 21
77574131 19
77793328 19
79191795 19
79222679 21
79241011 24
79573638 18
8197559 13
81993755 4
83536470 18
83691215 17
83785171 22
85741618 13
86520382 7
87983207 13
88767165 16
89354846 16
8974554 19
90405028 19
95091295 17
97475929 15
98156944 20
98265889 11