1 Set up R environment
2 Format & manipulate raw data files
3 Analyze data
- 3.1 Some summary statistics
- 3.2 Plot the results

1 Set up R environment

library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plyr)
library(magick)
library(png)
library(EBImage)

Set the R working drectory to the main experiment directory.

setwd("/Users/adambarnas/Box/Mudsplash/Results")

2 Format & manipulate raw data files

2.1 Read-in datafiles

First, read in the individual subject files (saved automatically on the server as csv files).

tbl_all <- list.files(path = "./Wolfe1_data/", pattern = "*.csv", full.names = T) %>%
  map_df(~read_csv(.))
tbl_all <- data.frame(tbl_all)
#head(tbl_all,10)

Get a count of the number of subjects.

nrow(tbl_all %>% distinct(workerId,.keep_all = FALSE))

## [1] 115

Next, rename the catch trials to the same convention as the main trials and break apart the unmod_image column into database (the lab where the stims come from) and image (the name of the image file).

tbl_all$unmod_image[tbl_all$unmod_image == "catchAirplane-a"] <- "wolfe_catchAirplane-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchBoat-a"] <- "wolfe_catchBoat-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchCow-a"] <- "wolfe_catchCow-a"
tbl_all$unmod_image <- lapply(tbl_all$unmod_image, gsub, pattern='-a', replacement='')
tbl_all <- tbl_all %>%
separate(unmod_image,into=c('database', 'image'), sep = "([\\_])")
#head(tbl_all,10)

Let’s, for now, also assign the trials to bins based on the trial number. The 2 practice trials at the beginning and the 1 catch trial at the end will be labeled “filler”.

tbl_all$bin = "filler"
tbl_all[which(tbl_all$trial_number %in% c(3:8)), "bin"] = "block_1"
tbl_all[which(tbl_all$trial_number %in% c(9:14)), "bin"] = "block_2"
tbl_all[which(tbl_all$trial_number %in% c(15:20)), "bin"] = "block_3"
tbl_all[which(tbl_all$trial_number %in% c(21:26)), "bin"] = "block_4"
tbl_all[which(tbl_all$trial_number %in% c(27:32)), "bin"] = "block_5"

Get the total number of trials for each subject and the initial count for each image.

tbl_all_counts <- tbl_all %>%
  group_by(workerId,image) %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow") %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_all_counts,10)

image_count_initial <- data.frame(image_count = colSums(tbl_all_counts[,2:112], na.rm = TRUE))
knitr::kable(image_count_initial)

	image_count
image-001	33
image-002	28
image-003	31
image-004	35
image-005	31
image-006	33
image-007	34
image-008	31
image-009	32
image-010	32
image-011	33
image-012	33
image-013	33
image-014	32
image-015	33
image-016	33
image-017	32
image-018	30
image-019	33
image-020	34
image-021	31
image-022	30
image-023	32
image-024	33
image-025	33
image-026	32
image-027	32
image-028	33
image-029	33
image-030	33
image-031	32
image-032	31
image-033	33
image-034	34
image-035	33
image-037	32
image-038	33
image-039	34
image-040	29
image-041	30
image-042	32
image-043	31
image-044	31
image-045	34
image-046	32
image-047	34
image-048	31
image-049	31
image-050	30
image-076	32
image-077	33
image-078	31
image-079	31
image-080	34
image-081	32
image-082	33
image-083	32
image-084	33
image-085	33
image-086	34
image-087	32
image-088	35
image-089	33
image-090	33
image-091	31
image-092	33
image-093	32
image-094	33
image-095	33
image-096	33
image-097	32
image-098	33
image-099	34
image-100	32
image-101	31
image-102	33
image-103	30
image-104	30
image-105	30
image-106	29
image-107	32
image-108	32
image-109	34
image-110	32
image-111	31
image-112	32
image-113	32
image-114	33
image-115	35
image-116	37
image-117	30
image-118	29
image-119	35
image-120	29
image-121	32
image-122	34
image-123	30
image-124	34
image-125	30
image-126	33
image-127	31
image-128	33
image-129	33
image-130	33
image-131	32
image-132	31
image-133	30
image-134	31
image-135	34
image-136	32
image-137	32

The data are loaded. Let’s move on and examine the quality of the data.

2.2 Analyze accuracy

In this chunk, every click for a given image is compared to the image difference hull. The process involves the addition of two arrays - the difference hull array and an array created by the script and the subject’s click. The difference hull array is composed of 0s and 1s, with 1s corresponding to the changing object. An equally sized array of all 0s is composed, with one 1 corresponding to the X,Y coordinates of the click. These two arrays are added together and the maximum value is queried. A maximum value of 2 indicates that the click occurred within the boundaries of the image difference hall (an accurate click). A values less than 2 indicates that the click occurred outside the boundaries of the image difference hall (an inaccurate click). In the new click_ACC column, 1s correspond to accurate clicks and 0s correspond to inaccurate clicks. This will analyze the accuracy for the 2 practice images, all main images, and the 1 catch image.

img_train <- list.files(path = "/Users/adambarnas/Box/Mudsplash/Boxes_Wolfe1/", pattern = ".png", all.files = TRUE,full.names = TRUE,no.. = TRUE)
img_array <- readPNG(img_train)
img_list <- lapply(img_train, readPNG)
img_names <- row.names(image_count_initial)
img_names <- c("catchAirplane", "catchBoat", "catchCow", img_names)
names(img_list) = img_names

tbl_all$x[tbl_all$x == "0"] <- 1
tbl_all$y[tbl_all$y == "0"] <- 1

tbl_all$click_ACC= "filler"

for (i in 1:length(tbl_all$workerId)){
  img <- data.frame(img_list[tbl_all$image[i]])
  blank <- data.frame(array(c(0,0), dim = c(nrow(img),ncol(img))))
  blank[tbl_all$y[i], tbl_all$x[i]] <- 1
  combo <- img + blank
  which(combo==2, arr.ind=TRUE)
  if (max(combo, na.rm=TRUE) == 2){
    tbl_all$click_ACC[i] = 1
  } else {
    tbl_all$click_ACC[i] = 0
  }
}

2.2.1 Catch trials

Check the accuracy of the catch trial. As a reminder, the catch trial was a large, salient changing object. If a subject did not click on the changing object during the catch trial, their performance on the main trials is likely poor and will be excluded. This chunk will filter the data by accuracy for both inaccurate (bad) catch trials and accurate (good) catch trials and save new dataframes. This chunk will also provide the number and workerIds for inaccurate and accurate catch trial performance.

tbl_all_catch_acc <- tbl_all %>%
  filter(image == "catchCow")
tbl_bad_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 0)
tbl_good_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 1)

tbl_bad_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_bad_catch_acc$workerId),]
nrow(tbl_bad_catch_acc %>% distinct(workerId,.keep_all = FALSE))

## [1] 2

tbl_good_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_good_catch_acc$workerId),]
nrow(tbl_good_catch_acc %>% distinct(workerId,.keep_all = FALSE))

## [1] 113

2.2.2 Main trials

Now, check the accuracy of the clicks for the main images. This chunk will compute the total number of inaccurate and accurate clicks for each subject.

tbl_good_catch_acc_all_main_acc <- tbl_good_catch_acc %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow")
tbl_good_catch_acc_all_main_acc_counts <- tbl_good_catch_acc_all_main_acc %>%
  group_by(workerId,click_ACC) %>%
  dplyr::summarize(counts = n()) %>%
  spread(click_ACC,counts) %>%
  mutate(total = rowSums(.[2:3], na.rm = TRUE))
colnames(tbl_good_catch_acc_all_main_acc_counts) <- c("workerId", "inacc", "acc", "total")

Here, we can plot the overall accuracy of the main trial clicks for the group.

tbl_good_catch_acc_all_main_acc_rate <- (tbl_good_catch_acc_all_main_acc_counts$acc / tbl_good_catch_acc_all_main_acc_counts$total)
tbl_good_catch_acc_all_main_acc_rate <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_counts[,1], tbl_good_catch_acc_all_main_acc_rate)
colnames(tbl_good_catch_acc_all_main_acc_rate) <- c("workerId", "acc_rate")
tbl_good_catch_acc_all_main_acc_rate[is.na(tbl_good_catch_acc_all_main_acc_rate)] <- 0


tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", add = "mean_se", ylim = c(0, 1), xlab = "Group", width = 0.5, label = TRUE, lab.nb.digits = 2, lab.vjust = -1.6, title = "Main Trial Accuracy for All Subjects")

tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(x = "workerId", y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), title = "Main Trial Accuracy for Individual Subjects", font.xtickslab = 4, sort.val = c("asc")) + rotate_x_text()

Count the number of subjects and only remove inaccurate trials.

nrow(tbl_good_catch_acc_all_main_acc %>% distinct(workerId,.keep_all = FALSE))

## [1] 113

tbl_good_catch_acc_all_main_acc_inacc_trials_removed <- tbl_good_catch_acc_all_main_acc %>% 
  filter(click_ACC == 1)

2.3 Remove outlier trials

Next, we can remove outlier RTs that are more than 3 SDs away from the mean.

Let’s get the number of trials. This is the initial number of trials.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts,10)

Before the data are trimmed, let’s generate histograms of all RTs and the mean RT of each subject

tbl_good_catch_acc_all_main_acc_inacc_trials_removed$rt_s = tbl_good_catch_acc_all_main_acc_inacc_trials_removed$rt/1000
tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,600), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_mean_subj_RT <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 25, xlim = c(0,25), ylim = c(0,12), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

Trial timer maxed out at 60 sec. Any RTs recorded as 60 sec should be discarded.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed %>% 
  filter(rt < 60000)

Next, data are inspected for RT outliers. Two additional columns are added to the data table. First, an “outliers” column is added that labels an RT as an outlier or not (0 = not an outlier, 1 = an outlier less than 3 SDs, 2 = an outlier greater than 3 SDs). Second, a “removed_RT” column is added that contains non-outlier RTs.

Note: code can be changed to allow for replacement of outliers with the cutoff values.

correct.trials <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed[tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed$click_ACC == "1",]
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed <- ddply(correct.trials, .(workerId), function(x){
  m <- mean(x$rt)
  s <- sd(x$rt)
  upper <- m + 3*s #change 3 with another number to increase or decrease cutoff criteria
  lower <- m - 3*s #change 3 with another number to increase or decrease cutoff criteria

  x$outliers <- 0
  x$outliers[x$rt > upper] <- 2
  x$outliers[x$rt < lower] <- 1
  x$removed_RT <- x$rt
  x$removed_RT[x$rt > upper]<- NA #change NA with upper to replace an outlier with the upper cutoff
  x$removed_RT[x$rt < lower]<- NA #change NA with lower to replace an outlier with the lower cutoff
  
  x
})
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed,10)

Next, let’s completely toss out the outlier trials (labeled as NA).

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed[!is.na(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed$removed_RT),]
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed,10)

Let’s get the number of trials. This is the number of trials that “survive” the data trimming.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts,10)

Here are new histograms of all RTs and the mean RT of each subject.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,600), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT <- tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 25, xlim = c(0,25), ylim = c(0,15), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

What is the percentage of outlier RTs that were removed overall?

tbl_all_main_acc_rts_3SD_removed_count <- data.frame(total_removed = tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum - tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts$sum)

per_RTs_removed <- (sum(tbl_all_main_acc_rts_3SD_removed_count) / sum(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum)) * 100
per_RTs_removed

## [1] 2.521008

What is the percentage of outlier RTs that were removed per subject? This is easy to visualize in a plot.

tbl_per_rts_3SD_removed_by_subj <- data.frame((tbl_all_main_acc_rts_3SD_removed_count / tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum) * 100)
tbl_per_rts_3SD_removed_by_subj <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts[1],tbl_all_main_acc_rts_3SD_removed_count,tbl_good_catch_acc_all_main_acc_inacc_trials_removed_counts$sum,tbl_per_rts_3SD_removed_by_subj)
colnames(tbl_per_rts_3SD_removed_by_subj) <- c("workerId", "outlier_RTs", "total_RTs", "percent_excluded")
#head(tbl_per_rts_3SD_removed_by_subj,10)

tbl_per_rts_3SD_removed_by_subj %>% 
  ggbarplot(x = "workerId", y = "percent_excluded", ylab = "% RTs excluded", fill = "#f7a800", font.tickslab = 4, sort.val = c("asc")) + rotate_x_text()

3 Analyze data

3.1 Some summary statistics

Let’s again confirm how many subjects we’re working with. This is the total number of subjects with good catch trial accuracy and good main trial accuracy.

nrow(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts %>% distinct(workerId,.keep_all = FALSE))

## [1] 106

3.2 Plot the results

This is a plot of the mean detection RT for each image.

tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  ggbarplot(x = "image", y = "rt_s", ylab = "Mean Detection RT (sec)", fill = "#f7a800", add = "mean_se", font.xtickslab = 4, sort.val = c("asc")) + rotate_x_text() + theme(legend.position = "none")

This table contains the final count for each image. This is after RTs were excluded that were more than 3 SDs from the mean.

image_count_final <- data.frame(image_count = colSums(tbl_good_catch_acc_all_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts[,2:112], na.rm = TRUE))
knitr::kable(image_count_final)

	image_count
image-001	27
image-002	22
image-003	28
image-004	29
image-005	23
image-006	20
image-007	23
image-008	23
image-009	24
image-010	27
image-011	27
image-012	21
image-013	20
image-014	24
image-015	25
image-016	14
image-017	10
image-018	23
image-019	29
image-020	22
image-021	21
image-022	17
image-023	22
image-024	21
image-025	23
image-026	26
image-027	5
image-028	23
image-029	23
image-030	20
image-031	20
image-032	20
image-033	21
image-034	23
image-035	6
image-037	23
image-038	10
image-039	22
image-040	23
image-041	22
image-042	18
image-043	20
image-044	14
image-045	16
image-046	20
image-047	23
image-048	10
image-049	25
image-050	25
image-076	15
image-077	22
image-078	22
image-079	17
image-080	29
image-081	17
image-082	26
image-083	23
image-084	26
image-085	21
image-086	13
image-087	24
image-088	27
image-089	22
image-090	15
image-091	16
image-092	24
image-093	19
image-094	20
image-095	27
image-096	18
image-097	21
image-098	18
image-099	24
image-100	22
image-101	24
image-102	8
image-103	19
image-104	17
image-105	21
image-106	21
image-107	24
image-108	22
image-109	26
image-110	25
image-111	20
image-112	24
image-113	22
image-114	25
image-115	26
image-116	5
image-117	24
image-118	17
image-119	28
image-120	26
image-121	21
image-122	20
image-123	19
image-124	18
image-125	23
image-126	24
image-127	18
image-128	24
image-129	10
image-130	18
image-131	15
image-132	19
image-133	22
image-134	21
image-135	26
image-136	23
image-137	24

Wolfe1 Mudsplash

Adam Barnas

Last compiled at 5:07 PM on May 09, 2020