1 Set up R environment
2 Format & manipulate raw data files
3 Analyzing data
- 3.1 Some summary statistics
- 3.2 Plot the results

1 Set up R environment

library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plyr)
library(magick)
library(png)
library(EBImage)

Set the R working drectory to the main experiment directory.

setwd("/Users/adambarnas/Box/Mudsplash/Results")

2 Format & manipulate raw data files

2.1 Read-in datafiles

First, read in the individual subject files (saved automatically on the server as csv files).

tbl_all <- list.files(path = "./Ma_data/", pattern = "*.csv", full.names = T) %>%
  map_df(~read_csv(.))
tbl_all <- data.frame(tbl_all)
#head(tbl_all,10)

Get a count of the number of subjects.

nrow(tbl_all %>% distinct(workerId,.keep_all = FALSE))

## [1] 63

Next, rename the catch trials to the same convention as the main trials and break apart the unmod_image column into database (the lab where the stims come from) and image (the name of the image file).

tbl_all$unmod_image[tbl_all$unmod_image == "catchAirplane-a"] <- "rensink_catchAirplane-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchBoat-a"] <- "rensink_catchBoat-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchCow-a"] <- "rensink_catchCow-a"
tbl_all <- tbl_all %>%
separate(unmod_image,into=c('database', 'image', NA), sep = "([\\_\\-])")
#head(tbl_all,10)

Let’s, for now, also assign the trials to bins based on the trial number. The 2 practice trials at the beginning and the 1 catch trial at the end will be labeled “filler”.

tbl_all$bin = "filler"
tbl_all[which(tbl_all$trial_number %in% c(3:8)), "bin"] = "block_1"
tbl_all[which(tbl_all$trial_number %in% c(9:14)), "bin"] = "block_2"
tbl_all[which(tbl_all$trial_number %in% c(15:20)), "bin"] = "block_3"
tbl_all[which(tbl_all$trial_number %in% c(21:26)), "bin"] = "block_4"
tbl_all[which(tbl_all$trial_number %in% c(27:32)), "bin"] = "block_5"

Get the total number of trials for each subject and the initial count for each image.

tbl_all_counts <- tbl_all %>%
  group_by(workerId,image) %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow") %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_all_counts,10)

image_count_initial <- data.frame(image_count = colSums(tbl_all_counts[,2:70], na.rm = TRUE))
image_count_initial

##          image_count
## 10504629          27
## 10810329          25
## 1191801           28
## 12115280          25
## 12178414          27
## 13141692          27
## 1383096           30
## 13873251          28
## 16527526          28
## 18169626          27
## 18345691          28
## 22020472          28
## 23024660          30
## 23199105          27
## 24383097          25
## 25107991          28
## 27857618          25
## 3099758           28
## 31236119          28
## 32289063          26
## 38466626          28
## 38546029          28
## 42429798          24
## 4247084           29
## 44993860          23
## 45525109          29
## 46475259          27
## 46635293          27
## 48384711          25
## 48486405          29
## 51537628          29
## 51856108          29
## 55174490          26
## 56835136          27
## 57861456          26
## 61118260          30
## 62096551          29
## 62224663          30
## 67862299          29
## 69128765          27
## 70687495          27
## 72488522          29
## 73637203          27
## 74173745          27
## 75081153          25
## 75958241          26
## 77345858          27
## 77574131          29
## 77793328          28
## 79191795          29
## 79222679          27
## 79241011          29
## 79573638          28
## 8197559           26
## 81993755          28
## 83536470          28
## 83691215          28
## 83785171          26
## 85741618          27
## 86520382          29
## 87983207          26
## 88767165          27
## 89354846          30
## 8974554           27
## 90405028          27
## 95091295          26
## 97475929          28
## 98156944          26
## 98265889          26

The data are loaded. Let’s move on and examine the quality of the data.

2.2 Analyze accuracy

In this chunk, every click for a given image is compared to the image difference hull. The process involves the addition of two arrays - the difference hull array and an array created by the script and the subject’s click. The difference hull array is composed of 0s and 1s, with 1s corresponding to the changing object. An equally sized array of all 0s is composed, with one 1 corresponding to the X,Y coordinates of the click. These two arrays are added together and the maximum value is queried. A maximum value of 2 indicates that the click occurred within the boundaries of the image difference hall (an accurate click). A values less than 2 indicates that the click occurred outside the boundaries of the image difference hall (an inaccurate click). In the new click_ACC column, 1s correspond to accurate clicks and 0s correspond to inaccurate clicks. This will analyze the accuracy for the 2 practice images, all main images, and the 1 catch image.

img_train <- list.files(path = "/Users/adambarnas/Box/Mudsplash/Boxes_Ma/", pattern = ".png", all.files = TRUE,full.names = TRUE,no.. = TRUE)
img_array <- readPNG(img_train)
img_list <- lapply(img_train, readPNG)
img_names <- row.names(image_count_initial)
img_names <- c("catchAirplane", "catchBoat", "catchCow", img_names)
names(img_list) = img_names

tbl_all$x[tbl_all$x == "0"] <- 1
tbl_all$y[tbl_all$y == "0"] <- 1

tbl_all$click_ACC= "filler"

for (i in 1:length(tbl_all$workerId)){
  img <- data.frame(img_list[tbl_all$image[i]])
  blank <- data.frame(array(c(0,0), dim = c(nrow(img),ncol(img))))
  blank[tbl_all$y[i], tbl_all$x[i]] <- 1
  combo <- img + blank
  which(combo==2, arr.ind=TRUE)
  if (max(combo, na.rm=TRUE) == 2){
    tbl_all$click_ACC[i] = 1
  } else {
    tbl_all$click_ACC[i] = 0
  }
}

2.2.1 Catch trials

Check the accuracy of the catch trial. As a reminder, the catch trial was a large, salient changing object. If a subject did not click on the changing object during the catch trial, their performance on the main trials is likely poor and will be excluded. This chunk will filter the data by accuracy for both inaccurate (bad) catch trials and accurate (good) catch trials and save new dataframes. This chunk will also provide the number and workerIds for inaccurate and accurate catch trial performance.

tbl_all_catch_acc <- tbl_all %>%
  filter(image == "catchCow")
tbl_bad_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 0)
tbl_good_catch_acc <- tbl_all_catch_acc %>%
  filter(click_ACC == 1)

tbl_bad_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_bad_catch_acc$workerId),]
nrow(tbl_bad_catch_acc %>% distinct(workerId,.keep_all = FALSE))

## [1] 1

tbl_good_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_good_catch_acc$workerId),]
nrow(tbl_good_catch_acc %>% distinct(workerId,.keep_all = FALSE))

## [1] 62

2.2.2 Main trials

Now, check the accuracy of the clicks for the main images. This chunk will compute the total number of inaccurate and accurate clicks for each subject.

tbl_good_catch_acc_all_main_acc <- tbl_good_catch_acc %>%
  filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow")
tbl_good_catch_acc_all_main_acc_counts <- tbl_good_catch_acc_all_main_acc %>%
  group_by(workerId,click_ACC) %>%
  dplyr::summarize(counts = n()) %>%
  spread(click_ACC,counts) %>%
  mutate(total = rowSums(.[2:3], na.rm = TRUE))
colnames(tbl_good_catch_acc_all_main_acc_counts) <- c("workerId", "inacc", "acc", "total")

Here, we can plot the overall accuracy of the main trial clicks for the group and for each individual subject. We can split the group by accuracy rate and plot “bad” and “good” subgroups. Subjects who clicked on the changing object 75% or more of the time are grouped as “good” and subjects who clicked on the changing object less than 75% are grouped as “bad”.

tbl_good_catch_acc_all_main_acc_rate <- (tbl_good_catch_acc_all_main_acc_counts$acc / tbl_good_catch_acc_all_main_acc_counts$total)
tbl_good_catch_acc_all_main_acc_rate <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_counts[,1], tbl_good_catch_acc_all_main_acc_rate)
colnames(tbl_good_catch_acc_all_main_acc_rate) <- c("workerId", "acc_rate")
tbl_good_catch_acc_all_main_acc_rate[is.na(tbl_good_catch_acc_all_main_acc_rate)] <- 0
tbl_good_catch_acc_all_main_acc_rate$cat = "filler"
for (i in 1:length(tbl_good_catch_acc_all_main_acc_rate$workerId)){
  if (tbl_good_catch_acc_all_main_acc_rate$acc_rate[i] >= 0.75){
    tbl_good_catch_acc_all_main_acc_rate$cat[i] = "Good"
  } else {
    tbl_good_catch_acc_all_main_acc_rate$cat[i] = "Bad"
  }
}

tbl_good_catch_acc_good_main_acc_rate <- tbl_good_catch_acc_all_main_acc_rate %>%
  filter(acc_rate >= 0.75)
tbl_good_catch_acc_bad_main_acc_rate <- tbl_good_catch_acc_all_main_acc_rate %>%
  filter(acc_rate < 0.75)

tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", add = "mean_se", ylim = c(0, 1), xlab = "Group", width = 0.5, label = TRUE, lab.nb.digits = 2, lab.vjust = -1.6, title = "Main Trial Accuracy for All Subjects")

tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(x = "workerId", y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), title = "Main Trial Accuracy for Individual Subjects", font.xtickslab = 8, sort.val = c("asc")) + rotate_x_text() + geom_hline(yintercept = .75, linetype = 2)

tbl_good_catch_acc_all_main_acc_rate %>% 
  ggbarplot(x = "cat", y = "acc_rate", ylab = "Accuracy", xlab = "Accuracy Group", add = "mean_se", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), label = TRUE, lab.nb.digits = 2, lab.vjust = c(-2.3, -0.8), title = "Main Trial Accuracy for Bad and Good Subjects", sort.val = c("asc"))

This dataframe consists of subjects with good catch trial accuracy and good main trial accuracy (again, greater than 75%).

tbl_good_catch_acc_good_main_acc <- tbl_good_catch_acc_all_main_acc[(tbl_good_catch_acc_all_main_acc$workerId %in% tbl_good_catch_acc_good_main_acc_rate$workerId),]
nrow(tbl_good_catch_acc_good_main_acc %>% distinct(workerId,.keep_all = FALSE))

## [1] 38

Remove inaccurate trials from the subjects with good main accuracy.

tbl_good_catch_acc_good_main_acc_inacc_trials_removed <- tbl_good_catch_acc_good_main_acc %>% 
  filter(click_ACC == 1)

2.3 Remove outlier trials

Next, we can remove outlier RTs that are more than 3 SDs away from the mean.

Let’s get the number of trials. This is the initial number of trials.

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts,10)

Before the data are trimmed, let’s generate histograms of all RTs and the mean RT of each subject

tbl_good_catch_acc_good_main_acc_inacc_trials_removed$rt_s = tbl_good_catch_acc_good_main_acc_inacc_trials_removed$rt/1000
tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,400), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_mean_subj_RT <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 20, xlim = c(0,20), ylim = c(0,8), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

Trial timer maxed out at 60 sec. Any RTs recorded as 60 sec should be discarded.

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>% 
  filter(rt < 60000)

Next, data are inspected for RT outliers. Two additional columns are added to the data table. First, an “outliers” column is added that labels an RT as an outlier or not (0 = not an outlier, 1 = an outlier less than 3 SDs, 2 = an outlier greater than 3 SDs). Second, a “removed_RT” column is added that contains non-outlier RTs.

Note: code can be changed to allow for replacement of outliers with the cutoff values.

correct.trials <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed[tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed$click_ACC == "1",]
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed <- ddply(correct.trials, .(workerId), function(x){
  m <- mean(x$rt)
  s <- sd(x$rt)
  upper <- m + 3*s #change 3 with another number to increase or decrease cutoff criteria
  lower <- m - 3*s #change 3 with another number to increase or decrease cutoff criteria

  x$outliers <- 0
  x$outliers[x$rt > upper] <- 2
  x$outliers[x$rt < lower] <- 1
  x$removed_RT <- x$rt
  x$removed_RT[x$rt > upper]<- NA #change NA with upper to replace an outlier with the upper cutoff
  x$removed_RT[x$rt < lower]<- NA #change NA with lower to replace an outlier with the lower cutoff
  
  x
})
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed,10)

Next, let’s completely toss out the outlier trials (labeled as NA).

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed[!is.na(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed$removed_RT),]
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed,10)

Let’s get the number of trials. This is the number of trials that “survive” the data trimming.

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId,image) %>%
  dplyr::summarize(counts = n()) %>%
  spread(image,counts) %>%
  mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts,10)

Here are new histograms of all RTs and the mean RT of each subject.

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,400), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  group_by(workerId) %>%
  dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT %>%
  gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 20, xlim = c(0,20), ylim = c(0,8), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

What is the percentage of outlier RTs that were removed overall?

tbl_rts_3SD_removed_count <- data.frame(total_removed = tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum - tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts$sum)

per_RTs_removed <- (sum(tbl_rts_3SD_removed_count) / sum(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum)) * 100
per_RTs_removed

## [1] 2.544031

What is the percentage of outlier RTs that were removed per subject? This is easy to visualize in a plot.

tbl_per_rts_3SD_removed_by_subj <- data.frame((tbl_rts_3SD_removed_count / tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum) * 100)
tbl_per_rts_3SD_removed_by_subj <- cbind.data.frame(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts[1],tbl_rts_3SD_removed_count,tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum,tbl_per_rts_3SD_removed_by_subj)
colnames(tbl_per_rts_3SD_removed_by_subj) <- c("workerId", "outlier_RTs", "total_RTs", "percent_excluded")
#head(tbl_per_rts_3SD_removed_by_subj,10)

tbl_per_rts_3SD_removed_by_subj %>% 
  ggbarplot(x = "workerId", y = "percent_excluded", ylab = "% RTs excluded", fill = "#f7a800", font.tickslab = 8, sort.val = c("asc")) + rotate_x_text()

3 Analyzing data

3.1 Some summary statistics

Let’s again confirm how many subjects we’re working with. This is the total number of subjects with good catch trial accuracy and good main trial accuracy.

nrow(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts %>% distinct(workerId,.keep_all = FALSE))

## [1] 38

3.2 Plot the results

This is a plot of the mean detection RT for each image.

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
  ggbarplot(x = "image", y = "rt_s", ylab = "Mean Detection RT (sec)", fill = "#f7a800", add = "mean_se", font.xtickslab = 8, sort.val = c("asc")) + rotate_x_text() + theme(legend.position = "none")

This table contains the final count for each image. This is after RTs were excluded that were more than 3 SDs from the mean.

image_count_final <- data.frame(image_count = colSums(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts[,2:70], na.rm = TRUE))
image_count_final

##          image_count
## 10504629          17
## 10810329          16
## 1191801           20
## 12115280          13
## 12178414          13
## 13141692          14
## 1383096           14
## 13873251          19
## 16527526          11
## 18169626          12
## 18345691          15
## 22020472          17
## 23024660          19
## 23199105          11
## 24383097          15
## 25107991          15
## 27857618          18
## 3099758           18
## 31236119          18
## 32289063          11
## 38466626          17
## 38546029          11
## 42429798          10
## 4247084           14
## 44993860          16
## 45525109          11
## 46475259          15
## 46635293          15
## 48384711           9
## 48486405          18
## 51537628          15
## 51856108          14
## 55174490          18
## 56835136          15
## 57861456          18
## 61118260          21
## 62096551          18
## 62224663          13
## 67862299          15
## 69128765           6
## 70687495          12
## 72488522          18
## 73637203          16
## 74173745          16
## 75081153          17
## 75958241          17
## 77345858          17
## 77574131          17
## 77793328          15
## 79191795          14
## 79222679          15
## 79241011          16
## 79573638          13
## 8197559           12
## 81993755           2
## 83536470          13
## 83691215          15
## 83785171          18
## 85741618          11
## 86520382           7
## 87983207          12
## 88767165          12
## 89354846          14
## 8974554           16
## 90405028          15
## 95091295          14
## 97475929          14
## 98156944          15
## 98265889           8

Rensink Mudsplash

Adam Barnas

Last compiled at 1:29 PM on May 06, 2020