Read-in datafiles
First, read in the individual subject files (saved automatically on the server as csv files).
tbl_all <- list.files(path = "./Wolfe1_data/", pattern = "*.csv", full.names = T) %>%
map_df(~read_csv(.))
tbl_all <- data.frame(tbl_all)
#head(tbl_all,10)
Get a count of the number of subjects.
nrow(tbl_all %>% distinct(workerId,.keep_all = FALSE))
## [1] 115
Next, rename the catch trials to the same convention as the main trials and break apart the unmod_image column into database (the lab where the stims come from) and image (the name of the image file).
tbl_all$unmod_image[tbl_all$unmod_image == "catchAirplane-a"] <- "wolfe_catchAirplane-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchBoat-a"] <- "wolfe_catchBoat-a"
tbl_all$unmod_image[tbl_all$unmod_image == "catchCow-a"] <- "wolfe_catchCow-a"
tbl_all$unmod_image <- lapply(tbl_all$unmod_image, gsub, pattern='-a', replacement='')
tbl_all <- tbl_all %>%
separate(unmod_image,into=c('database', 'image'), sep = "([\\_])")
#head(tbl_all,10)
Let’s, for now, also assign the trials to bins based on the trial number. The 2 practice trials at the beginning and the 1 catch trial at the end will be labeled “filler”.
tbl_all$bin = "filler"
tbl_all[which(tbl_all$trial_number %in% c(3:8)), "bin"] = "block_1"
tbl_all[which(tbl_all$trial_number %in% c(9:14)), "bin"] = "block_2"
tbl_all[which(tbl_all$trial_number %in% c(15:20)), "bin"] = "block_3"
tbl_all[which(tbl_all$trial_number %in% c(21:26)), "bin"] = "block_4"
tbl_all[which(tbl_all$trial_number %in% c(27:32)), "bin"] = "block_5"
Get the total number of trials for each subject and the initial count for each image.
tbl_all_counts <- tbl_all %>%
group_by(workerId,image) %>%
filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow") %>%
dplyr::summarize(counts = n()) %>%
spread(image,counts) %>%
mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_all_counts,10)
image_count_initial <- data.frame(image_count = colSums(tbl_all_counts[,2:112], na.rm = TRUE))
image_count_initial
## image_count
## image-001 33
## image-002 28
## image-003 31
## image-004 35
## image-005 31
## image-006 33
## image-007 34
## image-008 31
## image-009 32
## image-010 32
## image-011 33
## image-012 33
## image-013 33
## image-014 32
## image-015 33
## image-016 33
## image-017 32
## image-018 30
## image-019 33
## image-020 34
## image-021 31
## image-022 30
## image-023 32
## image-024 33
## image-025 33
## image-026 32
## image-027 32
## image-028 33
## image-029 33
## image-030 33
## image-031 32
## image-032 31
## image-033 33
## image-034 34
## image-035 33
## image-037 32
## image-038 33
## image-039 34
## image-040 29
## image-041 30
## image-042 32
## image-043 31
## image-044 31
## image-045 34
## image-046 32
## image-047 34
## image-048 31
## image-049 31
## image-050 30
## image-076 32
## image-077 33
## image-078 31
## image-079 31
## image-080 34
## image-081 32
## image-082 33
## image-083 32
## image-084 33
## image-085 33
## image-086 34
## image-087 32
## image-088 35
## image-089 33
## image-090 33
## image-091 31
## image-092 33
## image-093 32
## image-094 33
## image-095 33
## image-096 33
## image-097 32
## image-098 33
## image-099 34
## image-100 32
## image-101 31
## image-102 33
## image-103 30
## image-104 30
## image-105 30
## image-106 29
## image-107 32
## image-108 32
## image-109 34
## image-110 32
## image-111 31
## image-112 32
## image-113 32
## image-114 33
## image-115 35
## image-116 37
## image-117 30
## image-118 29
## image-119 35
## image-120 29
## image-121 32
## image-122 34
## image-123 30
## image-124 34
## image-125 30
## image-126 33
## image-127 31
## image-128 33
## image-129 33
## image-130 33
## image-131 32
## image-132 31
## image-133 30
## image-134 31
## image-135 34
## image-136 32
## image-137 32
The data are loaded. Let’s move on and examine the quality of the data.
Analyze accuracy
In this chunk, every click for a given image is compared to the image difference hull. The process involves the addition of two arrays - the difference hull array and an array created by the script and the subject’s click. The difference hull array is composed of 0s and 1s, with 1s corresponding to the changing object. An equally sized array of all 0s is composed, with one 1 corresponding to the X,Y coordinates of the click. These two arrays are added together and the maximum value is queried. A maximum value of 2 indicates that the click occurred within the boundaries of the image difference hall (an accurate click). A values less than 2 indicates that the click occurred outside the boundaries of the image difference hall (an inaccurate click). In the new click_ACC column, 1s correspond to accurate clicks and 0s correspond to inaccurate clicks. This will analyze the accuracy for the 2 practice images, all main images, and the 1 catch image.
img_train <- list.files(path = "/Users/adambarnas/Box/Mudsplash/Boxes_Wolfe1/", pattern = ".png", all.files = TRUE,full.names = TRUE,no.. = TRUE)
img_array <- readPNG(img_train)
img_list <- lapply(img_train, readPNG)
img_names <- row.names(image_count_initial)
img_names <- c("catchAirplane", "catchBoat", "catchCow", img_names)
names(img_list) = img_names
tbl_all$x[tbl_all$x == "0"] <- 1
tbl_all$y[tbl_all$y == "0"] <- 1
tbl_all$click_ACC= "filler"
for (i in 1:length(tbl_all$workerId)){
img <- data.frame(img_list[tbl_all$image[i]])
blank <- data.frame(array(c(0,0), dim = c(nrow(img),ncol(img))))
blank[tbl_all$y[i], tbl_all$x[i]] <- 1
combo <- img + blank
which(combo==2, arr.ind=TRUE)
if (max(combo, na.rm=TRUE) == 2){
tbl_all$click_ACC[i] = 1
} else {
tbl_all$click_ACC[i] = 0
}
}
Catch trials
Check the accuracy of the catch trial. As a reminder, the catch trial was a large, salient changing object. If a subject did not click on the changing object during the catch trial, their performance on the main trials is likely poor and will be excluded. This chunk will filter the data by accuracy for both inaccurate (bad) catch trials and accurate (good) catch trials and save new dataframes. This chunk will also provide the number and workerIds for inaccurate and accurate catch trial performance.
tbl_all_catch_acc <- tbl_all %>%
filter(image == "catchCow")
tbl_bad_catch_acc <- tbl_all_catch_acc %>%
filter(click_ACC == 0)
tbl_good_catch_acc <- tbl_all_catch_acc %>%
filter(click_ACC == 1)
tbl_bad_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_bad_catch_acc$workerId),]
nrow(tbl_bad_catch_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 2
tbl_good_catch_acc <- tbl_all[(tbl_all$workerId %in% tbl_good_catch_acc$workerId),]
nrow(tbl_good_catch_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 113
Main trials
Now, check the accuracy of the clicks for the main images. This chunk will compute the total number of inaccurate and accurate clicks for each subject.
tbl_good_catch_acc_all_main_acc <- tbl_good_catch_acc %>%
filter(image!= "catchAirplane" & image!= "catchBoat" & image!= "catchCow")
tbl_good_catch_acc_all_main_acc_counts <- tbl_good_catch_acc_all_main_acc %>%
group_by(workerId,click_ACC) %>%
dplyr::summarize(counts = n()) %>%
spread(click_ACC,counts) %>%
mutate(total = rowSums(.[2:3], na.rm = TRUE))
colnames(tbl_good_catch_acc_all_main_acc_counts) <- c("workerId", "inacc", "acc", "total")
Here, we can plot the overall accuracy of the main trial clicks for the group and for each individual subject. We can split the group by accuracy rate and plot “bad” and “good” subgroups. Subjects who clicked on the changing object 75% or more of the time are grouped as “good” and subjects who clicked on the changing object less than 75% are grouped as “bad”.
tbl_good_catch_acc_all_main_acc_rate <- (tbl_good_catch_acc_all_main_acc_counts$acc / tbl_good_catch_acc_all_main_acc_counts$total)
tbl_good_catch_acc_all_main_acc_rate <- cbind.data.frame(tbl_good_catch_acc_all_main_acc_counts[,1], tbl_good_catch_acc_all_main_acc_rate)
colnames(tbl_good_catch_acc_all_main_acc_rate) <- c("workerId", "acc_rate")
tbl_good_catch_acc_all_main_acc_rate[is.na(tbl_good_catch_acc_all_main_acc_rate)] <- 0
tbl_good_catch_acc_all_main_acc_rate$cat = "filler"
for (i in 1:length(tbl_good_catch_acc_all_main_acc_rate$workerId)){
if (tbl_good_catch_acc_all_main_acc_rate$acc_rate[i] >= 0.75){
tbl_good_catch_acc_all_main_acc_rate$cat[i] = "Good"
} else {
tbl_good_catch_acc_all_main_acc_rate$cat[i] = "Bad"
}
}
tbl_good_catch_acc_good_main_acc_rate <- tbl_good_catch_acc_all_main_acc_rate %>%
filter(acc_rate >= 0.75)
tbl_good_catch_acc_bad_main_acc_rate <- tbl_good_catch_acc_all_main_acc_rate %>%
filter(acc_rate < 0.75)
tbl_good_catch_acc_all_main_acc_rate %>%
ggbarplot(y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", add = "mean_se", ylim = c(0, 1), xlab = "Group", width = 0.5, label = TRUE, lab.nb.digits = 2, lab.vjust = -1.6, title = "Main Trial Accuracy for All Subjects")

tbl_good_catch_acc_all_main_acc_rate %>%
ggbarplot(x = "workerId", y = "acc_rate", ylab = "Accuracy", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), title = "Main Trial Accuracy for Individual Subjects", font.xtickslab = 4, sort.val = c("asc")) + rotate_x_text() + geom_hline(yintercept = .75, linetype = 2)

tbl_good_catch_acc_all_main_acc_rate %>%
ggbarplot(x = "cat", y = "acc_rate", ylab = "Accuracy", xlab = "Accuracy Group", add = "mean_se", fill = "#f7a800", color = "#f7a800", ylim = c(0, 1), label = TRUE, lab.nb.digits = 2, lab.vjust = c(-2.3, -0.8), title = "Main Trial Accuracy for Bad and Good Subjects", sort.val = c("asc"))

This dataframe consists of subjects with good catch trial accuracy and good main trial accuracy (again, greater than 75%).
tbl_good_catch_acc_good_main_acc <- tbl_good_catch_acc_all_main_acc[(tbl_good_catch_acc_all_main_acc$workerId %in% tbl_good_catch_acc_good_main_acc_rate$workerId),]
nrow(tbl_good_catch_acc_good_main_acc %>% distinct(workerId,.keep_all = FALSE))
## [1] 61
Remove inaccurate trials from the subjects with good main accuracy.
tbl_good_catch_acc_good_main_acc_inacc_trials_removed <- tbl_good_catch_acc_good_main_acc %>%
filter(click_ACC == 1)
Remove outlier trials
Next, we can remove outlier RTs that are more than 3 SDs away from the mean.
Let’s get the number of trials. This is the initial number of trials.
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
group_by(workerId,image) %>%
dplyr::summarize(counts = n()) %>%
spread(image,counts) %>%
mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts,10)
Before the data are trimmed, let’s generate histograms of all RTs and the mean RT of each subject
tbl_good_catch_acc_good_main_acc_inacc_trials_removed$rt_s = tbl_good_catch_acc_good_main_acc_inacc_trials_removed$rt/1000
tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,400), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_mean_subj_RT <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
group_by(workerId) %>%
dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_mean_subj_RT %>%
gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 25, xlim = c(0,25), ylim = c(0,8), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

Trial timer maxed out at 60 sec. Any RTs recorded as 60 sec should be discarded.
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed %>%
filter(rt < 60000)
Next, data are inspected for RT outliers. Two additional columns are added to the data table. First, an “outliers” column is added that labels an RT as an outlier or not (0 = not an outlier, 1 = an outlier less than 3 SDs, 2 = an outlier greater than 3 SDs). Second, a “removed_RT” column is added that contains non-outlier RTs.
Note: code can be changed to allow for replacement of outliers with the cutoff values.
correct.trials <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed[tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed$click_ACC == "1",]
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed <- ddply(correct.trials, .(workerId), function(x){
m <- mean(x$rt)
s <- sd(x$rt)
upper <- m + 3*s #change 3 with another number to increase or decrease cutoff criteria
lower <- m - 3*s #change 3 with another number to increase or decrease cutoff criteria
x$outliers <- 0
x$outliers[x$rt > upper] <- 2
x$outliers[x$rt < lower] <- 1
x$removed_RT <- x$rt
x$removed_RT[x$rt > upper]<- NA #change NA with upper to replace an outlier with the upper cutoff
x$removed_RT[x$rt < lower]<- NA #change NA with lower to replace an outlier with the lower cutoff
x
})
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed,10)
Next, let’s completely toss out the outlier trials (labeled as NA).
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed[!is.na(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed$removed_RT),]
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed,10)
Let’s get the number of trials. This is the number of trials that “survive” the data trimming.
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
group_by(workerId,image) %>%
dplyr::summarize(counts = n()) %>%
spread(image,counts) %>%
mutate(sum = rowSums(.[-1], na.rm = TRUE))
#head(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts,10)
Here are new histograms of all RTs and the mean RT of each subject.
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
gghistogram(x = "rt_s", fill = "#f7a800", rug = TRUE, bins = 60, xlim = c(0,60), ylim = c(0,400), xlab = ("Detection RT (sec)"), title = "All RTs")

tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT <- tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed %>%
group_by(workerId) %>%
dplyr::summarize(mean_rt = mean(rt_s, na.rm=TRUE))
tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_mean_subj_RT %>%
gghistogram(x = "mean_rt", fill = "#f7a800", rug = TRUE, bins = 25, xlim = c(0,25), ylim = c(0,8), xlab = ("Mean Detection RT (sec)"), title = "Subject Mean RT")

What is the percentage of outlier RTs that were removed overall?
tbl_rts_3SD_removed_count <- data.frame(total_removed = tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum - tbl_good_catch_acc_good_main_acc_inacc_trials_removed_timeout_trials_removed_rts_3SD_trimmed_rts_3SD_removed_counts$sum)
per_RTs_removed <- (sum(tbl_rts_3SD_removed_count) / sum(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum)) * 100
per_RTs_removed
## [1] 2.604482
What is the percentage of outlier RTs that were removed per subject? This is easy to visualize in a plot.
tbl_per_rts_3SD_removed_by_subj <- data.frame((tbl_rts_3SD_removed_count / tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum) * 100)
tbl_per_rts_3SD_removed_by_subj <- cbind.data.frame(tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts[1],tbl_rts_3SD_removed_count,tbl_good_catch_acc_good_main_acc_inacc_trials_removed_counts$sum,tbl_per_rts_3SD_removed_by_subj)
colnames(tbl_per_rts_3SD_removed_by_subj) <- c("workerId", "outlier_RTs", "total_RTs", "percent_excluded")
#head(tbl_per_rts_3SD_removed_by_subj,10)
tbl_per_rts_3SD_removed_by_subj %>%
ggbarplot(x = "workerId", y = "percent_excluded", ylab = "% RTs excluded", fill = "#f7a800", font.tickslab = 8, sort.val = c("asc")) + rotate_x_text()
