Sorterator_2 Data Analysis

Summer 2015

Molly Lewis and Andres Camperi

(1) SETUP

Do preprocessing on data or just read in data?

preprocess.data = FALSE

(2) PREPROCESSING

setwd("/Documents/GRADUATE_SCHOOL/Projects/sorterator/sorterator_2/processed_data/")

if (preprocess.data) {
  # loop to read in files
  all.data <- data.frame()
  files <- dir(raw.data.path_2pt,pattern="*.txt")
  for (file.name in files) {
    print(file.name)
    d <- read.smi.idf(paste(raw.data.path_2pt,file.name,sep=""),header.rows=35)
    d <- preprocess.data(d)
    d$subid <- file.name
    
    ## now here's where data get bound together
    all.data <- rbind(all.data, d)
    }
  
    files <- dir(raw.data.path_5pt,pattern="*.txt")
    for (file.name in files) {
    print(file.name)
    d <- read.smi.idf(paste(raw.data.path_5pt,file.name,sep=""),header.rows=38)
    d <- preprocess.data(d)
    d$subid <- file.name
    
    ## now here's where data get bound together
    all.data <- rbind(all.data, d)
    }
  
  write.csv(all.data,paste(processed.data.path,
                           "all_data.csv",sep=""),
            row.names=FALSE) 
}

#d <- read.csv("processed_data/all_data.csv")
d <- read.csv("all_data.csv")

Change no-data coordinates to “NA”

d$y[d$y == "1050"] <- NA 
d$x[d$x == "0"] <- NA
d$count.na <- is.na(d$x) | is.na(d$y) # make a column that shows whether NA (true) or not (false)

(3) DATAFRAME SETUP

Add item information

setwd("/Documents/GRADUATE_SCHOOL/Projects/sorterator/sorterator_2/analysis/")

#order <- read.csv("analysis/stim_order.csv")
order <- read.csv("stim_order.csv")
d <- left_join(d, order)

Add subject drop information and drop bad subjects

d$subid = as.factor(sapply(as.character(d$subid), 
                           function(a) strsplit(a,"-")[[1]][1]))
demo <- read.csv("demographics.csv")
d <- left_join(d, demo) 

m = d[d$keep_drop == "keep",] # drop bad subjects

Drop Fillers

d = d[!is.na(d$shape),]

Get trial order

d = get.trial.order(d)
d$trial.set = as.factor(ifelse(grepl("training",d$stimulus), "training", "testing"))
d$block <- factor (d$block, levels = c("training","testing"))

Define regions of interest (roi)

rois <- list()
rois[[1]] <- c(0,400,650,650)
rois[[2]] <- c(850,400,650,650)
names(rois) <- c("L","R")
#roi.image(rois)

d$roi <- roi.check(d,rois)
d$roi_left <- d$roi == "L"
summary(d$roi_left)

##    Mode   FALSE    TRUE    NA's 
## logical   72408   63497  212537

Define down sampling rate

# in one of the test videos (YC) the object emerges earlier than the others - fix this
d$t.stim = ifelse(d$trial.type == "yellow_cross", d$t.stim + .33, d$t.stim)

subsample.hz <- 10 
d$t.stim.binned <- round(d$t.stim*subsample.hz)/subsample.hz

allS <- d # save dataframe before exclusions

Drop trials based on number of NAs

t.reject.threshold = .5

# get proportion nas for each trial
trial.nas <- ddply(d,c("subid", "stimulus"), 
                   function (temp) {"prop.nas" = length(which(is.na(temp$x)))/length(temp$x)})
names(trial.nas)[3] = "prop.nas"
reject.trials <- trial.nas[trial.nas$prop.nas > t.reject.threshold,]

# add a column in data that combines subid and stim (trial)
reject.trials$subtrial <- paste(reject.trials$subid,"_",reject.trials$stimulus,sep="")
d$subtrial <- paste(d$subid,"_",d$stimulus,sep="")

# reject trials
d <- d[!(d$subtrial %in% reject.trials$subtrial),]

print(paste("Dropped", round(dim(reject.trials)[1]/dim(trial.nas)[1],2), "trials", sep = " "))

## [1] "Dropped 0.23 trials"

Drop subjects based on number of NAs

s.reject.threshold = .7

# proportion nas for each subject
subject.nas <- ddply(trial.nas, "subid", function (temp){mean(temp$prop.nas)})

# reject subjects
reject.subjects <- subject.nas[subject.nas$prop.nas > s.reject.threshold,]
d <- d[!(d$subid %in% reject.subjects$subid),]

# print rejects
print(paste("Dropped", 
            round(dim(reject.subjects)[1]/
                    length(which(summary(d$subid)>0)),2), 
            "subjects", sep = " "))

## [1] "Dropped 0 subjects"

Drop subjects based on number of missing trials

n.trials.threshold = 10
total_n = length(unique(d$subid))

# reject subjects
d = d %>% 
    group_by(subid) %>% 
    summarise(n.trials = length(unique(stimulus))) %>% # get number of trials/subj
    left_join(d) %>% # merge num trials back into giant data frame
    filter(n.trials > n.trials.threshold) # then exclude subjects based on criteria

# print rejects
print(paste("Dropped", total_n -length(unique(d$subid)) , 
            "subjects", sep = " "))

## [1] "Dropped 5 subjects"

# print rejects
print(paste("TOTAL:", length(unique(d$subid)) , 
            "subjects", sep = " "))

## [1] "TOTAL: 17 subjects"

(4) Descriptive plots

(1) Histogram of participant age

d$age.binned <- cut(d$age_group, 
                    seq(floor(min(d$age_group)),ceiling(max(d$age_group)),.5), 
                    include.lowest = TRUE)

sub.by.age = d %>%
            group_by(age.binned) %>%
            summarise(num = length(unique(subid))) %>%
            separate(age.binned,  c("min", "max"),  sep = ",") %>%
            mutate(min = gsub("[(]","",min)) 

qplot(x=min,y=num, data=sub.by.age, stat="identity",
      geom="bar", 
      xlab="Age Group", 
      ylab="Number of Participants")+
  scale_y_continuous(expand=c(0,0)) + 
  themeML

(2) Histogram of trials by trial number

includes.by.order <- d %>%
  group_by(subid, trial.order) %>%
  summarise(x = na.mean(x)) %>%
  filter(!is.na(x)) %>%
  group_by(trial.order) %>%
  summarise(n = n()) 

ggplot(includes.by.order, aes(x = trial.order, y = n)) +
  geom_bar(stat="identity") +
  ylim(0,15) +
  ylab("number of participants")

(3) Histogram of trials by stimulus

includes.by.stimulus <- d %>%
  group_by(subid, stimulus) %>%
  summarise(x = na.mean(x)) %>%
  filter(!is.na(x)) %>%
  group_by(stimulus) %>%
  summarise(n = n()) 

ggplot(includes.by.stimulus, aes(x = stimulus, y = n))+
  geom_bar(stat="identity") +
  ylim(0,15) +
  ylab("number of participants") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

(4) Number of trials by subject

includes.by.each.subj = d %>%
        group_by(subid, trial.order, age_group) %>%
        summarise(x = na.mean(x)) %>%
        filter(!is.na(x)) %>%
        group_by(subid, age_group) %>%
        summarise(n = n()) %>%  
        ungroup() %>% 
        arrange(n)

mean_trials = mean(includes.by.each.subj$n)

includes.by.each.subj$subid <- reorder(includes.by.each.subj$subid,
                                       includes.by.each.subj$n)

ggplot(includes.by.each.subj, aes(y=n, x=subid, fill = age_group, order = n)) +
  geom_bar(position="dodge", stat="identity") +  
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   scale_fill_continuous(low="beige", high="red") +
  geom_hline(yintercept=mean_trials) +
  themeML

#creates new data frame with only one unique observation for each subject
#newd <- d
#newd <- newd[!duplicated(newd[,"subid",]),]

#qplot(x=subid,y=n.trials, data = newd , stat="identity",
    #  geom="bar", 
    #  xlab="subject id", 
    #  ylab="Number of Trials")+
  #scale_y_continuous(expand=c(0,0)) + 
 # themeML

(5) Number of trials by age group

includes.by.subj <- d %>%
  group_by(block, subid, trial.order, age.binned) %>%
  summarise(x = na.mean(x)) %>%
  filter(!is.na(x)) %>%
  group_by(block, subid, age.binned) %>%
  summarise(n = n())

#aged <- d
#aged <- aged[!duplicated(aged[,"subid",]),]
#aged <- aged[,c("subid","n.trials","age.binned")]

trials.by.age = d %>%
                group_by(age.binned) %>%
                summarise(avg.trials = mean(n.trials))

#histogram of average number of trials completed by age group
qplot(x=age.binned,y=avg.trials, data = trials.by.age , stat="identity",
      geom="bar", 
      xlab="age", 
      ylab="Number of Trials")+
  scale_y_continuous(expand=c(0,0)) + 
  themeML

(6) Number of trials by age group and block

ggplot(includes.by.subj, aes(y=n, x=block) ) +
  geom_bar(position="dodge", stat="identity", fill = "red") +  
  facet_grid( ~ age.binned) +
  themeML +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

(7) Number of trials by block

includes.by.subj.block = includes.by.subj %>% 
  group_by(block)  %>%
  summarise(cih = ci.high(n),
            cil = ci.low(n),
            n = na.mean(n))

ggplot(includes.by.subj.block, aes(y=n, x=block), ) +
  geom_bar(position="dodge", stat="identity", fill = "red") +  
  geom_errorbar(aes(ymin = n - cil, ymax= n + cih), width=0.2, position="dodge") +
  themeML

(8) Distribution of gazes across all trials by subjects

window<-c(1.8,6)
qplot(x,y,geom="density2d",
      data=d,
      xlim=c(0,1680),ylim=c(0,1050), facets=~subid)  +
  annotate("rect", 0,0,0,650,400,1000, alpha=0.3) +
  annotate("rect", 0,0,850,1500,400,1000, alpha=0.3)

(5) Analysis plots

(1) PLOT OF X-POSITION OF GAZE

time_enter = 2.28
time_emerge = 3.08

ms <- d %>% 
      group_by(t.stim.binned, trial.type, block) %>% 
      summarise(cih = ci.high(x),
                cil = ci.low(x),
                x = mean(x, na.rm = T))
  
qplot(t.stim.binned,x,colour= trial.type, 
      geom="line", lty=trial.type, facets= block~.,
      data=ms) + 
  geom_ribbon(aes(ymin=x-cil, ymax=x+cih, fill=trial.type), 
              alpha=.2, colour=NA) +
  geom_hline(yintercept=740, lty=4)+
  annotate("rect", 0, 0, time_emerge, time_enter, -Inf, Inf, alpha=0.3)+
  ylim(c(300, 1150)) +
  scale_fill_manual(values=c("yellow_sphere"= "blue", 
                             "red_cross"= "red", 
                             "red_sphere" = "darkred", 
                             "yellow_cross"= "cyan")) +
  scale_color_manual(values=c("red", "red", "blue", 
                              "blue", "red", "blue")) +
  scale_linetype_manual(values = c("solid", "dashed", "solid", 
                                   "dashed","solid", "dashed")) + 
  xlab("Time (s)") + 
  ylab("X-Position of Gaze") + 
  themeML

(2) PLOT OF X-POSITION OF GAZE BY HALF

training.by.block <- d %>%
  mutate(first.half = trial.order < 10) %>%
  filter(block == "training") %>%
  filter(!is.na(trial.order)) %>%
  group_by(t.stim.binned, trial.type, first.half) %>%
  summarise(x = na.mean(x),
            n = n()) 

training.by.block$first.half <- factor(training.by.block$first.half, 
                                       labels = c("second", "first"))

qplot(t.stim.binned,x,colour= trial.type, 
      geom="line", lty=trial.type, facets= first.half~.,
      data=training.by.block) + 
  geom_hline(yintercept=740, lty=4)+
  annotate("rect", 0, 0, time_emerge, time_enter, -Inf, Inf, alpha=0.3)+
  ylim(c(300, 1150)) +
  scale_fill_manual(values=c("yellow_sphere"= "blue", 
                             "red_cross"= "red", 
                             "red_sphere" = "darkred", 
                             "yellow_cross"= "cyan")) +
  scale_color_manual(values=c("red", "red", "blue", 
                              "blue", "red", "blue")) +
  scale_linetype_manual(values = c("solid", "dashed", "solid", 
                                   "dashed","solid", "dashed")) + 
  xlab("Time (s)") + 
  ylab("X-Position of Gaze") + 
  themeML

(2) BY SUBJECT PLOTS

ms4 <- d %>%
  mutate(first.half = trial.order < 10) %>%
  filter(block == "training") %>%
  group_by(t.stim.binned, trial.type, first.half, subid) %>%
  summarise(x = na.mean(x),
            n = n()) 

qplot(t.stim.binned,x,colour= trial.type, 
      geom="line", lty=trial.type, facets= subid ~ first.half,
      data=ms4) + 
  geom_hline(yintercept=740, lty=4)+
  annotate("rect", 0,0, time_emerge, time_enter,-Inf,Inf, alpha=0.3)+
  ylim(c(300, 1150)) +
  scale_fill_manual(values=c("yellow_sphere"= "blue", 
                             "red_cross"= "red", 
                             "red_sphere" = "darkred", 
                             "yellow_cross"= "cyan")) +
  scale_color_manual(values=c("red", "red", "blue", 
                              "blue", "red", "blue"))+
  scale_linetype_manual(values = c("solid", "dashed", "solid", 
                                   "dashed","solid", "dashed")) + 
  xlab("Time (s)") + 
  ylab("X-Position of Gaze")

ALL PLOTS WITHOUT EXCLUSIONS:

(4) Descriptive plots

(1) Histogram of participant age

allS$age.binned <- cut(allS$age_group, 
                    seq(floor(min(allS$age_group)),ceiling(max(allS$age_group)),.5), 
                    include.lowest = TRUE)

sub.by.age = allS %>%
            group_by(age.binned) %>%
            summarise(num = length(unique(subid))) %>%
            separate(age.binned,  c("min", "max"),  sep = ",") %>%
            mutate(min = gsub("[(]","",min)) 

qplot(x=min,y=num, data=sub.by.age, stat="identity",
      geom="bar", 
      xlab="Age Group", 
      ylab="Number of Participants")+
  scale_y_continuous(expand=c(0,0)) + 
  themeML

(2) Histogram of trials by trial number

includes.by.order <- allS %>%
  group_by(subid, trial.order) %>%
  summarise(x = na.mean(x)) %>%
  filter(!is.na(x)) %>%
  group_by(trial.order) %>%
  summarise(n = n()) 

ggplot(includes.by.order, aes(x = trial.order, y = n)) +
  geom_bar(stat="identity") +
  ylim(0,25) +
  ylab("number of participants")

(3) Histogram of trials by stimulus

includes.by.stimulus <- allS %>%
  group_by(subid, stimulus) %>%
  summarise(x = na.mean(x)) %>%
  filter(!is.na(x)) %>%
  group_by(stimulus) %>%
  summarise(n = n()) 

ggplot(includes.by.stimulus, aes(x = stimulus, y = n))+
  geom_bar(stat="identity") +
  ylim(0,25) +
  ylab("number of participants") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

(4) Number of trials by subject

includes.by.each.subj = allS %>%
        group_by(subid, trial.order, age_group) %>%
        summarise(x = na.mean(x)) %>%
        filter(!is.na(x)) %>%
        group_by(subid, age_group) %>%
        summarise(n = n()) %>%  
        ungroup() %>% 
        arrange(n)

mean_trials = mean(includes.by.each.subj$n)

includes.by.each.subj$subid <- reorder(includes.by.each.subj$subid,
                                       includes.by.each.subj$n)

ggplot(includes.by.each.subj, aes(y=n, x=subid, fill = age_group, order = n)) +
  geom_bar(position="dodge", stat="identity") +  
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
   scale_fill_continuous(low="beige", high="red") +
  geom_hline(yintercept=mean_trials) +
  themeML

#creates new data frame with only one unique observation for each subject
#newd <- d
#newd <- newd[!duplicated(newd[,"subid",]),]

#qplot(x=subid,y=n.trials, data = newd , stat="identity",
    #  geom="bar", 
    #  xlab="subject id", 
    #  ylab="Number of Trials")+
  #scale_y_continuous(expand=c(0,0)) + 
 # themeML

(5) Number of trials by age group

includes.by.subj <- allS %>%
  group_by(block, subid, trial.order, age.binned) %>%
  summarise(x = na.mean(x)) %>%
  filter(!is.na(x)) %>%
  group_by(block, subid, age.binned) %>%
  summarise(n = n())

#aged <- allS
#aged <- aged[!duplicated(aged[,"subid",]),]
#aged <- aged[,c("subid","n.trials","age.binned")]

#trials.by.age = aged %>%
#                group_by(age.binned) %>%
 #               summarise(avg.trials = mean(n.trials))

#histogram of average number of trials completed by age group
#qplot(x=age.binned,y=avg.trials, data = trials.by.age , stat="identity",
#      geom="bar", 
#      xlab="age", 
#      ylab="Number of Trials")+
##  themeML

(6) Number of trials by age group and block

ggplot(includes.by.subj, aes(y=n, x=block) ) +
  geom_bar(position="dodge", stat="identity", fill = "red") +  
  facet_grid( ~ age.binned) +
  themeML +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

(7) Number of trials by block

includes.by.subj.block = includes.by.subj %>% 
  group_by(block)  %>%
  summarise(cih = ci.high(n),
            cil = ci.low(n),
            n = na.mean(n))

ggplot(includes.by.subj.block, aes(y=n, x=block), ) +
  geom_bar(position="dodge", stat="identity", fill = "red") +  
  geom_errorbar(aes(ymin = n - cil, ymax= n + cih), width=0.2, position="dodge") +
  themeML

(8) Distribution of gazes across allS trials by subjects

window<-c(1.8,6)
qplot(x,y,geom="density2d",
      data=allS,
      xlim=c(0,1680),ylim=c(0,1050), facets=~subid)  +
  annotate("rect", 0,0,0,650,400,1000, alpha=0.3) +
  annotate("rect", 0,0,850,1500,400,1000, alpha=0.3)

(5) Analysis plots

(1) PLOT OF X-POSITION OF GAZE

time_enter = 2.28
time_emerge = 3.08

ms <- allS %>% 
      group_by(t.stim.binned, trial.type, block) %>% 
      summarise(cih = ci.high(x),
                cil = ci.low(x),
                x = mean(x, na.rm = T))
  
qplot(t.stim.binned,x,colour= trial.type, 
      geom="line", lty=trial.type, facets= block~.,
      data=ms) + 
  geom_ribbon(aes(ymin=x-cil, ymax=x+cih, fill=trial.type), 
              alpha=.2, colour=NA) +
  geom_hline(yintercept=740, lty=4)+
  annotate("rect", 0, 0, time_emerge, time_enter, -Inf, Inf, alpha=0.3)+
  ylim(c(300, 1150)) +
  scale_fill_manual(values=c("yellow_sphere"= "blue", 
                             "red_cross"= "red", 
                             "red_sphere" = "darkred", 
                             "yellow_cross"= "cyan")) +
  scale_color_manual(values=c("red", "red", "blue", 
                              "blue", "red", "blue")) +
  scale_linetype_manual(values = c("solid", "dashed", "solid", 
                                   "dashed","solid", "dashed")) + 
  xlab("Time (s)") + 
  ylab("X-Position of Gaze") + 
  themeML

(2) PLOT OF X-POSITION OF GAZE BY HALF

training.by.block <- allS %>%
  mutate(first.half = trial.order < 10) %>%
  filter(block == "training") %>%
  filter(!is.na(trial.order)) %>%
  group_by(t.stim.binned, trial.type, first.half) %>%
  summarise(x = na.mean(x),
            n = n()) 

training.by.block$first.half <- factor(training.by.block$first.half, 
                                       labels = c("second", "first"))

qplot(t.stim.binned,x,colour= trial.type, 
      geom="line", lty=trial.type, facets= first.half~.,
      data=training.by.block) + 
  geom_hline(yintercept=740, lty=4)+
  annotate("rect", 0, 0, time_emerge, time_enter, -Inf, Inf, alpha=0.3)+
  ylim(c(300, 1150)) +
  scale_fill_manual(values=c("yellow_sphere"= "blue", 
                             "red_cross"= "red", 
                             "red_sphere" = "darkred", 
                             "yellow_cross"= "cyan")) +
  scale_color_manual(values=c("red", "red", "blue", 
                              "blue", "red", "blue")) +
  scale_linetype_manual(values = c("solid", "dashed", "solid", 
                                   "dashed","solid", "dashed")) + 
  xlab("Time (s)") + 
  ylab("X-Position of Gaze") + 
  themeML

(2) BY SUBJECT PLOTS

ms4 <- allS %>%
  mutate(first.half = trial.order < 10) %>%
  filter(block == "training") %>%
  group_by(t.stim.binned, trial.type, first.half, subid) %>%
  summarise(x = na.mean(x),
            n = n()) 

qplot(t.stim.binned,x,colour= trial.type, 
      geom="line", lty=trial.type, facets= subid ~ first.half,
      data=ms4) + 
  geom_hline(yintercept=740, lty=4)+
  annotate("rect", 0,0, time_emerge, time_enter,-Inf,Inf, alpha=0.3)+
  ylim(c(300, 1150)) +
  scale_fill_manual(values=c("yellow_sphere"= "blue", 
                             "red_cross"= "red", 
                             "red_sphere" = "darkred", 
                             "yellow_cross"= "cyan")) +
  scale_color_manual(values=c("red", "red", "blue", 
                              "blue", "red", "blue"))+
  scale_linetype_manual(values = c("solid", "dashed", "solid", 
                                   "dashed","solid", "dashed")) + 
  xlab("Time (s)") + 
  ylab("X-Position of Gaze")