Preliminaries

rm(list=ls())
library(lsr)
library(lme4)
## Loading required package: Matrix
## Loading required package: Rcpp
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rjson)
## Warning: package 'rjson' was built under R version 3.1.2
library(RSQLite)
## Loading required package: DBI
library(stringr)
library(ggplot2)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.2
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.2
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(tidyr)

mean.na.rm <- function(x) { mean(x,na.rm=T) }
sum.na.rm <- function(x) { sum(x,na.rm=T) }
stderr <- function(x) sqrt(var(x)/length(x))

Read data

con = dbConnect(SQLite(),dbname = "participants.db");
df.complete = dbReadTable(con,"turkdemo") #change the name of the database here (mine was called "almost")
dbDisconnect(con)
## [1] TRUE
#filter out incompletes (using dplyr methods)
df.complete = subset(df.complete,status %in% c(3,4)) 

#filter to a particular day (if I haven't set codeversions). OR together multiple days if needed
df.complete$currentVersion = str_detect(df.complete$beginhit, "2014-12-11")
df.complete = subset(df.complete, currentVersion %in% TRUE)

Structure data

#Note: Compile in wide form: 1 row/participant; each trial gets a series of column names, formatted XYFIELD_#
#Also, no extra underscores in the column names, this breaks wideToLong
#df.wide = data.frame(NULL)
df.wide = data.frame(matrix(nrow=nrow(df.complete),ncol=4))
colnames(df.wide) = c("participant","workerId","browser","beginhit") #will dynamically add columns from datastring below

for (i in 1:nrow(df.wide)){
  a = fromJSON(df.complete$datastring[i])
  #Note! Some badness is happening in finding these indices, need to fix this loop so it doesn't go beyond a's limit
  mylength = length(a$data)
  print(mylength)
  #Weird: Some of the people (~10) have the wrong number of blocks!  For now, just take the ones who are length 23, but what the hell!
  if (mylength == 23){
    df.wide$participant[i] = i
    df.wide$workerId[i] = a$workerId
    df.wide$browser[i] = df.complete$browser[i]
    df.wide$beginhit[i] = df.complete$beginhit[i]
    #cycle through all the trials, but only record where isTestTrial = 1
    for (j in 1:mylength){
      if(a$data[[j]]$trialdata$isTestTrial == "1"){
        df.wide[[paste("rt_",j, sep="")]][i] = a$data[[j]]$trialdata$rt
        df.wide[[paste("keypress_",j, sep="")]][i] = a$data[[j]]$trialdata$key_press
        df.wide[[paste("stimcondition_",j, sep="")]][i] = a$data[[j]]$trialdata$stimcondition
        df.wide[[paste("exposureManner_",j, sep="")]][i] = a$data[[j]]$trialdata$exposure_manner
        df.wide[[paste("exposurePath_",j, sep="")]][i] = a$data[[j]]$trialdata$exposure_path
        df.wide[[paste("exposureNumber_",j, sep="")]][i] = a$data[[j]]$trialdata$exposure_number
        df.wide[[paste("condition_",j, sep="")]][i] = a$data[[j]]$trialdata$condition_name
      } #Else just don't make any columns right now!!!
    }
  
    #And grab the info we need from the last 'trial' (feedback)
    if (is.null(a$data[[mylength-1]]$trialdata$Q0)){df.wide$feedback[i] = "none"
    }else{
      df.wide$feedback[i] = a$data[[mylength-1]]$trialdata$Q0
    }
  }
} #End of this participant
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 45
## [1] 23
## [1] 27
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 44
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 25
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 46
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 23
## [1] 26
## [1] 23

Weird behavior! I got those wrong-lenght participants to be assigned a participant no of NA, which is something, anyway.

Lost 6 people to this.

nrow(df.wide)
## [1] 101
df.wide = df.wide[!is.na(df.wide$participant),]
nrow(df.wide)
## [1] 95

Reformat into long form!

df.long = wideToLong(subset(df.wide,select=-feedback),within="trial")

#create factors
df.long = mutate(df.long, participant = as.numeric(participant),
          trial = as.numeric(as.character(trial)),
          rt = as.numeric(as.character(rt)),
          keypress = as.numeric(as.character(keypress))-48, #transform keycodes to numerals!
          stimcondition = factor(stimcondition,levels=c("mismatch","match", "mannerchange","pathchange")),
          exposureNumber = as.numeric(as.character(exposureNumber)),
          condition = factor(condition, levels=c("Noun","Verb")))

df.long = df.long[order(df.long$participant,df.long$trial),]

Analyze data!

Response variable histograms. RTs go a little long, maybe clip these at 8s?

qplot(keypress, facets=stimcondition~condition,
      data=df.long)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-6

qplot(rt, facets=stimcondition~condition,
      data=df.long)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-6

Basic summary. This is the first finding, that N/V construal changes the weighting on manner/path generalization. I think this is kinda cool, actually.

ms <- df.long %>%
  filter(rt < 8000, !grepl("debug*", df.long$workerId)) %>%
  group_by(stimcondition, condition, workerId) %>%
  summarise(resp = mean(keypress)) %>%
  group_by(stimcondition, condition, add=FALSE) %>%
  summarise(ci = stderr(resp)*1.96,
            resp = mean(resp))
  
ms$stimcondition <- factor(ms$stimcondition, 
                           levels=c("match","mannerchange","pathchange", "mismatch"))

qplot(stimcondition, resp, fill=condition, 
      position="dodge", geom="bar", stat="identity",
      data=ms) + 
  geom_linerange(aes(ymin=resp - ci, ymax = resp + ci), 
                 position = position_dodge(width=.9))

plot of chunk unnamed-chunk-7

Change score. A different way of capturing this.

changescore <- df.long %>%
  filter(rt < 8000, !grepl("debug*", df.long$workerId)) %>%
  group_by(stimcondition, condition, workerId) %>%
  summarise(resp = mean(keypress)) %>%
  ungroup() %>%
  spread(stimcondition, resp) %>%
  mutate(mp.diff = abs(mannerchange - pathchange))

with(changescore, t.test(mp.diff[condition=="Verb"],
                         mp.diff[condition=="Noun"]))         
## 
##  Welch Two Sample t-test
## 
## data:  mp.diff[condition == "Verb"] and mp.diff[condition == "Noun"]
## t = 2.119, df = 79.14, p-value = 0.0372
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.04186 1.33480
## sample estimates:
## mean of x mean of y 
##     2.187     1.498

Now the individual diffs analyss.

Try linear mixed effect model. Not much doing.

mod <- lmer(keypress ~ stimcondition * condition + (stimcondition | workerId), 
     data=df.long)
summary(mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: keypress ~ stimcondition * condition + (stimcondition | workerId)
##    Data: df.long
## 
## REML criterion at convergence: 4694
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -6.259 -0.239  0.033  0.239  5.980 
## 
## Random effects:
##  Groups   Name                      Variance Std.Dev. Corr             
##  workerId (Intercept)               1.686    1.299                     
##           stimconditionmatch        3.245    1.802    -0.82            
##           stimconditionmannerchange 2.150    1.466    -0.20  0.25      
##           stimconditionpathchange   1.377    1.174    -0.22  0.30 -0.30
##  Residual                           0.604    0.777                     
## Number of obs: 1615, groups:  workerId, 95
## 
## Fixed effects:
##                                         Estimate Std. Error t value
## (Intercept)                                2.092      0.194   10.80
## stimconditionmatch                         4.333      0.268   16.17
## stimconditionmannerchange                  1.337      0.224    5.98
## stimconditionpathchange                    1.036      0.185    5.59
## conditionVerb                             -0.179      0.278   -0.64
## stimconditionmatch:conditionVerb           0.224      0.385    0.58
## stimconditionmannerchange:conditionVerb    0.837      0.321    2.60
## stimconditionpathchange:conditionVerb     -0.291      0.266   -1.09
## 
## Correlation of Fixed Effects:
##              (Intr) stmcndtnmt stmcndtnmn stmcndtnp cndtnV stmcndtnmt:V
## stmcndtnmtc  -0.815                                                    
## stmcndtnmnn  -0.250  0.274                                             
## stmcndtnpth  -0.276  0.321     -0.184                                  
## conditinVrb  -0.696  0.567      0.174      0.192                       
## stmcndtnmt:V  0.567 -0.696     -0.191     -0.224    -0.815             
## stmcndtnmn:V  0.174 -0.191     -0.696      0.128    -0.250  0.274      
## stmcndtnp:V   0.192 -0.224      0.128     -0.696    -0.276  0.321      
##              stmcndtnmn:V
## stmcndtnmtc              
## stmcndtnmnn              
## stmcndtnpth              
## conditinVrb              
## stmcndtnmt:V             
## stmcndtnmn:V             
## stmcndtnp:V  -0.184

Plot manner vs. path scores:

qplot(mannerchange, pathchange, facets=.~condition, 
      data=changescore) + 
  geom_smooth(method="lm") + 
  xlim(c(1,7)) + ylim(c(1,7))

plot of chunk unnamed-chunk-10

Quantify and test this. Could do it both as a lmer and as an lm over means.

summary(lm(mannerchange ~ pathchange * condition, data=changescore))
## 
## Call:
## lm(formula = mannerchange ~ pathchange * condition, data = changescore)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.176 -1.403  0.082  1.211  3.192 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.337      0.574    4.07   0.0001 ***
## pathchange                  0.368      0.160    2.30   0.0239 *  
## conditionVerb               1.849      0.778    2.38   0.0196 *  
## pathchange:conditionVerb   -0.405      0.235   -1.73   0.0875 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.78 on 90 degrees of freedom
## Multiple R-squared:  0.0791, Adjusted R-squared:  0.0484 
## F-statistic: 2.58 on 3 and 90 DF,  p-value: 0.0587