library(dplyr)
library(ggplot2)
library(Hmisc)
library(lme4)
library(afex)
library(knitr)
library(data.table)
library(zoo) #replace missing with col mean

Data Preprocessing

df<-read.csv('revised_alldata.csv')

#convert numerical cols to factors
cols<-c('participant','prime','end','gender','idiom_lang','nat_LAN','phrase_condition','list')
df[cols]<-lapply(df[cols],factor)


colnames(df)[colnames(df)=="idiom_lang"] <- "Idiom_Language"
colnames(df)[colnames(df)=="phrase_condition"] <- "Phrase_Type"


levels(df$Phrase_Type)[levels(df$Phrase_Type)=="Experimental"] <- 'Idiomatic'
levels(df$nat_LAN)[levels(df$nat_LAN)=="2"] <- 'Mandarin-English Bilinguals'
levels(df$nat_LAN)[levels(df$nat_LAN)=="1"] <- 'Native English Monolinguals'

#replacing missing numerical values with col mean
cols1<-c('EN_rating','CN_rating','freq_HAL','age')
df[cols1] <- sapply(na.aggregate(df[cols1]),as.numeric)


head(df)

Remove RTs 2SD away from group mean

df3 <- df %>% 
            group_by(nat_LAN) %>% 
            filter(!(exp_resp.rt - median(exp_resp.rt)) > abs(2*sd(exp_resp.rt)))

#% of data points removed
1-nrow(df3)/nrow(df)
## [1] 0.0691818

Visualize RT distribution

ggplot(df3, aes(Idiom_Language, exp_resp.rt, color = Phrase_Type))+
  geom_boxplot()+
  ylab('Reaction time')+
  facet_wrap(~nat_LAN)

Dummy code Binary categorical variables

#preserve df3 as the trimmed dataset with original variable levels
df4<-df3

levels(df4$nat_LAN)<-c(0,1)
colnames(df4)[colnames(df4)=="nat_LAN"] <- "IsBilingual"

levels(df4$Idiom_Language)<-c(1,0)
colnames(df4)[colnames(df4)=="Idiom_Language"] <- "IsChineseIdiom"

levels(df4$Phrase_Type)<-c(0,1)
colnames(df4)[colnames(df4)=="Phrase_Type"] <- "IsExperimental"
df5<-df4[!df4$end=="gold",]
df5<-df5[!df5$end=="water",]

All subjects

Native language x phrase type x idiom language

ICC for subejects and items- null model

null_m<-lmer(exp_resp.rt~1+(1|participant)+(1|end), df5, REML=FALSE) 
summary(null_m)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: exp_resp.rt ~ 1 + (1 | participant) + (1 | end)
##    Data: df5
## 
##      AIC      BIC   logLik deviance df.resid 
## 122151.5 122180.2 -61071.8 122143.5     9704 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6333 -0.6813 -0.1397  0.5332  4.4332 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  2642     51.40  
##  participant (Intercept)  8106     90.03  
##  Residual                15585    124.84  
## Number of obs: 9708, groups:  end, 185; participant, 122
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)  678.292      9.081 169.432    74.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ICC item
2645/(2645+15556)
## [1] 0.1453217
#ICC subject
8118/(8118+15556)
## [1] 0.3429078

About 34% of the overall variances in RT can be attributed to the variances between subjects, and 14% of variations in RT can be attributed to inter-item differences. Higher ICC indicate more similarity within cluster.

Add phrase type as a predictor

null_m<-lmer(exp_resp.rt~1+(1|participant)+(1|end), df5, REML=FALSE)
m1<-lmer(exp_resp.rt~IsExperimental+(1|participant)+(1|end), df5, REML=FALSE)
anova(null_m,m1)

Add native language as a predictor

m1<-lmer(exp_resp.rt~IsExperimental+(1|participant)+(1|end), df5, REML=FALSE)
m2<-lmer(exp_resp.rt~IsBilingual+IsExperimental+(1|participant)+(1|end), df5, REML=FALSE)
anova(m1,m2)

Add idiom language as a predictor

m2<-lmer(exp_resp.rt~IsBilingual+IsExperimental+(1|participant)+(1|end), df5, REML=FALSE)
m3<-lmer(exp_resp.rt~IsBilingual+IsExperimental+IsChineseIdiom+(1|participant)+(1|end), df5, REML=FALSE)
anova(m2,m3)

Add interactions

m3<-lmer(exp_resp.rt~IsBilingual+IsExperimental+IsChineseIdiom+(1|participant)+(1|end), df5, REML=FALSE)
m_inter<-lmer(exp_resp.rt~IsBilingual*IsExperimental*IsChineseIdiom+(1|participant)+(1|end), df5, REML=FALSE)
anova(m3, m_inter)

Output of final model

summary(m_inter)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: exp_resp.rt ~ IsBilingual * IsExperimental * IsChineseIdiom +  
##     (1 | participant) + (1 | end)
##    Data: df5
## 
##      AIC      BIC   logLik deviance df.resid 
## 122049.2 122128.2 -61013.6 122027.2     9697 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.5912 -0.6807 -0.1471  0.5333  4.4695 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  2180     46.69  
##  participant (Intercept)  5567     74.62  
##  Residual                15517    124.57  
## Number of obs: 9708, groups:  end, 185; participant, 122
## 
## Fixed effects:
##                                              Estimate Std. Error       df
## (Intercept)                                   649.796     12.142  248.587
## IsBilingual1                                   92.658     14.472  144.638
## IsExperimental1                               -29.080     10.987  213.935
## IsChineseIdiom0                                10.829     10.851  221.735
## IsBilingual1:IsExperimental1                   -1.606      7.212 9413.997
## IsBilingual1:IsChineseIdiom0                   -7.194      7.283 9419.087
## IsExperimental1:IsChineseIdiom0               -43.105     15.450  215.915
## IsBilingual1:IsExperimental1:IsChineseIdiom0   42.721     10.180 9418.393
##                                              t value Pr(>|t|)    
## (Intercept)                                   53.517  < 2e-16 ***
## IsBilingual1                                   6.403 2.00e-09 ***
## IsExperimental1                               -2.647  0.00873 ** 
## IsChineseIdiom0                                0.998  0.31936    
## IsBilingual1:IsExperimental1                  -0.223  0.82376    
## IsBilingual1:IsChineseIdiom0                  -0.988  0.32323    
## IsExperimental1:IsChineseIdiom0               -2.790  0.00574 ** 
## IsBilingual1:IsExperimental1:IsChineseIdiom0   4.197 2.73e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) IsBln1 IsExp1 IsChI0 IsB1:IE1 IB1:IC IE1:IC
## IsBilingul1 -0.574                                            
## IsExprmntl1 -0.441  0.078                                     
## IsChinsIdm0 -0.447  0.079  0.494                              
## IsBlng1:IE1  0.141 -0.254 -0.307 -0.158                       
## IsBln1:ICI0  0.140 -0.251 -0.154 -0.322  0.503                
## IsExp1:ICI0  0.314 -0.055 -0.711 -0.702  0.219    0.226       
## IB1:IE1:ICI -0.100  0.180  0.218  0.230 -0.708   -0.715 -0.312

Native English speakers

ICC-null model

n_m_e<-lmer(exp_resp.rt~1+(1|participant)+(1|end), df5[df5$IsBilingual=='0',], REML=FALSE) 
summary(n_m_e)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: exp_resp.rt ~ 1 + (1 | participant) + (1 | end)
##    Data: df5[df5$IsBilingual == "0", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  62656.9  62683.1 -31324.5  62648.9     5080 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9164 -0.6671 -0.1227  0.5453  3.8928 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  2407     49.06  
##  participant (Intercept)  4785     69.17  
##  Residual                11778    108.53  
## Number of obs: 5084, groups:  end, 185; participant, 63
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)  629.655      9.565  80.162   65.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#by item
2413/(2413+11760)
## [1] 0.1702533
#by subejct
4757/(4757+11760)
## [1] 0.2880063

Add phrase type as a predictor

n_m_e<-lmer(exp_resp.rt~1+(1|participant)+(1|end), df5[df5$IsBilingual=='0',], REML=FALSE) 
m1_e<-lmer(exp_resp.rt~IsExperimental+(1|participant)+(1|end), df5[df5$IsBilingual=='0',], REML=FALSE) 
anova(n_m_e,m1_e)

Add idiom language as a predictor

m2_e<-lmer(exp_resp.rt~IsExperimental + IsChineseIdiom +(1|participant)+(1|end), df5[df5$IsBilingual=='0',], REML=FALSE) 
anova(m2_e,m1_e)
summary(m2_e)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: 
## exp_resp.rt ~ IsExperimental + IsChineseIdiom + (1 | participant) +  
##     (1 | end)
##    Data: df5[df5$IsBilingual == "0", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  62611.1  62650.3 -31299.6  62599.1     5078 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9491 -0.6688 -0.1188  0.5400  3.9052 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  1711     41.36  
##  participant (Intercept)  4778     69.13  
##  Residual                11782    108.54  
## Number of obs: 5084, groups:  end, 185; participant, 63
## 
## Fixed effects:
##                 Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)       659.88      10.52 111.33  62.744  < 2e-16 ***
## IsExperimental1   -50.31       6.83 172.23  -7.366 6.97e-12 ***
## IsChineseIdiom0   -11.30       6.83 172.34  -1.654   0.0999 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) IsExp1
## IsExprmntl1 -0.316       
## IsChinsIdm0 -0.324 -0.013

Add interaction as a predictor

m_inter_e<-lmer(exp_resp.rt~IsExperimental * IsChineseIdiom +(1|participant)+(1|end), df5[df5$IsBilingual=='0',], REML=FALSE) 
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00482567
## (tol = 0.002, component 1)
anova(m_inter_e, m2_e)

Output of interaction model

summary(m_inter_e)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: 
## exp_resp.rt ~ IsExperimental * IsChineseIdiom + (1 | participant) +  
##     (1 | end)
##    Data: df5[df5$IsBilingual == "0", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  62603.6  62649.3 -31294.8  62589.6     5077 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9727 -0.6666 -0.1242  0.5525  3.9147 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  1602     40.03  
##  participant (Intercept)  4777     69.12  
##  Residual                11781    108.54  
## Number of obs: 5084, groups:  end, 185; participant, 63
## 
## Fixed effects:
##                                 Estimate Std. Error      df t value
## (Intercept)                      649.582     10.936 125.334  59.399
## IsExperimental1                  -29.274      9.460 170.753  -3.094
## IsChineseIdiom0                    9.218      9.356 176.896   0.985
## IsExperimental1:IsChineseIdiom0  -41.571     13.312 172.279  -3.123
##                                 Pr(>|t|)    
## (Intercept)                      < 2e-16 ***
## IsExperimental1                  0.00231 ** 
## IsChineseIdiom0                  0.32588    
## IsExperimental1:IsChineseIdiom0  0.00210 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) IsExp1 IsChI0
## IsExprmntl1 -0.422              
## IsChinsIdm0 -0.427  0.493       
## IsExp1:ICI0  0.300 -0.711 -0.703
## convergence code: 0
## Model failed to converge with max|grad| = 0.00482567 (tol = 0.002, component 1)

Add word frequency as a predictor

m_inter2_e<-lmer(exp_resp.rt~IsExperimental * IsChineseIdiom +
                   scale(freq_HAL)+
                   (1|participant)+(1|end), 
                 df5[df5$IsBilingual=='0',], REML=FALSE) 

anova(m_inter_e, m_inter2_e)
summary(m_inter2_e)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: 
## exp_resp.rt ~ IsExperimental * IsChineseIdiom + scale(freq_HAL) +  
##     (1 | participant) + (1 | end)
##    Data: df5[df5$IsBilingual == "0", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  62600.6  62652.9 -31292.3  62584.6     5076 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9391 -0.6661 -0.1236  0.5510  3.9189 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  1549     39.36  
##  participant (Intercept)  4775     69.10  
##  Residual                11781    108.54  
## Number of obs: 5084, groups:  end, 185; participant, 63
## 
## Fixed effects:
##                                 Estimate Std. Error      df t value
## (Intercept)                      649.624     10.882 123.765  59.695
## IsExperimental1                  -29.105      9.335 170.949  -3.118
## IsChineseIdiom0                    8.908      9.235 177.219   0.965
## scale(freq_HAL)                   -7.228      3.223 174.852  -2.243
## IsExperimental1:IsChineseIdiom0  -41.683     13.137 172.518  -3.173
##                                 Pr(>|t|)    
## (Intercept)                      < 2e-16 ***
## IsExperimental1                  0.00214 ** 
## IsChineseIdiom0                  0.33606    
## scale(freq_HAL)                  0.02615 *  
## IsExperimental1:IsChineseIdiom0  0.00179 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) IsExp1 IsChI0 s(_HAL
## IsExprmntl1 -0.418                     
## IsChinsIdm0 -0.423  0.493              
## scl(fr_HAL) -0.003 -0.008  0.014       
## IsExp1:ICI0  0.297 -0.711 -0.703  0.005

Add English speaker familiarity as a predictor

m_inter3_e<-lmer(exp_resp.rt~IsExperimental * IsChineseIdiom +
                   scale(freq_HAL)+ EN_rating+
                   (1|participant)+(1|end), 
                 df5[df5$IsBilingual=='0',], REML=FALSE) 

anova(m_inter2_e,m_inter3_e)

Bilingual

ICC null model

n_m_c<-lmer(exp_resp.rt~1+(1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 
summary(n_m_c)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: exp_resp.rt ~ 1 + (1 | participant) + (1 | end)
##    Data: df5[df5$IsBilingual == "1", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  59183.6  59209.3 -29587.8  59175.6     4620 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8624 -0.6791 -0.1689  0.5317  4.0958 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  3876     62.25  
##  participant (Intercept)  6420     80.13  
##  Residual                18894    137.45  
## Number of obs: 4624, groups:  end, 185; participant, 59
## 
## Fixed effects:
##             Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)   730.38      11.59  80.75   63.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#by item
3874/(3874+18861)
## [1] 0.1703981
#by subject
6487/(6487+18861)
## [1] 0.2559176

Add phrase type as a predictor

n_m_c<-lmer(exp_resp.rt~1+(1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 
m1_c<-lmer(exp_resp.rt~IsExperimental+(1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 
anova(n_m_c,m1_c)

Add idiom language as a predictor

m2_c<-lmer(exp_resp.rt~IsExperimental + IsChineseIdiom +(1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 
anova(m2_c,m1_c)
summary(m1_c)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: exp_resp.rt ~ IsExperimental + (1 | participant) + (1 | end)
##    Data: df5[df5$IsBilingual == "1", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  59176.2  59208.4 -29583.1  59166.2     4619 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8847 -0.6809 -0.1675  0.5351  4.1117 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  3612     60.10  
##  participant (Intercept)  6421     80.13  
##  Residual                18899    137.47  
## Number of obs: 4624, groups:  end, 185; participant, 59
## 
## Fixed effects:
##                 Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)      745.131     12.480 104.203  59.708  < 2e-16 ***
## IsExperimental1  -30.446      9.783 167.652  -3.112  0.00218 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## IsExprmntl1 -0.384

Add interaction as a predictor

m_inter_c<-lmer(exp_resp.rt~IsExperimental * IsChineseIdiom +(1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 
anova(m_inter_c, m2_c)

Add word frequency as a predictor

m1_1_c<-lmer(exp_resp.rt~IsExperimental + scale(freq_HAL) +
                  (1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 

anova(m1_c, m1_1_c)

Add Chinese familiarity rating as a predictor

m1_2_c<-lmer(exp_resp.rt~IsExperimental + scale(freq_HAL) + CN_rating+
                  (1|participant)+(1|end), df5[df5$IsBilingual=='1',], REML=FALSE) 

anova(m1_2_c,m1_1_c)

Final model output

summary(m1_1_c)
## Linear mixed model fit by maximum likelihood . t-tests use
##   Satterthwaite's method [lmerModLmerTest]
## Formula: 
## exp_resp.rt ~ IsExperimental + scale(freq_HAL) + (1 | participant) +  
##     (1 | end)
##    Data: df5[df5$IsBilingual == "1", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  59164.5  59203.2 -29576.3  59152.5     4618 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8922 -0.6826 -0.1690  0.5328  4.1082 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  end         (Intercept)  3294     57.40  
##  participant (Intercept)  6418     80.11  
##  Residual                18899    137.47  
## Number of obs: 4624, groups:  end, 185; participant, 59
## 
## Fixed effects:
##                 Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)      744.244     12.344 100.670  60.293  < 2e-16 ***
## IsExperimental1  -30.077      9.424 167.842  -3.191 0.001690 ** 
## scale(freq_HAL)  -18.114      4.817 162.094  -3.761 0.000236 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) IsExp1
## IsExprmntl1 -0.374       
## scl(fr_HAL)  0.016 -0.012

References:
http://www.sfs.uni-tuebingen.de/~hbaayen/publications/baayenDavidsonBates.pdf

library(texreg)
## Version:  1.36.23
## Date:     2017-03-03
## Author:   Philip Leifeld (University of Glasgow)
## 
## Please cite the JSS article in your publications -- see citation("texreg").
htmlreg(list(null_m, m_inter), 
          file='texreg_all.doc',
          single.row = T, 
          stars = numeric(0))
## The table was written to the file 'texreg_all.doc'.
htmlreg(list(n_m_e, m2_e, m_inter_e, m_inter2_e), 
          file='texreg_en.doc',
          single.row = T, 
          stars = numeric(0))
## The table was written to the file 'texreg_en.doc'.
htmlreg(list(n_m_c, m1_c, m1_1_c), 
          file='texreg_ch.doc',
          single.row = T, 
          stars = numeric(0))
## The table was written to the file 'texreg_ch.doc'.