1 Data preprocessing

Preliminaries.

## [1] "dplyr"   "langcog" "tidyr"   "ggplot2" "lme4"
## 
## Attaching package: 'langcog'
## The following object is masked from 'package:base':
## 
##     scale
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## %+%():    ggplot2, psych
## alpha():  ggplot2, psych
## filter(): dplyr, stats
## lag():    dplyr, stats
## 
## Attaching package: 'ggthemes'
## The following objects are masked from 'package:langcog':
## 
##     scale_color_solarized, scale_colour_solarized,
##     scale_fill_solarized
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step

Read in participant data.

data <- read.csv("data.csv", header =TRUE)
dem <- read.csv("parenting_proj_emilyhembacher_demo2016.csv", header =TRUE)
conditions <- read.csv("joint_attention/conditions.csv")
load("paq/paq_demo.RData")

#fix ids
data$SID <- stringr::str_trim(data$SID)
data$SID <- stringr::str_replace(data$SID, "_cut.txt", "")
data$SID <- stringr::str_replace(data$SID, "8283_", "")
data$SID <- stringr::str_replace(data$SID, "283_", "")
data$SID <- stringr::str_replace(data$SID, "_cut_save.txt", "")
data$SID <- stringr::str_replace(data$SID, "_cut_save", "")
data$SID <- stringr::str_replace(data$SID, "_cutsave.txt", "")
data$SID <- stringr::str_replace(data$SID, "parenting_obs_", "0")
data$SID <- stringr::str_replace(data$SID, "_cut_m4a.txt", "")

data$SID <- stringr::str_replace(data$SID, "05116_05", "050116_05")
data$SID <- stringr::str_replace(data$SID, "05116_2", "050116_02")

Make data frames.

d <- left_join(data, conditions)%>%
  left_join(dem)%>%
  transmute(sid=SID, 
            types = Type.count, 
            tokens = Token.count, 
            lexdiv = Lexical.diversity, 
            condition= Condition, 
            video = Video, 
            age = age,
            gender = gender,
            parent_ed = parent_ed)%>%
  left_join(ids)%>%
  filter(!is.na(condition))

2 Plots

2.1 Lexical Diversity

ms_lex <- d %>%
  group_by(condition) %>%
  multi_boot_standard(col = "lexdiv") 

ggplot(ms_lex, aes(x = condition, y = mean, fill = condition)) + 
  geom_bar(stat="identity") + 
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
             position = position_dodge(width = .9))+
  xlab("Condition") + 
  ylab("Lexical Diversity") +
  langcog::scale_colour_solarized()  +
  ggthemes::theme_few() 

2.2 Word Tokens

ms_tok <- d %>%
  group_by(condition) %>%
  multi_boot_standard(col = "tokens") 

ggplot(ms_tok, aes(x = condition, y = mean, fill = condition)) + 
  geom_bar(stat="identity") + 
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
             position = position_dodge(width = .9))+
  xlab("Condition") + 
  ylab("Total Number of Word Tokens") +
  langcog::scale_colour_solarized()  +
  ggthemes::theme_few() 

2.3 Word Types

ms_type <- d %>%
  group_by(condition) %>%
  multi_boot_standard(col = "types") 

ggplot(ms_tok, aes(x = condition, y = mean, fill = condition)) + 
  geom_bar(stat="identity") + 
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
             position = position_dodge(width = .9))+
  xlab("Condition") + 
  ylab("Total Number of Word Types") +
  langcog::scale_colour_solarized()  +
  ggthemes::theme_few() 

3 Analyses

Prepare data.

lmer_data <- d %>%
  filter(!is.na(AA), !is.na(EL), !is.na(RR))%>%
  mutate(condition = factor(condition), 
         lexdiv = as.numeric(lexdiv),
         EL = as.numeric(langcog::scale(EL, scale=FALSE)),
         AA = as.numeric(langcog::scale(AA, scale=FALSE)),
         RR = as.numeric(langcog::scale(RR, scale=FALSE)),
         age = as.numeric(langcog::scale(age, scale=FALSE)),
         gender = as.factor(gender),
         video = as.factor(video))

3.1 Lexical diversity

Predicting lexical diversity based on experimental condition, PAQ, demographics.

maximal_mod <- lmer(lexdiv ~ condition *  EL  + condition * AA + condition * RR  + age + gender + parent_ed +
                           (1| video), 
                         data = lmer_data)
summary(maximal_mod)
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: 
## lexdiv ~ condition * EL + condition * AA + condition * RR + age +  
##     gender + parent_ed + (1 | video)
##    Data: lmer_data
## 
## REML criterion at convergence: -41.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8678 -0.5672 -0.1596  0.5007  2.3151 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.
##  video    (Intercept) 8.055e-05 0.008975
##  Residual             9.955e-03 0.099776
## Number of obs: 50, groups:  video, 6
## 
## Fixed effects:
##                  Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)      0.514448   0.059614 27.330000   8.630 2.74e-09 ***
## conditionexp    -0.107469   0.032222 37.690000  -3.335  0.00192 ** 
## EL              -0.028839   0.057046 38.970000  -0.506  0.61603    
## AA               0.023258   0.046785 38.840000   0.497  0.62190    
## RR              -0.042137   0.024417 38.310000  -1.726  0.09245 .  
## age              0.029031   0.036707  5.520000   0.791  0.46161    
## genderM         -0.021122   0.033301 38.670000  -0.634  0.52963    
## parent_ed       -0.015315   0.012676 33.930000  -1.208  0.23532    
## conditionexp:EL  0.053452   0.079118 31.840000   0.676  0.50417    
## conditionexp:AA -0.011718   0.060339 36.000000  -0.194  0.84710    
## conditionexp:RR  0.002142   0.037569 38.910000   0.057  0.95482    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtnx EL     AA     RR     age    gendrM prnt_d cnd:EL
## conditionxp  0.048                                                        
## EL           0.082  0.196                                                 
## AA          -0.229  0.189 -0.101                                          
## RR          -0.312 -0.134 -0.443 -0.020                                   
## age          0.146  0.080  0.175 -0.115 -0.064                            
## genderM     -0.098  0.028 -0.082  0.179  0.150 -0.158                     
## parent_ed   -0.906 -0.340 -0.155  0.082  0.322 -0.136 -0.094              
## condtnxp:EL -0.219 -0.157 -0.748  0.086  0.378 -0.130  0.050  0.286       
## condtnxp:AA  0.319 -0.045  0.117 -0.812 -0.048  0.164 -0.239 -0.197 -0.277
## condtnxp:RR  0.102  0.097  0.256  0.036 -0.608 -0.054 -0.047 -0.109 -0.302
##             cnd:AA
## conditionxp       
## EL                
## AA                
## RR                
## age               
## genderM           
## parent_ed         
## condtnxp:EL       
## condtnxp:AA       
## condtnxp:RR -0.006

3.2 Word tokens

Predicting the number of word tokens based on experimental condition, PAQ, demographics.

maximal_mod <- lmer(tokens ~ condition *  EL  + condition * AA + condition * RR  + age + gender + parent_ed +
                           (1| video), 
                         data = lmer_data)
summary(maximal_mod)
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: 
## tokens ~ condition * EL + condition * AA + condition * RR + age +  
##     gender + parent_ed + (1 | video)
##    Data: lmer_data
## 
## REML criterion at convergence: 471
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.72599 -0.62886 -0.08587  0.66739  1.83112 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  video    (Intercept)  274.9   16.58   
##  Residual             4977.6   70.55   
## Number of obs: 50, groups:  video, 6
## 
## Fixed effects:
##                 Estimate Std. Error       df t value Pr(>|t|)  
## (Intercept)      87.9631    43.8688  30.0400   2.005   0.0540 .
## conditionexp     51.3530    22.9033  37.5500   2.242   0.0309 *
## EL               -3.4370    40.7816  38.7000  -0.084   0.9333  
## AA               -7.6797    33.5899  38.9900  -0.229   0.8204  
## RR               19.8059    17.3896  38.0300   1.139   0.2618  
## age             -26.0479    29.2139   6.2500  -0.892   0.4056  
## genderM           5.6041    23.7486  38.2700   0.236   0.8147  
## parent_ed        17.2185     9.2143  36.6400   1.869   0.0697 .
## conditionexp:EL  -5.5297    57.6030  36.6900  -0.096   0.9240  
## conditionexp:AA  39.4475    43.7104  37.5800   0.902   0.3726  
## conditionexp:RR  -0.4329    26.8385  38.5900  -0.016   0.9872  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtnx EL     AA     RR     age    gendrM prnt_d cnd:EL
## conditionxp  0.059                                                        
## EL           0.079  0.195                                                 
## AA          -0.245  0.170 -0.099                                          
## RR          -0.323 -0.140 -0.441 -0.006                                   
## age          0.127  0.054  0.122 -0.090 -0.038                            
## genderM     -0.083  0.022 -0.080  0.176  0.144 -0.129                     
## parent_ed   -0.902 -0.341 -0.152  0.105  0.336 -0.113 -0.102              
## condtnxp:EL -0.227 -0.156 -0.747  0.085  0.382 -0.093  0.047  0.294       
## condtnxp:AA  0.338 -0.024  0.119 -0.815 -0.066  0.138 -0.236 -0.223 -0.283
## condtnxp:RR  0.119  0.107  0.261  0.015 -0.610 -0.053 -0.047 -0.126 -0.311
##             cnd:AA
## conditionxp       
## EL                
## AA                
## RR                
## age               
## genderM           
## parent_ed         
## condtnxp:EL       
## condtnxp:AA       
## condtnxp:RR  0.023

3.3 Word types

Predicting the number of word types based on experimental condition, PAQ, demographics.

maximal_mod <- lmer(types ~ condition *  EL  + condition * AA + condition * RR  + age + gender + parent_ed +
                           (1| video), 
                         data = lmer_data)
summary(maximal_mod)
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: types ~ condition * EL + condition * AA + condition * RR + age +  
##     gender + parent_ed + (1 | video)
##    Data: lmer_data
## 
## REML criterion at convergence: 373.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.67957 -0.72517  0.07765  0.50239  2.16847 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  video    (Intercept)   0.0     0.00   
##  Residual             421.5    20.53   
## Number of obs: 50, groups:  video, 6
## 
## Fixed effects:
##                 Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)       52.971     12.160 39.000   4.356 9.31e-05 ***
## conditionexp      -5.348      6.623 39.000  -0.808   0.4242    
## EL                -7.319     11.708 39.000  -0.625   0.5355    
## AA                -7.009      9.594 39.000  -0.731   0.4694    
## RR                 2.312      5.016 39.000   0.461   0.6475    
## age               -4.739      7.368 39.000  -0.643   0.5239    
## genderM           -6.576      6.838 39.000  -0.962   0.3422    
## parent_ed          4.539      2.592 39.000   1.751   0.0877 .  
## conditionexp:EL    6.117     16.162 39.000   0.378   0.7072    
## conditionexp:AA   19.894     12.349 39.000   1.611   0.1152    
## conditionexp:RR   -2.656      7.712 39.000  -0.344   0.7324    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtnx EL     AA     RR     age    gendrM prnt_d cnd:EL
## conditionxp  0.046                                                        
## EL           0.082  0.196                                                 
## AA          -0.226  0.194 -0.102                                          
## RR          -0.309 -0.132 -0.443 -0.023                                   
## age          0.151  0.086  0.187 -0.120 -0.070                            
## genderM     -0.102  0.029 -0.082  0.180  0.151 -0.165                     
## parent_ed   -0.907 -0.339 -0.155  0.077  0.318 -0.142 -0.092              
## condtnxp:EL -0.217 -0.157 -0.748  0.086  0.377 -0.139  0.050  0.284       
## condtnxp:AA  0.314 -0.049  0.116 -0.811 -0.044  0.170 -0.239 -0.191 -0.275
## condtnxp:RR  0.098  0.095  0.255  0.040 -0.608 -0.055 -0.047 -0.106 -0.300
##             cnd:AA
## conditionxp       
## EL                
## AA                
## RR                
## age               
## genderM           
## parent_ed         
## condtnxp:EL       
## condtnxp:AA       
## condtnxp:RR -0.013

4 Conclusions

Both the number of tokens and types are higher in the experimental condition, while lexical diversity (type-token ratio) is higher in the control condition. Parents may be relatively more repetetive in the experimental condition since they are attempting to stick to a specific prescribed task, but they talk more overall! Demographics and PAQ do not interact with condition, but there is a marginal effect of RR score on lexical diversity (lower ld for higher RR scores), and marginal effects of parent education on word types and tokens (more types and tokens for higher parent ed).