Referential Complexity Analyses

M. Lewis

14 July 2014

Analyses:

Google cross-linguistic analyses
(A) Correlation between all lengths
(B) Correlation between all lengths, controling for frequency, open class only
© Correlation between all lengths and complexity, controling for frequency
(D) Translation check data

High frequency words in mapping task

Novel real objects
(A) Norms
(B) Mappping task (adults)
© Mapping task (children) TO DO
(D) Production task (labels + descriptions)

Geons TO DO
(A) Norms
(B) Mappping task

(1) Google cross-linguistic analyses (Complexity norms task)

read in data

xling = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/corpus/xling/RC_xling_clean_WITH_checks.csv") 
norms.lf = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Experiment_26/Analysis/complexity_word_norms.csv") #LF complexity norms
norms.b = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/corpus/brysbaert_database/brysbaert_corpus.csv",header=TRUE) #Brysbaert concreteness 
norms.mrc = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/corpus/MRC_database/MRC_corpus.csv") #MRC
freqs = read.table("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/corpus/MRC_database/SUBTLEXusDataBase.txt",header=TRUE) #subtlexus frequency

merge norms to xling df

#lf
index = match(xling$ENGLISH, norms.lf$word)
xling$lf.complex = norms.lf$complexity[index]

#brysbaert
index = match(xling$ENGLISH, norms.b$Word)
xling$b.conc = norms.b$Conc.M[index]

#mrc
index = match(xling$ENGLISH, norms.mrc$word)
xling$mrc.fam = norms.mrc$mrc.fam[index]
xling$mrc.conc = norms.mrc$mrc.conc[index]
xling$mrc.imag = norms.mrc$mrc.imag[index]
xling$mrc.phon = norms.mrc$mrc.phon[index]
xling$mrc.syl = norms.mrc$mrc.syl[index]

#frequency
index <- match(xling$ENGLISH, freqs$Word)
xling$log.E.freq <- freqs$Lg10WF[index]

Word class distribution

xling$Open_class = as.factor(xling$Open_class)
counts = as.data.frame(summary(xling$Open_class))
counts$class = c("closed class", "open class bare", "open class inflected")
names(counts) = c("freq", "class")

ggplot(counts, aes(class, freq, fill = class)) + 
  geom_bar(stat = "identity") +
  ggtitle("Word types in corpus")

plot of chunk unnamed-chunk-4

(A) Correlation between all lengths

lens = c(which(grepl("LEN",names(xling)))) # get length column indices
col1 <- colorRampPalette(c("blue", "white" , "red"))

## Correlations between all lengths, all words
xling_len = xling[, lens] 
names(xling_len) = as.character(tolower(lapply(str_split(names(xling_len),"_"),function(x) {x[1]})))

# Correlations between all lengths
cmat = cor(xling_len, use = "pairwise.complete.obs")
corrplot(cmat,  tl.cex=.5, tl.srt=45, method = "color", tl.col = "black" ,col =col1(100),order = "FPC")

mean(cmat)

## [1] 0.3206

## Correlations between all lengths, open class words only
xlingO = xling[xling$Open_class != 0,lens] 
names(xlingO) = as.character(tolower(lapply(str_split(names(xlingO),"_"),function(x) {x[1]})))

# correlations between all lenghts
cmat = cor(xlingO, use = "pairwise.complete.obs")
corrplot(cmat,  tl.cex=.5, tl.srt=45, method = "color", tl.col = "black" ,col =col1(100), order = "FPC")

mean(cmat)

## [1] 0.2897

(B) Correlation between all lengths, controling for frequency, open class only

## open class words only
xlingOF = xling[xling$Open_class !=0 ,c(lens, which(names(xling)== "log.E.freq"))] 
names(xlingOF) = as.character(tolower(lapply(str_split(names(xlingOF),"_"),function(x) {x[1]})))

# correlations between all lengths, open class only
cmat.p = partial.r(xlingOF,1:81,82 )
# sorted by first principle component
corrplot(cmat.p,  tl.cex=.5, tl.srt=45,  order = "FPC", method = "color", tl.col = "black" ,col =col1(100))

# sorted by  angular order of the eigenvectors.
corrplot(cmat.p,  tl.cex=.5, tl.srt=45,  order = "AOE", method = "color", tl.col = "black" ,col =col1(100))

# sorted by hierarchical clustering
corrplot(cmat.p,  tl.cex=.5, tl.srt=45,  order = "hclus", method = "color", tl.col = "black", col =col1(100) )

mean(cmat.p)

## [1] 0.2201

© Correlation between all lengths and complexity, controling for frequency

### Plot with bootsrapped CIs on pearsons are, and parial frequencies
ggplot(c_l, aes(language, corr)) + 
  geom_bar(stat = "identity", fill = "red") + 
  ylab("Pearson's r") + xlab("Language") + 
  ggtitle("Correlation between word length and complexity norms") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_linerange(aes(ymax=upper.ci, ymin=lower.ci)) +
  geom_point(data=c_l, mapping=aes(x=language, y=p.corr), size=2, shape = 17)

#verify correlations by looking at English
partial.r(xlingOC,c(1,which(names(xlingOC) == "lf.complex")), which(names(xlingOC) == "log.e.freq"))

## partial correlations 
##            english lf.complex
## english       1.00       0.57
## lf.complex    0.57       1.00

cor(xlingOC$english,xlingOC$lf.complex, use = "pairwise")

## [1] 0.6387

# mean correlation
mean(c_l$corr)

## [1] 0.3089

(D) Translation checking data

checksR = read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/corpus/xling/translation_accuracy.csv")[1:500,]

index <- match(checksR$ENGLISH, xling$ENGLISH)
checksR$class <- xling$Open_class[index]
row.names(checksR) = checksR$ENGLISH
checksR$ENGLISH <- NULL
checksR$class = as.numeric(as.character(checksR$class))
accuracy = colSums(checksR[1:12], dims = 1)/ dim(checksR)[1]
mean(accuracy)

## [1] 0.919

dfa = as.data.frame(accuracy)
dfa$lang = row.names(dfa)

plot accuracy

#plot
ggplot(dfa, aes(lang, accuracy, fill = lang)) + 
  geom_bar(stat = "identity") + 
  xlab("Language") + 
  ggtitle("Google Translate Check Accuracy") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

accuracy_open = colSums(checksR[checksR$class != 0,1:12], dims = 1)/ dim(checksR[checksR$class != 0,])[1]
mean(accuracy_open)

## [1] 0.922

accuracy_open_bare = colSums(checksR[checksR$class ==1,1:12], dims = 1)/ dim(checksR[checksR$class == 1,])[1]
mean(accuracy_open_bare)

## [1] 0.923

ISSUES

how to count characters (chinese are the same)
bad word: peso
remove participants who have participated in previous studies?

(2) High frequency words in mapping task (Task)

read in data and prep variables

merge in stuff

get quintiles

aggregate by word

ms <- aggregate(LongBet  ~ engWord + norms.lf + l.freq + norms.lf_ci + quintile, data=d, mean)
ms$n <- aggregate(LongBet  ~ engWord + norms.lf + l.freq+ norms.lf_ci, data=d, n.unique)$workerid
ms$bet_ci <- aggregate(LongBet  ~ engWord + norms.lf+ l.freq+ norms.lf_ci, data=d, ci95)$LongBet

plot bet to long word vs. complexity norms

ggplot(ms, aes(norms.lf, LongBet)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbarh(aes(xmin=norms.lf-norms.lf_ci, xmax=norms.lf+norms.lf_ci), size=0.2, colour="grey") +
  geom_errorbar(aes(ymin=LongBet-bet_ci, ymax=LongBet+bet_ci), size=0.2, colour="grey") +
  annotate("text", x=6, y=25, label=paste("r=",round(cor(ms$norms.lf, ms$LongBet, use = "complete"), 2)))+
  xlab("Complexity Norms") +
  ylab("Bet to Long Word") +
  geom_vline(xintercept = q, col = "red") +
  ggtitle("High Frequency meanings (words)")

plot of chunk unnamed-chunk-16

correlation between norms and length

# correlation between norms and bets to long word (all)
cor.test(d$LongBet,d$norms.lf)

## 
##  Pearson's product-moment correlation
## 
## data:  d$LongBet and d$norms.lf
## t = 4.229, df = 1998, p-value = 2.459e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05056 0.13744
## sample estimates:
##     cor 
## 0.09418

bm.partial(d$LongBet,d$norms.lf, d$l.freq )

## [1] 0.0904

#partial.r(d[,c(4,8,10)],c(1,2),3 )

# correlation between norms and bets to long word (aggregated across words)
cor.test(ms$LongBet,ms$norms.lf)

## 
##  Pearson's product-moment correlation
## 
## data:  ms$LongBet and ms$norms.lf
## t = 2.998, df = 94, p-value = 0.003476
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1009 0.4682
## sample estimates:
##    cor 
## 0.2954

bm.partial(ms$LongBet,ms$norms.lf, ms$l.freq )

## [1] 0.3163

summary(lmer(LongBet ~ norms.lf + l.freq + (1|trial) + (1|workerid), d))

## Linear mixed model fit by REML ['lmerMod']
## Formula: LongBet ~ norms.lf + l.freq + (1 | trial) + (1 | workerid)
##    Data: d
## 
## REML criterion at convergence: 17981
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2832 -0.7451  0.0176  0.7584  2.3186 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  workerid (Intercept)  15.4     3.93   
##  trial    (Intercept)   1.6     1.26   
##  Residual             634.9    25.20   
## Number of obs: 1931, groups: workerid, 200; trial, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   53.188      3.037   17.51
## norms.lf       2.779      0.660    4.21
## l.freq        -3.860      0.624   -6.19
## 
## Correlation of Fixed Effects:
##          (Intr) nrms.l
## norms.lf -0.728       
## l.freq   -0.665  0.035

plot by quintiles

#aggregate by quintile
ms <- aggregate(LongBet  ~ quintile , data=d, mean)
ms$n <- aggregate(LongBet  ~ quintile, data=d, n.unique)$workerid
ms$bet_ci <- aggregate(LongBet  ~ quintile, data=d, ci95)$LongBet  

ggplot(ms, aes(quintile, LongBet)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbar(aes(ymin=LongBet-bet_ci, ymax=LongBet+bet_ci), size=0.2, colour="black") +
  annotate("text", x=5, y=25, label=paste("r=",round(cor(ms$quintile, ms$LongBet, use = "complete"), 2)))+
  scale_y_continuous(limits = c(20, 80)) +
  #scale_x_continuous(limits = c(0, 7), breaks = 1:7, labels = 1:7)  +
  theme(axis.title=element_text(size=20), axis.text=element_text(size=15)) +
  xlab("Complexity Norm quintile") +
  ylab("Bet to Long Word") +
  ggtitle("High Frequency meanings (words)")

plot of chunk unnamed-chunk-18

correlations with quintiles

# correlation between norms quintiles and bets to long word 
cor.test(d$quintile, d$LongBet)

## 
##  Pearson's product-moment correlation
## 
## data:  d$quintile and d$LongBet
## t = 3.808, df = 1998, p-value = 0.0001443
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04121 0.12824
## sample estimates:
##     cor 
## 0.08489

# correlation between norms quintiles and bets to long word (aggregated across words)
cor.test(ms$quintile, ms$LongBet)

## 
##  Pearson's product-moment correlation
## 
## data:  ms$quintile and ms$LongBet
## t = 2.315, df = 3, p-value = 0.1035
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2778  0.9862
## sample estimates:
##    cor 
## 0.8007

residual quintiles (controls for frequency)

get quintiles

###Plot

#aggregate by quintile
ms.qr <- aggregate(LongBet  ~ resid.quintile , data=d, mean)
ms.qr$n <- aggregate(LongBet  ~ resid.quintile, data=d, n.unique)$workerid
ms.qr$bet_ci <- aggregate(LongBet  ~ resid.quintile, data=d, ci95)$LongBet  

ggplot(ms.qr, aes(resid.quintile, LongBet)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbar(aes(ymin=LongBet-bet_ci, ymax=LongBet+bet_ci), size=0.2, colour="black") +
  annotate("text", x=5, y=25, label=paste("r=",round(cor(d$resid.quintile, d$LongBet, use = "complete"), 2)))+
  scale_y_continuous(limits = c(20, 80)) +
  #scale_x_continuous(limits = c(0, 7), breaks = 1:7, labels = 1:7)  +
  theme(axis.title=element_text(size=20), axis.text=element_text(size=15)) +
  xlab("RESIDUAL Complexity Norm quintile") +
  ylab("Bet to Long Word") +
ggtitle("High Frequency meanings (words)")

plot of chunk unnamed-chunk-21

residual quintiles correlations

cor.test(d$resid.quintile, d$LongBet) ### highly correlated

## 
##  Pearson's product-moment correlation
## 
## data:  d$resid.quintile and d$LongBet
## t = 3.96, df = 1910, p-value = 7.768e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0456 0.1345
## sample estimates:
##     cor 
## 0.09024

cor.test(ms.qr$resid.quintile, ms.qr$LongBet)

## 
##  Pearson's product-moment correlation
## 
## data:  ms.qr$resid.quintile and ms.qr$LongBet
## t = 2.512, df = 3, p-value = 0.08678
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2156  0.9879
## sample estimates:
##    cor 
## 0.8233

(3) Novel real objects

(A) Norms Complexity norming task RT task

read in complexity norms and RT norms for objects

c_norms <- read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Experiment_9_norm/complicated1AND2_norms.csv")
rt_norms <- read.csv("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Experiment_30/Analysis/rt_norms.csv")

(B) Mapping task (adults) (Task)

read in data and format

make everything factors

merge in norms

get effect sizes

get obj conds

ratio plots

ggplot(de, aes(y=effect_size, x=objRatio)) +
  geom_pointrange(aes(ymax = cill, ymin=ciul))+
  geom_hline(yintercept=0,lty=2) +
  stat_smooth(method="lm") +
  geom_text(aes(objRatio+.03, effect_size, label=objCondition)) +
  ylab("effect size") +
  xlab("object ratio") +
  theme(text = element_text(size=20), plot.title = element_text(size=20)) +
  ggtitle("Object ratio vs. effect size") +
  annotate("text", x=.3, y=-.2, col = "red",label=paste("r=",round(cor(de$effect_size, de$objRatio, use = "complete"), 2)))

plot of chunk unnamed-chunk-29

ggplot(de, aes(y=effect_size, x=c.Mratio)) +
  geom_pointrange(aes(ymax = cill, ymin=ciul),position="dodge")+
  geom_hline(yintercept=0,lty=2) +
  stat_smooth(method="lm") +
  geom_text(aes(c.Mratio+.02, effect_size, label=objCondition), position="dodge") +
  ylab("effect size") +
  xlab("object ratio") + 
  ggtitle("complexity ratio vs. effect size") +
  theme(text = element_text(size=20), plot.title = element_text(size=20)) +
  annotate("text", x=.5, y=-.2, col = "red",label=paste("r=",round(cor(de$effect_size, de$c.Mratio, use = "complete"), 2)))

## ymax not defined: adjusting position using y instead

plot of chunk unnamed-chunk-29

ggplot(de, aes(y=effect_size, x=rt.Mratio)) +
  geom_pointrange( aes(ymax = cill, ymin=ciul))+
  geom_hline(yintercept=0,lty=2) +
  stat_smooth(method="lm") +
  geom_text(aes(rt.Mratio+.0008, effect_size, label=objCondition)) +
  ylab("effect size") +
  xlab("RT ratio") +
   ggtitle("RT ratio vs. effect size") +
  theme(text = element_text(size=20), plot.title = element_text(size=20)) +
  annotate("text", x=.985, y=-.2, col = "red",label=paste("r=",round(cor(de$effect_size, de$rt.Mratio, use = "complete"), 2)))

plot of chunk unnamed-chunk-29

correlations between effect size at complexity conditions

cor.test(de$objRatio, de$effect_size)

## 
##  Pearson's product-moment correlation
## 
## data:  de$objRatio and de$effect_size
## t = -3.051, df = 13, p-value = 0.009273
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8703 -0.1999
## sample estimates:
##    cor 
## -0.646

cor.test(de$c.Mratio, de$effect_size)

## 
##  Pearson's product-moment correlation
## 
## data:  de$c.Mratio and de$effect_size
## t = -3.494, df = 13, p-value = 0.00396
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8907 -0.2854
## sample estimates:
##     cor 
## -0.6959

cor.test(de$rt.Mratio, de$effect_size)

## 
##  Pearson's product-moment correlation
## 
## data:  de$rt.Mratio and de$effect_size
## t = -3.659, df = 13, p-value = 0.002887
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8972 -0.3149
## sample estimates:
##     cor 
## -0.7123

(D) Production task (labels + desecriptions)

(1) Labels (Task)

read in data and prep data frame

relationship between condition and description length

t.test(md[md$condition == '"complex"',"log.length"],md[md$condition == '"simple"',"log.length"],paired = TRUE)

## 
##  Paired t-test
## 
## data:  md[md$condition == "\"complex\"", "log.length"] and md[md$condition == "\"simple\"", "log.length"]
## t = 3.735, df = 286, p-value = 0.0002269
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03846 0.12417
## sample estimates:
## mean of the differences 
##                 0.08131

summary(lmer(log.length~condition + (1+trial|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length ~ condition + (1 + trial | workerid)
##    Data: md
## 
## REML criterion at convergence: -0.4
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -4.009 -0.626  0.061  0.633  2.773 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  workerid (Intercept) 1.53e-02 0.12358      
##           trial       3.40e-06 0.00184  1.00
##  Residual             4.92e-02 0.22189      
## Number of obs: 574, groups: workerid, 59
## 
## Fixed effects:
##                   Estimate Std. Error t value
## (Intercept)         1.9350     0.0218    88.7
## condition"simple"  -0.0797     0.0185    -4.3
## 
## Correlation of Fixed Effects:
##             (Intr)
## cndtn"smpl" -0.427

relationship with complicated norms

index <- match(md$picture, c_norms$ratingNum)
md$c.norms <- c_norms$value[index]

ms <- aggregate(log.length ~ c.norms + picture, data=md, mean)
ms$cih <- aggregate(log.length ~ c.norms + picture, data=md, ci.high)$log.length
ms$cil <- aggregate(log.length ~ c.norms + picture, data=md, ci.low)$log.length
ms$n <- aggregate(workerid ~ c.norms + picture, data=md, n.unique)$workerid

#plot
ggplot(ms, aes(c.norms,log.length)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbar(aes(ymax=log.length+cih,ymin=log.length-cil), size=0.2, colour="grey") +
  theme_bw() +
  xlab("Object Complexity Norms") +
  ylab("Log Word Length (characters)") +
  theme(axis.title=element_text(size=20), axis.text=element_text(size=15))

plot of chunk unnamed-chunk-33

relationship with RT norms

index <- match(md$picture, rt_norms$Answer.train_image)
md$rt.norms <- rt_norms$log.rt[index]

ms <- aggregate(log.length ~ rt.norms + picture, data=md, mean)
ms$cih <- aggregate(log.length ~ rt.norms + picture, data=md, ci.high)$log.length
ms$cil <- aggregate(log.length ~ rt.norms + picture, data=md, ci.low)$log.length
ms$n <- aggregate(workerid ~ rt.norms + picture, data=md, n.unique)$workerid

#plot
ggplot(ms, aes(rt.norms,log.length)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbar(aes(ymax=log.length+cih,ymin=log.length-cil), size=0.2, colour="grey") +
  theme_bw() +
  xlab("Object Complexity Norms") +
  ylab("Log Word Length (characters)") +
  theme(axis.title=element_text(size=20), axis.text=element_text(size=15))

plot of chunk unnamed-chunk-34

(2) Descriptions (Task)

read in data and prep data frame

relationship between condition and description length

#summary(lmer(length_c~condition + (1|workerid), md))
#summary(lmer(length_c~condition + trial + (1+trial|workerid), md))

summary(lmer(log.length_c~md$condition + (1|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length_c ~ md$condition + (1 | workerid)
##    Data: md
## 
## REML criterion at convergence: 863.1
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.004 -0.616  0.071  0.599  4.269 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  workerid (Intercept) 0.476    0.690   
##  Residual             0.175    0.418   
## Number of obs: 600, groups: workerid, 60
## 
## Fixed effects:
##                      Estimate Std. Error t value
## (Intercept)            3.3545     0.0922    36.4
## md$condition"simple"  -0.1128     0.0342    -3.3
## 
## Correlation of Fixed Effects:
##             (Intr)
## md$cndtn"s" -0.185

summary(lmer(log.length_c~condition + trial + (1+trial|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length_c ~ condition + trial + (1 + trial | workerid)
##    Data: md
## 
## REML criterion at convergence: 823.1
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.185 -0.559  0.071  0.594  3.841 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  workerid (Intercept) 0.43168  0.6570        
##           trial       0.00177  0.0421   -0.02
##  Residual             0.14949  0.3866        
## Number of obs: 600, groups: workerid, 60
## 
## Fixed effects:
##                   Estimate Std. Error t value
## (Intercept)        3.54229    0.09310    38.0
## condition"simple" -0.12772    0.03260    -3.9
## trial             -0.03278    0.00773    -4.2
## 
## Correlation of Fixed Effects:
##             (Intr) cndt""
## cndtn"smpl" -0.189       
## trial       -0.252  0.031

## plot
ggplot(md, aes(x=log.length_c, fill=condition)) + geom_density(alpha = 0.2)

plot of chunk unnamed-chunk-36

correlations with complexity norms

index <- match(md$picture, c_norms$ratingNum)
md$c.norms <- c_norms$value[index]

summary(lmer(log.length_c~c.norms + (1+trial|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length_c ~ c.norms + (1 + trial | workerid)
##    Data: md
## 
## REML criterion at convergence: 826.8
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.272 -0.551  0.069  0.587  3.893 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  workerid (Intercept) 0.44295  0.6655        
##           trial       0.00275  0.0524   -0.11
##  Residual             0.14874  0.3857        
## Number of obs: 600, groups: workerid, 60
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   3.2392     0.0944    34.3
## c.norms       0.2921     0.0691     4.2
## 
## Correlation of Fixed Effects:
##         (Intr)
## c.norms -0.346

summary(lmer(log.length_c~c.norms + trial + (1|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length_c ~ c.norms + trial + (1 | workerid)
##    Data: md
## 
## REML criterion at convergence: 836
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.062 -0.625  0.060  0.597  4.042 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  workerid (Intercept) 0.477    0.690   
##  Residual             0.165    0.406   
## Number of obs: 600, groups: workerid, 60
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  3.34180    0.10143    32.9
## c.norms      0.28220    0.07006     4.0
## trial       -0.03248    0.00577    -5.6
## 
## Correlation of Fixed Effects:
##         (Intr) c.nrms
## c.norms -0.322       
## trial   -0.304 -0.027

# complexity norms predict length

cor.test(md$log.length_c,md$c.norms)

## 
##  Pearson's product-moment correlation
## 
## data:  md$log.length_c and md$c.norms
## t = 1.965, df = 598, p-value = 0.04993
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  3.457e-05 1.591e-01
## sample estimates:
##     cor 
## 0.08008

complexity norms plot

ms <- aggregate(log.length_c ~ c.norms + picture, data=md, mean)
ms$cih <- aggregate(log.length_c ~ c.norms + picture, data=md, ci.high)$log.length_c
ms$cil <- aggregate(log.length_c ~ c.norms + picture, data=md, ci.low)$log.length_c
ms$n <- aggregate(workerid ~ c.norms + picture, data=md, n.unique)$workerid

ggplot(ms, aes(c.norms,log.length_c)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbar(aes(ymax=log.length_c+cih,ymin=log.length_c-cil), size=0.2, colour="grey") +
  theme_bw() +
  xlab("Object Complexity Norms") +
  ylab("Log Description Length (characters)") +
  theme(axis.title=element_text(size=20), axis.text=element_text(size=15))

plot of chunk unnamed-chunk-38

correlations with RT norms

index <- match(md$picture, rt_norms$Answer.train_image)
md$rt.norms <- rt_norms$log.rt[index]

summary(lmer(log.length_c~rt.norms + (1+trial|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length_c ~ rt.norms + (1 + trial | workerid)
##    Data: md
## 
## REML criterion at convergence: 825.6
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.306 -0.589  0.083  0.590  3.705 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  workerid (Intercept) 0.43683  0.6609        
##           trial       0.00277  0.0527   -0.08
##  Residual             0.14889  0.3859        
## Number of obs: 600, groups: workerid, 60
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   -2.262      1.373   -1.65
## rt.norms       0.770      0.187    4.12
## 
## Correlation of Fixed Effects:
##          (Intr)
## rt.norms -0.998

summary(lmer(log.length_c~rt.norms + trial + (1|workerid), md))

## Linear mixed model fit by REML ['lmerMod']
## Formula: log.length_c ~ rt.norms + trial + (1 | workerid)
##    Data: md
## 
## REML criterion at convergence: 836.3
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -3.119 -0.587  0.079  0.615  3.803 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  workerid (Intercept) 0.481    0.693   
##  Residual             0.165    0.406   
## Number of obs: 600, groups: workerid, 60
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -1.71467    1.39726   -1.23
## rt.norms     0.70745    0.19008    3.72
## trial       -0.03215    0.00578   -5.57
## 
## Correlation of Fixed Effects:
##          (Intr) rt.nrm
## rt.norms -0.998       
## trial    -0.009 -0.014

#rt norms predict length

cor.test(md$log.length_c,md$rt.norms)

## 
##  Pearson's product-moment correlation
## 
## data:  md$log.length_c and md$rt.norms
## t = 0.848, df = 598, p-value = 0.3968
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04551  0.11438
## sample estimates:
##     cor 
## 0.03466

rt norms plot

ms <- aggregate(log.length_c ~ rt.norms + picture, data=md, mean)
ms$cih <- aggregate(log.length_c ~ rt.norms + picture, data=md, ci.high)$log.length_c
ms$cil <- aggregate(log.length_c ~ rt.norms + picture, data=md, ci.low)$log.length_c
ms$n <- aggregate(workerid ~ rt.norms + picture, data=md, n.unique)$workerid

ggplot(ms, aes(rt.norms,log.length_c)) +
  geom_point() + 
  geom_smooth(method = "lm", color="blue", formula = y ~ x) +
  geom_errorbar(aes(ymax=log.length_c+cih,ymin=log.length_c-cil), size=0.2, colour="grey") +
  theme_bw() +
  xlab("Object RT Norms") +
  ylab("Log Description Length (characters)") +
  theme(axis.title=element_text(size=20), axis.text=element_text(size=15))

plot of chunk unnamed-chunk-40

# reliable when control for random effects

Referential Complexity Analyses

M. Lewis

14 July 2014

Analyses:

read in data

merge norms to xling df

Word class distribution

(A) Correlation between all lengths

(B) Correlation between all lengths, controling for frequency, open class only

© Correlation between all lengths and complexity, controling for frequency

(D) Translation checking data

plot accuracy

ISSUES

read in data and prep variables

merge in stuff

get quintiles

aggregate by word

plot bet to long word vs. complexity norms

correlation between norms and length

plot by quintiles

correlations with quintiles

residual quintiles (controls for frequency)

get quintiles

residual quintiles correlations

(3) Novel real objects

read in complexity norms and RT norms for objects

read in data and format

make everything factors

merge in norms

get effect sizes

get obj conds

ratio plots

correlations between effect size at complexity conditions

(D) Production task (labels + desecriptions)

read in data and prep data frame

relationship between condition and description length

relationship with complicated norms

relationship with RT norms

(2) Descriptions (Task)

read in data and prep data frame

relationship between condition and description length

correlations with complexity norms

complexity norms plot

correlations with RT norms

rt norms plot

(4) Geons