Complexity and Roy et al. (2015) Wordbirth entropy analyses



Analysis: Is there a relationship between the Roy et al. (2015) wordbirth measures of entropy and our measures of complexity?

Three measures of entropy – temporal, spatial, and linguistic.



Read in data, etc.

roy = read.csv("roy_et_al_data.csv")
eng.complexity = read.csv("complexityNormsEnglishexp9.csv")

d = roy %>%
    inner_join(eng.complexity) 

d$word = as.factor(d$word)
d$mean.KL = rowMeans(cbind(d$srl.topic.KL, d$srl.temp.KL, d$srl.sp.KL))
m = cor(d[,c(-1,-3:-4,-14, -17:-18)], use = "complete")
corrplot(m)

ggplot(d, aes(x= complexity, y= srl.sp.KL,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) +
  themeML 

ggplot(d, aes(x= complexity, y= srl.topic.KL,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) +
  themeML 

ggplot(d, aes(x= complexity, y= srl.temp.KL,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) +
  themeML 

ggplot(d, aes(x= complexity, y= mean.KL,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) + 
  themeML 

ggplot(d, aes(x= complexity, y= aoa,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) +
  themeML 

ggplot(d, aes(x= complexity, y= sln.freq.pre,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) +
  themeML 

ggplot(d, aes(x= complexity, y= s.uttlen.pre,  label=word)) +
  geom_text(aes(label=word),hjust=0, vjust=0) +
  geom_smooth(method = "lm") +
  facet_grid(.~ small.cat) +
  themeML 

Some models

summary(lmer(complexity ~ srl.sp.KL +s.cmu.phon + sln.freq.pre + s.uttlen.pre +  (1|cdi.cat), d))
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## complexity ~ srl.sp.KL + s.cmu.phon + sln.freq.pre + s.uttlen.pre +  
##     (1 | cdi.cat)
##    Data: d
## 
## REML criterion at convergence: 218.4
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.75133 -0.66804 -0.01131  0.66170  2.48054 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  cdi.cat  (Intercept) 0.1705   0.4130  
##  Residual             0.4911   0.7008  
## Number of obs: 91, groups:  cdi.cat, 19
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)   2.71988    0.14618  18.607
## srl.sp.KL    -0.12239    0.07742  -1.581
## s.cmu.phon    0.48049    0.14743   3.259
## sln.freq.pre  0.11012    0.11436   0.963
## s.uttlen.pre -0.07002    0.12527  -0.559
## 
## Correlation of Fixed Effects:
##             (Intr) sr..KL s.cm.p sln.f.
## srl.sp.KL   -0.013                     
## s.cmu.phon   0.339 -0.005              
## sln.freq.pr -0.026 -0.241  0.450       
## s.uttlen.pr -0.248  0.410 -0.143 -0.395
summary(lmer(complexity ~ srl.temp.KL +s.cmu.phon + sln.freq.pre + s.uttlen.pre +  (1|cdi.cat), d))
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## complexity ~ srl.temp.KL + s.cmu.phon + sln.freq.pre + s.uttlen.pre +  
##     (1 | cdi.cat)
##    Data: d
## 
## REML criterion at convergence: 219
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.72558 -0.69846 -0.05592  0.61388  2.58257 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  cdi.cat  (Intercept) 0.1517   0.3895  
##  Residual             0.5037   0.7097  
## Number of obs: 91, groups:  cdi.cat, 19
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)   2.71235    0.14330  18.928
## srl.temp.KL  -0.10831    0.08568  -1.264
## s.cmu.phon    0.49020    0.14911   3.288
## sln.freq.pre  0.07396    0.11209   0.660
## s.uttlen.pre -0.02517    0.11822  -0.213
## 
## Correlation of Fixed Effects:
##             (Intr) sr..KL s.cm.p sln.f.
## srl.temp.KL  0.012                     
## s.cmu.phon   0.345 -0.081              
## sln.freq.pr -0.031 -0.064  0.472       
## s.uttlen.pr -0.259  0.237 -0.167 -0.345
summary(lm(complexity ~ srl.topic.KL +s.cmu.phon + sln.freq.pre + s.uttlen.pre, d))
## 
## Call:
## lm(formula = complexity ~ srl.topic.KL + s.cmu.phon + sln.freq.pre + 
##     s.uttlen.pre, data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39258 -0.56063 -0.04917  0.57341  2.40378 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.69086    0.10995  24.473   <2e-16 ***
## srl.topic.KL  0.05775    0.09699   0.595    0.553    
## s.cmu.phon    0.38858    0.16228   2.395    0.019 *  
## sln.freq.pre  0.07729    0.12463   0.620    0.537    
## s.uttlen.pre  0.02246    0.11528   0.195    0.846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.782 on 80 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.08181,    Adjusted R-squared:  0.0359 
## F-statistic: 1.782 on 4 and 80 DF,  p-value: 0.1406
summary(lm(complexity ~ srl.temp.KL +s.cmu.phon + sln.freq.pre + s.uttlen.pre, d))
## 
## Call:
## lm(formula = complexity ~ srl.temp.KL + s.cmu.phon + sln.freq.pre + 
##     s.uttlen.pre, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5689 -0.5431 -0.1201  0.4984  2.4805 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.68043    0.10845  24.715  < 2e-16 ***
## srl.temp.KL  -0.12425    0.08810  -1.410  0.16204    
## s.cmu.phon    0.44102    0.15410   2.862  0.00529 ** 
## sln.freq.pre  0.04555    0.11494   0.396  0.69288    
## s.uttlen.pre -0.03603    0.11464  -0.314  0.75408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7759 on 86 degrees of freedom
## Multiple R-squared:  0.1175, Adjusted R-squared:  0.0765 
## F-statistic: 2.864 on 4 and 86 DF,  p-value: 0.02798
summary(lm(complexity ~ srl.sp.KL +s.cmu.phon + sln.freq.pre + s.uttlen.pre, d))
## 
## Call:
## lm(formula = complexity ~ srl.sp.KL + s.cmu.phon + sln.freq.pre + 
##     s.uttlen.pre, data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43541 -0.48384 -0.03978  0.54140  2.29662 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.67962    0.10848  24.702  < 2e-16 ***
## srl.sp.KL    -0.11069    0.07957  -1.391  0.16776    
## s.cmu.phon    0.42633    0.15359   2.776  0.00676 ** 
## sln.freq.pre  0.06842    0.11636   0.588  0.55810    
## s.uttlen.pre -0.06104    0.12071  -0.506  0.61438    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7762 on 86 degrees of freedom
## Multiple R-squared:  0.117,  Adjusted R-squared:  0.07594 
## F-statistic: 2.849 on 4 and 86 DF,  p-value: 0.02861
cor.test(d[d$small.cat == "Nouns", "complexity"],
         d[d$small.cat == "Nouns", "srl.topic.KL"])
## 
##  Pearson's product-moment correlation
## 
## data:  d[d$small.cat == "Nouns", "complexity"] and d[d$small.cat == "Nouns", "srl.topic.KL"]
## t = 1.9511, df = 34, p-value = 0.05932
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01252098  0.58488136
## sample estimates:
##       cor 
## 0.3173201

None of the entropy measures are reliable predicotrs of complexity, even for controling for stuff. There’s a marginal relationship betwen srl.topic.KL and complexity, for nouns only (n = 36, p=.06, r = .32).