Iterated Referential Complexity E2

ANALYSES
       1: Word forms
             individual chain
             label length
             unique words
             transitional probability
             CV ratio
             accuracy
             levenshtein edit distance

       2: Complexity bias
             label length
             cumulative characters removed (CCR)
             empirical distributions
             change in bias across generations
             confusion matrix

3: Relationship between word forms and complexity bias

Analysis #1: Forms in the lexicon

Examine particular chains

## [1] "Chain 9"
##      [,1]      [,2]        [,3]  [,4]       [,5]        [,6]    [,7] 
## [1,] "dunobax" "bipag"     "nid" "nimimog"  "damitobup" "nagir" "gan"
## [2,] "dunobug" "bipag"     "nid" "nimimog"  "nilobup"   "niger" "gan"
## [3,] "dunbug"  "bippenbog" "nid" "nimobop"  "nilobup"   "niger" "gan"
## [4,] "dabop"   "buttenbug" "nid" "nimobop"  "nilobop"   "niger" "gar"
## [5,] "dabop"   "buttenbop" "nid" "nimbobop" "nilobop"   "niger" "gar"
## [6,] "dabop"   "bittenbop" "nir" "nilobop"  "nilop"     "niger" "dir"
## [7,] "dabop"   "bittenbop" "nir" "nilop"    "nilop"     "niger" "dir"
## [8,] "dabog"   "bittenbop" "hir" "nilop"    "nilop"     "niger" "dir"
##      [,8]          [,9]        [,10]        
## [1,] "gimunobugup" "mikupudax" "daganitobip"
## [2,] "runtunbug"   "bipoxtog"  "dipentag"   
## [3,] "runtunbug"   "ripenbog"  "dipentag"   
## [4,] "rittenbob"   "rudentag"  "dertag"     
## [5,] "bittenbob"   "rittenbog" "dertag"     
## [6,] "girbop"      "dirbop"    "rittenbog"  
## [7,] "garbog"      "dabog"     "rittenbog"  
## [8,] "garbop"      "dabog"     "rottenbog"

Label length

# get length dataframe that includes input language
gen_one = preProcess(1, rto_norms, co_norms)
len = data.frame(length = DA$guessedLabelLen, gen = DA$gen)
g1 = data.frame(length = gen_one$labelLen, gen = 0) 
len = rbind(len, g1) 
len$gen = as.factor(len$gen)

# aggregate over generations
ms = aggregate(length ~ gen, data=len, mean)
ms$cil <-  aggregate(length ~ gen, data=len, ci.low)$length
ms$cih <-  aggregate(length ~ gen, data=len, ci.high)$length
ms$gen = as.numeric(as.character(ms$gen))

#plot
lplot <- ggplot(ms, aes(y = length, x = gen)) +
  geom_point() +
  geom_pointrange(aes(ymax = length + cih, ymin=length - cil), size = 1.2) + 
  geom_line() +
  scale_x_continuous(breaks=seq(0, numGen, 1)) +
  ylim(1,8)+
  geom_abline(intercept = 7, slope = 0, color = "grey", lty = 2) +
  ylab("Mean length") +
  xlab("Generation") +
  ggtitle("Mean length") +
    theme(text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black')) 
lplot

plot of chunk unnamed-chunk-4

cor.test(len$length, as.numeric(as.character(len$gen)))

## 
##  Pearson's product-moment correlation
## 
## data:  len$length and as.numeric(as.character(len$gen))
## t = -14.17, df = 3955, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2493 -0.1900
## sample estimates:
##     cor 
## -0.2199

Unique words

da_u <- ddply(DA, .(gen, Answer.subSeed), summarise, numUnique = length(unique(guessedLabel)))
da_ul = aggregate(numUnique ~ gen, data=da_u, mean)
da_ul$cih <- aggregate(numUnique ~ gen, data=da_u, ci.high)$numUnique
da_ul$cil <- aggregate(numUnique ~ gen, data=da_u, ci.low)$numUnique
da_ul[8,]= c(0, 10, 0, 0) # add zero generation data
 
uplot = ggplot(da_ul, aes(x = gen, y = numUnique)) + 
  geom_line(position="dodge") +
  geom_point(size = 4) +
  geom_pointrange(aes(ymax = numUnique + cih, ymin=numUnique - cil), size = 1.2) + 
  ylab("Number of unique words") +
  geom_abline(intercept = 10, slope = 0, color = "grey", lty = 2) +
  scale_y_continuous(limits=c(1, 12), breaks=seq(1, 12, 2))  +
  xlab("Generation") +
  ggtitle("Number of unique words") +
  scale_x_continuous(breaks=seq(0, numGen, 1))  +
    theme(text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black')) 
uplot

plot of chunk unnamed-chunk-6

cor.test(da_u$numUnique, da_u$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_u$numUnique and da_u$gen
## t = -6.925, df = 348, p-value = 2.116e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4369 -0.2524
## sample estimates:
##    cor 
## -0.348

Transitional Prob

Language empirical model

getMeanBigramProb <- function(w, n, num_words, p.bis) {
  w = as.character(w)
  m = as.matrix(textcnt(w, n = n))

  # get bigram model
  if (missing(p.bis)){ # get language model
      # get bigram conditional probabilities 
      check_bigram = lapply(rownames(m), nchar) == n
      check_space = unlist(lapply(rownames(m), function (x) {!grepl("_", x)}))
      bis = m[check_bigram & check_space,] # get bigrams with no space characters only
    
      # make contingency table to get prob(a | b)
      con.table = matrix(0, nrow =  length(letters), 
                         ncol =  length(letters), 
                         dimnames = list(letters, letters))
      
      # loop over bigrams
      for (i in 1:length(bis)){
        chars = unlist(strsplit(names(bis)[i], ""))
        con.table[chars[1], chars[2]] = con.table[chars[1], chars[2]] + bis[i]
      }
      
      # get conditional probabilities by normalizing across rows
      p.bis = prop.table(con.table, 1) 
      
      # THIS IS WHERE I SHOULD SMOOTH
   }
  
  # Now, get mean across words of mean bigram probability of each word
  ll_w = matrix(NA, ncol = num_words) # make matrix for mean prob of bigrams of word
  
  for (i in 1:length(w)){
    current_word = w[i]
    
    # make into bigrams
    ww = as.matrix(textcnt(current_word, n = n))
    check_bigram = lapply(rownames(ww), nchar) == n
    check_space = unlist(lapply(rownames(ww), function (x) {!grepl("_", x)}))
    ww_b = ww[check_bigram & check_space,] # get bigrams with no space characters only
    
    # loop over bigrams 
    if (length(ww_b) > 0){ # gets rid of single letter words
      ll = matrix(0, nrow = length(ww_b), 1) # make matrix for prob of each bigram in the word
      for (j in 1:length(ww_b)[1]) {
        bi = unlist(strsplit(names(ww_b)[j], ""))
        ll[j] = p.bis[bi[1], bi[2]]
      }
      ll_w[i]= mean(ll) # get mean bigram probability
    }
  }
  return(mean(ll_w, na.rm = T)) # get mean across all words in chain
}

DA = ddply(DA, .(gen, Answer.subSeed), transform, duplicate =  duplicated(guessedLabel)) # add duplicate row column

b.DA = DA[,c("gen", "Answer.subSeed", "guessedLabel", "duplicate")]
gen_one = preProcess(1, rto_norms, co_norms)
g1 = data.frame(Answer.subSeed = gen_one$Answer.subSeed,  
                guessedLabel = gen_one$guessedLabel, duplicate = FALSE, gen = 0) # add g1 
b.DA = rbind(b.DA, g1)
                    
## LANGUAGE EMPIRICAL MODEL
ms <- ddply(b.DA[b.DA$duplicate == FALSE,], .(gen, Answer.subSeed),
            summarise, lls = getMeanBigramProb(guessedLabel, 2, 10))
mss <- aggregate(lls ~ gen, data=ms, mean)
mss$cil <- aggregate(lls ~ gen, data=ms, ci.low)$lls
mss$cih <-aggregate(lls ~ gen, data=ms, ci.high)$lls

bplot = ggplot(mss, aes(x = gen, y = lls)) +
  geom_line() +
  geom_point() +
  geom_pointrange(aes(ymax = lls + cih, ymin = lls - cil), size = 1.2) + 
  ylim(0,1) +
  scale_x_continuous(breaks=seq(0, numGen, 1)) +
  ylab("Mean transitional probability") +
  xlab("Generation") +
  ggtitle("Mean transitional probability") +
  geom_abline(intercept = mss[1, "lls"], slope = 0, color = "grey", lty = 2) +
  theme(text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black')) 
bplot

plot of chunk unnamed-chunk-8

cor.test(ms$lls, ms$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  ms$lls and ms$gen
## t = 12.12, df = 398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4437 0.5873
## sample estimates:
##    cor 
## 0.5192

English model

## get english bigrams
eng.ngrams = read.csv('/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/data/ngrams2.csv', header = T)
eng.ngrams = eng.ngrams[,c(1:2)] # the first row raw counts, the rest is subdivided by position
names(eng.ngrams) = c("bigram", "count")

# add missing bigrams
eng.ngrams = rbind(eng.ngrams, 
                   data.frame(bigram = c("JQ", "QG", "QK", "QY", "QZ", "WQ", "WZ"), 
                              count=c(0,0,0,0,0,0,0)))
eng.ngrams$bigram = as.character(eng.ngrams$bigram)
eng.ngrams$bigram = tolower(eng.ngrams$bigram)

# take out 'na' bigram and add it back in later
NA_count = eng.ngrams[is.na(eng.ngrams$bigram),"count"]
eng.ngramsC = eng.ngrams[!is.na(eng.ngrams$bigram),]

con.table = matrix(0, nrow = 26, ncol = 26, dimnames = list(letters, letters))
# loop over bigrams
for (i in 1:length(eng.ngramsC$bigram)){
  chars = unlist(strsplit(eng.ngramsC$bigram[i], ""))
  con.table[chars[1], chars[2]] = eng.ngramsC$count[i]
}
con.table["n", "a"] = NA_count

# get conditional probabilities by normalizing across rows
eng.bis = prop.table(con.table, 1) 

mse = ddply(DA[DA$duplicate == FALSE,], .(gen, Answer.subSeed), 
            summarise, lls = getMeanBigramProb(guessedLabel, 2, 10, eng.bis))
msse = aggregate(lls ~ gen, data=mse, mean)
msse$cil <-  aggregate(lls ~ gen, data=mse, ci.low)$lls
msse$cih <-  aggregate(lls ~ gen, data=mse, ci.high)$lls

ep = ggplot(msse, aes(x = gen, y = lls)) +
  geom_line() +
  geom_point() +
  geom_pointrange(aes(ymax = lls + cih, ymin= lls-cil), size = 1.2) + 
  ylim(0,.25) +
  scale_x_continuous(breaks=seq(0, numGen, 1)) +
  ylab("Mean bigram probability") +
  xlab("Generation") +
  ggtitle("Mean bigram probability - English Model") +
  theme(text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black')) 
ep

plot of chunk unnamed-chunk-10

cor.test(mse$lls, mse$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  mse$lls and mse$gen
## t = 3.341, df = 348, p-value = 0.0009263
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07279 0.27601
## sample estimates:
##    cor 
## 0.1763

CV ratio

getCVRatio <- function (d) {
  word = as.character(d)
  consonants = c("b", "c", "d", "f", "g", "h", "j", 
                 "k","l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
  vowels = c("a", "e", "i", "o", "u")
  letters = unlist(strsplit(word, ""))
  num_vowels = length(intersect(vowels, letters))
  num_consonants = length(intersect(consonants, letters))
  
  ratio = num_consonants/num_vowels
  return(ratio)
  }

DA$CVratio = unlist(lapply(DA$guessedLabel, function(d) {getCVRatio(d)}))
DA$CVratio = unlist(lapply(DA$CVratio, function(x) replace(x, is.infinite(x),NA)))

da_cv = aggregate(CVratio ~ gen, data=DA, mean)
da_cv$cih <- aggregate(CVratio ~ gen, data=DA, ci.high)$CVratio
da_cv$cil <- aggregate(CVratio ~ gen, data=DA, ci.low)$CVratio
 
cvplot = ggplot(da_cv, aes(x = gen, y = CVratio)) + 
  geom_line(position="dodge") +
  geom_point(size = 4) +
  geom_pointrange(aes(ymax = CVratio + cih, ymin=CVratio - cil), size = 1.2) + 
  ylab("CV ratio") +
  xlab("Generation") +
  scale_x_continuous(breaks=seq(0, numGen, 1))  +
    theme(text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black')) 

cvplot

plot of chunk unnamed-chunk-12

cvss = aggregate(CVratio ~ gen + Answer.subSeed, data=DA, mean)
cor.test(cvss$CVratio, cvss$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  cvss$CVratio and cvss$gen
## t = 1.597, df = 340, p-value = 0.1111
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01992  0.19062
## sample estimates:
##     cor 
## 0.08631

cor.test(da_cv$CVratio, da_cv$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_cv$CVratio and da_cv$gen
## t = 1.961, df = 5, p-value = 0.1072
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1862  0.9438
## sample estimates:
##    cor 
## 0.6593

Accuracy

a_mean <- ddply(DA, .(gen), function (d) {p.fc(d, "correct")})

aplot <- ggplot(a_mean, aes(y = p_correct, x = gen)) +
  geom_point() +
  geom_pointrange(aes(ymax = ciul,ymin=ciwl), size = 1.2) + 
  geom_line() +
  scale_x_continuous(breaks=seq(0, numGen, 1)) +
  ylab("Proportion correct") +
  xlab("Generation") +
  ylim(0,1)+
  ggtitle("Mean accuracy") +
    theme(text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black')) 

aplot

plot of chunk unnamed-chunk-14

# relationship between accuracy and gen
a_mean_s <- ddply(DA, .(Answer.subSeed, gen), function (d) {p.fc(d, "correct")})
cor.test(a_mean_s$p_correct, a_mean_s$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  a_mean_s$p_correct and a_mean_s$gen
## t = 9.752, df = 348, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3767 0.5418
## sample estimates:
##    cor 
## 0.4633

# relationship between accuracy and length and unique
A1 <- ddply(DA, .(Answer.subSeed, gen), summarise, 
           accuracy = p.fc2(accuracy),
           length = mean(guessedLabelLen),
           numUnique = length(unique(guessedLabel)),
           lls = getMeanBigramProb(guessedLabel, 2, 10))

m1 = lm(scale(accuracy) ~ scale(length) + scale(numUnique) + scale(lls), A1)
m2 = lm(scale(accuracy) ~ scale(length) + scale(lls), A1)
m3 = lm(scale(accuracy) ~ scale(length) + scale(numUnique), A1)

summary(m1)

## 
## Call:
## lm(formula = scale(accuracy) ~ scale(length) + scale(numUnique) + 
##     scale(lls), data = A1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4028 -0.6781  0.0075  0.6784  2.1860 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.34e-17   4.91e-02    0.00  1.00000    
## scale(length)    -2.13e-01   5.68e-02   -3.75  0.00021 ***
## scale(numUnique) -2.15e-01   7.58e-02   -2.83  0.00488 ** 
## scale(lls)        9.81e-02   8.33e-02    1.18  0.23981    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.918 on 346 degrees of freedom
## Multiple R-squared:  0.165,  Adjusted R-squared:  0.158 
## F-statistic: 22.8 on 3 and 346 DF,  p-value: 1.71e-13

summary(m2)

## 
## Call:
## lm(formula = scale(accuracy) ~ scale(length) + scale(lls), data = A1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4574 -0.7048  0.0072  0.7030  2.2380 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -4.65e-17   4.95e-02    0.00   1.0000    
## scale(length) -1.69e-01   5.52e-02   -3.06   0.0024 ** 
## scale(lls)     2.76e-01   5.52e-02    5.00    9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.927 on 347 degrees of freedom
## Multiple R-squared:  0.146,  Adjusted R-squared:  0.141 
## F-statistic: 29.6 on 2 and 347 DF,  p-value: 1.37e-12

summary(m3) # this is the reported model

## 
## Call:
## lm(formula = scale(accuracy) ~ scale(length) + scale(numUnique), 
##     data = A1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3129 -0.6784 -0.0033  0.6624  2.1473 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.24e-18   4.91e-02    0.00        1    
## scale(length)    -2.45e-01   4.98e-02   -4.92  1.3e-06 ***
## scale(numUnique) -2.82e-01   4.98e-02   -5.67  3.0e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.918 on 347 degrees of freedom
## Multiple R-squared:  0.162,  Adjusted R-squared:  0.157 
## F-statistic: 33.5 on 2 and 347 DF,  p-value: 5.12e-14

cor.test(A1$lls, A1$numUnique) #num unique and transitional are highly colinear

## 
##  Pearson's product-moment correlation
## 
## data:  A1$lls and A1$numUnique
## t = -20.49, df = 348, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7835 -0.6879
## sample estimates:
##     cor 
## -0.7394

Levenshtein Distance

Normalized

# normalize Levenshetein distance by length of longer word
DA_all$long_length = ifelse(DA_all$labelLen > DA_all$guessedLabelLen, 
                            DA_all$labelLen, DA_all$guessedLabelLen )
DA_all$lev.n = DA_all$lev / DA_all$long_length
DA_all$ins.n = DA_all$ins / DA_all$long_length 
DA_all$del.n = DA_all$del / DA_all$long_length 
DA_all$sub.n = DA_all$sub / DA_all$long_length 

da_edits <- melt(DA_all, id.vars = c("index", "gen", "workerid"), 
                 measure.vars = c("lev.n", "ins.n", "sub.n", "del.n"))
ms = aggregate(value ~ gen + variable, data = da_edits, mean)
ms$cil <- aggregate(value ~ gen + variable, data=da_edits, ci.low)$value
ms$cih <- aggregate(value ~ gen + variable, data=da_edits, ci.high)$value

# stuff for plotting
ms$size = ifelse(ms$variable == "lev.n", 1.2, 1)
levels(ms$variable)  = c("Levenshtein distance", "insertions", "substitutions", "deletions")

#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/lev.pdf", width = 6, height = 6)
ggplot(ms, aes(y=value, x=gen, color=variable)) +
  geom_point(size = 3) + 
  geom_line(aes(size = 4*size)) + 
  geom_pointrange(aes(ymax = value+cih, ymin=value-cil)) + 
  scale_size(range=c(0.5, 1.6)) +
  xlab("Generation") +
  ylab("Normalized Edit distance") +
  ggtitle("Levenshtein edit distance - normalized") +
  scale_x_continuous(breaks = seq(0, numGen, 1)) +
  guides(color = guide_legend(title = "Edit metric"), size=FALSE) +
  theme(legend.position=c(1,1),legend.justification=c(1,1.4),
        legend.text = element_text(size = 15),
        text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'))

plot of chunk unnamed-chunk-17

#dev.off()

da_lev = da_edits[da_edits$variable == "lev.n",]
cor.test(da_lev$value, da_lev$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_lev$value and da_lev$gen
## t = -18.75, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3339 -0.2734
## sample estimates:
##    cor 
## -0.304

da_del = da_edits[da_edits$variable == "del.n",]
cor.test(da_del$value, da_del$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_del$value and da_del$gen
## t = -10.88, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2140 -0.1495
## sample estimates:
##    cor 
## -0.182

da_ins = da_edits[da_edits$variable == "ins.n",]
cor.test(da_ins$value, da_ins$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_ins$value and da_ins$gen
## t = -4.284, df = 3455, p-value = 1.887e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10577 -0.03945
## sample estimates:
##      cor 
## -0.07269

da_sub= da_edits[da_edits$variable == "sub.n",]
cor.test(da_sub$value, da_sub$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_sub$value and da_sub$gen
## t = -16.41, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2995 -0.2376
## sample estimates:
##     cor 
## -0.2689

Unnormalized

da_edits <- melt(DA_all, id.vars = c("index", "gen", "workerid"),
                 measure.vars = c("lev", "ins", "sub", "del"))
ms = aggregate(value ~ gen + variable, data = da_edits, mean)
ms$cil <- aggregate(value ~ gen + variable, data=da_edits, ci.low)$value
ms$cih <- aggregate(value ~ gen + variable, data=da_edits, ci.high)$value

# stuff for plotting
ms$size = ifelse(ms$variable == "lev", 1.2, 1)
levels(ms$variable)  = c("Levenshtein distance", "insertions", "substitutions", "deletions")

#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/lev.pdf", width = 6, height = 6)
ggplot(ms, aes(y=value, x=gen, color=variable)) +
  geom_point(size = 3) + 
  geom_line(aes(size = 4*size)) + 
  geom_pointrange(aes(ymax = value+cih, ymin=value-cil)) + 
  scale_size(range=c(0.5, 1.6)) +
  xlab("Generation") +
  ylab("Number of edits") +
  ggtitle("Levenshtein edit distance - unnormalized") +
  scale_x_continuous(breaks=seq(0, numGen, 1)) +
  guides(color = guide_legend(title = "Edit metric"), size=FALSE) +
  theme(legend.position=c(1,1),legend.justification=c(1,1.4),
        legend.text = element_text(size = 15),
        text = element_text(size=20),
        plot.title=element_text(size=20, face = "bold"),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'))

plot of chunk unnamed-chunk-19

#dev.off()

da_lev = da_edits[da_edits$variable == "lev",]
cor.test(da_lev$value, da_lev$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_lev$value and da_lev$gen
## t = -22.82, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3905 -0.3325
## sample estimates:
##     cor 
## -0.3619

da_del = da_edits[da_edits$variable == "del",]
cor.test(da_del$value, da_del$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_del$value and da_del$gen
## t = -12.98, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2472 -0.1836
## sample estimates:
##     cor 
## -0.2157

da_ins = da_edits[da_edits$variable == "ins",]
cor.test(da_ins$value, da_ins$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_ins$value and da_ins$gen
## t = -6.532, df = 3455, p-value = 7.456e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.14325 -0.07739
## sample estimates:
##     cor 
## -0.1104

da_sub= da_edits[da_edits$variable == "sub",]
cor.test(da_sub$value, da_sub$gen)

## 
##  Pearson's product-moment correlation
## 
## data:  da_sub$value and da_sub$gen
## t = -21.13, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3675 -0.3084
## sample estimates:
##     cor 
## -0.3383

Analysis #2: Complexity bias

Complexity bias with guessed label length

getCorString <- function(d, complex, length) {
  ci = which(names(d) == complex)
  li = which(names(d) == length)
  m = round(cor(d[,ci], d[,li]),2)
  string = paste("r = ", m, sep = "")
  return(string)
}
DA$quintile2 = DA$quintile/5 

# get label length by quintile for each generation
MS = aggregate(guessedLabelLen ~ quintile2 + gen, data=DA, mean)
MS$cil <- aggregate(guessedLabelLen ~ quintile2 + gen, data=DA, ci.low)$guessedLabelLen
MS$cih <- aggregate(guessedLabelLen ~ quintile2 + gen, data=DA, ci.high)$guessedLabelLen

# get cor strings
cb_r = ddply(DA, .(gen), function (k) {lab = getCorString(k, "c.norms", "guessedLabelLen")})
cb_r$gen = 1:7
names(cb_r)[which(names(cb_r) == "V1")] = "lab"

ggplot(MS, aes(x=quintile2, y=guessedLabelLen)) +
  geom_smooth(method = "lm", aes(x=c.norms, y=guessedLabelLen), data = DA, 
                position = "identity", size = 1.5, alpha = .2) +
  geom_point(size = 3) + 
  xlab("Complexity quintile") +
  ylab("Guessed label length (chars.)") +
  ggtitle("Complexity bias across generations") +
  facet_grid( ~ gen) +
  theme(text = element_text(size=20),
        plot.title=element_text(size=20, vjust=2),
        axis.title.y=element_text(vjust=0.2),
        axis.title.x=element_text(vjust=-0.2),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'), 
        strip.background = element_rect(colour="white"),
        strip.text.x = element_text( size = 20)) +
    scale_x_continuous(breaks=c(.2,.4,.6,.8, 1), 
                            labels=c(1:5)) +
    scale_y_continuous(breaks=c(5,6,7))

plot of chunk unnamed-chunk-21

Complexity bias with CCR

DA$quintile2 = DA$quintile/5 -.08
# get label length by quintile for each generation
MSC = aggregate(removedCharactersC ~ quintile2 + gen, data=DA, mean)
MSC$cil <- aggregate(removedCharactersC ~ quintile2 + gen, data=DA, ci.low)$removedCharactersC
MSC$cih <- aggregate(removedCharactersC ~ quintile2 + gen, data=DA, ci.high)$removedCharactersC


#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/complexity_bias.pdf", width = 16, height = 6)
ggplot(MSC, aes(x=quintile2, y=removedCharactersC)) +
  geom_smooth(method = "lm", aes(x=c.norms, y=removedCharactersC), data = DA, 
                position = "identity", size = 1.5, alpha = .2) +
  geom_pointrange(aes(ymax = removedCharactersC + cih, ymin=removedCharactersC - cil), size = .7) + 
  xlab("Complexity quintile") +
  ylab("Cumulative characters removed") +
  ggtitle("Complexity bias across generations") +
  facet_grid( ~ gen) +
  theme(text = element_text(size=20),
        plot.title=element_text(size=20, vjust=2, face="bold"),
        axis.title.y=element_text(vjust=0.2),
        axis.title.x=element_text(vjust=-0.2),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'), 
        strip.background = element_rect(colour="white"),
        strip.text.x = element_text( size = 20)) +
  scale_x_continuous(breaks=c(0.12, 0.32, 0.52, 0.72, 0.92), lim = c(0,1),labels=c(1:5))

plot of chunk unnamed-chunk-22

#dev.off()

# # generation 1 only plot
# 
# #pdf("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Talks/Prospectives/E6a.pdf", width = 6, height = 6)
# ggplot(MSC[MSC$gen == 1,], aes(x=quintile2, y=removedCharactersC)) +
#   geom_smooth(method = "lm", aes(x=c.norms, y=removedCharactersC), data = DA[DA$gen ==1,], 
#                 position = "identity", size = 1.5, alpha = .2) +
#   geom_pointrange(aes(ymax = removedCharactersC + cih, ymin=removedCharactersC - cil), size = .7) + 
#   xlab("Complexity quintile") +
#   ylab("Number characters removed") +
#   theme(text = element_text(size=20),
#         plot.title=element_text(size=20, vjust=2, face="bold"),
#         axis.title.y=element_text(vjust=0.2),
#         axis.title.x=element_text(vjust=-0.2),
#         plot.background = element_blank(),
#         panel.grid.major = element_blank(),
#         panel.grid.minor = element_blank(),
#         panel.border = element_blank(),
#         axis.line = element_line(color = 'black'), 
#         strip.background = element_rect(colour="white"),
#         strip.text.x = element_text( size = 20)) +
#   scale_x_continuous(breaks=c(0.12, 0.32, 0.52, 0.72, 0.92), lim = c(0,1),labels=c(1:5))
# #dev.off()

Empirical distributions

Get p value based on empirical r distribution

getEmpiricalr.labelLen <- function(d, i) {
  permLabelLen = d$labelLen[i] 
  corr(cbind(d$c.norms, permLabelLen))
}

getEmpiricalr.CR <- function(d, i) {
  permCR= d$removedCharactersC[i] 
  corr(cbind(d$c.norms, permCR))
}

getEmpiricalp <- function(d, measure) {
  if (measure == "labelLen") {
    m = boot(d, getEmpiricalr.labelLen, R = 10000)
    r_veridical = corr(cbind(d$c.norms, d$labelLen))
  } else if (measure == "removedCharactersC") { 
    m = boot(d, getEmpiricalr.CR, R = 10000)
    r_veridical = corr(cbind(d$c.norms, d$removedCharactersC))
  }
  cdf_func = ecdf(m$t) # cdf function (go from z to q)
  p = 1-cdf_func(abs(r_veridical))
  return(p)
}

# get dataframe with generation 0
DA.e = DA[,c("labelLen", "removedCharactersC", "gen", "c.norms")]
gen_zero = preProcess(1, rto_norms, co_norms)
g0 = data.frame(labelLen = gen_zero$G0LabelLen, gen = 0, 
                removedCharactersC = NA, c.norms = gen_zero$c.norms) 
DA.e = rbind(DA.e, g0)

Label Length

# labelLen rs
nsamps = 10000
M = data.frame(sample = NA, gen = NA)
m = matrix(nrow = nsamps, ncol = 2)
for (i in 0:7) {
  d = DA.e[DA.e$gen == i,]
  samps = boot(d, getEmpiricalr.labelLen, R = nsamps)
  m[,1] = samps$t
  m[,2] = i
  colnames(m) = c("sample", "gen")
  M = rbind(M, m)
}

# plot r distribution
ggplot(M[-1,], aes(x = sample)) + 
  geom_histogram(fill =  "gray") + 
  facet_grid(~ gen) +
    theme(axis.title.y=element_text(vjust=0.2),
        axis.title.x=element_text(vjust=-0.2),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'), 
        strip.background = element_rect(colour="white")) +
  geom_vline(xintercept = 0, color = "red") +
    ggtitle("r distribution for label length")

plot of chunk unnamed-chunk-24

# labelLen ps 
ps = ddply(DA.e, .(gen), function (d) {getEmpiricalp(d, "labelLen")})
rs = ddply(DA.e, .(gen), function (d) {k = corr(cbind(d$c.norms,d$labelLen))})
rs$estimate = "r"
ps$estimate = "empirical_p"
es = rbind(ps,rs)
names(es)[2] = "value"

# plot ps and veridcal rs
ggplot(es, aes(x = gen, y = value, group = estimate, color = estimate)) +
geom_line() +
theme(axis.title.y=element_text(vjust=0.2),
        axis.title.x=element_text(vjust=-0.2),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'), 
        strip.background = element_rect(colour="white")) +
      scale_x_continuous(breaks=seq(0, numGen, 1)) +
      geom_hline(yintercept = .05, color = "grey") +
ggtitle("correlation between length and complexity norms \nand empirical p-value")

plot of chunk unnamed-chunk-25

# collapsing across all generations
getEmpiricalp(DA.e[DA.e$gen>0,], "labelLen")

## [1] 0.0162

CCR

# CCR rs
nsamps = 10000
M = data.frame(sample = NA, gen = NA)
m = matrix(nrow = nsamps, ncol = 2)
for (i in 1:7) {
  d = DA.e[DA.e$gen == i,]
  samps = boot(d, getEmpiricalr.CR, R = nsamps)
  m[,1] = samps$t
  m[,2] = i
  colnames(m) = c("sample", "gen")
  M = rbind(M, m)
}

# plot empirical r distributions
ggplot(M[-1,], aes(x = sample)) + 
  geom_histogram(fill =  "gray") + 
  facet_grid(~ gen) +
    theme(axis.title.y=element_text(vjust=0.2),
        axis.title.x=element_text(vjust=-0.2),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'), 
        strip.background = element_rect(colour="white")) +
  geom_vline(xintercept = 0, color = "red") +
  ggtitle("CCR empirical r distribution")

plot of chunk unnamed-chunk-27

# CR ps
ps = ddply(DA.e[DA.e$gen > 0,], .(gen), function (d) {getEmpiricalp(d, "removedCharactersC")})
rs = ddply(DA.e[DA.e$gen > 0,], .(gen), function (d) {corr(cbind(d$c.norms,d$removedCharactersC))})
rs$estimate = "r"
ps$estimate = "empirical_p"
es = rbind(ps, rs)
names(es)[2] = "value"

ggplot(es, aes(x = gen, y = value, group = estimate, color = estimate)) +
geom_line() +
xlab("generation") +
theme(text = element_text(size=15),
        plot.title=element_text(size=15, vjust=2),
        axis.title.y=element_text(vjust=0.2),
        axis.title.x=element_text(vjust=-0.2),
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = 'black'), 
        strip.background = element_rect(colour="white")) +
        scale_y_continuous(breaks=seq(-.2, .1, .05)) +
        scale_x_continuous(breaks=seq(1, numGen, 1)) +
        geom_hline(yintercept = .05, color = "grey") +
ggtitle("r (cumulative chars. removed vs. complexity norms) \n and associated empirical p-value")

plot of chunk unnamed-chunk-28

# collapsing across all generations
p = getEmpiricalp(DA.e[DA.e$gen>0,], "removedCharactersC") 
sprintf("%.100f",p)

## [1] "0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"

Change in bias across generations

# mean complexity bias across generations
cor.test(DA$c.norms, DA$guessedLabelLen)

## 
##  Pearson's product-moment correlation
## 
## data:  DA$c.norms and DA$guessedLabelLen
## t = 3.165, df = 3455, p-value = 0.001562
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02047 0.08696
## sample estimates:
##     cor 
## 0.05377

cor.test(DA$c.norms, DA$removedCharactersC)

## 
##  Pearson's product-moment correlation
## 
## data:  DA$c.norms and DA$removedCharactersC
## t = -6.445, df = 3455, p-value = 1.312e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.14182 -0.07594
## sample estimates:
##    cor 
## -0.109

# averaging across chains
# coefsS returns standardized betas, which are the same as pearson's r
# using this rather than cor or scale() with coeff 

# averaging across chains, the bias at each generation
ms <-  DA %>% 
  group_by(gen) %>%
   summarise(beta = coefsS(c.norms, guessedLabelLen), 
             accuracy = p.fc2(accuracy), 
             var_length = sd(guessedLabelLen),
             guessedLabelLen = mean(guessedLabelLen),
             removedCharactersC  = mean(removedCharactersC), 
             removedCharacters = mean(removedCharacters))

# variability in bias  across chains
msss <-  DA %>% 
  group_by(Answer.subSeed) %>%
   summarise(beta = coefsS(c.norms, guessedLabelLen), 
             accuracy = p.fc2(accuracy), 
             var_length = sd(guessedLabelLen),
             guessedLabelLen = mean(guessedLabelLen),
             removedCharactersC  = mean(removedCharactersC), 
             removedCharacters = mean(removedCharacters))

mean(msss$beta)

## [1] 0.04547

sd(msss$beta)

## [1] 0.2652

# variability in decrease across chains
mss <-  DA %>% 
  group_by(gen, Answer.subSeed) %>%
   summarise(beta = coefsS(c.norms, guessedLabelLen), 
             accuracy = p.fc2(accuracy), 
             var_length = sd(guessedLabelLen),
             guessedLabelLen = mean(guessedLabelLen),
             removedCharactersC  = mean(removedCharactersC), 
             removedCharacters = mean(removedCharacters))

crs = ddply(mss, .(Answer.subSeed), summarise, change_r = cor(gen, beta))
mean(crs$change_r, na.rm = T)

## [1] 0.004277

sd(crs$change_r, na.rm = T)

## [1] 0.6896

# trying to quantifiy the decrease in bias
cor.test(mss$beta, mss$gen) # continuously

## 
##  Pearson's product-moment correlation
## 
## data:  mss$beta and mss$gen
## t = -0.5642, df = 345, p-value = 0.573
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.13521  0.07516
## sample estimates:
##      cor 
## -0.03036

mss = as.data.frame(mss)  # first vs. last generation
mssC = mss[mss$gen != 43 | mss$gen != 45,]
t.test(mssC[mssC$gen == 1,"beta"], mssC[mssC$gen == 7,"beta"], paired = TRUE)

## 
##  Paired t-test
## 
## data:  mssC[mssC$gen == 1, "beta"] and mssC[mssC$gen == 7, "beta"]
## t = 0.7297, df = 47, p-value = 0.4692
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.06955  0.14871
## sample estimates:
## mean of the differences 
##                 0.03958

# mean complexity bias across generations with lev
cor.test(DA$c.norms, DA$lev)

## 
##  Pearson's product-moment correlation
## 
## data:  DA$c.norms and DA$lev
## t = 2.669, df = 3455, p-value = 0.007644
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01204 0.07858
## sample estimates:
##     cor 
## 0.04536

Confusion matrix (relationship between word length and meaning complexity)

Analysis #3: Complexity bias and accuracy

# aggregate over generations for each chain (accuracy and bias)
# get complexity bias at each chain - CCR

ms <-  DA %>% 
  group_by(gen, Answer.subSeed) %>%
   summarise(rs = coefsS(c.norms, removedCharactersC), 
             accuracy = p.fc2(accuracy), 
             lev = mean(lev))

mss <- ms %>% 
  group_by(Answer.subSeed) %>%
   summarise(CB_beta = coefsS(gen, rs), 
             accuracy = mean(accuracy),
             lev = mean(lev))

cor.test(mss$CB_beta, mss$accuracy)

## 
##  Pearson's product-moment correlation
## 
## data:  mss$CB_beta and mss$accuracy
## t = -2.798, df = 48, p-value = 0.007385
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5912 -0.1073
## sample estimates:
##     cor 
## -0.3744

cor.test(mss$CB_beta, mss$lev)

## 
##  Pearson's product-moment correlation
## 
## data:  mss$CB_beta and mss$lev
## t = 3.227, df = 48, p-value = 0.00226
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1630 0.6269
## sample estimates:
##    cor 
## 0.4222

#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/change_plot.pdf", width = 7, height = 6)

# plot correlation between accuracy and change beta
qplot(lev,CB_beta, position=position_dodge(),
      data=mss, geom="point", 
      ylab="Change in complexity bias \nacross generations (Pearson's r)", 
      xlab="Mean Levenshtein edit distance") +
      geom_point(position=position_dodge(.9)) +
      geom_smooth(method = "lm", size = 1.3,, alpha = .2) +
      ggtitle("Complexity bias change vs. Edit distance") +
      ylim(-1,1.25) +
      annotate("text", x=4, y=-.4, color = "red", size = 7,
           label=paste("r=", round(cor(mss$lev, mss$CB_beta), 2))) +
      theme(text = element_text(size=20),
          plot.title=element_text(size=20, face = "bold"),
          plot.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          axis.line = element_line(color = 'black'))

plot of chunk unnamed-chunk-32

#dev.off()

## get complexity bias at each chain - LENGTH
ms <-  DA %>% 
  group_by(gen, Answer.subSeed) %>%
   summarise(rs = coefsS(c.norms, guessedLabelLen), 
             accuracy = p.fc2(accuracy), 
             lev = mean(lev))
mss <- ms %>% 
  group_by(Answer.subSeed) %>%
   summarise(CB_beta = coefsS(gen, rs), 
             accuracy = mean(accuracy),
             lev = mean(lev))

cor.test(mss$CB_beta, mss$accuracy)

## 
##  Pearson's product-moment correlation
## 
## data:  mss$CB_beta and mss$accuracy
## t = 2.101, df = 48, p-value = 0.04091
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01291 0.52607
## sample estimates:
##    cor 
## 0.2902

cor.test(mss$CB_beta, mss$lev)

## 
##  Pearson's product-moment correlation
## 
## data:  mss$CB_beta and mss$lev
## t = -1.818, df = 48, p-value = 0.07537
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.49700  0.02645
## sample estimates:
##     cor 
## -0.2538