ANALYSES
       1: Word forms
             individual chain
             label length
             unique words
             transitional probability
             CV ratio
             accuracy
             levenshtein edit distance
       2: Complexity bias
             label length
             cumulative characters removed (CCR)
             empirical distributions
             change in bias across generations
             confusion matrix
       3: Relationship between word forms and complexity bias
## [1] "Chain 9"
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "dunobax" "bipag" "nid" "nimimog" "damitobup" "nagir" "gan"
## [2,] "dunobug" "bipag" "nid" "nimimog" "nilobup" "niger" "gan"
## [3,] "dunbug" "bippenbog" "nid" "nimobop" "nilobup" "niger" "gan"
## [4,] "dabop" "buttenbug" "nid" "nimobop" "nilobop" "niger" "gar"
## [5,] "dabop" "buttenbop" "nid" "nimbobop" "nilobop" "niger" "gar"
## [6,] "dabop" "bittenbop" "nir" "nilobop" "nilop" "niger" "dir"
## [7,] "dabop" "bittenbop" "nir" "nilop" "nilop" "niger" "dir"
## [8,] "dabog" "bittenbop" "hir" "nilop" "nilop" "niger" "dir"
## [,8] [,9] [,10]
## [1,] "gimunobugup" "mikupudax" "daganitobip"
## [2,] "runtunbug" "bipoxtog" "dipentag"
## [3,] "runtunbug" "ripenbog" "dipentag"
## [4,] "rittenbob" "rudentag" "dertag"
## [5,] "bittenbob" "rittenbog" "dertag"
## [6,] "girbop" "dirbop" "rittenbog"
## [7,] "garbog" "dabog" "rittenbog"
## [8,] "garbop" "dabog" "rottenbog"
# get length dataframe that includes input language
gen_one = preProcess(1, rto_norms, co_norms)
len = data.frame(length = DA$guessedLabelLen, gen = DA$gen)
g1 = data.frame(length = gen_one$labelLen, gen = 0)
len = rbind(len, g1)
len$gen = as.factor(len$gen)
# aggregate over generations
ms = aggregate(length ~ gen, data=len, mean)
ms$cil <- aggregate(length ~ gen, data=len, ci.low)$length
ms$cih <- aggregate(length ~ gen, data=len, ci.high)$length
ms$gen = as.numeric(as.character(ms$gen))
#plot
lplot <- ggplot(ms, aes(y = length, x = gen)) +
geom_point() +
geom_pointrange(aes(ymax = length + cih, ymin=length - cil), size = 1.2) +
geom_line() +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
ylim(1,8)+
geom_abline(intercept = 7, slope = 0, color = "grey", lty = 2) +
ylab("Mean length") +
xlab("Generation") +
ggtitle("Mean length") +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
lplot
cor.test(len$length, as.numeric(as.character(len$gen)))
##
## Pearson's product-moment correlation
##
## data: len$length and as.numeric(as.character(len$gen))
## t = -14.17, df = 3955, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2493 -0.1900
## sample estimates:
## cor
## -0.2199
da_u <- ddply(DA, .(gen, Answer.subSeed), summarise, numUnique = length(unique(guessedLabel)))
da_ul = aggregate(numUnique ~ gen, data=da_u, mean)
da_ul$cih <- aggregate(numUnique ~ gen, data=da_u, ci.high)$numUnique
da_ul$cil <- aggregate(numUnique ~ gen, data=da_u, ci.low)$numUnique
da_ul[8,]= c(0, 10, 0, 0) # add zero generation data
uplot = ggplot(da_ul, aes(x = gen, y = numUnique)) +
geom_line(position="dodge") +
geom_point(size = 4) +
geom_pointrange(aes(ymax = numUnique + cih, ymin=numUnique - cil), size = 1.2) +
ylab("Number of unique words") +
geom_abline(intercept = 10, slope = 0, color = "grey", lty = 2) +
scale_y_continuous(limits=c(1, 12), breaks=seq(1, 12, 2)) +
xlab("Generation") +
ggtitle("Number of unique words") +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
uplot
cor.test(da_u$numUnique, da_u$gen)
##
## Pearson's product-moment correlation
##
## data: da_u$numUnique and da_u$gen
## t = -6.925, df = 348, p-value = 2.116e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4369 -0.2524
## sample estimates:
## cor
## -0.348
getMeanBigramProb <- function(w, n, num_words, p.bis) {
w = as.character(w)
m = as.matrix(textcnt(w, n = n))
# get bigram model
if (missing(p.bis)){ # get language model
# get bigram conditional probabilities
check_bigram = lapply(rownames(m), nchar) == n
check_space = unlist(lapply(rownames(m), function (x) {!grepl("_", x)}))
bis = m[check_bigram & check_space,] # get bigrams with no space characters only
# make contingency table to get prob(a | b)
con.table = matrix(0, nrow = length(letters),
ncol = length(letters),
dimnames = list(letters, letters))
# loop over bigrams
for (i in 1:length(bis)){
chars = unlist(strsplit(names(bis)[i], ""))
con.table[chars[1], chars[2]] = con.table[chars[1], chars[2]] + bis[i]
}
# get conditional probabilities by normalizing across rows
p.bis = prop.table(con.table, 1)
# THIS IS WHERE I SHOULD SMOOTH
}
# Now, get mean across words of mean bigram probability of each word
ll_w = matrix(NA, ncol = num_words) # make matrix for mean prob of bigrams of word
for (i in 1:length(w)){
current_word = w[i]
# make into bigrams
ww = as.matrix(textcnt(current_word, n = n))
check_bigram = lapply(rownames(ww), nchar) == n
check_space = unlist(lapply(rownames(ww), function (x) {!grepl("_", x)}))
ww_b = ww[check_bigram & check_space,] # get bigrams with no space characters only
# loop over bigrams
if (length(ww_b) > 0){ # gets rid of single letter words
ll = matrix(0, nrow = length(ww_b), 1) # make matrix for prob of each bigram in the word
for (j in 1:length(ww_b)[1]) {
bi = unlist(strsplit(names(ww_b)[j], ""))
ll[j] = p.bis[bi[1], bi[2]]
}
ll_w[i]= mean(ll) # get mean bigram probability
}
}
return(mean(ll_w, na.rm = T)) # get mean across all words in chain
}
DA = ddply(DA, .(gen, Answer.subSeed), transform, duplicate = duplicated(guessedLabel)) # add duplicate row column
b.DA = DA[,c("gen", "Answer.subSeed", "guessedLabel", "duplicate")]
gen_one = preProcess(1, rto_norms, co_norms)
g1 = data.frame(Answer.subSeed = gen_one$Answer.subSeed,
guessedLabel = gen_one$guessedLabel, duplicate = FALSE, gen = 0) # add g1
b.DA = rbind(b.DA, g1)
## LANGUAGE EMPIRICAL MODEL
ms <- ddply(b.DA[b.DA$duplicate == FALSE,], .(gen, Answer.subSeed),
summarise, lls = getMeanBigramProb(guessedLabel, 2, 10))
mss <- aggregate(lls ~ gen, data=ms, mean)
mss$cil <- aggregate(lls ~ gen, data=ms, ci.low)$lls
mss$cih <-aggregate(lls ~ gen, data=ms, ci.high)$lls
bplot = ggplot(mss, aes(x = gen, y = lls)) +
geom_line() +
geom_point() +
geom_pointrange(aes(ymax = lls + cih, ymin = lls - cil), size = 1.2) +
ylim(0,1) +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
ylab("Mean transitional probability") +
xlab("Generation") +
ggtitle("Mean transitional probability") +
geom_abline(intercept = mss[1, "lls"], slope = 0, color = "grey", lty = 2) +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
bplot
cor.test(ms$lls, ms$gen)
##
## Pearson's product-moment correlation
##
## data: ms$lls and ms$gen
## t = 12.12, df = 398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4437 0.5873
## sample estimates:
## cor
## 0.5192
## get english bigrams
eng.ngrams = read.csv('/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/data/ngrams2.csv', header = T)
eng.ngrams = eng.ngrams[,c(1:2)] # the first row raw counts, the rest is subdivided by position
names(eng.ngrams) = c("bigram", "count")
# add missing bigrams
eng.ngrams = rbind(eng.ngrams,
data.frame(bigram = c("JQ", "QG", "QK", "QY", "QZ", "WQ", "WZ"),
count=c(0,0,0,0,0,0,0)))
eng.ngrams$bigram = as.character(eng.ngrams$bigram)
eng.ngrams$bigram = tolower(eng.ngrams$bigram)
# take out 'na' bigram and add it back in later
NA_count = eng.ngrams[is.na(eng.ngrams$bigram),"count"]
eng.ngramsC = eng.ngrams[!is.na(eng.ngrams$bigram),]
con.table = matrix(0, nrow = 26, ncol = 26, dimnames = list(letters, letters))
# loop over bigrams
for (i in 1:length(eng.ngramsC$bigram)){
chars = unlist(strsplit(eng.ngramsC$bigram[i], ""))
con.table[chars[1], chars[2]] = eng.ngramsC$count[i]
}
con.table["n", "a"] = NA_count
# get conditional probabilities by normalizing across rows
eng.bis = prop.table(con.table, 1)
mse = ddply(DA[DA$duplicate == FALSE,], .(gen, Answer.subSeed),
summarise, lls = getMeanBigramProb(guessedLabel, 2, 10, eng.bis))
msse = aggregate(lls ~ gen, data=mse, mean)
msse$cil <- aggregate(lls ~ gen, data=mse, ci.low)$lls
msse$cih <- aggregate(lls ~ gen, data=mse, ci.high)$lls
ep = ggplot(msse, aes(x = gen, y = lls)) +
geom_line() +
geom_point() +
geom_pointrange(aes(ymax = lls + cih, ymin= lls-cil), size = 1.2) +
ylim(0,.25) +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
ylab("Mean bigram probability") +
xlab("Generation") +
ggtitle("Mean bigram probability - English Model") +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
ep
cor.test(mse$lls, mse$gen)
##
## Pearson's product-moment correlation
##
## data: mse$lls and mse$gen
## t = 3.341, df = 348, p-value = 0.0009263
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07279 0.27601
## sample estimates:
## cor
## 0.1763
getCVRatio <- function (d) {
word = as.character(d)
consonants = c("b", "c", "d", "f", "g", "h", "j",
"k","l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
vowels = c("a", "e", "i", "o", "u")
letters = unlist(strsplit(word, ""))
num_vowels = length(intersect(vowels, letters))
num_consonants = length(intersect(consonants, letters))
ratio = num_consonants/num_vowels
return(ratio)
}
DA$CVratio = unlist(lapply(DA$guessedLabel, function(d) {getCVRatio(d)}))
DA$CVratio = unlist(lapply(DA$CVratio, function(x) replace(x, is.infinite(x),NA)))
da_cv = aggregate(CVratio ~ gen, data=DA, mean)
da_cv$cih <- aggregate(CVratio ~ gen, data=DA, ci.high)$CVratio
da_cv$cil <- aggregate(CVratio ~ gen, data=DA, ci.low)$CVratio
cvplot = ggplot(da_cv, aes(x = gen, y = CVratio)) +
geom_line(position="dodge") +
geom_point(size = 4) +
geom_pointrange(aes(ymax = CVratio + cih, ymin=CVratio - cil), size = 1.2) +
ylab("CV ratio") +
xlab("Generation") +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
cvplot
cvss = aggregate(CVratio ~ gen + Answer.subSeed, data=DA, mean)
cor.test(cvss$CVratio, cvss$gen)
##
## Pearson's product-moment correlation
##
## data: cvss$CVratio and cvss$gen
## t = 1.597, df = 340, p-value = 0.1111
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01992 0.19062
## sample estimates:
## cor
## 0.08631
cor.test(da_cv$CVratio, da_cv$gen)
##
## Pearson's product-moment correlation
##
## data: da_cv$CVratio and da_cv$gen
## t = 1.961, df = 5, p-value = 0.1072
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1862 0.9438
## sample estimates:
## cor
## 0.6593
a_mean <- ddply(DA, .(gen), function (d) {p.fc(d, "correct")})
aplot <- ggplot(a_mean, aes(y = p_correct, x = gen)) +
geom_point() +
geom_pointrange(aes(ymax = ciul,ymin=ciwl), size = 1.2) +
geom_line() +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
ylab("Proportion correct") +
xlab("Generation") +
ylim(0,1)+
ggtitle("Mean accuracy") +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
aplot
# relationship between accuracy and gen
a_mean_s <- ddply(DA, .(Answer.subSeed, gen), function (d) {p.fc(d, "correct")})
cor.test(a_mean_s$p_correct, a_mean_s$gen)
##
## Pearson's product-moment correlation
##
## data: a_mean_s$p_correct and a_mean_s$gen
## t = 9.752, df = 348, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3767 0.5418
## sample estimates:
## cor
## 0.4633
# relationship between accuracy and length and unique
A1 <- ddply(DA, .(Answer.subSeed, gen), summarise,
accuracy = p.fc2(accuracy),
length = mean(guessedLabelLen),
numUnique = length(unique(guessedLabel)),
lls = getMeanBigramProb(guessedLabel, 2, 10))
m1 = lm(scale(accuracy) ~ scale(length) + scale(numUnique) + scale(lls), A1)
m2 = lm(scale(accuracy) ~ scale(length) + scale(lls), A1)
m3 = lm(scale(accuracy) ~ scale(length) + scale(numUnique), A1)
summary(m1)
##
## Call:
## lm(formula = scale(accuracy) ~ scale(length) + scale(numUnique) +
## scale(lls), data = A1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4028 -0.6781 0.0075 0.6784 2.1860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.34e-17 4.91e-02 0.00 1.00000
## scale(length) -2.13e-01 5.68e-02 -3.75 0.00021 ***
## scale(numUnique) -2.15e-01 7.58e-02 -2.83 0.00488 **
## scale(lls) 9.81e-02 8.33e-02 1.18 0.23981
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.918 on 346 degrees of freedom
## Multiple R-squared: 0.165, Adjusted R-squared: 0.158
## F-statistic: 22.8 on 3 and 346 DF, p-value: 1.71e-13
summary(m2)
##
## Call:
## lm(formula = scale(accuracy) ~ scale(length) + scale(lls), data = A1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4574 -0.7048 0.0072 0.7030 2.2380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.65e-17 4.95e-02 0.00 1.0000
## scale(length) -1.69e-01 5.52e-02 -3.06 0.0024 **
## scale(lls) 2.76e-01 5.52e-02 5.00 9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.927 on 347 degrees of freedom
## Multiple R-squared: 0.146, Adjusted R-squared: 0.141
## F-statistic: 29.6 on 2 and 347 DF, p-value: 1.37e-12
summary(m3) # this is the reported model
##
## Call:
## lm(formula = scale(accuracy) ~ scale(length) + scale(numUnique),
## data = A1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3129 -0.6784 -0.0033 0.6624 2.1473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.24e-18 4.91e-02 0.00 1
## scale(length) -2.45e-01 4.98e-02 -4.92 1.3e-06 ***
## scale(numUnique) -2.82e-01 4.98e-02 -5.67 3.0e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.918 on 347 degrees of freedom
## Multiple R-squared: 0.162, Adjusted R-squared: 0.157
## F-statistic: 33.5 on 2 and 347 DF, p-value: 5.12e-14
cor.test(A1$lls, A1$numUnique) #num unique and transitional are highly colinear
##
## Pearson's product-moment correlation
##
## data: A1$lls and A1$numUnique
## t = -20.49, df = 348, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7835 -0.6879
## sample estimates:
## cor
## -0.7394
Normalized
# normalize Levenshetein distance by length of longer word
DA_all$long_length = ifelse(DA_all$labelLen > DA_all$guessedLabelLen,
DA_all$labelLen, DA_all$guessedLabelLen )
DA_all$lev.n = DA_all$lev / DA_all$long_length
DA_all$ins.n = DA_all$ins / DA_all$long_length
DA_all$del.n = DA_all$del / DA_all$long_length
DA_all$sub.n = DA_all$sub / DA_all$long_length
da_edits <- melt(DA_all, id.vars = c("index", "gen", "workerid"),
measure.vars = c("lev.n", "ins.n", "sub.n", "del.n"))
ms = aggregate(value ~ gen + variable, data = da_edits, mean)
ms$cil <- aggregate(value ~ gen + variable, data=da_edits, ci.low)$value
ms$cih <- aggregate(value ~ gen + variable, data=da_edits, ci.high)$value
# stuff for plotting
ms$size = ifelse(ms$variable == "lev.n", 1.2, 1)
levels(ms$variable) = c("Levenshtein distance", "insertions", "substitutions", "deletions")
#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/lev.pdf", width = 6, height = 6)
ggplot(ms, aes(y=value, x=gen, color=variable)) +
geom_point(size = 3) +
geom_line(aes(size = 4*size)) +
geom_pointrange(aes(ymax = value+cih, ymin=value-cil)) +
scale_size(range=c(0.5, 1.6)) +
xlab("Generation") +
ylab("Normalized Edit distance") +
ggtitle("Levenshtein edit distance - normalized") +
scale_x_continuous(breaks = seq(0, numGen, 1)) +
guides(color = guide_legend(title = "Edit metric"), size=FALSE) +
theme(legend.position=c(1,1),legend.justification=c(1,1.4),
legend.text = element_text(size = 15),
text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
#dev.off()
da_lev = da_edits[da_edits$variable == "lev.n",]
cor.test(da_lev$value, da_lev$gen)
##
## Pearson's product-moment correlation
##
## data: da_lev$value and da_lev$gen
## t = -18.75, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3339 -0.2734
## sample estimates:
## cor
## -0.304
da_del = da_edits[da_edits$variable == "del.n",]
cor.test(da_del$value, da_del$gen)
##
## Pearson's product-moment correlation
##
## data: da_del$value and da_del$gen
## t = -10.88, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2140 -0.1495
## sample estimates:
## cor
## -0.182
da_ins = da_edits[da_edits$variable == "ins.n",]
cor.test(da_ins$value, da_ins$gen)
##
## Pearson's product-moment correlation
##
## data: da_ins$value and da_ins$gen
## t = -4.284, df = 3455, p-value = 1.887e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10577 -0.03945
## sample estimates:
## cor
## -0.07269
da_sub= da_edits[da_edits$variable == "sub.n",]
cor.test(da_sub$value, da_sub$gen)
##
## Pearson's product-moment correlation
##
## data: da_sub$value and da_sub$gen
## t = -16.41, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2995 -0.2376
## sample estimates:
## cor
## -0.2689
Unnormalized
da_edits <- melt(DA_all, id.vars = c("index", "gen", "workerid"),
measure.vars = c("lev", "ins", "sub", "del"))
ms = aggregate(value ~ gen + variable, data = da_edits, mean)
ms$cil <- aggregate(value ~ gen + variable, data=da_edits, ci.low)$value
ms$cih <- aggregate(value ~ gen + variable, data=da_edits, ci.high)$value
# stuff for plotting
ms$size = ifelse(ms$variable == "lev", 1.2, 1)
levels(ms$variable) = c("Levenshtein distance", "insertions", "substitutions", "deletions")
#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/lev.pdf", width = 6, height = 6)
ggplot(ms, aes(y=value, x=gen, color=variable)) +
geom_point(size = 3) +
geom_line(aes(size = 4*size)) +
geom_pointrange(aes(ymax = value+cih, ymin=value-cil)) +
scale_size(range=c(0.5, 1.6)) +
xlab("Generation") +
ylab("Number of edits") +
ggtitle("Levenshtein edit distance - unnormalized") +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
guides(color = guide_legend(title = "Edit metric"), size=FALSE) +
theme(legend.position=c(1,1),legend.justification=c(1,1.4),
legend.text = element_text(size = 15),
text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
#dev.off()
da_lev = da_edits[da_edits$variable == "lev",]
cor.test(da_lev$value, da_lev$gen)
##
## Pearson's product-moment correlation
##
## data: da_lev$value and da_lev$gen
## t = -22.82, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3905 -0.3325
## sample estimates:
## cor
## -0.3619
da_del = da_edits[da_edits$variable == "del",]
cor.test(da_del$value, da_del$gen)
##
## Pearson's product-moment correlation
##
## data: da_del$value and da_del$gen
## t = -12.98, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2472 -0.1836
## sample estimates:
## cor
## -0.2157
da_ins = da_edits[da_edits$variable == "ins",]
cor.test(da_ins$value, da_ins$gen)
##
## Pearson's product-moment correlation
##
## data: da_ins$value and da_ins$gen
## t = -6.532, df = 3455, p-value = 7.456e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.14325 -0.07739
## sample estimates:
## cor
## -0.1104
da_sub= da_edits[da_edits$variable == "sub",]
cor.test(da_sub$value, da_sub$gen)
##
## Pearson's product-moment correlation
##
## data: da_sub$value and da_sub$gen
## t = -21.13, df = 3455, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3675 -0.3084
## sample estimates:
## cor
## -0.3383
getCorString <- function(d, complex, length) {
ci = which(names(d) == complex)
li = which(names(d) == length)
m = round(cor(d[,ci], d[,li]),2)
string = paste("r = ", m, sep = "")
return(string)
}
DA$quintile2 = DA$quintile/5
# get label length by quintile for each generation
MS = aggregate(guessedLabelLen ~ quintile2 + gen, data=DA, mean)
MS$cil <- aggregate(guessedLabelLen ~ quintile2 + gen, data=DA, ci.low)$guessedLabelLen
MS$cih <- aggregate(guessedLabelLen ~ quintile2 + gen, data=DA, ci.high)$guessedLabelLen
# get cor strings
cb_r = ddply(DA, .(gen), function (k) {lab = getCorString(k, "c.norms", "guessedLabelLen")})
cb_r$gen = 1:7
names(cb_r)[which(names(cb_r) == "V1")] = "lab"
ggplot(MS, aes(x=quintile2, y=guessedLabelLen)) +
geom_smooth(method = "lm", aes(x=c.norms, y=guessedLabelLen), data = DA,
position = "identity", size = 1.5, alpha = .2) +
geom_point(size = 3) +
xlab("Complexity quintile") +
ylab("Guessed label length (chars.)") +
ggtitle("Complexity bias across generations") +
facet_grid( ~ gen) +
theme(text = element_text(size=20),
plot.title=element_text(size=20, vjust=2),
axis.title.y=element_text(vjust=0.2),
axis.title.x=element_text(vjust=-0.2),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
strip.background = element_rect(colour="white"),
strip.text.x = element_text( size = 20)) +
scale_x_continuous(breaks=c(.2,.4,.6,.8, 1),
labels=c(1:5)) +
scale_y_continuous(breaks=c(5,6,7))
DA$quintile2 = DA$quintile/5 -.08
# get label length by quintile for each generation
MSC = aggregate(removedCharactersC ~ quintile2 + gen, data=DA, mean)
MSC$cil <- aggregate(removedCharactersC ~ quintile2 + gen, data=DA, ci.low)$removedCharactersC
MSC$cih <- aggregate(removedCharactersC ~ quintile2 + gen, data=DA, ci.high)$removedCharactersC
#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/complexity_bias.pdf", width = 16, height = 6)
ggplot(MSC, aes(x=quintile2, y=removedCharactersC)) +
geom_smooth(method = "lm", aes(x=c.norms, y=removedCharactersC), data = DA,
position = "identity", size = 1.5, alpha = .2) +
geom_pointrange(aes(ymax = removedCharactersC + cih, ymin=removedCharactersC - cil), size = .7) +
xlab("Complexity quintile") +
ylab("Cumulative characters removed") +
ggtitle("Complexity bias across generations") +
facet_grid( ~ gen) +
theme(text = element_text(size=20),
plot.title=element_text(size=20, vjust=2, face="bold"),
axis.title.y=element_text(vjust=0.2),
axis.title.x=element_text(vjust=-0.2),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
strip.background = element_rect(colour="white"),
strip.text.x = element_text( size = 20)) +
scale_x_continuous(breaks=c(0.12, 0.32, 0.52, 0.72, 0.92), lim = c(0,1),labels=c(1:5))
#dev.off()
# # generation 1 only plot
#
# #pdf("/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Talks/Prospectives/E6a.pdf", width = 6, height = 6)
# ggplot(MSC[MSC$gen == 1,], aes(x=quintile2, y=removedCharactersC)) +
# geom_smooth(method = "lm", aes(x=c.norms, y=removedCharactersC), data = DA[DA$gen ==1,],
# position = "identity", size = 1.5, alpha = .2) +
# geom_pointrange(aes(ymax = removedCharactersC + cih, ymin=removedCharactersC - cil), size = .7) +
# xlab("Complexity quintile") +
# ylab("Number characters removed") +
# theme(text = element_text(size=20),
# plot.title=element_text(size=20, vjust=2, face="bold"),
# axis.title.y=element_text(vjust=0.2),
# axis.title.x=element_text(vjust=-0.2),
# plot.background = element_blank(),
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank(),
# panel.border = element_blank(),
# axis.line = element_line(color = 'black'),
# strip.background = element_rect(colour="white"),
# strip.text.x = element_text( size = 20)) +
# scale_x_continuous(breaks=c(0.12, 0.32, 0.52, 0.72, 0.92), lim = c(0,1),labels=c(1:5))
# #dev.off()
Get p value based on empirical r distribution
getEmpiricalr.labelLen <- function(d, i) {
permLabelLen = d$labelLen[i]
corr(cbind(d$c.norms, permLabelLen))
}
getEmpiricalr.CR <- function(d, i) {
permCR= d$removedCharactersC[i]
corr(cbind(d$c.norms, permCR))
}
getEmpiricalp <- function(d, measure) {
if (measure == "labelLen") {
m = boot(d, getEmpiricalr.labelLen, R = 10000)
r_veridical = corr(cbind(d$c.norms, d$labelLen))
} else if (measure == "removedCharactersC") {
m = boot(d, getEmpiricalr.CR, R = 10000)
r_veridical = corr(cbind(d$c.norms, d$removedCharactersC))
}
cdf_func = ecdf(m$t) # cdf function (go from z to q)
p = 1-cdf_func(abs(r_veridical))
return(p)
}
# get dataframe with generation 0
DA.e = DA[,c("labelLen", "removedCharactersC", "gen", "c.norms")]
gen_zero = preProcess(1, rto_norms, co_norms)
g0 = data.frame(labelLen = gen_zero$G0LabelLen, gen = 0,
removedCharactersC = NA, c.norms = gen_zero$c.norms)
DA.e = rbind(DA.e, g0)
Label Length
# labelLen rs
nsamps = 10000
M = data.frame(sample = NA, gen = NA)
m = matrix(nrow = nsamps, ncol = 2)
for (i in 0:7) {
d = DA.e[DA.e$gen == i,]
samps = boot(d, getEmpiricalr.labelLen, R = nsamps)
m[,1] = samps$t
m[,2] = i
colnames(m) = c("sample", "gen")
M = rbind(M, m)
}
# plot r distribution
ggplot(M[-1,], aes(x = sample)) +
geom_histogram(fill = "gray") +
facet_grid(~ gen) +
theme(axis.title.y=element_text(vjust=0.2),
axis.title.x=element_text(vjust=-0.2),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
strip.background = element_rect(colour="white")) +
geom_vline(xintercept = 0, color = "red") +
ggtitle("r distribution for label length")
# labelLen ps
ps = ddply(DA.e, .(gen), function (d) {getEmpiricalp(d, "labelLen")})
rs = ddply(DA.e, .(gen), function (d) {k = corr(cbind(d$c.norms,d$labelLen))})
rs$estimate = "r"
ps$estimate = "empirical_p"
es = rbind(ps,rs)
names(es)[2] = "value"
# plot ps and veridcal rs
ggplot(es, aes(x = gen, y = value, group = estimate, color = estimate)) +
geom_line() +
theme(axis.title.y=element_text(vjust=0.2),
axis.title.x=element_text(vjust=-0.2),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
strip.background = element_rect(colour="white")) +
scale_x_continuous(breaks=seq(0, numGen, 1)) +
geom_hline(yintercept = .05, color = "grey") +
ggtitle("correlation between length and complexity norms \nand empirical p-value")
# collapsing across all generations
getEmpiricalp(DA.e[DA.e$gen>0,], "labelLen")
## [1] 0.0162
CCR
# CCR rs
nsamps = 10000
M = data.frame(sample = NA, gen = NA)
m = matrix(nrow = nsamps, ncol = 2)
for (i in 1:7) {
d = DA.e[DA.e$gen == i,]
samps = boot(d, getEmpiricalr.CR, R = nsamps)
m[,1] = samps$t
m[,2] = i
colnames(m) = c("sample", "gen")
M = rbind(M, m)
}
# plot empirical r distributions
ggplot(M[-1,], aes(x = sample)) +
geom_histogram(fill = "gray") +
facet_grid(~ gen) +
theme(axis.title.y=element_text(vjust=0.2),
axis.title.x=element_text(vjust=-0.2),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
strip.background = element_rect(colour="white")) +
geom_vline(xintercept = 0, color = "red") +
ggtitle("CCR empirical r distribution")
# CR ps
ps = ddply(DA.e[DA.e$gen > 0,], .(gen), function (d) {getEmpiricalp(d, "removedCharactersC")})
rs = ddply(DA.e[DA.e$gen > 0,], .(gen), function (d) {corr(cbind(d$c.norms,d$removedCharactersC))})
rs$estimate = "r"
ps$estimate = "empirical_p"
es = rbind(ps, rs)
names(es)[2] = "value"
ggplot(es, aes(x = gen, y = value, group = estimate, color = estimate)) +
geom_line() +
xlab("generation") +
theme(text = element_text(size=15),
plot.title=element_text(size=15, vjust=2),
axis.title.y=element_text(vjust=0.2),
axis.title.x=element_text(vjust=-0.2),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
strip.background = element_rect(colour="white")) +
scale_y_continuous(breaks=seq(-.2, .1, .05)) +
scale_x_continuous(breaks=seq(1, numGen, 1)) +
geom_hline(yintercept = .05, color = "grey") +
ggtitle("r (cumulative chars. removed vs. complexity norms) \n and associated empirical p-value")
# collapsing across all generations
p = getEmpiricalp(DA.e[DA.e$gen>0,], "removedCharactersC")
sprintf("%.100f",p)
## [1] "0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
# mean complexity bias across generations
cor.test(DA$c.norms, DA$guessedLabelLen)
##
## Pearson's product-moment correlation
##
## data: DA$c.norms and DA$guessedLabelLen
## t = 3.165, df = 3455, p-value = 0.001562
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02047 0.08696
## sample estimates:
## cor
## 0.05377
cor.test(DA$c.norms, DA$removedCharactersC)
##
## Pearson's product-moment correlation
##
## data: DA$c.norms and DA$removedCharactersC
## t = -6.445, df = 3455, p-value = 1.312e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.14182 -0.07594
## sample estimates:
## cor
## -0.109
# averaging across chains
# coefsS returns standardized betas, which are the same as pearson's r
# using this rather than cor or scale() with coeff
# averaging across chains, the bias at each generation
ms <- DA %>%
group_by(gen) %>%
summarise(beta = coefsS(c.norms, guessedLabelLen),
accuracy = p.fc2(accuracy),
var_length = sd(guessedLabelLen),
guessedLabelLen = mean(guessedLabelLen),
removedCharactersC = mean(removedCharactersC),
removedCharacters = mean(removedCharacters))
# variability in bias across chains
msss <- DA %>%
group_by(Answer.subSeed) %>%
summarise(beta = coefsS(c.norms, guessedLabelLen),
accuracy = p.fc2(accuracy),
var_length = sd(guessedLabelLen),
guessedLabelLen = mean(guessedLabelLen),
removedCharactersC = mean(removedCharactersC),
removedCharacters = mean(removedCharacters))
mean(msss$beta)
## [1] 0.04547
sd(msss$beta)
## [1] 0.2652
# variability in decrease across chains
mss <- DA %>%
group_by(gen, Answer.subSeed) %>%
summarise(beta = coefsS(c.norms, guessedLabelLen),
accuracy = p.fc2(accuracy),
var_length = sd(guessedLabelLen),
guessedLabelLen = mean(guessedLabelLen),
removedCharactersC = mean(removedCharactersC),
removedCharacters = mean(removedCharacters))
crs = ddply(mss, .(Answer.subSeed), summarise, change_r = cor(gen, beta))
mean(crs$change_r, na.rm = T)
## [1] 0.004277
sd(crs$change_r, na.rm = T)
## [1] 0.6896
# trying to quantifiy the decrease in bias
cor.test(mss$beta, mss$gen) # continuously
##
## Pearson's product-moment correlation
##
## data: mss$beta and mss$gen
## t = -0.5642, df = 345, p-value = 0.573
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.13521 0.07516
## sample estimates:
## cor
## -0.03036
mss = as.data.frame(mss) # first vs. last generation
mssC = mss[mss$gen != 43 | mss$gen != 45,]
t.test(mssC[mssC$gen == 1,"beta"], mssC[mssC$gen == 7,"beta"], paired = TRUE)
##
## Paired t-test
##
## data: mssC[mssC$gen == 1, "beta"] and mssC[mssC$gen == 7, "beta"]
## t = 0.7297, df = 47, p-value = 0.4692
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.06955 0.14871
## sample estimates:
## mean of the differences
## 0.03958
# mean complexity bias across generations with lev
cor.test(DA$c.norms, DA$lev)
##
## Pearson's product-moment correlation
##
## data: DA$c.norms and DA$lev
## t = 2.669, df = 3455, p-value = 0.007644
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01204 0.07858
## sample estimates:
## cor
## 0.04536
# aggregate over generations for each chain (accuracy and bias)
# get complexity bias at each chain - CCR
ms <- DA %>%
group_by(gen, Answer.subSeed) %>%
summarise(rs = coefsS(c.norms, removedCharactersC),
accuracy = p.fc2(accuracy),
lev = mean(lev))
mss <- ms %>%
group_by(Answer.subSeed) %>%
summarise(CB_beta = coefsS(gen, rs),
accuracy = mean(accuracy),
lev = mean(lev))
cor.test(mss$CB_beta, mss$accuracy)
##
## Pearson's product-moment correlation
##
## data: mss$CB_beta and mss$accuracy
## t = -2.798, df = 48, p-value = 0.007385
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5912 -0.1073
## sample estimates:
## cor
## -0.3744
cor.test(mss$CB_beta, mss$lev)
##
## Pearson's product-moment correlation
##
## data: mss$CB_beta and mss$lev
## t = 3.227, df = 48, p-value = 0.00226
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1630 0.6269
## sample estimates:
## cor
## 0.4222
#pdf("/Documents/GRADUATE_SCHOOL/Projects/iterated_RC/iterated_rc_cogsci/writeup/figs/change_plot.pdf", width = 7, height = 6)
# plot correlation between accuracy and change beta
qplot(lev,CB_beta, position=position_dodge(),
data=mss, geom="point",
ylab="Change in complexity bias \nacross generations (Pearson's r)",
xlab="Mean Levenshtein edit distance") +
geom_point(position=position_dodge(.9)) +
geom_smooth(method = "lm", size = 1.3,, alpha = .2) +
ggtitle("Complexity bias change vs. Edit distance") +
ylim(-1,1.25) +
annotate("text", x=4, y=-.4, color = "red", size = 7,
label=paste("r=", round(cor(mss$lev, mss$CB_beta), 2))) +
theme(text = element_text(size=20),
plot.title=element_text(size=20, face = "bold"),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'))
#dev.off()
## get complexity bias at each chain - LENGTH
ms <- DA %>%
group_by(gen, Answer.subSeed) %>%
summarise(rs = coefsS(c.norms, guessedLabelLen),
accuracy = p.fc2(accuracy),
lev = mean(lev))
mss <- ms %>%
group_by(Answer.subSeed) %>%
summarise(CB_beta = coefsS(gen, rs),
accuracy = mean(accuracy),
lev = mean(lev))
cor.test(mss$CB_beta, mss$accuracy)
##
## Pearson's product-moment correlation
##
## data: mss$CB_beta and mss$accuracy
## t = 2.101, df = 48, p-value = 0.04091
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01291 0.52607
## sample estimates:
## cor
## 0.2902
cor.test(mss$CB_beta, mss$lev)
##
## Pearson's product-moment correlation
##
## data: mss$CB_beta and mss$lev
## t = -1.818, df = 48, p-value = 0.07537
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.49700 0.02645
## sample estimates:
## cor
## -0.2538