#packages
library(tidyverse)
library(cowplot)
library(knitr)
library(htmltools)
library(glue)
theme_set(theme_cowplot())

###function for adding html audio
html_tag_audio <- function(file, type = "wav") {
  type <- match.arg(type)
  htmltools::tags$audio(
    controls = NA,
    htmltools::tags$source(
      src = file,
      type = glue::glue("audio/{type}", type = type)
    )
  )
}

#read in data
d <- read.csv("WordProperties - phonotactic probability.csv")
pitch <- read.csv("pitch_lists.csv")
#pitch$word <- pitch$item
d <- d %>%
  left_join(pitch)

#clean up the data frame
d <- d %>%
  rename(seg_pp=adult_corpus_avg_segment_phonotactic_probability,bi_pp=adult_corpus_avg_biphone_phonotactic.probability)


# define pairs for each list and add info to data frame
#list1=c("toma","manu","fiffin","kita","modi","regli", "tosip","coodle")
#list2=c("blicket","sarel","boskot","koba","tever", "chatten","jefa",
#        "fuppy")
list1=c("toma","manu","fiffin","modi","kita", "chatten","gazzer","jefa")
list2=c("blicket","sarel","koba","tever","boskot","regli" ,"doopy","pizer")

d$new_list <- ifelse(d$word %in% list1,"list1",
                     ifelse(d$word %in% list2,"list2","not_used"))
d$used=ifelse(d$new_list=="not_used",0.5,1)

#create phonotactics summary values for each list
sum_d_phon <- d %>%
  group_by(new_list) %>%
  summarize(
    mean_seg=mean(seg_pp),
    median_seg=median(seg_pp),
    sd_seg=sd(seg_pp),
    mean_bi=mean(bi_pp),
    median_bi=median(bi_pp),
    sd_bi=sd(bi_pp),
    max_seg=max(seg_pp),
    min_seg=min(seg_pp),
    max_bi=max(bi_pp),
    min_bi=min(bi_pp)
    
  )

#create pitch summary values for each list
sum_d_pitch <- d %>%
  group_by(new_list) %>%
  summarize(
    mean_f0_mean=mean(f0_mean),
    median_f0_mean=median(f0_mean),
    sd_f0_mean=sd(f0_mean),
    mean_f0_min=mean(f0_min),
    median_f0_min=median(f0_min),
    sd_f0_min=sd(f0_min),
    mean_f0_max=mean(f0_max),
    median_f0_max=median(f0_max),
    sd_f0_max=sd(f0_max),
    
  )

The Words

Below are the list of words I am proposing for List 1 and for List 2 in the pilot study. There are 8 words in each list. Note that you can also play the audio files directly in the browser.

#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',list1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',list2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
  list_1=list1,
  audio_list_1=audio_list_1,
  list_2=list2,
  audio_list_2=audio_list_2)

kable(list_d)
list_1 audio_list_1 list_2 audio_list_2
toma blicket
manu sarel
fiffin koba
modi tever
kita boskot
chatten regli
gazzer doopy
jefa pizer

Suggested test items (PICK TWO PAIRS)

Here are specifically the setof items that I was thinking of pulling out as test items. The idea is to select two items from each list that will be the test items (novel or familiar, depending on which list the child heard). It’s not super easy, because it’s even more difficult to match the paired items when there are just 2 novel and 2 familiar test items (while avoiding similar sounds, etc.). It doesn’t need to be perfect, since we will counterbalance across lists (and hence across which items are novel/ familiar), but it would be good if no test item stands out toooooo prominently (at least that’s how I’ve been thinking about it).

I think manu vs. sarel are a good pair. Then it gets a bit tricky. I like toma vs. blicket, but unfortunately blicket does have a noticeably higher peak than the other items. Other possibilities that sound pretty ok to me are kita vs. boskot and fiffin vs. koba. I think the [d] in doopy is pretty prominent, otherwise I would like gazzer vs. doopy best. What do y’all think?

test_list_1 <- c("manu","toma","fiffin","kita","gazzer")
test_list_2 <- c("sarel","blicket","koba","boskot","doopy")

#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',test_list_1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',test_list_2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
  test_list_1=test_list_1,
  audio_list_1=audio_list_1,
  test_list_2=test_list_2,
  audio_list_2=audio_list_2)

kable(list_d)
test_list_1 audio_list_1 test_list_2 audio_list_2
manu sarel
toma blicket
fiffin koba
kita boskot
gazzer doopy

Summarize Properties of Words By List

Phonotactic probability

The table below compares the two lists in terms of phonotactic probability (seg==segment (roughly speaking, phone) probability; bi == biphone probability).

Below, I’m showing the mean, SD, median, maximum, and minumum values (both segment and biphone phonotactic probability) for the items I currently have assigned to list 1 and list 2.

The only noteable difference is that one word in list one (manu) has somewhat higher biphone probability (the maximum value among the set). I still stuck with this word since I wanted to err on the side of including phonotactically probable words. I did not select “kaki” for the list (see plot below to see why this is another possible option) since it ended up being quite close to real word (~“cocky”), which I only realized after recording. If the higher maximum value is a concern, though, it would be easy to adjust this, however.

kable(filter(sum_d_phon, new_list!="not_used"), digits = 5)
new_list mean_seg median_seg sd_seg mean_bi median_bi sd_bi max_seg min_seg max_bi min_bi
list1 0.05255 0.05135 0.00926 0.00365 0.00314 0.00208 0.0676 0.0411 0.0083 0.00173
list2 0.05284 0.05300 0.00979 0.00342 0.00334 0.00167 0.0660 0.0386 0.0065 0.00127

T-tests on average segment and biphone phonotactic probability

We can do a quick t-test to show that the lists don’t appear to differ on segment or biphone phonotactic probability, on average.

Segment phonotactic probability

t.test(
  filter(d,new_list=="list1")$seg_pp,
  filter(d,new_list=="list2")$seg_pp,
  var.equal=T
  )
## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$seg_pp and filter(d, new_list == "list2")$seg_pp
## t = -0.060342, df = 14, p-value = 0.9527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.010506308  0.009931308
## sample estimates:
## mean of x mean of y 
## 0.0525500 0.0528375

Biphone phonotactic probability

t.test(
  filter(d,new_list=="list1")$bi_pp,
  filter(d,new_list=="list2")$bi_pp,
  var.equal=T
)
## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$bi_pp and filter(d, new_list == "list2")$bi_pp
## t = 0.24478, df = 14, p-value = 0.8102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.001795016  0.002257516
## sample estimates:
##  mean of x  mean of y 
## 0.00365125 0.00342000

Plotting the segment and biphone phonotactic probability

Since we are matching the words on two dimensions (segment and biphone phonotactic probability), this makes selection a little more complex and a little more difficult to visualize. Here’s how the selected words are distributed in the 2-D space of segment and biphone phonotactic probability.

ggplot(filter(d,new_list!="not_used"),aes(seg_pp,bi_pp, label=word, color=new_list)) +
  geom_point()+
  geom_label()+
  geom_point(data=filter(d,new_list=="not_used"), alpha=0.5)+
  geom_text(data=filter(d,new_list=="not_used"), alpha=0.5)+
  xlab("average segment probability")+
  ylab("average biphone phonotactive probability")+
  theme(legend.position=c(0.2,0.8))

Pitch

kable(filter(sum_d_pitch, new_list!="not_used"), digits = 5)
new_list mean_f0_mean median_f0_mean sd_f0_mean mean_f0_min median_f0_min sd_f0_min mean_f0_max median_f0_max sd_f0_max
list1 202.6463 205.255 10.24293 131.0775 129.775 4.12751 345.2575 345.645 33.32560
list2 204.0200 199.580 19.58810 133.1037 133.765 3.72697 341.4100 331.220 49.95472

Mean F0

t.test(
  filter(d,new_list=="list1")$f0_mean,
  filter(d,new_list=="list2")$f0_mean,
  var.equal=T
)
## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$f0_mean and filter(d, new_list == "list2")$f0_mean
## t = -0.17578, df = 14, p-value = 0.863
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.13556  15.38806
## sample estimates:
## mean of x mean of y 
##  202.6463  204.0200

Min F0

t.test(
  filter(d,new_list=="list1")$f0_min,
  filter(d,new_list=="list2")$f0_min,
  var.equal=T
)
## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$f0_min and filter(d, new_list == "list2")$f0_min
## t = -1.0306, df = 14, p-value = 0.3202
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.243271  2.190771
## sample estimates:
## mean of x mean of y 
##  131.0775  133.1037

Max F0

t.test(
  filter(d,new_list=="list1")$f0_max,
  filter(d,new_list=="list2")$f0_max,
  var.equal=T
)
## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$f0_max and filter(d, new_list == "list2")$f0_max
## t = 0.18122, df = 14, p-value = 0.8588
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -41.68864  49.38364
## sample estimates:
## mean of x mean of y 
##  345.2575  341.4100