Data loading and cleaning

Read in files and consolidate to the same directory.

files <- dir("../production-results/")
d.raw <- tibble()

for (f in files) {
  jf <- paste("../production-results/",f,sep="")
  jd <- fromJSON(paste(readLines(jf), collapse=""))
  id <- data.frame(workerid = jd$WorkerId, 
                   rating = as.numeric(jd$answers$data$rating),
                   file = jd$answers$data$filename,
                   condition = jd$answers$data$condition,
                   age = jd$answers$data$age,
                   language = jd$answers$data$homelang,
                   race = jd$answers$data$race[1],
                   education = jd$answers$data$education,
                   children = jd$answers$data$children)
  d.raw <- bind_rows(d.raw, id)
}

Exclude non-compliant participants and non-English speakers.

compliant <- d.raw %>% 
  filter(grepl("choose", file)) %>%
  mutate(file = as.numeric(str_replace(str_replace(file, "\\.\\./choose_", ""), 
                            "\\.wav", ""))) %>%
  mutate(compliant = rating == file) %>%
  group_by(workerid) %>%
  summarise(compliant = mean(compliant) == 1)

english <- d.raw %>% 
  group_by(workerid, language) %>%
  distinct() %>%
  filter(str_detect(pattern = "[eE]nglish", language))

d <- d.raw %>%
  filter(workerid %in% compliant$workerid, 
         workerid %in% english$workerid, 
         !str_detect(file, pattern = "choose_"))

Merge in filename data.

file_info <- read_csv("../wavs/file_key.csv")

d <- left_join(d, file_info)  

Distribution across conditions.

d %>% 
  group_by(condition, workerid) %>%
  distinct %>%
  group_by(condition) %>%
  summarise(n =n()) %>%
  kable
condition n
accent 14
affect 8
idsness 10
naturalness 18
noise 13

Main analysis

Major summary histogram.

d %>%
  mutate(object = !is.na(object)) %>%
  group_by(file, condition, register, object) %>%
  summarise(rating = mean(rating)) %>%
ggplot(aes(x = rating)) + 
  geom_histogram(binwidth=1) + 
  facet_grid(register~condition) + 
  theme_bw()

Check correlations between raters.

d %>%
  split(.$condition) %>%
  map_df(function(x) {
    cors <- x %>% 
      select(workerid, rating, file) %>% 
      spread(workerid, rating) %>%
      select(-file) %>%
      cor
    cors[cors == 1] <- NA
    tibble(condition = x$condition[1], 
           cor = mean(cors, na.rm=TRUE), 
           min = min(cors, na.rm=TRUE), 
           max = max(cors, na.rm=TRUE))
  }) %>%
  kable(digits = 2)
condition cor min max
accent 0.19 -0.12 0.54
affect 0.32 0.01 0.65
idsness 0.48 0.03 0.71
naturalness 0.05 -0.67 0.70
noise 0.42 0.12 0.75

Filtering analysis

As described by Melanie in an email 8/2/16.

Step 1: Select items that meet the following criteria only

Step 2: Match across IDS and ADS on similarity for:

Step 3: Re-check according some criteria of “sameness” (not sure what that should be)

filtered <- d %>%
  group_by(condition, register, object, baby_id, recording_id, file) %>%
  summarize(rating = mean(rating)) %>%
  spread(condition, rating) %>%
  filter(accent < 3, naturalness > 4, noise < 4,
         (register == "IDS" & idsness > 4) | (register == "ADS" & idsness < 4))

filtered
## Source: local data frame [163 x 10]
## Groups: register, object, baby_id, recording_id [163]
## 
##    register object     baby_id    recording_id
##       <chr>  <chr>       <chr>           <chr>
## 1       ADS   ball   1266_1081    994965918577
## 2       ADS   ball     2077_12 582301715046901
## 3       ADS   ball    3214_461   6955604692438
## 4       ADS   ball    3214_464   7267987032388
## 5       ADS   ball 3314 at 104             602
## 6       ADS   ball 3314 at 115             838
## 7       ADS   ball    4510_341  74610580792717
## 8       ADS  block    1810_458   0540644809489
## 9       ADS  block       4_184     73879609345
## 10      ADS    cup    2077_442   0700898098337
## # ... with 153 more rows, and 6 more variables: file <chr>, accent <dbl>,
## #   affect <dbl>, idsness <dbl>, naturalness <dbl>, noise <dbl>

Output.

write_csv(filtered, "../wavs/filtered_and_normed.csv")