Code is here: https://github.com/tca2/uncertainty-team-audio-data-processing

Loading, setting up

library(tidyverse)
library(skimr)

d <- read_csv("Uncertainty_Output.csv")

Descriptive stats for duration

d %>% 
  select(duration) %>% 
  skim()
Data summary
Name Piped data
Number of rows 2912
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
duration 0 1 1.44 1.69 0.05 0.35 0.88 1.91 15.07 ▇▁▁▁▁

Histogram: length of turns

entire distribution

d %>% 
  ggplot(aes(x = duration)) +
  geom_histogram(bins = 80) +
  xlab("Duration (seconds)") +
  theme_bw()

first second of time only

d %>% 
  ggplot(aes(x = duration)) +
  geom_histogram(bins = 25) +
  xlab("Duration (seconds)") +
  theme_bw() +
  xlim(0, 1)

Scatter plot

x-axis is the start time of the turn

y-axis is the length of the turn

might need to “bin” or “group” cleverly if it’s hard to see all of the datapoints, so some playing with how this is displayed is likely needed

d %>% 
  ggplot(aes(x = starttime, y = duration)) +
  geom_point(alpha = .2) +
  theme_bw()

10 second bins

bin_the_data <- function(d, bin_duration = 10) {
  # d is the data
  # bin_duration is in seconds
  times <- ceiling(tail(d$end_time, 1) / bin_duration) # calculating the number of breaks inclusive of the last "incomplete" bin
  breaks <- c(1, 1:times * bin_duration) # specifying bins, adding one for the start of the first bin
  
  d$break_ids <- cut(d$starttime, breaks = breaks, dig.lab = 4) # adding these break IDs as a variable to the data set
  d$break_ids_dbl <- as.integer(d$break_ids) 
  d
}

binned_data_10_seconds <- bin_the_data(d, 10)

binned_data_10_seconds
## # A tibble: 2,912 × 7
##    segmentfile  starttime end_time count_frames duration break_ids break_ids_dbl
##    <chr>            <dbl>    <dbl>        <dbl>    <dbl> <fct>             <int>
##  1 /Users/euge…      3.56     3.73           28   0.17   (1,10]                1
##  2 /Users/euge…     15.1     15.2            24   0.130  (10,20]               2
##  3 /Users/euge…     23.7     23.9            37   0.260  (20,30]               3
##  4 /Users/euge…     24.6     24.9            37   0.260  (20,30]               3
##  5 /Users/euge…     25.5     26.2            83   0.720  (20,30]               3
##  6 /Users/euge…     28.1     28.3            23   0.120  (20,30]               3
##  7 /Users/euge…     28.5     29.1            66   0.550  (20,30]               3
##  8 /Users/euge…     29.5     29.6            20   0.0900 (20,30]               3
##  9 /Users/euge…     34.7     35.0            41   0.300  (30,40]               4
## 10 /Users/euge…     38.0     38.2            26   0.150  (30,40]               4
## # … with 2,902 more rows
binned_data_10_seconds %>% 
  group_by(break_ids_dbl) %>% 
  summarize(mean_duration = mean(duration)) %>% 
  ggplot(aes(x = break_ids_dbl, y = mean_duration)) +
  geom_point() +
  theme_bw()

histogram: # of turns

10 second segments

binned_data_10_seconds %>% 
  count(break_ids_dbl) %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 40) +
  xlab("Number of turns for each 10-second segment") +
  theme_bw()

binned_data_10_minutes <- bin_the_data(d, 60 * 10)

binned_data_10_minutes %>% 
  count(break_ids_dbl) %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 40) +
  xlab("Number of turns for each 10-minute segment") +
  theme_bw()