VAD Validation

Code is here: https://github.com/tca2/uncertainty-team-audio-data-processing

Loading, setting up

library(tidyverse)
library(skimr)

d <- read_csv("Uncertainty_Output.csv")

Descriptive stats for duration

d %>% 
  select(duration) %>% 
  skim()

Data summary
Name	Piped data
Number of rows	2912
Number of columns	1
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
duration	0	1	1.44	1.69	0.05	0.35	0.88	1.91	15.07	▇▁▁▁▁

Histogram: length of turns

entire distribution

d %>% 
  ggplot(aes(x = duration)) +
  geom_histogram(bins = 80) +
  xlab("Duration (seconds)") +
  theme_bw()

first second of time only

d %>% 
  ggplot(aes(x = duration)) +
  geom_histogram(bins = 25) +
  xlab("Duration (seconds)") +
  theme_bw() +
  xlim(0, 1)

Scatter plot

x-axis is the start time of the turn

y-axis is the length of the turn

might need to “bin” or “group” cleverly if it’s hard to see all of the datapoints, so some playing with how this is displayed is likely needed

d %>% 
  ggplot(aes(x = starttime, y = duration)) +
  geom_point(alpha = .2) +
  theme_bw()

10 second bins

bin_the_data <- function(d, bin_duration = 10) {
  # d is the data
  # bin_duration is in seconds
  times <- ceiling(tail(d$end_time, 1) / bin_duration) # calculating the number of breaks inclusive of the last "incomplete" bin
  breaks <- c(1, 1:times * bin_duration) # specifying bins, adding one for the start of the first bin
  
  d$break_ids <- cut(d$starttime, breaks = breaks, dig.lab = 4) # adding these break IDs as a variable to the data set
  d$break_ids_dbl <- as.integer(d$break_ids) 
  d
}

binned_data_10_seconds <- bin_the_data(d, 10)

binned_data_10_seconds

## # A tibble: 2,912 × 7
##    segmentfile  starttime end_time count_frames duration break_ids break_ids_dbl
##    <chr>            <dbl>    <dbl>        <dbl>    <dbl> <fct>             <int>
##  1 /Users/euge…      3.56     3.73           28   0.17   (1,10]                1
##  2 /Users/euge…     15.1     15.2            24   0.130  (10,20]               2
##  3 /Users/euge…     23.7     23.9            37   0.260  (20,30]               3
##  4 /Users/euge…     24.6     24.9            37   0.260  (20,30]               3
##  5 /Users/euge…     25.5     26.2            83   0.720  (20,30]               3
##  6 /Users/euge…     28.1     28.3            23   0.120  (20,30]               3
##  7 /Users/euge…     28.5     29.1            66   0.550  (20,30]               3
##  8 /Users/euge…     29.5     29.6            20   0.0900 (20,30]               3
##  9 /Users/euge…     34.7     35.0            41   0.300  (30,40]               4
## 10 /Users/euge…     38.0     38.2            26   0.150  (30,40]               4
## # … with 2,902 more rows

binned_data_10_seconds %>% 
  group_by(break_ids_dbl) %>% 
  summarize(mean_duration = mean(duration)) %>% 
  ggplot(aes(x = break_ids_dbl, y = mean_duration)) +
  geom_point() +
  theme_bw()

histogram: # of turns

10 second segments

binned_data_10_seconds %>% 
  count(break_ids_dbl) %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 40) +
  xlab("Number of turns for each 10-second segment") +
  theme_bw()

binned_data_10_minutes <- bin_the_data(d, 60 * 10)

binned_data_10_minutes %>% 
  count(break_ids_dbl) %>% 
  ggplot(aes(x = n)) +
  geom_histogram(bins = 40) +
  xlab("Number of turns for each 10-minute segment") +
  theme_bw()