Code is here: https://github.com/tca2/uncertainty-team-audio-data-processing
library(tidyverse)
library(skimr)
d <- read_csv("Uncertainty_Output.csv")
d %>%
select(duration) %>%
skim()
Name | Piped data |
Number of rows | 2912 |
Number of columns | 1 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
duration | 0 | 1 | 1.44 | 1.69 | 0.05 | 0.35 | 0.88 | 1.91 | 15.07 | ▇▁▁▁▁ |
d %>%
ggplot(aes(x = duration)) +
geom_histogram(bins = 80) +
xlab("Duration (seconds)") +
theme_bw()
d %>%
ggplot(aes(x = duration)) +
geom_histogram(bins = 25) +
xlab("Duration (seconds)") +
theme_bw() +
xlim(0, 1)
x-axis is the start time of the turn
y-axis is the length of the turn
might need to “bin” or “group” cleverly if it’s hard to see all of the datapoints, so some playing with how this is displayed is likely needed
d %>%
ggplot(aes(x = starttime, y = duration)) +
geom_point(alpha = .2) +
theme_bw()
bin_the_data <- function(d, bin_duration = 10) {
# d is the data
# bin_duration is in seconds
times <- ceiling(tail(d$end_time, 1) / bin_duration) # calculating the number of breaks inclusive of the last "incomplete" bin
breaks <- c(1, 1:times * bin_duration) # specifying bins, adding one for the start of the first bin
d$break_ids <- cut(d$starttime, breaks = breaks, dig.lab = 4) # adding these break IDs as a variable to the data set
d$break_ids_dbl <- as.integer(d$break_ids)
d
}
binned_data_10_seconds <- bin_the_data(d, 10)
binned_data_10_seconds
## # A tibble: 2,912 × 7
## segmentfile starttime end_time count_frames duration break_ids break_ids_dbl
## <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int>
## 1 /Users/euge… 3.56 3.73 28 0.17 (1,10] 1
## 2 /Users/euge… 15.1 15.2 24 0.130 (10,20] 2
## 3 /Users/euge… 23.7 23.9 37 0.260 (20,30] 3
## 4 /Users/euge… 24.6 24.9 37 0.260 (20,30] 3
## 5 /Users/euge… 25.5 26.2 83 0.720 (20,30] 3
## 6 /Users/euge… 28.1 28.3 23 0.120 (20,30] 3
## 7 /Users/euge… 28.5 29.1 66 0.550 (20,30] 3
## 8 /Users/euge… 29.5 29.6 20 0.0900 (20,30] 3
## 9 /Users/euge… 34.7 35.0 41 0.300 (30,40] 4
## 10 /Users/euge… 38.0 38.2 26 0.150 (30,40] 4
## # … with 2,902 more rows
binned_data_10_seconds %>%
group_by(break_ids_dbl) %>%
summarize(mean_duration = mean(duration)) %>%
ggplot(aes(x = break_ids_dbl, y = mean_duration)) +
geom_point() +
theme_bw()
binned_data_10_seconds %>%
count(break_ids_dbl) %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 40) +
xlab("Number of turns for each 10-second segment") +
theme_bw()
binned_data_10_minutes <- bin_the_data(d, 60 * 10)
binned_data_10_minutes %>%
count(break_ids_dbl) %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 40) +
xlab("Number of turns for each 10-minute segment") +
theme_bw()