library(tidyverse)
library(here)
prosodic_features <- read_csv(here("data", "aligning", "joined-data-prosodic-uncertainty-aligned.csv"))

content_log <- read_csv(here("data", "aligning", "LogClass_IG__T103 14-04-22 Content Log Merged-aligned.csv"))

Note that the values in frame_time_minutes_aligned - e.g., (approximately) 2.95 are aligned to (or, in the units of) the UT. Concomitantly, a frame_time_minutes value of 0.059 corresponds to a UT time of 2.95. Thus, the frame_time_minutes_aligned variable represents the time stamp in terms of the UT and it can be used to join this data with other data on the UT.

Here’s a brief look at the two variables:

prosodic_features %>% 
  select(frame_time_minutes,
         frame_time_minutes_aligned)
## # A tibble: 2,913 × 2
##    frame_time_minutes frame_time_minutes_aligned
##                 <dbl>                      <dbl>
##  1             0.0593                       2.96
##  2             0.251                        3.15
##  3             0.394                        3.29
##  4             0.410                        3.31
##  5             0.426                        3.33
##  6             0.469                        3.37
##  7             0.475                        3.38
##  8             0.492                        3.39
##  9             0.579                        3.48
## 10             0.634                        3.53
## # … with 2,903 more rows

Next, we can filter the prosodic features data to include only the segment with the small group we are focused on:

4 minutes, 50 seconds = 4.8333333 25 minutes, 50 seconds = 25.8333333

prosodic_features_filtered <- prosodic_features %>% 
  filter(frame_time_minutes_aligned > (4 + 50/60) & # time stamps are greater than 5.45
           frame_time_minutes_aligned < (25 + 50/60))

Let’s look just at the key variables:

prosodic_features_filtered_key_vars <- prosodic_features_filtered %>% 
  select(frame_time_minutes_aligned, F0_SMA:pcm_loudness_sma, frame_time_minutes)

prosodic_features_filtered_key_vars %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 550
Number of columns 4
_______________________
Column type frequency:
numeric 4
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
frame_time_minutes_aligned 0 1 15.15 6.27 4.84 9.52 15.08 20.79 25.78 ▇▆▇▆▇
F0_SMA 0 1 74.92 54.37 0.00 40.68 65.08 97.60 359.13 ▇▅▁▁▁
pcm_loudness_sma 0 1 0.16 0.06 0.06 0.12 0.15 0.19 0.42 ▆▇▃▁▁
frame_time_minutes 0 1 12.25 6.27 1.94 6.62 12.18 17.89 22.88 ▇▆▇▆▇

Let’s write this file so we can use it in other analyses.

write_csv(prosodic_features_filtered_key_vars, "prosodic-features-filtered-key-vars.csv")

Visualizations

Let’s visualize these variables for the roughly 20 minute segment:

F0_SMA

p <- prosodic_features_filtered_key_vars %>% 
  ggplot(aes(x = frame_time_minutes_aligned, y = F0_SMA)) +
  geom_point()

plotly::ggplotly(p)

F0_SMA

p <- prosodic_features_filtered_key_vars %>% 
  ggplot(aes(x = frame_time_minutes_aligned, y = pcm_loudness_sma)) +
  geom_point()

plotly::ggplotly(p)

visualizing content log

needs help

4 minutes, 50 seconds = 4.8333333 25 minutes, 50 seconds = 25.8333333

content_log_segmented <- content_log %>% 
  select(code01, starttime_minutes) %>% 
  filter(starttime_minutes > 4 + 50/60 &
           starttime_minutes < 25 + 50/60)

content_log_segmented %>% 
  ggplot(aes(x = starttime_minutes, y = 1, color = code01)) +
  xlim(4 + 50/60, 25 + 50/60) +
  geom_point()

pcm_loudness and content log

trying to viz content log + prosodic features - pcm_loudness

prosodic_features_filtered_key_vars %>% 
  ggplot(aes(x = frame_time_minutes_aligned, y = pcm_loudness_sma)) +
  geom_point() +
  geom_point(data = content_log_segmented,  aes(x = starttime_minutes, y = 0, color = code01), shape = 3, size = 4)

F0_SMA and content log

trying to viz content log + prosodic features - F0_SMA

prosodic_features_filtered_key_vars %>% 
  ggplot(aes(x = frame_time_minutes_aligned, y = F0_SMA)) +
  geom_point() +
  geom_point(data = content_log_segmented,  aes(x = starttime_minutes, y = 0, color = code01), shape = 3, size = 4)