Setup

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(here)

## here() starts at /Users/brialong/Documents/GitHub/babyview-pipeline

library(googlesheets4)
library(lubridate)
library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.3.2

library(knitr)

Load family demographics, IDENTIFIABLE (!) with birthdate

# I don't have the correct permsissions to do this somehow, reading in manually
# 
# families <- read_sheet('https://docs.google.com/file/d/1EbozDZBPTlaV6Sv9Eeho0CkcVhH6JvsT/edit?filetype=msexcel')

families = read_csv(file=here::here('data/BabyView Demographic Sheet.csv')) %>%
  select(sid, date_birth, ethnicity, gender, parent_ed) %>%
  rename(birthdate = date_birth) %>%
  rename(subject_id = sid) %>%
  filter(!is.na(subject_id))

## New names:
## Rows: 1000 Columns: 28
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (12): experimenter, location, study_name, sid, date_test, date_birth, hi... dbl
## (4): protocol, weeks_old, months_old, parent_ed lgl (12): RA, ...18, ...19,
## ...20, ...21, ...22, ...23, ...24, ...25, ...26,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`

# families_deidentified <- families %>%
#   select(subject_id, birthdate) %>%
#   mutate(birthdate = mdy(birthdate)) %>%
#   mutate(birthdate = round_date(birthdate, unit="month")) %>%
# select(-birthdate)
# # 
# write_csv(families_deidentified, file='subids_deidentified.csv')

note that if you use de-identified data, demographics stuff won’t work

# families = read_csv(file=('subids_deidentified.csv'))

Load session tracking data

# ongoing data collection
ongoing_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=754020357#gid=754020357',sheet='Ongoing_data_collection'
) %>%
  select(-Time, -Notes) %>% # causing join errors because incompatible types
  left_join(families, by=c('subject_id')) %>%
  mutate(cohort = 'ongoing')

## ! Using an auto-discovered, cached token.

##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.

##   See gargle's "Non-interactive auth" vignette for more details:

##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>

## ℹ The googlesheets4 package is using a cached token for 'bria@stanford.edu'.

## Auto-refreshing stale OAuth token.

## ✔ Reading from "BabyView Session Tracking".

## ✔ Range ''Ongoing_data_collection''.

# release 1
release_1_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=1883822719#gid=1883822719', sheet='Main_Release_1_Corrected') %>%
  select(-Time, -Notes) %>%
  left_join(families, by=c('subject_id')) %>%
  mutate(cohort = 'release_1')

## ✔ Reading from "BabyView Session Tracking".

## ✔ Range ''Main_Release_1_Corrected''.

Do some annoying work to get dates out of nested frames

all_session_durations <- release_1_session_durations %>%
  full_join(ongoing_session_durations) %>%
  filter(!is.na(Date)) %>%
  filter(map_lgl(Date, ~ !is.null(.x)))  %>%
  mutate(date_column = map_chr(Date, ~ as.character(.x[1])))  %>%
  filter(date_column!='NA') %>%
  mutate(date_tested = ymd(date_column))

## Joining with `by = join_by(subject_id, video_id, Week, Date, date_inferred,
## License, `Blackout Portions`, Processed_date, Status, Duration, birthdate,
## ethnicity, gender, parent_ed, cohort)`

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_tested = ymd(date_column)`.
## Caused by warning:
## !  8 failed to parse.

Calculate descriptives from release 1

##
seconds_recorded_first_release = sum(release_1_session_durations$Duration, na.rm=TRUE)
hours_recorded_first_release = (seconds_recorded_first_release/60)/60
##
unprocessed_videos = sum(is.na(release_1_session_durations$Duration))

In the first BV-main release, we have 454.9892639 processed hours, and 2 unprocessed videos (maybe mismatched in some way).

##
seconds_recorded_ongoing = sum(ongoing_session_durations$Duration, na.rm=TRUE)
hours_recorded_ongoing = (seconds_recorded_ongoing/60)/60
##
unprocessed_videos = sum(is.na(ongoing_session_durations$Duration))

In the ongoing data collection pipeline, we have 427.6400556 processed hours, and 234 unprocessed videos.

blackout_to_dos = sum(!is.na(all_session_durations$`Blackout Portions`))

All together, this makes 882.6293194 hours of video, with 15 videos that have some portion redacted, and length(all_session_durations$video_id) videos total.

Wrangle data for plot

Look at number of hours by each subject

hours_by_subject <- all_session_durations %>% 
  group_by(subject_id) %>%
  summarize(num_hours = (sum(Duration, na.rm=TRUE)/60)/60) %>%
  arrange(-num_hours) 

hours_by_subject

## # A tibble: 31 × 2
##    subject_id num_hours
##    <chr>          <dbl>
##  1 00400001        91.0
##  2 00320001        74.0
##  3 00370002        70.9
##  4 00400002        63.0
##  5 00820001        54.3
##  6 00240001        51.7
##  7 00320003        51.4
##  8 00510002        45.0
##  9 00420001        41.5
## 10 00500001        38.3
## # ℹ 21 more rows

Make datastructure for plotting – need to calculate ages during recording

main_plot_temp <- all_session_durations %>%
  filter(!is.na(Processed_date)) %>%
  filter(!is.na(Duration)) %>%
  rowwise() %>%
  mutate(age_in_days_during_video = as.numeric(difftime(ymd(date_tested), mdy(birthdate), units='days'))) %>%
  filter(age_in_days_during_video>0) %>%
  filter(!age_in_days_during_video>10000) %>% # some years = 2204
  mutate(age_in_months_during_video = age_in_days_during_video/30.44)

by_subject <- main_plot_temp %>%
  group_by(subject_id, ethnicity, gender, parent_ed) %>%
  summarize(total_recorded = (sum(Duration, na.rm=TRUE)/60)/60)

## `summarise()` has grouped output by 'subject_id', 'ethnicity', 'gender'. You
## can override using the `.groups` argument.

Histogram of age in days during recording

hist(main_plot_temp$age_in_days_during_video)

Histogram of age in months during recording

hist(main_plot_temp$age_in_months_during_video)

Calculate cumulative time spent recording by age in months during recording

main_plot_age_bin <- main_plot_temp %>%
  filter(!is.na(age_in_months_during_video)) %>%
  filter(!is.na(Duration)) %>%
  mutate(duration_in_minutes = sum(Duration)/60) %>%
  group_by(subject_id) %>%
  arrange(age_in_days_during_video) %>%
  mutate(minutes_cumulative = cumsum(duration_in_minutes)) %>%
  select(subject_id, minutes_cumulative, age_in_days_during_video, ethnicity, gender, parent_ed)

Make sure this matches up

check <- main_plot_temp %>%
  filter(!is.na(Duration)) %>%
  group_by(subject_id) %>%
  summarize(hours = (sum(Duration)/60)/60)

sum(check$hours)

## [1] 872.9098

Main plot of age by duration, finally

ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
  geom_point(alpha=.6, size=.5) +
  theme(legend.position='right') +
  ylab('Cumulative hours of videos') +
  xlab('Age (in months) during recording')  +
  theme(legend.position = 'none') +
  xlim(0,36)

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

  # geom_line(aes(group=subject_id)) +
  # facet_wrap(~subject_id)

# ggsave('age_by_duration.pdf', width=4, units='in')

Recording by basic demographics

Do some faceting by by ethnicity

ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
  geom_point(alpha=.6, size=.5) +
  theme(legend.position='right') +
  ylab('Cumulative hours of videos') +
  xlab('Age (in months) during recording')  +
  theme(legend.position = 'none') +
  geom_line(aes(group=subject_id)) +
  facet_wrap(~ethnicity)

Do some faceting by by gender

ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
  geom_point(alpha=.6, size=.5) +
  theme(legend.position='right') +
  ylab('Cumulative hours of videos') +
  xlab('Age (in months) during recording')  +
  theme(legend.position = 'none') +
  geom_line(aes(group=subject_id)) +
  facet_wrap(~gender)

Do some faceting by parent ed

ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
  geom_point(alpha=.6, size=.5) +
  theme(legend.position='right') +
  ylab('Cumulative hours of videos') +
  xlab('Age (in months) during recording')  +
  theme(legend.position = 'none') +
  geom_line(aes(group=subject_id)) +
  facet_wrap(~parent_ed)

Caclulate rate of data collection and plot it

date_started = "2023-03-29"

time_spent <- main_plot_temp %>%
  mutate(date_tested = lubridate::ymd(date_tested)) %>%
  mutate(time_since_starting = as.numeric(difftime(ymd(date_tested), ymd(date_started), units='weeks'))) %>%
  mutate(rounded_weeks = round(time_since_starting)) %>%
  group_by(rounded_weeks) %>%
  summarize(num_subs = length(unique(subject_id)), num_hours = (sum(Duration, na.rm=TRUE)/60)/60)

ggplot(data=time_spent, aes(x=rounded_weeks, y=num_hours, size=num_subs)) +
  geom_point(alpha=.2) +
  geom_smooth(method='lm') +
  theme_few() +
  xlab('Weeks since beginning data collection') +
  ylab('Hours per week collected')

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(data=time_spent %>% filter(num_subs>10), aes(x=rounded_weeks, y=num_hours, size=num_subs)) +
  geom_point(alpha=.2) +
  geom_smooth(method='lm') +
  theme_few() +
  xlab('Weeks since beginning data collection') +
  ylab('Hours per week collected')

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

enrollment visualization for new pipeline

Bria Long

2024-10-25