Setup
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/brialong/Documents/GitHub/babyview-pipeline
library(googlesheets4)
library(lubridate)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.2
library(knitr)
Load family demographics, IDENTIFIABLE (!) with birthdate
# I don't have the correct permsissions to do this somehow, reading in manually
#
# families <- read_sheet('https://docs.google.com/file/d/1EbozDZBPTlaV6Sv9Eeho0CkcVhH6JvsT/edit?filetype=msexcel')
families = read_csv(file=here::here('data/BabyView Demographic Sheet.csv')) %>%
select(sid, date_birth, ethnicity, gender, parent_ed) %>%
rename(birthdate = date_birth) %>%
rename(subject_id = sid) %>%
filter(!is.na(subject_id))
## New names:
## Rows: 1000 Columns: 28
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (12): experimenter, location, study_name, sid, date_test, date_birth, hi... dbl
## (4): protocol, weeks_old, months_old, parent_ed lgl (12): RA, ...18, ...19,
## ...20, ...21, ...22, ...23, ...24, ...25, ...26,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...27`
## • `` -> `...28`
# families_deidentified <- families %>%
# select(subject_id, birthdate) %>%
# mutate(birthdate = mdy(birthdate)) %>%
# mutate(birthdate = round_date(birthdate, unit="month")) %>%
# select(-birthdate)
# #
# write_csv(families_deidentified, file='subids_deidentified.csv')
note that if you use de-identified data, demographics stuff won’t
work
# families = read_csv(file=('subids_deidentified.csv'))
Load session tracking data
# ongoing data collection
ongoing_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=754020357#gid=754020357',sheet='Ongoing_data_collection'
) %>%
select(-Time, -Notes) %>% # causing join errors because incompatible types
left_join(families, by=c('subject_id')) %>%
mutate(cohort = 'ongoing')
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for 'bria@stanford.edu'.
## Auto-refreshing stale OAuth token.
## ✔ Reading from "BabyView Session Tracking".
## ✔ Range ''Ongoing_data_collection''.
# release 1
release_1_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=1883822719#gid=1883822719', sheet='Main_Release_1_Corrected') %>%
select(-Time, -Notes) %>%
left_join(families, by=c('subject_id')) %>%
mutate(cohort = 'release_1')
## ✔ Reading from "BabyView Session Tracking".
## ✔ Range ''Main_Release_1_Corrected''.
Do some annoying work to get dates out of nested frames
all_session_durations <- release_1_session_durations %>%
full_join(ongoing_session_durations) %>%
filter(!is.na(Date)) %>%
filter(map_lgl(Date, ~ !is.null(.x))) %>%
mutate(date_column = map_chr(Date, ~ as.character(.x[1]))) %>%
filter(date_column!='NA') %>%
mutate(date_tested = ymd(date_column))
## Joining with `by = join_by(subject_id, video_id, Week, Date, date_inferred,
## License, `Blackout Portions`, Processed_date, Status, Duration, birthdate,
## ethnicity, gender, parent_ed, cohort)`
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_tested = ymd(date_column)`.
## Caused by warning:
## ! 8 failed to parse.
Calculate descriptives from release 1
##
seconds_recorded_first_release = sum(release_1_session_durations$Duration, na.rm=TRUE)
hours_recorded_first_release = (seconds_recorded_first_release/60)/60
##
unprocessed_videos = sum(is.na(release_1_session_durations$Duration))
In the first BV-main release, we have 454.9892639 processed hours,
and 2 unprocessed videos (maybe mismatched in some way).
##
seconds_recorded_ongoing = sum(ongoing_session_durations$Duration, na.rm=TRUE)
hours_recorded_ongoing = (seconds_recorded_ongoing/60)/60
##
unprocessed_videos = sum(is.na(ongoing_session_durations$Duration))
In the ongoing data collection pipeline, we have 427.6400556
processed hours, and 234 unprocessed videos.
blackout_to_dos = sum(!is.na(all_session_durations$`Blackout Portions`))
All together, this makes 882.6293194 hours of video, with 15 videos
that have some portion redacted, and
length(all_session_durations$video_id) videos total.
Wrangle data for plot
Look at number of hours by each subject
hours_by_subject <- all_session_durations %>%
group_by(subject_id) %>%
summarize(num_hours = (sum(Duration, na.rm=TRUE)/60)/60) %>%
arrange(-num_hours)
hours_by_subject
## # A tibble: 31 × 2
## subject_id num_hours
## <chr> <dbl>
## 1 00400001 91.0
## 2 00320001 74.0
## 3 00370002 70.9
## 4 00400002 63.0
## 5 00820001 54.3
## 6 00240001 51.7
## 7 00320003 51.4
## 8 00510002 45.0
## 9 00420001 41.5
## 10 00500001 38.3
## # ℹ 21 more rows
Make datastructure for plotting – need to calculate ages during
recording
main_plot_temp <- all_session_durations %>%
filter(!is.na(Processed_date)) %>%
filter(!is.na(Duration)) %>%
rowwise() %>%
mutate(age_in_days_during_video = as.numeric(difftime(ymd(date_tested), mdy(birthdate), units='days'))) %>%
filter(age_in_days_during_video>0) %>%
filter(!age_in_days_during_video>10000) %>% # some years = 2204
mutate(age_in_months_during_video = age_in_days_during_video/30.44)
by_subject <- main_plot_temp %>%
group_by(subject_id, ethnicity, gender, parent_ed) %>%
summarize(total_recorded = (sum(Duration, na.rm=TRUE)/60)/60)
## `summarise()` has grouped output by 'subject_id', 'ethnicity', 'gender'. You
## can override using the `.groups` argument.
Histogram of age in days during recording
hist(main_plot_temp$age_in_days_during_video)

Histogram of age in months during recording
hist(main_plot_temp$age_in_months_during_video)

Calculate cumulative time spent recording by age in months during
recording
main_plot_age_bin <- main_plot_temp %>%
filter(!is.na(age_in_months_during_video)) %>%
filter(!is.na(Duration)) %>%
mutate(duration_in_minutes = sum(Duration)/60) %>%
group_by(subject_id) %>%
arrange(age_in_days_during_video) %>%
mutate(minutes_cumulative = cumsum(duration_in_minutes)) %>%
select(subject_id, minutes_cumulative, age_in_days_during_video, ethnicity, gender, parent_ed)
Make sure this matches up
check <- main_plot_temp %>%
filter(!is.na(Duration)) %>%
group_by(subject_id) %>%
summarize(hours = (sum(Duration)/60)/60)
sum(check$hours)
## [1] 872.9098
Main plot of age by duration, finally
ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
geom_point(alpha=.6, size=.5) +
theme(legend.position='right') +
ylab('Cumulative hours of videos') +
xlab('Age (in months) during recording') +
theme(legend.position = 'none') +
xlim(0,36)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

# geom_line(aes(group=subject_id)) +
# facet_wrap(~subject_id)
# ggsave('age_by_duration.pdf', width=4, units='in')
Recording by basic demographics
Do some faceting by by ethnicity
ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
geom_point(alpha=.6, size=.5) +
theme(legend.position='right') +
ylab('Cumulative hours of videos') +
xlab('Age (in months) during recording') +
theme(legend.position = 'none') +
geom_line(aes(group=subject_id)) +
facet_wrap(~ethnicity)

Do some faceting by by gender
ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
geom_point(alpha=.6, size=.5) +
theme(legend.position='right') +
ylab('Cumulative hours of videos') +
xlab('Age (in months) during recording') +
theme(legend.position = 'none') +
geom_line(aes(group=subject_id)) +
facet_wrap(~gender)

Do some faceting by parent ed
ggplot(main_plot_age_bin, aes(x=age_in_days_during_video/30.44, y=minutes_cumulative/60, col=subject_id)) +
geom_point(alpha=.6, size=.5) +
theme(legend.position='right') +
ylab('Cumulative hours of videos') +
xlab('Age (in months) during recording') +
theme(legend.position = 'none') +
geom_line(aes(group=subject_id)) +
facet_wrap(~parent_ed)

Caclulate rate of data collection and plot it
date_started = "2023-03-29"
time_spent <- main_plot_temp %>%
mutate(date_tested = lubridate::ymd(date_tested)) %>%
mutate(time_since_starting = as.numeric(difftime(ymd(date_tested), ymd(date_started), units='weeks'))) %>%
mutate(rounded_weeks = round(time_since_starting)) %>%
group_by(rounded_weeks) %>%
summarize(num_subs = length(unique(subject_id)), num_hours = (sum(Duration, na.rm=TRUE)/60)/60)
ggplot(data=time_spent, aes(x=rounded_weeks, y=num_hours, size=num_subs)) +
geom_point(alpha=.2) +
geom_smooth(method='lm') +
theme_few() +
xlab('Weeks since beginning data collection') +
ylab('Hours per week collected')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

ggplot(data=time_spent %>% filter(num_subs>10), aes(x=rounded_weeks, y=num_hours, size=num_subs)) +
geom_point(alpha=.2) +
geom_smooth(method='lm') +
theme_few() +
xlab('Weeks since beginning data collection') +
ylab('Hours per week collected')
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
