library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/brialong/Documents/GitHub/babyview-dataset
library(googlesheets4)
library(lubridate)
library(ggthemes)
library(knitr)
IDENTIFIABLE (!) with birthdate if you get it from the source; that bit is commented out now.
Here we are using a deidentified version to check rough ages
# families = read_csv(file=here::here('data/demographics/demographics_2025.csv')) %>%
# as_tibble() %>%
# filter(study_name == 'BabyView') %>% # no pilots
# select(sid, date_birth, ethnicity, gender, num_lang, parent_ed) %>%
# rename(birthdate = date_birth) %>%
# rename(subject_id = sid) %>%
# filter(!is.na(subject_id))
# We collected data from `r length(unique(families$subject_id))`.
# In our sample, `r sum(families$num_lang>1)` children are exposed to more than one language.
# families_short <- families %>%
# select(subject_id, birthdate, num_lang)
# families_deidentified <- families %>%
# select(subject_id, birthdate, num_lang) %>%
# mutate(birthdate = mdy(birthdate)) %>%
# mutate(birthdate = round_date(birthdate, unit="month"))
# write_csv(families_deidentified, file=here::here('data/release_2.0/subids_deidentified.csv'))
Load in deidentified info
families_short = read_csv(file=('subids_deidentified.csv'))
## Rows: 41 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): subject_id
## dbl (1): num_lang
## date (1): birthdate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# ongoing data collection
ongoing_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=754020357#gid=754020357',sheet='Ongoing_data_collection') %>%
filter(Status=="Uploaded") %>%
select(-Notes) %>% # causing join errors because incompatible types
select(subject_id, Upload_fname, Date, `Blackout Portions`, Duration) %>%
rename(exclude = `Blackout Portions`) %>%
left_join(families_short, by=c('subject_id')) %>%
mutate(cohort = 'ongoing') %>%
mutate(video_name = str_split_fixed(Upload_fname, '.zip',2)[,1]) %>%
# Fix the date column, which is in a list for this sheet but not all
filter(map_lgl(Date, ~ !is.null(.x))) %>%
mutate(date_column = map_chr(Date, ~ as.character(.x[1]))) %>%
mutate(date_tested = ymd(date_column)) %>%
select(-Date, -date_column)
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for 'brlong@ucsd.edu'.
## ✔ Reading from "BabyView Session Tracking".
## ✔ Range ''Ongoing_data_collection''.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_tested = ymd(date_column)`.
## Caused by warning:
## ! 8 failed to parse.
## Not missing video names
# sum(is.na(ongoing_session_durations$Upload_fname))
# release 1
release_1_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=1883822719#gid=1883822719', sheet='Main_Release_1_Corrected') %>%
filter(Vid_In_Storage_Bucket=="Y") %>% # filter to uploadeed vids
select(subject_id, Upload_fname, Date, `Blackout Portions`, Duration, `grace notes`) %>%
rename(exclude = `Blackout Portions`) %>% # tag when there was anything to be excluded
left_join(families_short, by=c('subject_id')) %>%
mutate(cohort = 'release_1') %>%
mutate(video_name = str_split_fixed(Upload_fname, '.zip',2)[,1]) %>%
# Fix the date column, which is in a list for this one
filter(map_lgl(Date, ~ !is.null(.x))) %>%
mutate(date_column = map_chr(Date, ~ as.character(.x[1]))) %>%
mutate(date_tested = ymd(date_column)) %>%
select(-Date, -date_column)
## ✔ Reading from "BabyView Session Tracking".
## ✔ Range ''Main_Release_1_Corrected''.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_tested = ymd(date_column)`.
## Caused by warning:
## ! 46 failed to parse.
## Stil missing 1090 video names as of 10pm on 2/10/25
sum(is.na(release_1_session_durations$Upload_fname))
## [1] 1090
# release 1
luna_release_1_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=1883822719#gid=1883822719', sheet='Luna_V1_Corrected') %>%
filter(Vid_In_Storage_Bucket=="Y") %>%
select(subject_id, Upload_fname, Date, `Blackout Portions`, Duration) %>%
rename(exclude = `Blackout Portions`) %>% # tag when there was anything to be excluded
left_join(families_short, by=c('subject_id')) %>%
mutate(cohort = 'ego_single') %>%
mutate(video_name = str_split_fixed(Upload_fname, '.zip',2)[,1]) %>%
# Fix the date column, which is in a list for this one
filter(map_lgl(Date, ~ !is.null(.x))) %>%
mutate(date_column = map_chr(Date, ~ as.character(.x[1]))) %>%
mutate(date_tested = ymd(date_column)) %>%
select(-Date, -date_column)
## ✔ Reading from "BabyView Session Tracking".
## ✔ Range ''Luna_V1_Corrected''.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_tested = ymd(date_column)`.
## Caused by warning:
## ! 2 failed to parse.
## Not missing luna names
sum(is.na(luna_release_1_session_durations$Upload_fname))
## [1] 0
# release 2 luna
luna_release_2_session_durations <- read_sheet('https://docs.google.com/spreadsheets/d/1mAti9dBNUqgNQQIIsnPb5Hu59ovKCUh9LSYOcQvzt2U/edit?gid=1883822719#gid=1883822719', sheet='Luna_Round_2_Ongoing') %>%
filter(Status=="Uploaded") %>%
select(subject_id, Upload_fname, Date, `Delete?`, Duration) %>%
rename(exclude = `Delete?`) %>% # tag when there was anything to be excluded
left_join(families_short, by=c('subject_id')) %>%
mutate(cohort = 'ego_single') %>%
mutate(video_name = str_split_fixed(Upload_fname, '.zip',2)[,1]) %>%
# Fix the date column, which is in a list for this one
mutate(date_column = ymd(Date)) %>%
mutate(date_tested = ymd(date_column)) %>%
select(-Date, -date_column)
## ✔ Reading from "BabyView Session Tracking".
## ✔ Range ''Luna_Round_2_Ongoing''.
sum(is.na(luna_release_2_session_durations$Upload_fname))
## [1] 0
all_sessions <- release_1_session_durations %>%
full_join(ongoing_session_durations) %>%
full_join(luna_release_1_session_durations) %>%
full_join(luna_release_2_session_durations) %>%
# note actual birthdays are in a different date format, but this works here
mutate(age_in_days_during_video = as.numeric(difftime(ymd(date_tested), ymd(birthdate), units='days'))) %>%
select(-birthdate)
## Joining with `by = join_by(subject_id, Upload_fname, exclude, Duration,
## birthdate, num_lang, cohort, video_name, date_tested)`
## Joining with `by = join_by(subject_id, Upload_fname, exclude, Duration,
## birthdate, num_lang, cohort, video_name, date_tested)`
## Joining with `by = join_by(subject_id, Upload_fname, exclude, Duration,
## birthdate, num_lang, cohort, video_name, date_tested)`
to_join <- all_sessions %>%
arrange(video_name) %>%
mutate(filename = video_name)
Here’s the file list from Khais’ pull on GCP, try to join
joined_release_2 <- read_csv(file = here::here('data/included_videos.csv')) %>%
left_join(to_join, by=c('filename'))
## Rows: 5041 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): filename
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
merged <- joined_release_2 %>%
filter(!is.na(subject_id))
no_merge <- joined_release_2 %>%
filter(is.na(subject_id)) %>%
select(filename)
# write_csv(no_merge, file=here::here('data/no_merge_vids.csv'))
How far did we get? We got 2941 videos out of the 5041 we were hoping to match. We are still missing information about 2100 files. We are still missing filenames from release 1 for 1090 files.
ego_single_child <- joined_release_2 %>%
# filter(is.na(subject_id)) %>%
mutate(luna = str_detect(filename, 'LUNA')) %>%
filter(luna == TRUE)
ego_single_child_no_merge <- joined_release_2 %>%
filter(is.na(subject_id)) %>%
mutate(luna = str_detect(filename, 'LUNA')) %>%
filter(luna == TRUE)
None of the missing files are luna files.
to_delete_release2 <- joined_release_2 %>%
filter(!is.na(exclude)) %>%
select(filename, subject_id, date_tested, Duration)
# these videos were recorded but then manually deleted and are not in the release or in the raw/storage buckets
to_delete_from_spreadhseet <- all_sessions %>%
filter(!is.na(exclude)) %>%
anti_join(to_delete_release2 %>% rename(vid_name = filename))
## Joining with `by = join_by(subject_id, Duration, date_tested)`
# write_csv(to_delete_release2, file=here::here('data/to_delete_blackout_vids.csv'))