Dataset

The time spent per User per day was calculated by finding the difference between the first time of the day and the last time of the day.

The daily time difference was then summed together by group of each unique user (study_id) to get the total day spent per unique User.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(knitr)

setwd("/Users/owad/Desktop/Work")
df <- read.csv("CLEANED_Events Tab.csv") %>%
    mutate(date = as.Date(timestamp))
head(df)
##   event_id              name           timestamp                       note
## 1      817 StartedEnrollment 2019-10-03 04:41:02   /enrollment/get-started/
## 2      820   ScreenerStarted 2019-10-03 04:41:03     /enrollment/step-one/1
## 3      826 ScreenerCompleted 2019-10-03 04:41:30    /enrollment/step-one/36
## 4      829   AcceptedConsent 2019-10-03 04:42:27       /enrollment/consent/
## 5      832    AccountCreated 2019-10-03 04:43:01 /enrollment/account-setup/
## 6      833      UserLoggedIn 2019-10-03 04:43:01 /enrollment/account-setup/
##   study_id       ppt_timestamp Real       date
## 1        7 2019-10-03 00:41:02    1 2019-10-03
## 2        7 2019-10-03 00:41:03    1 2019-10-03
## 3        7 2019-10-03 00:41:30    1 2019-10-03
## 4        7 2019-10-03 00:42:27    1 2019-10-03
## 5        7 2019-10-03 00:43:01    1 2019-10-03
## 6        7 2019-10-03 00:43:01    1 2019-10-03

Summary Table

df_summary <- df %>%
    group_by(study_id, date) %>%
    summarize(time_diff = difftime(last(timestamp), first(timestamp), units = "mins"))
## `summarise()` has grouped output by 'study_id'. You can override using the
## `.groups` argument.
## Time Spent per Day per User
head(df_summary)
## # A tibble: 6 × 3
## # Groups:   study_id [1]
##   study_id date       time_diff     
##      <int> <date>     <drtn>        
## 1        7 2019-10-03  24.06667 mins
## 2        7 2020-01-01   0.00000 mins
## 3        7 2020-01-06   0.00000 mins
## 4        7 2020-01-07 788.35000 mins
## 5        7 2020-03-30   0.00000 mins
## 6        7 2020-08-10   0.00000 mins
df2 <- df_summary %>%
    group_by(study_id) %>%
    summarize(total_time = sum(time_diff))

## Total Time spent per User
head(df2)
## # A tibble: 6 × 2
##   study_id total_time    
##      <int> <drtn>        
## 1        7 825.03333 mins
## 2        8  12.90000 mins
## 3        9  50.90000 mins
## 4       10 209.26667 mins
## 5       11  82.18333 mins
## 6       12 552.25000 mins

Summary Statistics for User Time

sum_stat <- df2 %>%
    summarize(mean = mean(total_time),
        std = sd(total_time),
        "25%" = quantile(total_time, probs = 0.25),
        "50%" = quantile(total_time, probs = 0.5),
        "75%" = quantile(total_time, probs = 0.75))
kable(sum_stat, caption = "Statistics Table", align = c("c", "c", "c", "c"))
Statistics Table
mean std 25% 50% 75%
404.1509 mins 466.0054 25.14167 mins 227.7667 mins 641.1333 mins