The time spent per User per day was calculated by finding the difference between the first time of the day and the last time of the day.
The daily time difference was then summed together by group of each unique user (study_id) to get the total day spent per unique User.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(knitr)
setwd("/Users/owad/Desktop/Work")
df <- read.csv("CLEANED_Events Tab.csv") %>%
mutate(date = as.Date(timestamp))
head(df)
## event_id name timestamp note
## 1 817 StartedEnrollment 2019-10-03 04:41:02 /enrollment/get-started/
## 2 820 ScreenerStarted 2019-10-03 04:41:03 /enrollment/step-one/1
## 3 826 ScreenerCompleted 2019-10-03 04:41:30 /enrollment/step-one/36
## 4 829 AcceptedConsent 2019-10-03 04:42:27 /enrollment/consent/
## 5 832 AccountCreated 2019-10-03 04:43:01 /enrollment/account-setup/
## 6 833 UserLoggedIn 2019-10-03 04:43:01 /enrollment/account-setup/
## study_id ppt_timestamp Real date
## 1 7 2019-10-03 00:41:02 1 2019-10-03
## 2 7 2019-10-03 00:41:03 1 2019-10-03
## 3 7 2019-10-03 00:41:30 1 2019-10-03
## 4 7 2019-10-03 00:42:27 1 2019-10-03
## 5 7 2019-10-03 00:43:01 1 2019-10-03
## 6 7 2019-10-03 00:43:01 1 2019-10-03
df_summary <- df %>%
group_by(study_id, date) %>%
summarize(time_diff = difftime(last(timestamp), first(timestamp), units = "mins"))
## `summarise()` has grouped output by 'study_id'. You can override using the
## `.groups` argument.
## Time Spent per Day per User
head(df_summary)
## # A tibble: 6 × 3
## # Groups: study_id [1]
## study_id date time_diff
## <int> <date> <drtn>
## 1 7 2019-10-03 24.06667 mins
## 2 7 2020-01-01 0.00000 mins
## 3 7 2020-01-06 0.00000 mins
## 4 7 2020-01-07 788.35000 mins
## 5 7 2020-03-30 0.00000 mins
## 6 7 2020-08-10 0.00000 mins
df2 <- df_summary %>%
group_by(study_id) %>%
summarize(total_time = sum(time_diff))
## Total Time spent per User
head(df2)
## # A tibble: 6 × 2
## study_id total_time
## <int> <drtn>
## 1 7 825.03333 mins
## 2 8 12.90000 mins
## 3 9 50.90000 mins
## 4 10 209.26667 mins
## 5 11 82.18333 mins
## 6 12 552.25000 mins
sum_stat <- df2 %>%
summarize(mean = mean(total_time),
std = sd(total_time),
"25%" = quantile(total_time, probs = 0.25),
"50%" = quantile(total_time, probs = 0.5),
"75%" = quantile(total_time, probs = 0.75))
kable(sum_stat, caption = "Statistics Table", align = c("c", "c", "c", "c"))
mean | std | 25% | 50% | 75% |
---|---|---|---|---|
404.1509 mins | 466.0054 | 25.14167 mins | 227.7667 mins | 641.1333 mins |