Step: Library packages and load data
Github for project: https://github.com/mhanauer/spring_health_assignment
library(pacman)
p_load(dplyr, tidyverse, janitor, ggplot2, tsibble, formattable)
data_spring_health <- read_csv("~/spring_health_assignment/spring_health_take_home_df.csv")
## New names:
## * `` -> ...1
## Rows: 3269 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): member_id_hashed, questionnaire_kind, PHQ9_q1, PHQ9_q2, PHQ9_q3, ...
## dbl (2): ...1, first_number_in_member_hash
## dttm (2): assessment_created_at, questionnaire_created_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Step: Clean data sets 1. Lower case names 2. Make outcome variables numeric and reclassify NAs 3. Create a group id to track events by member over time
data_spring_health_clean = data_spring_health %>%
clean_names() %>%
mutate_at(vars(phq9_score, sds_days_unproductive), funs(na_if(., "N/A"))) %>%
mutate_at(vars(phq9_score, sds_days_unproductive), funs(as.numeric(.))) %>%
group_by(member_id_hashed, questionnaire_kind) %>%
mutate(event = row_number()) %>%
ungroup() %>%
group_by(member_id_hashed) %>%
mutate(id = cur_group_id()) %>%
ungroup()
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Step: Q1 How many individuals used our platform Assuming the number of participants is the unique number of member_id_hashed. It seems like each person takes both the SDS and PHQ-9.
number_of_participants = data_spring_health_clean %>%
distinct(member_id_hashed) %>%
count()
number_of_participants
## # A tibble: 1 × 1
## n
## <int>
## 1 1166
Step: Q2 What is the average number of times that a member interacts with the platform. Assuming member_id_hashed is the member id and is unique for each member. So grouping them by member ID and then taking the mean.
avg_number_of_interactions = data_spring_health_clean %>%
group_by(member_id_hashed) %>%
count() %>%
ungroup() %>%
summarise(mean_interactions = mean(n)) %>%
mutate(mean_interactions = round(mean_interactions, 2))
avg_number_of_interactions
## # A tibble: 1 × 1
## mean_interactions
## <dbl>
## 1 2.8
Step: Q3 What is the distribution of baseline PHQ9 total scores for members on the platform. Q3a Please include a visualization
ggplot(data_spring_health_clean, aes(x=as.numeric(phq9_score))) +
geom_histogram(binwidth=1) +
labs(y= "Count of PHQ-9 total scores", x = "PHQ-9 total scores")
## Warning: Removed 1656 rows containing non-finite values (stat_bin).
Step: Q3b Please calculate summary statistics
summary(data_spring_health_clean$phq9_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 4.000 7.000 8.108 11.000 26.000 1656
Step: Q4 What is the average change in total PHQ-9 scores for members using the platform
average_change_phq9 = data_spring_health_clean %>%
filter(questionnaire_kind == "PHQ9") %>%
dplyr::select(member_id_hashed, phq9_score, id, event) %>%
arrange(desc(id), (event)) %>%
ungroup() %>%
group_by(id) %>%
# Only keep values with at least two events
filter(n() > 1) %>%
group_by(id) %>%
mutate(phq9_score_change = tsibble::difference(phq9_score)) %>%
ungroup() %>%
summarise(avg_phq9_score_change = mean(phq9_score_change, na.rm = TRUE)) %>%
mutate(avg_phq9_score_change = round(avg_phq9_score_change, 2))
average_change_phq9
## # A tibble: 1 × 1
## avg_phq9_score_change
## <dbl>
## 1 0.09
Step: Q5 What is the average change in total PHQ9 score for depressed individuals using the platform Assuming depression at baseline means excluding none or N/A at baseline and removing those individuals.
average_change_phq9_depressed = data_spring_health_clean %>%
filter(questionnaire_kind == "PHQ9") %>%
mutate(filter_non_dep_baseline = if_else(event == 1 & (phq9_acuity == "N/A" | phq9_acuity == "none"), "drop", "keep")) %>%
filter(filter_non_dep_baseline == "keep") %>%
dplyr::select(member_id_hashed, phq9_score, id) %>%
arrange(desc(id)) %>%
ungroup() %>%
group_by(id) %>%
# Only keep values with at least two responses
filter( n() > 1 ) %>%
mutate(phq9_score_change = tsibble::difference(phq9_score)) %>%
ungroup() %>%
summarise(avg_phq9_score_change = mean(phq9_score_change, na.rm = TRUE)) %>%
mutate(avg_phq9_score_change = round(avg_phq9_score_change, 2))
average_change_phq9_depressed
## # A tibble: 1 × 1
## avg_phq9_score_change
## <dbl>
## 1 -0.35
Step: Q6 What is the average change in total workplace productivity (SDS_days_unproductive) for members interacting with the platform
avg_work_productivity_fun = function(data = "data_spring_health_clean") {
data_out = data_spring_health_clean %>%
dplyr::select(member_id_hashed, id, event, sds_days_unproductive) %>%
arrange(desc(id), (event)) %>%
ungroup() %>%
group_by(id) %>%
# Only keep values with at least two responses
filter(n() > 1) %>%
mutate(sds_days_score_change = tsibble::difference(sds_days_unproductive)) %>%
ungroup() %>%
summarise(sds_days_score_change = mean(sds_days_score_change, na.rm = TRUE)) %>%
mutate(sds_days_score_change = round(sds_days_score_change, 2))
return(data_out)
}
avg_work_productivity = avg_work_productivity_fun()
avg_work_productivity
## # A tibble: 1 × 1
## sds_days_score_change
## <dbl>
## 1 0.28
Step: Q7a Please explore and explain the relationship between symptomatic improvement and functional improvement amongst members who interacted with the Spring platform.
Recreate data sets to allow for non-parametric tests
phq_9_improvement = data_spring_health_clean %>%
filter(questionnaire_kind == "PHQ9") %>%
dplyr::select(member_id_hashed, phq9_score, id, event) %>%
arrange(desc(id), (event)) %>%
ungroup() %>%
group_by(id) %>%
# Only keep values with at least two responses
filter( n() > 1 ) %>%
mutate(phq9_score_change = tsibble::difference(phq9_score)) %>%
ungroup() %>%
dplyr::select(phq9_score_change)
sds_improvement = data_spring_health_clean %>%
dplyr::select(member_id_hashed, id, event, sds_days_unproductive) %>%
arrange(desc(id), (event)) %>%
ungroup() %>%
group_by(id) %>%
# Only keep values with at least two responses
filter( n() > 1 ) %>%
mutate(sds_days_score_change = tsibble::difference(sds_days_unproductive)) %>%
ungroup() %>%
dplyr::select(sds_days_score_change)
Step: Conduct non-parametric Wilcox tests to evaluate statistical differences
phq9_results = wilcox.test(phq_9_improvement$phq9_score_change, mu = 0)
phq9_results
##
## Wilcoxon signed rank test with continuity correction
##
## data: phq_9_improvement$phq9_score_change
## V = 40178, p-value = 0.7668
## alternative hypothesis: true location is not equal to 0
sds_results = wilcox.test(sds_improvement$sds_days_score_change, mu = 0)
sds_results
##
## Wilcoxon signed rank test with continuity correction
##
## data: sds_improvement$sds_days_score_change
## V = 898, p-value = 0.1717
## alternative hypothesis: true location is not equal to 0
Q7b Do you think that members benefit from interacting with the Spring platform? Why?
According to my analysis, both PHQ-9 and unproductive SDS days average change are not statistically significantly different from zero indicating there is no evidence the Spring Health platform is effective at reducing these two outcomes.