Step: Library packages and load data

Github for project: https://github.com/mhanauer/spring_health_assignment

library(pacman)
p_load(dplyr, tidyverse, janitor, ggplot2, tsibble, formattable)
data_spring_health <- read_csv("~/spring_health_assignment/spring_health_take_home_df.csv")
## New names:
## * `` -> ...1
## Rows: 3269 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (23): member_id_hashed, questionnaire_kind, PHQ9_q1, PHQ9_q2, PHQ9_q3, ...
## dbl   (2): ...1, first_number_in_member_hash
## dttm  (2): assessment_created_at, questionnaire_created_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Step: Clean data sets 1. Lower case names 2. Make outcome variables numeric and reclassify NAs 3. Create a group id to track events by member over time

data_spring_health_clean =  data_spring_health %>%
  clean_names() %>%
  mutate_at(vars(phq9_score, sds_days_unproductive), funs(na_if(., "N/A"))) %>%
  mutate_at(vars(phq9_score, sds_days_unproductive), funs(as.numeric(.))) %>%
  group_by(member_id_hashed, questionnaire_kind) %>%
  mutate(event = row_number()) %>%
  ungroup() %>%
  group_by(member_id_hashed) %>%
  mutate(id = cur_group_id()) %>%
  ungroup()
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

Step: Q1 How many individuals used our platform Assuming the number of participants is the unique number of member_id_hashed. It seems like each person takes both the SDS and PHQ-9.

number_of_participants = data_spring_health_clean %>%
  distinct(member_id_hashed) %>%
  count()

number_of_participants
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1166

Step: Q2 What is the average number of times that a member interacts with the platform. Assuming member_id_hashed is the member id and is unique for each member. So grouping them by member ID and then taking the mean.

avg_number_of_interactions = data_spring_health_clean %>%
  group_by(member_id_hashed) %>%
  count() %>%
  ungroup() %>%
  summarise(mean_interactions = mean(n)) %>%
  mutate(mean_interactions = round(mean_interactions, 2))

avg_number_of_interactions
## # A tibble: 1 × 1
##   mean_interactions
##               <dbl>
## 1               2.8

Step: Q3 What is the distribution of baseline PHQ9 total scores for members on the platform. Q3a Please include a visualization

ggplot(data_spring_health_clean, aes(x=as.numeric(phq9_score))) + 
  geom_histogram(binwidth=1) + 
  labs(y= "Count of PHQ-9 total scores", x = "PHQ-9 total scores")
## Warning: Removed 1656 rows containing non-finite values (stat_bin).

Step: Q3b Please calculate summary statistics

summary(data_spring_health_clean$phq9_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   4.000   7.000   8.108  11.000  26.000    1656

Step: Q4 What is the average change in total PHQ-9 scores for members using the platform

average_change_phq9 = data_spring_health_clean %>%
  filter(questionnaire_kind == "PHQ9") %>%
  dplyr::select(member_id_hashed, phq9_score, id, event) %>%
  arrange(desc(id), (event)) %>%
  ungroup() %>%
  group_by(id) %>%
  # Only keep values with at least two events
  filter(n() > 1) %>%
  group_by(id) %>%
  mutate(phq9_score_change = tsibble::difference(phq9_score)) %>%
  ungroup() %>%
  summarise(avg_phq9_score_change = mean(phq9_score_change, na.rm = TRUE)) %>%
  mutate(avg_phq9_score_change = round(avg_phq9_score_change, 2))


average_change_phq9
## # A tibble: 1 × 1
##   avg_phq9_score_change
##                   <dbl>
## 1                  0.09

Step: Q5 What is the average change in total PHQ9 score for depressed individuals using the platform Assuming depression at baseline means excluding none or N/A at baseline and removing those individuals.

average_change_phq9_depressed = data_spring_health_clean %>%
  filter(questionnaire_kind == "PHQ9") %>%
  mutate(filter_non_dep_baseline = if_else(event == 1 & (phq9_acuity == "N/A" | phq9_acuity == "none"), "drop", "keep")) %>%
  filter(filter_non_dep_baseline == "keep") %>%
  dplyr::select(member_id_hashed, phq9_score, id) %>%
  arrange(desc(id)) %>%
  ungroup() %>%
  group_by(id) %>%
  # Only keep values with at least two responses
  filter( n() > 1 ) %>%
  mutate(phq9_score_change = tsibble::difference(phq9_score)) %>%
  ungroup() %>%
  summarise(avg_phq9_score_change = mean(phq9_score_change, na.rm = TRUE)) %>%
  mutate(avg_phq9_score_change = round(avg_phq9_score_change,  2))
   
average_change_phq9_depressed
## # A tibble: 1 × 1
##   avg_phq9_score_change
##                   <dbl>
## 1                 -0.35

Step: Q6 What is the average change in total workplace productivity (SDS_days_unproductive) for members interacting with the platform

avg_work_productivity_fun = function(data = "data_spring_health_clean") {
  data_out = data_spring_health_clean %>%
    dplyr::select(member_id_hashed, id, event, sds_days_unproductive) %>%
    arrange(desc(id), (event)) %>%
    ungroup() %>%
    group_by(id) %>%
    # Only keep values with at least two responses
    filter(n() > 1) %>%
    mutate(sds_days_score_change = tsibble::difference(sds_days_unproductive)) %>%
    ungroup() %>%
    summarise(sds_days_score_change = mean(sds_days_score_change, na.rm = TRUE)) %>%
    mutate(sds_days_score_change = round(sds_days_score_change, 2))
  return(data_out)
}


avg_work_productivity = avg_work_productivity_fun()
avg_work_productivity
## # A tibble: 1 × 1
##   sds_days_score_change
##                   <dbl>
## 1                  0.28

Step: Q7a Please explore and explain the relationship between symptomatic improvement and functional improvement amongst members who interacted with the Spring platform.

Recreate data sets to allow for non-parametric tests

phq_9_improvement = data_spring_health_clean %>%
  filter(questionnaire_kind == "PHQ9") %>%
  dplyr::select(member_id_hashed, phq9_score, id, event) %>%
  arrange(desc(id), (event)) %>%
  ungroup() %>%
  group_by(id) %>%
  # Only keep values with at least two responses
  filter( n() > 1 ) %>%
  mutate(phq9_score_change = tsibble::difference(phq9_score)) %>%
  ungroup() %>%
  dplyr::select(phq9_score_change)

sds_improvement = data_spring_health_clean %>%
  dplyr::select(member_id_hashed, id, event, sds_days_unproductive) %>%
  arrange(desc(id), (event)) %>%
  ungroup() %>%
  group_by(id) %>%
  # Only keep values with at least two responses
  filter( n() > 1 ) %>%
  mutate(sds_days_score_change = tsibble::difference(sds_days_unproductive)) %>%
  ungroup() %>%
  dplyr::select(sds_days_score_change)

Step: Conduct non-parametric Wilcox tests to evaluate statistical differences

phq9_results =  wilcox.test(phq_9_improvement$phq9_score_change, mu = 0)
phq9_results
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  phq_9_improvement$phq9_score_change
## V = 40178, p-value = 0.7668
## alternative hypothesis: true location is not equal to 0
sds_results = wilcox.test(sds_improvement$sds_days_score_change, mu = 0)
sds_results
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sds_improvement$sds_days_score_change
## V = 898, p-value = 0.1717
## alternative hypothesis: true location is not equal to 0

Q7b Do you think that members benefit from interacting with the Spring platform? Why?

According to my analysis, both PHQ-9 and unproductive SDS days average change are not statistically significantly different from zero indicating there is no evidence the Spring Health platform is effective at reducing these two outcomes.