knitr::opts_chunk$set(collapse = TRUE, warning=FALSE, message=F, tidy=TRUE) 
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(ggthemes))
suppressPackageStartupMessages(require(lubridate))
suppressPackageStartupMessages(require(stringr))

# Might just be a problem on my system, but I have to run this code
# to set the timezone so lubridate will work
Sys.setenv(TZ="America/Chicago")

Import & format data

masterDf <- readr::read_csv("~/Downloads/spring_health_take_home_df.csv") %>%
  select(-X1) %>% # drop the first column
  
  # Rename variables to shorter names
  rename(mid=member_id_hashed, asmt_time=assessment_created_at, 
         qst_kind=questionnaire_kind, qst_time=questionnaire_created_at) %>%
  
  # I'm assuming that it's okay to simplify to just the date (not datetime),
  # but this might not be the case (e.g. if people are taking the PHQ9 / SDS
  # multiple times a day, or around midnight)
  mutate(
    asmt_time = as.Date(asmt_time),
    qst_time  = as.Date(qst_time)
  )

# Seperate these out into unique df's to avoid problems with NA. You can
# join them back together with the dplyr::join fxns
PHQ <- masterDf %>% filter(qst_kind=="PHQ9")
SDS <- masterDf %>% filter(qst_kind=="SDS")

Corrected code

This should work to recreate the data frame (at least on my system). FYI, I changed the names of the variables

PHQ_overview <- PHQ %>%
  group_by(mid) %>%
  arrange(qst_time) %>%
  summarise(
    nResponses = n(),    # find the total number of responses per user
    Base.score   = first(PHQ9_score),
    Recent.score = last(PHQ9_score),
    Delta.score  = Recent.score - Base.score) %>%
  ungroup()

This should let you get the baseline summary stats

summary(PHQ_overview$Base.score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.000   7.000   8.021  11.000  26.000
summary(PHQ_overview$Delta.score)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -17.0000   0.0000   0.0000  -0.2147   0.0000  15.0000

You can use a similar idea to make the graphics by including other variables (in this case Base.acuity = first(PHQ9_acuity)) in your summarise() function

PHQ %>% group_by(mid) %>% arrange(qst_time) %>% summarise(Base.score = first(PHQ9_score), 
    Base.acuity = first(PHQ9_acuity)) %>% mutate(Base.acuity = factor(Base.acuity, 
    ordered = T, levels = c("none", "mild", "moderate", "moderately severe", 
        "severe"))) %>% ggplot(aes(x = Base.score, fill = Base.acuity)) + geom_histogram(color = "black") + 
    scale_fill_brewer(palette = "Reds") + labs(x = "Baseline PHQ9 score", y = "# of members", 
    title = "Baseline PHQ9 scores", fill = "Baseline acuity")

SDS score

So it looks like one of the problems is that some of the responses to the SDS questions are incomplete (i.e. they answered some questions, but not all). See snapshot below from a user I picked at random:

SDS %>% arrange(qst_time) %>% filter(mid == "1c3f48e629a99ca2a05d4f38692409cc3b5e778ff9a49c3563c15353e56fc37b") %>% 
    select(asmt_time, starts_with("SDS")) %>% knitr::kable()
asmt_time SDS_days_missed SDS_work_impact SDS_home_life_impact SDS_days_unproductive SDS_social_life_impact SDS_risk SDS_positive
2018-01-30 NA 7 4 NA 7 Low TRUE
2018-07-06 NA 6 6 NA 6 Low TRUE
2019-03-20 1 6 5 2 8 Low TRUE
2019-04-18 0 6 6 2 8 Low TRUE

You should be able to get around this problem by filtering out the NA’s for this specific column (SDS_days_unproductive) before the summarise on the SDS df

SDS_productivity <- SDS %>%
  filter(!is.na(SDS_days_unproductive)) %>% # ADD THIS LINE
  group_by(mid) %>%
  arrange(qst_time) %>%
  summarise(
    nResponses = n(),    
    Base.score   = first(SDS_days_unproductive),
    Recent.score = last(SDS_days_unproductive),
    Delta.score  = Recent.score - Base.score) %>%
  ungroup()

So your mean productivity score is u = -0.026096.

Now your problem with the cor.test() is that your vectors are not the same length:

nrow(PHQ_overview)
## [1] 1160
nrow(SDS_productivity)
## [1] 958

Ideally, you’d join the PHQ9 & SDS results together by mid, which lets you do pairwise analysis. For simplicty sake, I’m going to remake these two data frames below (with new names for the baseline/recent/delta scores)

rm(PHQ_overview)
rm(SDS_productivity)

# Remake the PHQ9 score df
PHQ_score <- PHQ %>%
  filter(!is.na(PHQ9_score)) %>% # Just to be consistent
  group_by(mid) %>%
  arrange(qst_time) %>%
  summarise(
    PHQ.n      = n(),    
    PHQ.base   = first(PHQ9_score),
    PHQ.recent = last(PHQ9_score),
    PHQ.Delta  = PHQ.recent - PHQ.base) %>%
  ungroup()


# Remake the SDS - Productivity df
unprdDays <- SDS %>%
  filter(!is.na(SDS_days_unproductive)) %>% # This is the line I added
  group_by(mid) %>%
  arrange(qst_time) %>%
  summarise(
    unprdDays.n      = n(),    
    unprdDays.base   = first(SDS_days_unproductive),
    unprdDays.recent = last(SDS_days_unproductive),
    unprdDays.Delta  = unprdDays.recent - unprdDays.base) %>%
  ungroup()

Since we’re looking at the correlation of the delta’s, we also only want to include members who have more than one response (otherwise you’re counting a bunch of 0’s from people who only have one response)

PHQ_score <- PHQ_score %>% filter(PHQ.n > 1)
unprdDays <- unprdDays %>% filter(unprdDays.n > 1)

Finally, we’ll join the two df’s together. You can use full_join() or inner_join, but I’ll pick inner_join because it’ll get rid of the NAs.

joinedDf <- inner_join(PHQ_score, unprdDays)

Now you should be able to run the cor.test with the new df

cor.test(~PHQ.Delta + unprdDays.Delta, data = joinedDf)
## 
##  Pearson's product-moment correlation
## 
## data:  PHQ.Delta and unprdDays.Delta
## t = 6.6939, df = 209, p-value = 1.964e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3022570 0.5254217
## sample estimates:
##       cor 
## 0.4201719
joinedDf %>% ggplot(aes(x = PHQ.Delta, y = unprdDays.Delta)) + geom_jitter(alpha = 0.3) + 
    geom_smooth(method = lm)