knitr::opts_chunk$set(collapse = TRUE, warning=FALSE, message=F, tidy=TRUE)
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(ggthemes))
suppressPackageStartupMessages(require(lubridate))
suppressPackageStartupMessages(require(stringr))
# Might just be a problem on my system, but I have to run this code
# to set the timezone so lubridate will work
Sys.setenv(TZ="America/Chicago")
masterDf <- readr::read_csv("~/Downloads/spring_health_take_home_df.csv") %>%
select(-X1) %>% # drop the first column
# Rename variables to shorter names
rename(mid=member_id_hashed, asmt_time=assessment_created_at,
qst_kind=questionnaire_kind, qst_time=questionnaire_created_at) %>%
# I'm assuming that it's okay to simplify to just the date (not datetime),
# but this might not be the case (e.g. if people are taking the PHQ9 / SDS
# multiple times a day, or around midnight)
mutate(
asmt_time = as.Date(asmt_time),
qst_time = as.Date(qst_time)
)
# Seperate these out into unique df's to avoid problems with NA. You can
# join them back together with the dplyr::join fxns
PHQ <- masterDf %>% filter(qst_kind=="PHQ9")
SDS <- masterDf %>% filter(qst_kind=="SDS")
This should work to recreate the data frame (at least on my system). FYI, I changed the names of the variables
PHQ_overview <- PHQ %>%
group_by(mid) %>%
arrange(qst_time) %>%
summarise(
nResponses = n(), # find the total number of responses per user
Base.score = first(PHQ9_score),
Recent.score = last(PHQ9_score),
Delta.score = Recent.score - Base.score) %>%
ungroup()
This should let you get the baseline summary stats
summary(PHQ_overview$Base.score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 4.000 7.000 8.021 11.000 26.000
summary(PHQ_overview$Delta.score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17.0000 0.0000 0.0000 -0.2147 0.0000 15.0000
You can use a similar idea to make the graphics by including other variables (in this case Base.acuity = first(PHQ9_acuity)) in your summarise() function
PHQ %>% group_by(mid) %>% arrange(qst_time) %>% summarise(Base.score = first(PHQ9_score),
Base.acuity = first(PHQ9_acuity)) %>% mutate(Base.acuity = factor(Base.acuity,
ordered = T, levels = c("none", "mild", "moderate", "moderately severe",
"severe"))) %>% ggplot(aes(x = Base.score, fill = Base.acuity)) + geom_histogram(color = "black") +
scale_fill_brewer(palette = "Reds") + labs(x = "Baseline PHQ9 score", y = "# of members",
title = "Baseline PHQ9 scores", fill = "Baseline acuity")
So it looks like one of the problems is that some of the responses to the SDS questions are incomplete (i.e. they answered some questions, but not all). See snapshot below from a user I picked at random:
SDS %>% arrange(qst_time) %>% filter(mid == "1c3f48e629a99ca2a05d4f38692409cc3b5e778ff9a49c3563c15353e56fc37b") %>%
select(asmt_time, starts_with("SDS")) %>% knitr::kable()
| asmt_time | SDS_days_missed | SDS_work_impact | SDS_home_life_impact | SDS_days_unproductive | SDS_social_life_impact | SDS_risk | SDS_positive |
|---|---|---|---|---|---|---|---|
| 2018-01-30 | NA | 7 | 4 | NA | 7 | Low | TRUE |
| 2018-07-06 | NA | 6 | 6 | NA | 6 | Low | TRUE |
| 2019-03-20 | 1 | 6 | 5 | 2 | 8 | Low | TRUE |
| 2019-04-18 | 0 | 6 | 6 | 2 | 8 | Low | TRUE |
You should be able to get around this problem by filtering out the NA’s for this specific column (SDS_days_unproductive) before the summarise on the SDS df
SDS_productivity <- SDS %>%
filter(!is.na(SDS_days_unproductive)) %>% # ADD THIS LINE
group_by(mid) %>%
arrange(qst_time) %>%
summarise(
nResponses = n(),
Base.score = first(SDS_days_unproductive),
Recent.score = last(SDS_days_unproductive),
Delta.score = Recent.score - Base.score) %>%
ungroup()
So your mean productivity score is u = -0.026096.
Now your problem with the cor.test() is that your vectors are not the same length:
nrow(PHQ_overview)
## [1] 1160
nrow(SDS_productivity)
## [1] 958
Ideally, you’d join the PHQ9 & SDS results together by mid, which lets you do pairwise analysis. For simplicty sake, I’m going to remake these two data frames below (with new names for the baseline/recent/delta scores)
rm(PHQ_overview)
rm(SDS_productivity)
# Remake the PHQ9 score df
PHQ_score <- PHQ %>%
filter(!is.na(PHQ9_score)) %>% # Just to be consistent
group_by(mid) %>%
arrange(qst_time) %>%
summarise(
PHQ.n = n(),
PHQ.base = first(PHQ9_score),
PHQ.recent = last(PHQ9_score),
PHQ.Delta = PHQ.recent - PHQ.base) %>%
ungroup()
# Remake the SDS - Productivity df
unprdDays <- SDS %>%
filter(!is.na(SDS_days_unproductive)) %>% # This is the line I added
group_by(mid) %>%
arrange(qst_time) %>%
summarise(
unprdDays.n = n(),
unprdDays.base = first(SDS_days_unproductive),
unprdDays.recent = last(SDS_days_unproductive),
unprdDays.Delta = unprdDays.recent - unprdDays.base) %>%
ungroup()
Since we’re looking at the correlation of the delta’s, we also only want to include members who have more than one response (otherwise you’re counting a bunch of 0’s from people who only have one response)
PHQ_score <- PHQ_score %>% filter(PHQ.n > 1)
unprdDays <- unprdDays %>% filter(unprdDays.n > 1)
Finally, we’ll join the two df’s together. You can use full_join() or inner_join, but I’ll pick inner_join because it’ll get rid of the NAs.
joinedDf <- inner_join(PHQ_score, unprdDays)
Now you should be able to run the cor.test with the new df
cor.test(~PHQ.Delta + unprdDays.Delta, data = joinedDf)
##
## Pearson's product-moment correlation
##
## data: PHQ.Delta and unprdDays.Delta
## t = 6.6939, df = 209, p-value = 1.964e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3022570 0.5254217
## sample estimates:
## cor
## 0.4201719
joinedDf %>% ggplot(aes(x = PHQ.Delta, y = unprdDays.Delta)) + geom_jitter(alpha = 0.3) +
geom_smooth(method = lm)