Codebase Untidy Mental Health Data

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.1     ✔ readr     2.1.6
✔ ggplot2   4.0.1     ✔ stringr   1.6.0
✔ lubridate 1.9.4     ✔ tibble    3.3.1
✔ purrr     1.2.1     ✔ tidyr     1.3.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(stringr)
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
raw_data <- read.csv("https://raw.githubusercontent.com/Jeovany97/Data-607/refs/heads/main/Project%202/Untidy%20Mental%20Health%20Data/untidy_mental_health_data.csv", na.strings = c("", "NA", "n/a", "missing", "NA ", "-", "none", "None"))

Tidying the data

tidy_data <- raw_data %>%
  # Rename for consistency
  rename_all(tolower) %>%
  rename(participant_id = recordid) %>%
  
  # Normalize: Separate City and State
  separate(city_state, into = c("city", "state"), sep = "[,;] ?", fill = "right") %>%
  
  # Normalize: Robust Extraction of PHQ and GAD
  # We extract all numeric sequences and pick the first for PHQ and the last for GAD
  mutate(
    temp_nums = str_extract_all(screeningscores, "\\d+"),
    phq_score = map_dbl(temp_nums, ~ as.numeric(.x[1])),
    # GAD is either the second number in ScreeningScores OR found in the Jan column
    gad_initial = map_dbl(temp_nums, ~ as.numeric(.x[2]))
  ) %>%
  
  # Reshape: Pivot monthly columns to long format
  pivot_longer(
    cols = jan:may,
    names_to = "month",
    values_to = "monthly_raw"
  ) %>%
  
  # Normalize: Extract GAD scores that were hidden in monthly columns (e.g., "GAD=7=9")
  mutate(
    gad_monthly = as.numeric(str_extract(monthly_raw, "(?<=GAD[:=]7[:=])\\d+|(?<=GAD[:=])\\d+")),
    # If the monthly value is just a number (like 140), it's a different metric
    other_metric = ifelse(is.na(gad_monthly), as.numeric(monthly_raw), NA)
  ) %>%
  
  # Combine GAD sources into one variable
  mutate(gad_score = coalesce(gad_initial, gad_monthly)) %>%
  
  # Clean up: Remove temporary columns and extract numbers from text strings
  mutate(
    sessions = as.numeric(str_extract(sessions, "\\d+")),
    workhours = as.numeric(str_extract(workhours, "\\d+")),
    sleep_hours = as.numeric(str_extract(sleep, "[\\d.]+"))
  ) %>%
  select(-temp_nums, -gad_initial, -gad_monthly, -screeningscores, -sleep)

# Check the results
glimpse(tidy_data)
Rows: 900
Columns: 21
$ participant_id  <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, …
$ participant     <chr> "Participant_1", "Participant_1", "Participant_1", "Pa…
$ agegroup        <chr> "unknown", "unknown", "unknown", "unknown", "unknown",…
$ gender          <chr> "F", "F", "F", "F", "F", NA, NA, NA, NA, NA, "M", "M",…
$ city            <chr> "Phoenix", "Phoenix", "Phoenix", "Phoenix", "Phoenix",…
$ state           <chr> "AZ", "AZ", "AZ", "AZ", "AZ", "TX", "TX", "TX", "TX", …
$ therapytype     <chr> "Group", "Group", "Group", "Group", "Group", "ACT", "A…
$ sessions        <dbl> 18, 18, 18, 18, 18, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ medication      <chr> NA, NA, NA, NA, NA, "SNRI", "SNRI", "SNRI", "SNRI", "S…
$ diagnosisstatus <chr> "Undiagnosed", "Undiagnosed", "Undiagnosed", "Undiagno…
$ workhours       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ stressscale     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ insurance       <chr> "Private", "Private", "Private", "Private", "Private",…
$ surveydate      <chr> "2024-03-12", "2024-03-12", "2024-03-12", "2024-03-12"…
$ notes           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ phq_score       <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, …
$ month           <chr> "jan", "feb", "mar", "apr", "may", "jan", "feb", "mar"…
$ monthly_raw     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ other_metric    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ gad_score       <dbl> 9, 9, 9, 9, 9, 2, 2, 2, 2, 2, 20, 20, 20, 20, 20, 24, …
$ sleep_hours     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Anaysis. Looking at the relationship between Treatment Type and Average PHQ Scores (Depression) to see if certain therapies correlate with lower symptom reporting

analysis_summary <- tidy_data %>%
  group_by(therapytype) %>%
  summarise(
    avg_phq = mean(phq_score, na.rm = TRUE),
    avg_gad = mean(gad_score, na.rm = TRUE),
    sample_size = n_distinct(participant_id)
  ) %>%
  drop_na(therapytype)

print(analysis_summary)
# A tibble: 4 × 4
  therapytype avg_phq avg_gad sample_size
  <chr>         <dbl>   <dbl>       <int>
1 ACT               9    12.7          27
2 CBT               9    12            37
3 DBT               9    14.5          29
4 Group             9    14.9          29