project

title: “Project 2 — Data Tidying” author: “Shawn Ganz” date: today format: html: toc: true code-fold: true theme: flatly execute: echo: true warning: false —

Approach

I’ll use the following data:

https://brightspace.cuny.edu/d2l/le/1154241/discussions/threads/4168991/View
- df_1
https://brightspace.cuny.edu/d2l/le/1154241/discussions/threads/4206056/View
- df_2
https://brightspace.cuny.edu/d2l/le/1154241/discussions/threads/4224007/View
- df_3
- Wasn’t wide, so I have to make it wide

df <- data.frame(
  Quarter = c("Q1","Q2","Q3","Q4"),
  Lakers = c(28,32,25,30),
  Celtics = c(31,29,33,28),
  Warriors = c(25,27,35,30)
)

write.csv(df, "data_03.csv", row.names = FALSE)

put csv’s onto github read csv in project

url -> "github_raw_link"
library(tidyverse)
df <- read_csv("url")

Analysis for df_1

Fix the brackets, there are too many.

Summary table
- religion
- income bracket
- n values
ggplot
- x = income brackets
- y = n
- fill = religion
Description
- Talk about which ones has the most per bracket
- Maybe even

Analysis for df_2

The only real wide portion of the dataset is the months. So, make that into months using pivot_longer() and try and fix the craziness of screeningscores and pivot_wider() PHQ and GAD. Limit selection to a couple of rows and then see if there’s anything to analyze.

Could show off age, month, and values.

Summary table
- group age, month
- summarize
  - values
ggplot
- x = age
- y = values
- cols
Description
- findings

Analysis for df_3

Simple one, will just pivot_longer() team names into new column, values to score.

Summary table
- entire dataset
ggplot
- x = quarter
- y = scores
- fill = team
- cols
Description
- review the chart

Codebase

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

url_1 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_01.csv"
url_2 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_02.csv"
url_3 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_03.csv"

df_1 <- read_csv(url_1)

Rows: 14 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): Religion
dbl (10): <$10k, $10-20k, $20-30k, $30-40k, $40-50k, $50-75k, $75-100k, $100...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_2 <- read_csv(url_2)

Rows: 180 Columns: 21
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (20): Participant, AgeGroup, Gender, City_State, ScreeningScores, Jan, F...
dbl  (1): RecordID

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_3 <- read_csv(url_3)

Rows: 4 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Quarter
dbl (3): Lakers, Celtics, Warriors

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#df_all <- c(df_1, df_2, df_3)
#glimpse(df_all <- c(df_1, df_2, df_3))

glimpse(df_1)

Rows: 14
Columns: 11
$ Religion             <chr> "Agnostic", "Atheist", "Buddhist", "Catholic", "E…
$ `<$10k`              <dbl> 27, 12, 27, 418, 575, 9, 228, 20, 19, 289, 34, 23…
$ `$10-20k`            <dbl> 34, 27, 21, 617, 869, 7, 244, 27, 19, 495, 42, 23…
$ `$20-30k`            <dbl> 60, 37, 30, 732, 1064, 9, 236, 24, 25, 619, 37, 1…
$ `$30-40k`            <dbl> 81, 52, 34, 670, 982, 11, 238, 24, 25, 655, 48, 1…
$ `$40-50k`            <dbl> 76, 39, 33, 638, 881, 13, 197, 21, 30, 651, 51, 1…
$ `$50-75k`            <dbl> 137, 81, 58, 1116, 1486, 34, 212, 30, 73, 1107, 1…
$ `$75-100k`           <dbl> 102, 76, 62, 949, 949, 47, 156, 15, 59, 939, 87, …
$ `$100-150k`          <dbl> 109, 59, 39, 792, 723, 48, 156, 11, 87, 792, 96, …
$ `>$150k`             <dbl> 84, 74, 53, 792, 414, 54, 78, 6, 151, 753, 64, 41…
$ `Don't know/refused` <dbl> 96, 76, 54, 1163, 1529, 37, 339, 37, 87, 1096, 10…

glimpse(df_2)

Rows: 180
Columns: 21
$ RecordID        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ Participant     <chr> "Participant_1", "Participant_2", "Participant_3", "Pa…
$ AgeGroup        <chr> "unknown", "unknown", "26-35", "36-45", "36-45", "18-2…
$ Gender          <chr> "F", "n/a", "M", "Female", "Female", "Female", "Other"…
$ City_State      <chr> "Phoenix, AZ", "Houston, TX", "Chicago, IL", "Houston,…
$ ScreeningScores <chr> "PHQ9:9,GAD7:0", "PHQ-9=2;GAD-7=9", "PHQ-9=20;GAD-7=13…
$ Jan             <chr> NA, NA, "-", "n/a", NA, "missing", "n/a", "140", "miss…
$ Feb             <chr> "-", "missing", "missing", "n/a", NA, NA, NA, "119", "…
$ Mar             <chr> "-", "n/a", "n/a", NA, "n/a", NA, NA, "111", "n/a", NA…
$ Apr             <chr> "n/a", "n/a", "-", NA, "n/a", NA, "n/a", "-", NA, NA, …
$ May             <chr> "-", "n/a", NA, "missing", "missing", NA, NA, "missing…
$ TherapyType     <chr> "Group", "ACT", "n/a", "CBT", "Group", "CBT", "CBT", "…
$ Sessions        <chr> "18", NA, "n/a", "n/a", "9", "8", "n/a", NA, "n/a", NA…
$ Medication      <chr> NA, "SNRI", "None", "None", NA, "None", "SSRI", "SSRI"…
$ DiagnosisStatus <chr> "Undiagnosed", "-", "Pending", "Undiagnosed", "-", NA,…
$ WorkHours       <chr> NA, NA, NA, "n/a", "36h", "n/a", "33 hours", "57h", NA…
$ Sleep           <chr> "missing", NA, "missing", "missing", NA, NA, "8.7 hour…
$ StressScale     <chr> NA, NA, NA, NA, "10", NA, NA, "7", "2", "3", "n/a", "n…
$ Insurance       <chr> "Private", "-", "Public", NA, "Public", "Public", "-",…
$ SurveyDate      <chr> "2024-03-12", "2025-05-21", "03-30-2024", "2025-06-14"…
$ Notes           <chr> "missing", "missing", NA, "Follow-up needed", "Increas…

glimpse(df_3)

Rows: 4
Columns: 4
$ Quarter  <chr> "Q1", "Q2", "Q3", "Q4"
$ Lakers   <dbl> 28, 32, 25, 30
$ Celtics  <dbl> 31, 29, 33, 28
$ Warriors <dbl> 25, 27, 35, 30

df_1 |>
  mutate(
  low = rowSums(across(c(`<$10k`, `$10-20k`, `$20-30k`, `$30-40k`, `$40-50k`))),
  mid = rowSums(across(c(, `$50-75k`, `$75-100k`))),
  high = rowSums(across(c(`$100-150k`, `>$150k`)))) |>
  select(Religion, low, mid, high) |>
  pivot_longer(cols = -Religion, names_to = "income_range", values_to = "n")

# A tibble: 42 × 3
   Religion income_range     n
   <chr>    <chr>        <dbl>
 1 Agnostic low            278
 2 Agnostic mid            239
 3 Agnostic high           193
 4 Atheist  low            167
 5 Atheist  mid            157
 6 Atheist  high           133
 7 Buddhist low            145
 8 Buddhist mid            120
 9 Buddhist high            92
10 Catholic low           3075
# ℹ 32 more rows

df_2 |>
  select(RecordID, AgeGroup, ScreeningScores, TherapyType, Jan, Feb, Mar, Apr, May) |>
  rename(id = RecordID, scores = ScreeningScores, age = AgeGroup) |>
  pivot_longer(cols = c(Jan, Feb, Mar, Apr, May), names_to = "month", values_to = "values") |>
  mutate(scores = str_replace(scores, ",\\s*", ";"),
         scores = str_replace_all(scores, "[:\\-=]", ""),
         scores = str_replace(scores, "GAD7", ""),
         scores = str_replace(scores, "PHQ9", ""))|> 
  separate(col = scores, into = c("PHQ", "GAD7"), sep = ";") |>
  mutate(values = replace(values, values %in% c("-", "missing", "", "n/a"), NA))

# A tibble: 900 × 7
      id age     PHQ   GAD7  TherapyType month values
   <dbl> <chr>   <chr> <chr> <chr>       <chr> <chr> 
 1     1 unknown 9     0     Group       Jan   <NA>  
 2     1 unknown 9     0     Group       Feb   <NA>  
 3     1 unknown 9     0     Group       Mar   <NA>  
 4     1 unknown 9     0     Group       Apr   <NA>  
 5     1 unknown 9     0     Group       May   <NA>  
 6     2 unknown 2     9     ACT         Jan   <NA>  
 7     2 unknown 2     9     ACT         Feb   <NA>  
 8     2 unknown 2     9     ACT         Mar   <NA>  
 9     2 unknown 2     9     ACT         Apr   <NA>  
10     2 unknown 2     9     ACT         May   <NA>  
# ℹ 890 more rows

df_3 |> rename(quarter = Quarter) |>
  pivot_longer(cols = -quarter, names_to = "teams", values_to = "scores") |>
  arrange(teams)

# A tibble: 12 × 3
   quarter teams    scores
   <chr>   <chr>     <dbl>
 1 Q1      Celtics      31
 2 Q2      Celtics      29
 3 Q3      Celtics      33
 4 Q4      Celtics      28
 5 Q1      Lakers       28
 6 Q2      Lakers       32
 7 Q3      Lakers       25
 8 Q4      Lakers       30
 9 Q1      Warriors     25
10 Q2      Warriors     27
11 Q3      Warriors     35
12 Q4      Warriors     30