Project 2 — Data Tidying

Author

Shawn Ganz

Published

March 8, 2026

Approach

I’ll use the following data:

Dataset_1
- df_1
Dataset_2
- df_2
Dataset_3
- df_3
- Wasn’t wide, so I have to make it wide

Code

df <- data.frame(
  Quarter = c("Q1","Q2","Q3","Q4"),
  Lakers = c(28,32,25,30),
  Celtics = c(31,29,33,28),
  Warriors = c(25,27,35,30)
)

write.csv(df, "data_03.csv", row.names = FALSE)

put csv’s onto github read csv in project

url -> "github_raw_link"
library(tidyverse)
df <- read_csv("url")

Analysis for df_1

Fix the brackets, there are too many.

Summary table
- religion
- income bracket
- n values
ggplot
- x = income brackets
- y = n
- fill = religion
Description
- Talk about which ones has the most per bracket
- Maybe even

Analysis for df_2

The only real wide portion of the dataset is the months. So, make that into months using pivot_longer() and try and fix the craziness of screeningscores and pivot_wider() PHQ and GAD. Limit selection to a couple of rows and then see if there’s anything to analyze.

Could show off age, month, and values.

Summary table
- group age, month
- summarize
  - values
ggplot
- x = age
- y = values
- cols
Description
- findings

Analysis for df_3

Simple one, will just pivot_longer() team names into new column, values to score.

Summary table
- entire dataset
ggplot
- x = quarter
- y = scores
- fill = team
- cols
Description
- review the chart

Codebase

Reading

Code

library(tidyverse)
url_1 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_01.csv"
url_2 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_02.csv"
url_3 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_03.csv"

df_1 <- read_csv(url_1)
df_2 <- read_csv(url_2)
df_3 <- read_csv(url_3)

#df_all <- c(df_1, df_2, df_3)
#glimpse(df_all <- c(df_1, df_2, df_3))

glimpse(df_1)

Rows: 14
Columns: 11
$ Religion             <chr> "Agnostic", "Atheist", "Buddhist", "Catholic", "E…
$ `<$10k`              <dbl> 27, 12, 27, 418, 575, 9, 228, 20, 19, 289, 34, 23…
$ `$10-20k`            <dbl> 34, 27, 21, 617, 869, 7, 244, 27, 19, 495, 42, 23…
$ `$20-30k`            <dbl> 60, 37, 30, 732, 1064, 9, 236, 24, 25, 619, 37, 1…
$ `$30-40k`            <dbl> 81, 52, 34, 670, 982, 11, 238, 24, 25, 655, 48, 1…
$ `$40-50k`            <dbl> 76, 39, 33, 638, 881, 13, 197, 21, 30, 651, 51, 1…
$ `$50-75k`            <dbl> 137, 81, 58, 1116, 1486, 34, 212, 30, 73, 1107, 1…
$ `$75-100k`           <dbl> 102, 76, 62, 949, 949, 47, 156, 15, 59, 939, 87, …
$ `$100-150k`          <dbl> 109, 59, 39, 792, 723, 48, 156, 11, 87, 792, 96, …
$ `>$150k`             <dbl> 84, 74, 53, 792, 414, 54, 78, 6, 151, 753, 64, 41…
$ `Don't know/refused` <dbl> 96, 76, 54, 1163, 1529, 37, 339, 37, 87, 1096, 10…

Code

glimpse(df_2)

Rows: 180
Columns: 21
$ RecordID        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ Participant     <chr> "Participant_1", "Participant_2", "Participant_3", "Pa…
$ AgeGroup        <chr> "unknown", "unknown", "26-35", "36-45", "36-45", "18-2…
$ Gender          <chr> "F", "n/a", "M", "Female", "Female", "Female", "Other"…
$ City_State      <chr> "Phoenix, AZ", "Houston, TX", "Chicago, IL", "Houston,…
$ ScreeningScores <chr> "PHQ9:9,GAD7:0", "PHQ-9=2;GAD-7=9", "PHQ-9=20;GAD-7=13…
$ Jan             <chr> NA, NA, "-", "n/a", NA, "missing", "n/a", "140", "miss…
$ Feb             <chr> "-", "missing", "missing", "n/a", NA, NA, NA, "119", "…
$ Mar             <chr> "-", "n/a", "n/a", NA, "n/a", NA, NA, "111", "n/a", NA…
$ Apr             <chr> "n/a", "n/a", "-", NA, "n/a", NA, "n/a", "-", NA, NA, …
$ May             <chr> "-", "n/a", NA, "missing", "missing", NA, NA, "missing…
$ TherapyType     <chr> "Group", "ACT", "n/a", "CBT", "Group", "CBT", "CBT", "…
$ Sessions        <chr> "18", NA, "n/a", "n/a", "9", "8", "n/a", NA, "n/a", NA…
$ Medication      <chr> NA, "SNRI", "None", "None", NA, "None", "SSRI", "SSRI"…
$ DiagnosisStatus <chr> "Undiagnosed", "-", "Pending", "Undiagnosed", "-", NA,…
$ WorkHours       <chr> NA, NA, NA, "n/a", "36h", "n/a", "33 hours", "57h", NA…
$ Sleep           <chr> "missing", NA, "missing", "missing", NA, NA, "8.7 hour…
$ StressScale     <chr> NA, NA, NA, NA, "10", NA, NA, "7", "2", "3", "n/a", "n…
$ Insurance       <chr> "Private", "-", "Public", NA, "Public", "Public", "-",…
$ SurveyDate      <chr> "2024-03-12", "2025-05-21", "03-30-2024", "2025-06-14"…
$ Notes           <chr> "missing", "missing", NA, "Follow-up needed", "Increas…

Code

glimpse(df_3)

Rows: 4
Columns: 4
$ Quarter  <chr> "Q1", "Q2", "Q3", "Q4"
$ Lakers   <dbl> 28, 32, 25, 30
$ Celtics  <dbl> 31, 29, 33, 28
$ Warriors <dbl> 25, 27, 35, 30

Cleaning

Code

religion_df <- df_1 |>
  rename(religion = Religion) |>
  mutate(
  low = rowSums(across(c(`<$10k`, `$10-20k`, `$20-30k`, `$30-40k`, `$40-50k`))),
  mid = rowSums(across(c(, `$50-75k`, `$75-100k`))),
  high = rowSums(across(c(`$100-150k`, `>$150k`)))) |>
  select(religion, low, mid, high) |>
  pivot_longer(cols = -religion, names_to = "income_range", values_to = "n")
glimpse(religion_df)

Rows: 42
Columns: 3
$ religion     <chr> "Agnostic", "Agnostic", "Agnostic", "Atheist", "Atheist",…
$ income_range <chr> "low", "mid", "high", "low", "mid", "high", "low", "mid",…
$ n            <dbl> 278, 239, 193, 167, 157, 133, 145, 120, 92, 3075, 2065, 1…

Code

medical_df <- df_2 |>
  select(RecordID, AgeGroup, ScreeningScores, TherapyType, Jan, Feb, Mar, Apr, May) |>
  rename(id = RecordID, scores = ScreeningScores, age = AgeGroup) |>
  pivot_longer(cols = c(Jan, Feb, Mar, Apr, May), names_to = "month", values_to = "values") |>
  mutate(scores = str_replace(scores, ",\\s*", ";"),
         scores = str_replace_all(scores, "[:\\-=]", ""),
         scores = str_replace(scores, "GAD7", ""),
         scores = str_replace(scores, "PHQ9", ""))|> 
  separate(col = scores, into = c("PHQ", "GAD7"), sep = ";") |>
  mutate(values = replace(values, values %in% c("-", "missing", "", "n/a"), NA)) |>
  mutate(value = as.integer(values), PHQ = as.integer(PHQ), GAD7 = as.integer(GAD7)) |>
  select(-values) |>
  mutate(TherapyType = ifelse(TherapyType == "n/a", "None", TherapyType),
         month = factor(month, levels = c("Jan", "Feb","Mar", "Apr", "May")))
glimpse(medical_df)

Rows: 900
Columns: 7
$ id          <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4…
$ age         <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "un…
$ PHQ         <int> 9, 9, 9, 9, 9, 2, 2, 2, 2, 2, 20, 20, 20, 20, 20, 24, 24, …
$ GAD7        <int> 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 13, 13, 13, 13, 13, 13, 13, …
$ TherapyType <chr> "Group", "Group", "Group", "Group", "Group", "ACT", "ACT",…
$ month       <fct> Jan, Feb, Mar, Apr, May, Jan, Feb, Mar, Apr, May, Jan, Feb…
$ value       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Code

basketball_df <- df_3 |> rename(quarter = Quarter) |>
  pivot_longer(cols = -quarter, names_to = "teams", values_to = "scores") |>
  mutate(quarter = factor(quarter, levels = c("Q1","Q2","Q3","Q4"))) |>
  arrange(teams)
glimpse(basketball_df)

Rows: 12
Columns: 3
$ quarter <fct> Q1, Q2, Q3, Q4, Q1, Q2, Q3, Q4, Q1, Q2, Q3, Q4
$ teams   <chr> "Celtics", "Celtics", "Celtics", "Celtics", "Lakers", "Lakers"…
$ scores  <dbl> 31, 29, 33, 28, 28, 32, 25, 30, 25, 27, 35, 30

Religion Summarize + ggplot

Code

library(ggthemes)

sum_religion <- religion_df |> 
  mutate(income_range = factor(income_range, levels = c("low", "mid", "high"))) |>
  group_by(religion, income_range) |>
  summarize(n = sum(n))

sum_religion |>
  ggplot(aes(x = income_range, y = n, fill = religion)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Religion counts by income range",
    x = "income range",
    y = "count")

Code

religion_df |> group_by(religion) |>
  summarize(n = sum(n)) |>
  arrange(desc(n)) |>
  slice_head(n = 3)

# A tibble: 3 × 2
  religion                n
  <chr>               <dbl>
1 Evangelical          7943
2 Catholic             6724
3 Mainline Protestant  6300

Code

religion_df |> group_by(religion) |>
  summarize(n = sum(n)) |>
  arrange(n) |>
  slice_head(n = 3)

# A tibble: 3 × 2
  religion              n
  <chr>             <dbl>
1 Jehovah's Witness   178
2 Hindu               232
3 Muslim              234

According to the religion data, there are more patrons that are of the low income range ($0 - $50k), than the other income ranges. One notable pattern is that Orthodox has more patrons that are high income than mid income that can be visually determined.

The religions with the highest count of patrons: - Evangelical (7943) - Catholic (6724) - Mainline Protestant (6300)

The religions with the lowest count of patrons: - Jehovah’s Witness (178) - Hindu (232) - Muslim (234)

Medical Summarize + ggplot

Code

sum_medical <- medical_df |> 
  rename(therapy = TherapyType) |>
  filter(!is.na(value)) |>
  group_by(therapy, month) |>
  summarize(n = sum(value),
            mean_phq = mean(PHQ, na.rm = TRUE),
            mean_gad = mean(GAD7, na.rm = TRUE),
            .groups = "drop")

glimpse(sum_medical)

Rows: 25
Columns: 5
$ therapy  <chr> "ACT", "ACT", "ACT", "ACT", "ACT", "CBT", "CBT", "CBT", "CBT"…
$ month    <fct> Jan, Feb, Mar, Apr, May, Jan, Feb, Mar, Apr, May, Jan, Feb, M…
$ n        <int> 650, 527, 756, 633, 268, 545, 728, 415, 646, 576, 427, 430, 6…
$ mean_phq <dbl> 12.166667, 9.600000, 12.428571, 17.400000, 17.500000, 11.0000…
$ mean_gad <dbl> 15.500000, 10.400000, 11.428571, 12.600000, 3.500000, 8.40000…

Code

sum_medical |>
  group_by(month, therapy) |>
  ggplot(aes(month, mean_phq, color = therapy, group = therapy)) +
  geom_line() +
  geom_point() + 
  labs(title = "Mean PHQ over time",
       x = "Month",
       y = "Mean PHQ Score")

Code

sum_medical |>
  group_by(month, therapy) |>
  ggplot(aes(month, mean_gad, color = therapy, group = therapy)) +
  geom_line() +
  geom_point() +
  labs(title = "Mean GAD over time",
       x = "Month",
       y = "Mean GAD Score")

We can see the highest Mean PHQ score was in April with the Group therapy, which also had the largest drop in the following month. We can see the same in the GAD ggplot, where the highest GAD score was in January with the ACT therapy, followed by the largest drop.

Basketball Summarize + ggplot

Code

basketball_df |>
  ggplot(aes(quarter, scores, fill = teams)) +
  geom_col(position = "dodge") +
  labs(title = "Scores Per Team Per Quarter",
       x = "Quarters",
       y = "Score")

Code

basketball_df |>
  group_by(teams) |>
  slice_max(scores, n = 1, with_ties = FALSE) |>
  select(teams, quarter, scores)

# A tibble: 3 × 3
# Groups:   teams [3]
  teams    quarter scores
  <chr>    <fct>    <dbl>
1 Celtics  Q3          33
2 Lakers   Q2          32
3 Warriors Q3          35

From the ggplot we can see that the data is pretty average, where all four quarters had somewhat equal scoring. When we look at the summarization table, we can see that the Celtics highest scoring quarter was Q3, Warriors was Q3, and Lakers was Q2.