df <- data.frame(
Quarter = c("Q1","Q2","Q3","Q4"),
Lakers = c(28,32,25,30),
Celtics = c(31,29,33,28),
Warriors = c(25,27,35,30)
)
write.csv(df, "data_03.csv", row.names = FALSE)title: “Project 2 — Data Tidying” author: “Shawn Ganz” date: today format: html: toc: true code-fold: true theme: flatly execute: echo: true warning: false —
Approach
I’ll use the following data:
- https://brightspace.cuny.edu/d2l/le/1154241/discussions/threads/4168991/View
- df_1
- https://brightspace.cuny.edu/d2l/le/1154241/discussions/threads/4206056/View
- df_2
- https://brightspace.cuny.edu/d2l/le/1154241/discussions/threads/4224007/View
- df_3
- Wasn’t wide, so I have to make it wide
put csv’s onto github read csv in project
url -> "github_raw_link"library(tidyverse)df <- read_csv("url")
Analysis for df_1
Fix the brackets, there are too many.
- Summary table
- religion
- income bracket
- n values
- ggplot
- x = income brackets
- y = n
- fill = religion
- Description
- Talk about which ones has the most per bracket
- Maybe even
Analysis for df_2
The only real wide portion of the dataset is the months. So, make that into months using pivot_longer() and try and fix the craziness of screeningscores and pivot_wider() PHQ and GAD. Limit selection to a couple of rows and then see if there’s anything to analyze.
Could show off age, month, and values.
- Summary table
- group age, month
- summarize
- values
- ggplot
- x = age
- y = values
- cols
- Description
- findings
Analysis for df_3
Simple one, will just pivot_longer() team names into new column, values to score.
- Summary table
- entire dataset
- ggplot
- x = quarter
- y = scores
- fill = team
- cols
- Description
- review the chart
Codebase
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.2.0 ✔ readr 2.2.0
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.2 ✔ tibble 3.3.1
✔ lubridate 1.9.5 ✔ tidyr 1.3.2
✔ purrr 1.2.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
url_1 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_01.csv"
url_2 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_02.csv"
url_3 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_03.csv"
df_1 <- read_csv(url_1)Rows: 14 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Religion
dbl (10): <$10k, $10-20k, $20-30k, $30-40k, $40-50k, $50-75k, $75-100k, $100...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_2 <- read_csv(url_2)Rows: 180 Columns: 21
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (20): Participant, AgeGroup, Gender, City_State, ScreeningScores, Jan, F...
dbl (1): RecordID
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_3 <- read_csv(url_3)Rows: 4 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Quarter
dbl (3): Lakers, Celtics, Warriors
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#df_all <- c(df_1, df_2, df_3)
#glimpse(df_all <- c(df_1, df_2, df_3))
glimpse(df_1)Rows: 14
Columns: 11
$ Religion <chr> "Agnostic", "Atheist", "Buddhist", "Catholic", "E…
$ `<$10k` <dbl> 27, 12, 27, 418, 575, 9, 228, 20, 19, 289, 34, 23…
$ `$10-20k` <dbl> 34, 27, 21, 617, 869, 7, 244, 27, 19, 495, 42, 23…
$ `$20-30k` <dbl> 60, 37, 30, 732, 1064, 9, 236, 24, 25, 619, 37, 1…
$ `$30-40k` <dbl> 81, 52, 34, 670, 982, 11, 238, 24, 25, 655, 48, 1…
$ `$40-50k` <dbl> 76, 39, 33, 638, 881, 13, 197, 21, 30, 651, 51, 1…
$ `$50-75k` <dbl> 137, 81, 58, 1116, 1486, 34, 212, 30, 73, 1107, 1…
$ `$75-100k` <dbl> 102, 76, 62, 949, 949, 47, 156, 15, 59, 939, 87, …
$ `$100-150k` <dbl> 109, 59, 39, 792, 723, 48, 156, 11, 87, 792, 96, …
$ `>$150k` <dbl> 84, 74, 53, 792, 414, 54, 78, 6, 151, 753, 64, 41…
$ `Don't know/refused` <dbl> 96, 76, 54, 1163, 1529, 37, 339, 37, 87, 1096, 10…
glimpse(df_2)Rows: 180
Columns: 21
$ RecordID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ Participant <chr> "Participant_1", "Participant_2", "Participant_3", "Pa…
$ AgeGroup <chr> "unknown", "unknown", "26-35", "36-45", "36-45", "18-2…
$ Gender <chr> "F", "n/a", "M", "Female", "Female", "Female", "Other"…
$ City_State <chr> "Phoenix, AZ", "Houston, TX", "Chicago, IL", "Houston,…
$ ScreeningScores <chr> "PHQ9:9,GAD7:0", "PHQ-9=2;GAD-7=9", "PHQ-9=20;GAD-7=13…
$ Jan <chr> NA, NA, "-", "n/a", NA, "missing", "n/a", "140", "miss…
$ Feb <chr> "-", "missing", "missing", "n/a", NA, NA, NA, "119", "…
$ Mar <chr> "-", "n/a", "n/a", NA, "n/a", NA, NA, "111", "n/a", NA…
$ Apr <chr> "n/a", "n/a", "-", NA, "n/a", NA, "n/a", "-", NA, NA, …
$ May <chr> "-", "n/a", NA, "missing", "missing", NA, NA, "missing…
$ TherapyType <chr> "Group", "ACT", "n/a", "CBT", "Group", "CBT", "CBT", "…
$ Sessions <chr> "18", NA, "n/a", "n/a", "9", "8", "n/a", NA, "n/a", NA…
$ Medication <chr> NA, "SNRI", "None", "None", NA, "None", "SSRI", "SSRI"…
$ DiagnosisStatus <chr> "Undiagnosed", "-", "Pending", "Undiagnosed", "-", NA,…
$ WorkHours <chr> NA, NA, NA, "n/a", "36h", "n/a", "33 hours", "57h", NA…
$ Sleep <chr> "missing", NA, "missing", "missing", NA, NA, "8.7 hour…
$ StressScale <chr> NA, NA, NA, NA, "10", NA, NA, "7", "2", "3", "n/a", "n…
$ Insurance <chr> "Private", "-", "Public", NA, "Public", "Public", "-",…
$ SurveyDate <chr> "2024-03-12", "2025-05-21", "03-30-2024", "2025-06-14"…
$ Notes <chr> "missing", "missing", NA, "Follow-up needed", "Increas…
glimpse(df_3)Rows: 4
Columns: 4
$ Quarter <chr> "Q1", "Q2", "Q3", "Q4"
$ Lakers <dbl> 28, 32, 25, 30
$ Celtics <dbl> 31, 29, 33, 28
$ Warriors <dbl> 25, 27, 35, 30
df_1 |>
mutate(
low = rowSums(across(c(`<$10k`, `$10-20k`, `$20-30k`, `$30-40k`, `$40-50k`))),
mid = rowSums(across(c(, `$50-75k`, `$75-100k`))),
high = rowSums(across(c(`$100-150k`, `>$150k`)))) |>
select(Religion, low, mid, high) |>
pivot_longer(cols = -Religion, names_to = "income_range", values_to = "n")# A tibble: 42 × 3
Religion income_range n
<chr> <chr> <dbl>
1 Agnostic low 278
2 Agnostic mid 239
3 Agnostic high 193
4 Atheist low 167
5 Atheist mid 157
6 Atheist high 133
7 Buddhist low 145
8 Buddhist mid 120
9 Buddhist high 92
10 Catholic low 3075
# ℹ 32 more rows
df_2 |>
select(RecordID, AgeGroup, ScreeningScores, TherapyType, Jan, Feb, Mar, Apr, May) |>
rename(id = RecordID, scores = ScreeningScores, age = AgeGroup) |>
pivot_longer(cols = c(Jan, Feb, Mar, Apr, May), names_to = "month", values_to = "values") |>
mutate(scores = str_replace(scores, ",\\s*", ";"),
scores = str_replace_all(scores, "[:\\-=]", ""),
scores = str_replace(scores, "GAD7", ""),
scores = str_replace(scores, "PHQ9", ""))|>
separate(col = scores, into = c("PHQ", "GAD7"), sep = ";") |>
mutate(values = replace(values, values %in% c("-", "missing", "", "n/a"), NA))# A tibble: 900 × 7
id age PHQ GAD7 TherapyType month values
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 unknown 9 0 Group Jan <NA>
2 1 unknown 9 0 Group Feb <NA>
3 1 unknown 9 0 Group Mar <NA>
4 1 unknown 9 0 Group Apr <NA>
5 1 unknown 9 0 Group May <NA>
6 2 unknown 2 9 ACT Jan <NA>
7 2 unknown 2 9 ACT Feb <NA>
8 2 unknown 2 9 ACT Mar <NA>
9 2 unknown 2 9 ACT Apr <NA>
10 2 unknown 2 9 ACT May <NA>
# ℹ 890 more rows
df_3 |> rename(quarter = Quarter) |>
pivot_longer(cols = -quarter, names_to = "teams", values_to = "scores") |>
arrange(teams)# A tibble: 12 × 3
quarter teams scores
<chr> <chr> <dbl>
1 Q1 Celtics 31
2 Q2 Celtics 29
3 Q3 Celtics 33
4 Q4 Celtics 28
5 Q1 Lakers 28
6 Q2 Lakers 32
7 Q3 Lakers 25
8 Q4 Lakers 30
9 Q1 Warriors 25
10 Q2 Warriors 27
11 Q3 Warriors 35
12 Q4 Warriors 30