Code
df <- data.frame(
Quarter = c("Q1","Q2","Q3","Q4"),
Lakers = c(28,32,25,30),
Celtics = c(31,29,33,28),
Warriors = c(25,27,35,30)
)
write.csv(df, "data_03.csv", row.names = FALSE)I’ll use the following data:
df <- data.frame(
Quarter = c("Q1","Q2","Q3","Q4"),
Lakers = c(28,32,25,30),
Celtics = c(31,29,33,28),
Warriors = c(25,27,35,30)
)
write.csv(df, "data_03.csv", row.names = FALSE)put csv’s onto github read csv in project
url -> "github_raw_link"library(tidyverse)df <- read_csv("url")Analysis for df_1
Fix the brackets, there are too many.
Analysis for df_2
The only real wide portion of the dataset is the months. So, make that into months using pivot_longer() and try and fix the craziness of screeningscores and pivot_wider() PHQ and GAD. Limit selection to a couple of rows and then see if there’s anything to analyze.
Could show off age, month, and values.
Analysis for df_3
Simple one, will just pivot_longer() team names into new column, values to score.
Reading
library(tidyverse)
url_1 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_01.csv"
url_2 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_02.csv"
url_3 <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/project_02/data_03.csv"
df_1 <- read_csv(url_1)
df_2 <- read_csv(url_2)
df_3 <- read_csv(url_3)
#df_all <- c(df_1, df_2, df_3)
#glimpse(df_all <- c(df_1, df_2, df_3))
glimpse(df_1)Rows: 14
Columns: 11
$ Religion <chr> "Agnostic", "Atheist", "Buddhist", "Catholic", "E…
$ `<$10k` <dbl> 27, 12, 27, 418, 575, 9, 228, 20, 19, 289, 34, 23…
$ `$10-20k` <dbl> 34, 27, 21, 617, 869, 7, 244, 27, 19, 495, 42, 23…
$ `$20-30k` <dbl> 60, 37, 30, 732, 1064, 9, 236, 24, 25, 619, 37, 1…
$ `$30-40k` <dbl> 81, 52, 34, 670, 982, 11, 238, 24, 25, 655, 48, 1…
$ `$40-50k` <dbl> 76, 39, 33, 638, 881, 13, 197, 21, 30, 651, 51, 1…
$ `$50-75k` <dbl> 137, 81, 58, 1116, 1486, 34, 212, 30, 73, 1107, 1…
$ `$75-100k` <dbl> 102, 76, 62, 949, 949, 47, 156, 15, 59, 939, 87, …
$ `$100-150k` <dbl> 109, 59, 39, 792, 723, 48, 156, 11, 87, 792, 96, …
$ `>$150k` <dbl> 84, 74, 53, 792, 414, 54, 78, 6, 151, 753, 64, 41…
$ `Don't know/refused` <dbl> 96, 76, 54, 1163, 1529, 37, 339, 37, 87, 1096, 10…
glimpse(df_2)Rows: 180
Columns: 21
$ RecordID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ Participant <chr> "Participant_1", "Participant_2", "Participant_3", "Pa…
$ AgeGroup <chr> "unknown", "unknown", "26-35", "36-45", "36-45", "18-2…
$ Gender <chr> "F", "n/a", "M", "Female", "Female", "Female", "Other"…
$ City_State <chr> "Phoenix, AZ", "Houston, TX", "Chicago, IL", "Houston,…
$ ScreeningScores <chr> "PHQ9:9,GAD7:0", "PHQ-9=2;GAD-7=9", "PHQ-9=20;GAD-7=13…
$ Jan <chr> NA, NA, "-", "n/a", NA, "missing", "n/a", "140", "miss…
$ Feb <chr> "-", "missing", "missing", "n/a", NA, NA, NA, "119", "…
$ Mar <chr> "-", "n/a", "n/a", NA, "n/a", NA, NA, "111", "n/a", NA…
$ Apr <chr> "n/a", "n/a", "-", NA, "n/a", NA, "n/a", "-", NA, NA, …
$ May <chr> "-", "n/a", NA, "missing", "missing", NA, NA, "missing…
$ TherapyType <chr> "Group", "ACT", "n/a", "CBT", "Group", "CBT", "CBT", "…
$ Sessions <chr> "18", NA, "n/a", "n/a", "9", "8", "n/a", NA, "n/a", NA…
$ Medication <chr> NA, "SNRI", "None", "None", NA, "None", "SSRI", "SSRI"…
$ DiagnosisStatus <chr> "Undiagnosed", "-", "Pending", "Undiagnosed", "-", NA,…
$ WorkHours <chr> NA, NA, NA, "n/a", "36h", "n/a", "33 hours", "57h", NA…
$ Sleep <chr> "missing", NA, "missing", "missing", NA, NA, "8.7 hour…
$ StressScale <chr> NA, NA, NA, NA, "10", NA, NA, "7", "2", "3", "n/a", "n…
$ Insurance <chr> "Private", "-", "Public", NA, "Public", "Public", "-",…
$ SurveyDate <chr> "2024-03-12", "2025-05-21", "03-30-2024", "2025-06-14"…
$ Notes <chr> "missing", "missing", NA, "Follow-up needed", "Increas…
glimpse(df_3)Rows: 4
Columns: 4
$ Quarter <chr> "Q1", "Q2", "Q3", "Q4"
$ Lakers <dbl> 28, 32, 25, 30
$ Celtics <dbl> 31, 29, 33, 28
$ Warriors <dbl> 25, 27, 35, 30
Cleaning
religion_df <- df_1 |>
rename(religion = Religion) |>
mutate(
low = rowSums(across(c(`<$10k`, `$10-20k`, `$20-30k`, `$30-40k`, `$40-50k`))),
mid = rowSums(across(c(, `$50-75k`, `$75-100k`))),
high = rowSums(across(c(`$100-150k`, `>$150k`)))) |>
select(religion, low, mid, high) |>
pivot_longer(cols = -religion, names_to = "income_range", values_to = "n")
glimpse(religion_df)Rows: 42
Columns: 3
$ religion <chr> "Agnostic", "Agnostic", "Agnostic", "Atheist", "Atheist",…
$ income_range <chr> "low", "mid", "high", "low", "mid", "high", "low", "mid",…
$ n <dbl> 278, 239, 193, 167, 157, 133, 145, 120, 92, 3075, 2065, 1…
medical_df <- df_2 |>
select(RecordID, AgeGroup, ScreeningScores, TherapyType, Jan, Feb, Mar, Apr, May) |>
rename(id = RecordID, scores = ScreeningScores, age = AgeGroup) |>
pivot_longer(cols = c(Jan, Feb, Mar, Apr, May), names_to = "month", values_to = "values") |>
mutate(scores = str_replace(scores, ",\\s*", ";"),
scores = str_replace_all(scores, "[:\\-=]", ""),
scores = str_replace(scores, "GAD7", ""),
scores = str_replace(scores, "PHQ9", ""))|>
separate(col = scores, into = c("PHQ", "GAD7"), sep = ";") |>
mutate(values = replace(values, values %in% c("-", "missing", "", "n/a"), NA)) |>
mutate(value = as.integer(values), PHQ = as.integer(PHQ), GAD7 = as.integer(GAD7)) |>
select(-values) |>
mutate(TherapyType = ifelse(TherapyType == "n/a", "None", TherapyType),
month = factor(month, levels = c("Jan", "Feb","Mar", "Apr", "May")))
glimpse(medical_df)Rows: 900
Columns: 7
$ id <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4…
$ age <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "un…
$ PHQ <int> 9, 9, 9, 9, 9, 2, 2, 2, 2, 2, 20, 20, 20, 20, 20, 24, 24, …
$ GAD7 <int> 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 13, 13, 13, 13, 13, 13, 13, …
$ TherapyType <chr> "Group", "Group", "Group", "Group", "Group", "ACT", "ACT",…
$ month <fct> Jan, Feb, Mar, Apr, May, Jan, Feb, Mar, Apr, May, Jan, Feb…
$ value <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
basketball_df <- df_3 |> rename(quarter = Quarter) |>
pivot_longer(cols = -quarter, names_to = "teams", values_to = "scores") |>
mutate(quarter = factor(quarter, levels = c("Q1","Q2","Q3","Q4"))) |>
arrange(teams)
glimpse(basketball_df)Rows: 12
Columns: 3
$ quarter <fct> Q1, Q2, Q3, Q4, Q1, Q2, Q3, Q4, Q1, Q2, Q3, Q4
$ teams <chr> "Celtics", "Celtics", "Celtics", "Celtics", "Lakers", "Lakers"…
$ scores <dbl> 31, 29, 33, 28, 28, 32, 25, 30, 25, 27, 35, 30
Religion Summarize + ggplot
library(ggthemes)
sum_religion <- religion_df |>
mutate(income_range = factor(income_range, levels = c("low", "mid", "high"))) |>
group_by(religion, income_range) |>
summarize(n = sum(n))
sum_religion |>
ggplot(aes(x = income_range, y = n, fill = religion)) +
geom_col() +
coord_flip() +
labs(
title = "Religion counts by income range",
x = "income range",
y = "count")religion_df |> group_by(religion) |>
summarize(n = sum(n)) |>
arrange(desc(n)) |>
slice_head(n = 3)# A tibble: 3 × 2
religion n
<chr> <dbl>
1 Evangelical 7943
2 Catholic 6724
3 Mainline Protestant 6300
religion_df |> group_by(religion) |>
summarize(n = sum(n)) |>
arrange(n) |>
slice_head(n = 3)# A tibble: 3 × 2
religion n
<chr> <dbl>
1 Jehovah's Witness 178
2 Hindu 232
3 Muslim 234
According to the religion data, there are more patrons that are of the low income range ($0 - $50k), than the other income ranges. One notable pattern is that Orthodox has more patrons that are high income than mid income that can be visually determined.
The religions with the highest count of patrons: - Evangelical (7943) - Catholic (6724) - Mainline Protestant (6300)
The religions with the lowest count of patrons: - Jehovah’s Witness (178) - Hindu (232) - Muslim (234)
Medical Summarize + ggplot
sum_medical <- medical_df |>
rename(therapy = TherapyType) |>
filter(!is.na(value)) |>
group_by(therapy, month) |>
summarize(n = sum(value),
mean_phq = mean(PHQ, na.rm = TRUE),
mean_gad = mean(GAD7, na.rm = TRUE),
.groups = "drop")
glimpse(sum_medical)Rows: 25
Columns: 5
$ therapy <chr> "ACT", "ACT", "ACT", "ACT", "ACT", "CBT", "CBT", "CBT", "CBT"…
$ month <fct> Jan, Feb, Mar, Apr, May, Jan, Feb, Mar, Apr, May, Jan, Feb, M…
$ n <int> 650, 527, 756, 633, 268, 545, 728, 415, 646, 576, 427, 430, 6…
$ mean_phq <dbl> 12.166667, 9.600000, 12.428571, 17.400000, 17.500000, 11.0000…
$ mean_gad <dbl> 15.500000, 10.400000, 11.428571, 12.600000, 3.500000, 8.40000…
sum_medical |>
group_by(month, therapy) |>
ggplot(aes(month, mean_phq, color = therapy, group = therapy)) +
geom_line() +
geom_point() +
labs(title = "Mean PHQ over time",
x = "Month",
y = "Mean PHQ Score")sum_medical |>
group_by(month, therapy) |>
ggplot(aes(month, mean_gad, color = therapy, group = therapy)) +
geom_line() +
geom_point() +
labs(title = "Mean GAD over time",
x = "Month",
y = "Mean GAD Score")We can see the highest Mean PHQ score was in April with the Group therapy, which also had the largest drop in the following month. We can see the same in the GAD ggplot, where the highest GAD score was in January with the ACT therapy, followed by the largest drop.
Basketball Summarize + ggplot
basketball_df |>
ggplot(aes(quarter, scores, fill = teams)) +
geom_col(position = "dodge") +
labs(title = "Scores Per Team Per Quarter",
x = "Quarters",
y = "Score")basketball_df |>
group_by(teams) |>
slice_max(scores, n = 1, with_ties = FALSE) |>
select(teams, quarter, scores)# A tibble: 3 × 3
# Groups: teams [3]
teams quarter scores
<chr> <fct> <dbl>
1 Celtics Q3 33
2 Lakers Q2 32
3 Warriors Q3 35
From the ggplot we can see that the data is pretty average, where all four quarters had somewhat equal scoring. When we look at the summarization table, we can see that the Celtics highest scoring quarter was Q3, Warriors was Q3, and Lakers was Q2.