Setup and Data Load
library(tidyverse)
df <- read_csv("C:/Users/shelb/OneDrive/Desktop/Datasets/Durham City Homelessness/homelessness_shelter_data.csv")
# Create clean_season and beds_rate immediately after load
df <- df |>
mutate(
month_num = as.integer(format(as.Date(date), "%m")),
clean_season = case_when(
month_num %in% c(12, 1, 2) ~ "Winter",
month_num %in% c(3, 4, 5) ~ "Spring",
month_num %in% c(6, 7, 8) ~ "Summer",
month_num %in% c(9, 10, 11) ~ "Autumn"
),
beds_rate = occupied_beds / total_capacity * 100
)
glimpse(df)
#> Rows: 1,000
#> Columns: 17
#> $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
#> $ date <date> 2023-10-05, 2023-08-01, 2025-04-27, 2024-10-05, 202…
#> $ shelter_name <chr> "Sunrise Shelter", "Safe Haven", "Second Chance", "H…
#> $ city <chr> "Dallas", "Dallas", "San Jose", "San Jose", "San Ant…
#> $ state <chr> "TX", "TX", "CA", "CA", "TX", "CA", "AZ", "TX", "TX"…
#> $ total_capacity <dbl> 85, 128, 190, 115, 155, 81, 261, 105, 241, 181, 152,…
#> $ occupied_beds <dbl> 62, 117, 144, 7, 65, 61, 108, 83, 179, 96, 134, 147,…
#> $ available_beds <dbl> 23, 11, 46, 108, 90, 20, 153, 22, 62, 85, 18, 123, 6…
#> $ occupancy_rate <dbl> 72.9, 91.4, 75.8, 6.1, 41.9, 75.3, 41.4, 79.0, 74.3,…
#> $ average_age <dbl> 19, 58, 56, 63, 21, 41, 26, 51, 21, 24, 54, 37, 47, …
#> $ male_percentage <dbl> 43, 45, 52, 48, 48, 51, 44, 52, 67, 56, 61, 63, 66, …
#> $ female_percentage <dbl> 57, 55, 48, 52, 52, 49, 56, 48, 33, 44, 39, 37, 34, …
#> $ season <chr> "Spring", "Summer", "Summer", "Autumn", "Summer", "A…
#> $ notes <chr> "High demand", "No issues reported", "Excellent cond…
#> $ month_num <int> 10, 8, 4, 10, 9, 9, 6, 11, 7, 2, 2, 4, 11, 8, 5, 1, …
#> $ clean_season <chr> "Autumn", "Summer", "Spring", "Autumn", "Autumn", "A…
#> $ beds_rate <dbl> 72.941176, 91.406250, 75.789474, 6.086957, 41.935484…
date_min <- format(min(as.Date(df$date)), "%B %Y")
date_max <- format(max(as.Date(df$date)), "%B %Y")
n_records <- nrow(df)
n_shelters <- df |> distinct(shelter_name, city) |> nrow()
n_cities <- n_distinct(df$city)
n_states <- n_distinct(df$state)
cat("Date Range:", date_min, "to", date_max, "\n")
#> Date Range: July 2023 to July 2025
cat("Total Records:", n_records, "\n")
#> Total Records: 1000
cat("Unique Shelters:", n_shelters, "\n")
#> Unique Shelters: 100
cat("Cities:", n_cities, "\n")
#> Cities: 10
cat("States:", n_states, "\n")
#> States: 6
Data Quality Checks
# Missing values by column
df |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
filter(n_missing > 0)
#> # A tibble: 1 × 2
#> variable n_missing
#> <chr> <int>
#> 1 notes 87
# Occupancy rate column vs beds-calculated rate
df |>
mutate(
calc_rate = round(occupied_beds / total_capacity * 100, 1),
rate_mismatch = abs(occupancy_rate - calc_rate) > 1
) |>
summarise(
n_mismatches = sum(rate_mismatch),
pct_mismatch = round(mean(rate_mismatch) * 100, 1)
)
#> # A tibble: 1 × 2
#> n_mismatches pct_mismatch
#> <int> <dbl>
#> 1 0 0
# Occupied + available beds check
df |>
mutate(bed_check = (occupied_beds + available_beds) == total_capacity) |>
summarise(
n_mismatch = sum(!bed_check),
pct_mismatch = round(mean(!bed_check) * 100, 1)
)
#> # A tibble: 1 × 2
#> n_mismatch pct_mismatch
#> <int> <dbl>
#> 1 0 0
# Extreme occupancy records
df |>
filter(beds_rate == 0 | beds_rate == 100) |>
select(date, shelter_name, city, total_capacity, occupied_beds, beds_rate, notes) |>
arrange(beds_rate)
#> # A tibble: 15 × 7
#> date shelter_name city total_capacity occupied_beds beds_rate notes
#> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
#> 1 2024-10-15 Second Chance Phil… 52 0 0 Rece…
#> 2 2025-03-19 Sunrise Shelter San … 162 0 0 Staf…
#> 3 2024-04-19 Second Chance San … 51 0 0 Over…
#> 4 2023-12-03 Recovery Resid… San … 254 0 0 Over…
#> 5 2025-04-16 Safe Haven Phil… 73 0 0 No i…
#> 6 2024-07-06 HomeSafe Chic… 136 0 0 <NA>
#> 7 2024-06-08 HomeSafe San … 271 271 100 Rece…
#> 8 2025-02-18 HomeSafe San … 68 68 100 Unde…
#> 9 2024-01-10 New Beginnings San … 153 153 100 Exce…
#> 10 2025-01-30 Shelter Plus New … 64 64 100 Over…
#> 11 2024-09-11 Shelter Plus Dall… 116 116 100 Over…
#> 12 2024-03-19 Harbor Home Los … 124 124 100 Need…
#> 13 2024-01-12 Sunrise Shelter Phoe… 294 294 100 Staf…
#> 14 2025-07-11 New Beginnings San … 92 92 100 Over…
#> 15 2023-08-25 Shelter Plus San … 59 59 100 Rece…
# Confirm clean_season assignments
df |>
mutate(month = format(as.Date(date), "%B")) |>
distinct(clean_season, month_num, month) |>
arrange(clean_season, month_num)
#> # A tibble: 12 × 3
#> clean_season month_num month
#> <chr> <int> <chr>
#> 1 Autumn 9 September
#> 2 Autumn 10 October
#> 3 Autumn 11 November
#> 4 Spring 3 March
#> 5 Spring 4 April
#> 6 Spring 5 May
#> 7 Summer 6 June
#> 8 Summer 7 July
#> 9 Summer 8 August
#> 10 Winter 1 January
#> 11 Winter 2 February
#> 12 Winter 12 December
Descriptive Statistics
df |>
summarise(
mean_occupancy = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
median_occupancy = round(median(beds_rate), 1),
sd_occupancy = round(sd(beds_rate), 1),
min_occupancy = round(min(beds_rate), 1),
max_occupancy = round(max(beds_rate), 1)
)
#> # A tibble: 1 × 5
#> mean_occupancy median_occupancy sd_occupancy min_occupancy max_occupancy
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 51.3 51.8 28.9 0 100
df |>
mutate(
occupancy_group = case_when(
beds_rate < 20 ~ "Under 20%",
beds_rate < 40 ~ "20-39%",
beds_rate < 60 ~ "40-59%",
beds_rate < 80 ~ "60-79%",
beds_rate < 95 ~ "80-94%",
TRUE ~ "95-100%"
),
occupancy_group = factor(occupancy_group, levels = c(
"Under 20%", "20-39%", "40-59%", "60-79%", "80-94%", "95-100%"
))
) |>
count(occupancy_group) |>
mutate(pct = round(n / sum(n) * 100, 1)) |>
rename(Group = occupancy_group, Count = n, Percent = pct)
#> # A tibble: 6 × 3
#> Group Count Percent
#> <fct> <int> <dbl>
#> 1 Under 20% 185 18.5
#> 2 20-39% 201 20.1
#> 3 40-59% 188 18.8
#> 4 60-79% 226 22.6
#> 5 80-94% 154 15.4
#> 6 95-100% 46 4.6
Available Beds
df |>
summarise(
mean_available = round(mean(available_beds), 1),
median_available = round(median(available_beds), 1),
records_zero_beds = sum(available_beds == 0),
pct_zero_beds = round(mean(available_beds == 0) * 100, 1),
records_under10 = sum(available_beds < 10),
pct_under10 = round(mean(available_beds < 10) * 100, 1)
)
#> # A tibble: 1 × 6
#> mean_available median_available records_zero_beds pct_zero_beds
#> <dbl> <dbl> <int> <dbl>
#> 1 87.2 71 9 0.9
#> # ℹ 2 more variables: records_under10 <int>, pct_under10 <dbl>
City-Level Summary
df |>
group_by(city, state) |>
summarise(
n_records = n(),
n_shelters = n_distinct(shelter_name),
occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
mean_available = round(mean(available_beds), 1),
total_capacity = sum(total_capacity),
total_occupied = sum(occupied_beds),
total_available = sum(available_beds),
.groups = "drop"
) |>
arrange(desc(occ_rate)) |>
print(width = Inf)
#> # A tibble: 10 × 9
#> city state n_records n_shelters occ_rate mean_available
#> <chr> <chr> <int> <int> <dbl> <dbl>
#> 1 San Jose CA 90 10 55.3 85.2
#> 2 Chicago IL 115 10 53.7 83.7
#> 3 Phoenix AZ 82 10 53.1 88
#> 4 Dallas TX 101 10 52.3 80.8
#> 5 San Diego CA 97 10 52.2 88.9
#> 6 San Antonio TX 108 10 51 84.6
#> 7 Houston TX 104 10 50.9 88.7
#> 8 Philadelphia PA 93 10 49.6 89.2
#> 9 Los Angeles CA 116 10 49.2 89.9
#> 10 New York NY 94 10 45.3 94.5
#> total_capacity total_occupied total_available
#> <dbl> <dbl> <dbl>
#> 1 17169 9501 7668
#> 2 20804 11179 9625
#> 3 15390 8174 7216
#> 4 17094 8938 8156
#> 5 18023 9404 8619
#> 6 18633 9496 9137
#> 7 18775 9552 9223
#> 8 16462 8171 8291
#> 9 20537 10106 10431
#> 10 16236 7353 8883
city_season <- df |>
group_by(city, state, clean_season) |>
summarise(
occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
.groups = "drop"
) |>
pivot_wider(names_from = clean_season, values_from = occ_rate)
city_overall <- df |>
group_by(city, state) |>
summarise(
Overall = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
.groups = "drop"
)
city_table <- left_join(city_season, city_overall, by = c("city", "state")) |>
select(city, state, Winter, Spring, Summer, Autumn, Overall) |>
arrange(state, city) |>
rename(City = city, State = state)
knitr::kable(city_table, caption = "Beds-Based Occupancy Rate (%) by City and Season")
Beds-Based Occupancy Rate (%) by City and Season
| Phoenix |
AZ |
54.1 |
57.4 |
50.3 |
49.9 |
53.1 |
| Los Angeles |
CA |
45.2 |
50.6 |
46.4 |
53.6 |
49.2 |
| San Diego |
CA |
56.2 |
55.1 |
43.5 |
56.9 |
52.2 |
| San Jose |
CA |
54.5 |
55.8 |
57.5 |
53.8 |
55.3 |
| Chicago |
IL |
51.6 |
57.5 |
55.8 |
51.1 |
53.7 |
| New York |
NY |
47.6 |
51.8 |
44.4 |
35.5 |
45.3 |
| Philadelphia |
PA |
58.4 |
53.9 |
44.4 |
45.8 |
49.6 |
| Dallas |
TX |
47.1 |
56.3 |
49.8 |
55.0 |
52.3 |
| Houston |
TX |
46.5 |
46.4 |
50.6 |
59.9 |
50.9 |
| San Antonio |
TX |
54.7 |
56.4 |
45.9 |
47.6 |
51.0 |
Shelter-Level Analysis
shelter_avg <- df |>
group_by(shelter_name, city, state) |>
summarise(
n = n(),
occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
.groups = "drop"
) |>
filter(n >= 5)
cat("--- Top 10 Highest Occupancy (Beds-Based) ---\n")
#> --- Top 10 Highest Occupancy (Beds-Based) ---
shelter_avg |>
arrange(desc(occ_rate)) |>
slice_head(n = 10) |>
print(width = Inf)
#> # A tibble: 10 × 5
#> shelter_name city state n occ_rate
#> <chr> <chr> <chr> <int> <dbl>
#> 1 Sunrise Shelter Phoenix AZ 6 72.8
#> 2 Recovery Residence San Diego CA 9 72.2
#> 3 New Beginnings San Jose CA 9 69
#> 4 Hope House San Jose CA 9 68.9
#> 5 Safe Haven San Jose CA 6 68.8
#> 6 Pathway Place Philadelphia PA 12 67
#> 7 Pathway Place San Antonio TX 10 66.1
#> 8 Sunrise Shelter San Antonio TX 11 66
#> 9 Shelter Plus Los Angeles CA 15 65.7
#> 10 Hope House Philadelphia PA 6 65.2
cat("\n--- Top 10 Lowest Occupancy (Beds-Based) ---\n")
#>
#> --- Top 10 Lowest Occupancy (Beds-Based) ---
shelter_avg |>
arrange(occ_rate) |>
slice_head(n = 10) |>
print(width = Inf)
#> # A tibble: 10 × 5
#> shelter_name city state n occ_rate
#> <chr> <chr> <chr> <int> <dbl>
#> 1 HomeSafe New York NY 9 21.5
#> 2 Hope House Los Angeles CA 7 32
#> 3 Recovery Residence San Jose CA 8 32.1
#> 4 Sunrise Shelter Philadelphia PA 8 34.2
#> 5 Second Chance New York NY 7 34.8
#> 6 Harbor Home Los Angeles CA 13 35.4
#> 7 Pathway Place Dallas TX 7 36.2
#> 8 Hope House Phoenix AZ 5 37.8
#> 9 Sunrise Shelter San Diego CA 12 38.2
#> 10 Shelter Plus San Antonio TX 11 38.7
df |>
group_by(shelter_name, city, state) |>
summarise(
n_records = n(),
occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
sd_occ = round(sd(beds_rate), 1),
min_occ = round(min(beds_rate), 1),
max_occ = round(max(beds_rate), 1),
.groups = "drop"
) |>
filter(n_records >= 5) |>
arrange(desc(sd_occ)) |>
print(width = Inf)
#> # A tibble: 98 × 8
#> shelter_name city state n_records occ_rate sd_occ min_occ
#> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl>
#> 1 Pathway Place San Jose CA 12 60.4 39.1 0.3
#> 2 Recovery Residence Phoenix AZ 12 53.5 36.4 4.4
#> 3 Harbor Home Philadelphia PA 7 41.4 35.7 9.7
#> 4 HomeSafe Phoenix AZ 8 47 35.4 0.6
#> 5 Hope House San Diego CA 8 41.9 34.8 1.6
#> 6 Harbor Home Dallas TX 10 48 34.7 6.7
#> 7 Second Chance San Diego CA 8 59.3 34.7 15.4
#> 8 Sunrise Shelter Houston TX 10 49.2 34.6 3
#> 9 Pathway Place Dallas TX 7 36.2 34.5 6
#> 10 Safe Haven Dallas TX 13 55.6 34.5 9.1
#> 11 Hope House Chicago IL 11 54.3 34.4 4.9
#> 12 Shelter Plus Dallas TX 11 57.2 34.3 8.1
#> 13 New Beginnings New York NY 10 44.8 34.1 1
#> 14 Safe Haven Houston TX 5 49.3 34 4.5
#> 15 New Beginnings Houston TX 9 58.9 33.8 7.5
#> 16 New Beginnings Philadelphia PA 11 58.5 33.8 5.1
#> 17 Shelter Plus Philadelphia PA 11 43.9 33.5 1
#> 18 Second Chance Philadelphia PA 7 51 33.4 0
#> 19 Second Chance Los Angeles CA 5 49.5 33.3 10.4
#> 20 Sunrise Shelter San Diego CA 12 38.2 32.9 0
#> 21 Recovery Residence San Diego CA 9 72.2 32.8 8.2
#> 22 Sunrise Shelter Los Angeles CA 19 43.8 32.4 3.8
#> 23 Safe Haven New York NY 8 52.6 32.3 11.3
#> 24 Harbor Home San Jose CA 11 39.6 32.2 0.5
#> 25 Safe Haven Chicago IL 7 54.2 32.1 5.3
#> 26 Second Chance Phoenix AZ 6 41.2 32.1 2.2
#> 27 Sunrise Shelter San Antonio TX 11 66 32.1 4.7
#> 28 HomeSafe Chicago IL 9 51.5 31.8 0
#> 29 Safe Haven Phoenix AZ 9 49.2 31.8 1.8
#> 30 Shelter Plus San Jose CA 7 56.9 31.8 0.7
#> 31 Shelter Plus San Antonio TX 11 38.7 31.5 2.4
#> 32 Harbor Home San Diego CA 11 52.7 31.4 2.8
#> 33 Second Chance New York NY 7 34.8 31.4 5.7
#> 34 New Beginnings Dallas TX 9 47.7 31.3 2.3
#> 35 Sunrise Shelter Philadelphia PA 8 34.2 31.3 2.9
#> 36 Pathway Place Houston TX 5 61 31.2 8.5
#> 37 HomeSafe Dallas TX 14 49.1 30.9 0.8
#> 38 Pathway Place Los Angeles CA 11 57.8 30.7 2.6
#> 39 Shelter Plus Houston TX 14 48.5 30.7 0.5
#> 40 Harbor Home Los Angeles CA 13 35.4 30.6 6.6
#> 41 HomeSafe San Jose CA 9 54.5 30.6 5.8
#> 42 Second Chance San Antonio TX 13 47.8 30.5 6.3
#> 43 New Beginnings San Antonio TX 18 40.3 30.3 0.8
#> 44 New Beginnings San Diego CA 9 45.5 30.1 11.7
#> 45 HomeSafe Philadelphia PA 15 45.7 29.9 4.7
#> 46 Pathway Place New York NY 9 39.5 29.5 5.1
#> 47 Sunrise Shelter Chicago IL 16 55 29.5 1.1
#> 48 Recovery Residence San Jose CA 8 32.1 28.6 0
#> 49 Shelter Plus Phoenix AZ 8 58.4 28.6 27.4
#> 50 Harbor Home Phoenix AZ 9 65.1 28.5 1.3
#> 51 HomeSafe San Diego CA 8 54.3 28.5 19.2
#> 52 Safe Haven Los Angeles CA 11 45.3 28.5 1.4
#> 53 New Beginnings San Jose CA 9 69 28.3 20
#> 54 Safe Haven Philadelphia PA 9 40.1 28.2 0
#> 55 Recovery Residence San Antonio TX 10 61.3 28.1 10.5
#> 56 Recovery Residence Philadelphia PA 7 42 28 8.1
#> 57 Sunrise Shelter Dallas TX 11 49.2 27.9 5.1
#> 58 Pathway Place Phoenix AZ 8 48.4 27.7 18.4
#> 59 Sunrise Shelter Phoenix AZ 6 72.8 27.7 35.4
#> 60 New Beginnings Chicago IL 19 55.3 27.5 3.8
#> 61 Recovery Residence Los Angeles CA 12 54.1 27.4 22
#> 62 Pathway Place Chicago IL 9 58.2 27.3 8.9
#> 63 Recovery Residence Houston TX 10 43.4 27.3 15.1
#> 64 Sunrise Shelter New York NY 11 51.4 27.3 3.2
#> 65 Sunrise Shelter San Jose CA 9 52.5 27.2 8.4
#> 66 Shelter Plus Los Angeles CA 15 65.7 26.6 8.5
#> 67 Pathway Place San Antonio TX 10 66.1 26.5 20
#> 68 Harbor Home San Antonio TX 15 49 26.2 2.9
#> 69 Second Chance San Jose CA 10 53.2 26.2 0
#> 70 Hope House New York NY 12 44.2 26 6.3
#> 71 Second Chance Houston TX 11 64.2 25.8 13
#> 72 Shelter Plus Chicago IL 9 53.4 25.8 13.6
#> 73 Shelter Plus San Diego CA 15 54.6 25.7 11.4
#> 74 HomeSafe Houston TX 11 52 25.6 19
#> 75 Hope House Phoenix AZ 5 37.8 25.6 0.4
#> 76 Recovery Residence Chicago IL 11 56.1 25.5 23
#> 77 Recovery Residence New York NY 9 63.7 25.4 22.6
#> 78 HomeSafe Los Angeles CA 11 50.9 25.2 23.8
#> 79 Second Chance Chicago IL 9 56.8 25 24.9
#> 80 Harbor Home Chicago IL 15 45.6 24.6 2.7
#> 81 Harbor Home New York NY 11 42.7 24.6 14.5
#> 82 Hope House Los Angeles CA 7 32 24.6 3.2
#> 83 Hope House Houston TX 17 49.6 24.3 10.6
#> 84 Safe Haven San Diego CA 11 54.3 24.2 7
#> 85 Hope House San Antonio TX 13 51.1 24.1 12.9
#> 86 Shelter Plus New York NY 8 62.9 23.9 31.1
#> 87 Second Chance Dallas TX 7 53.4 23.2 23.6
#> 88 Harbor Home Houston TX 12 39 23 4.6
#> 89 Hope House Philadelphia PA 6 65.2 22.5 25
#> 90 Pathway Place San Diego CA 6 44.5 22.4 20.2
#> 91 Hope House Dallas TX 12 61.9 21.6 14.6
#> 92 New Beginnings Phoenix AZ 11 52.5 21.6 12.7
#> 93 Pathway Place Philadelphia PA 12 67 21.5 16.1
#> 94 New Beginnings Los Angeles CA 12 53.7 21 12.7
#> 95 Recovery Residence Dallas TX 7 59.4 19.9 25.3
#> 96 Safe Haven San Jose CA 6 68.8 18.8 33.1
#> 97 Hope House San Jose CA 9 68.9 17.6 44.1
#> 98 HomeSafe New York NY 9 21.5 12.3 3.9
#> max_occ
#> <dbl>
#> 1 98.7
#> 2 98.2
#> 3 94.6
#> 4 97.6
#> 5 96.6
#> 6 96
#> 7 96.6
#> 8 98.1
#> 9 93.8
#> 10 98.3
#> 11 95.6
#> 12 100
#> 13 94.6
#> 14 75.6
#> 15 96.1
#> 16 99.2
#> 17 90.2
#> 18 85.7
#> 19 86.8
#> 20 94.6
#> 21 96.9
#> 22 96.1
#> 23 98.2
#> 24 93.2
#> 25 91.5
#> 26 79.7
#> 27 99.1
#> 28 80.5
#> 29 85.4
#> 30 95.5
#> 31 100
#> 32 95.8
#> 33 79.7
#> 34 92.6
#> 35 85.7
#> 36 87.7
#> 37 96.7
#> 38 98.1
#> 39 94.5
#> 40 100
#> 41 100
#> 42 95.9
#> 43 100
#> 44 92.6
#> 45 96.4
#> 46 93.9
#> 47 98.8
#> 48 79.3
#> 49 89.9
#> 50 94.7
#> 51 100
#> 52 87.2
#> 53 100
#> 54 92.2
#> 55 93.1
#> 56 77.4
#> 57 94.9
#> 58 90.3
#> 59 100
#> 60 95
#> 61 94.2
#> 62 94.4
#> 63 88.2
#> 64 96.4
#> 65 90.9
#> 66 97.5
#> 67 95
#> 68 76.5
#> 69 86.1
#> 70 89
#> 71 97.8
#> 72 94.4
#> 73 83.2
#> 74 95.6
#> 75 68.9
#> 76 98.2
#> 77 86.8
#> 78 98.5
#> 79 92.5
#> 80 85.8
#> 81 87.5
#> 82 78.5
#> 83 90.8
#> 84 97.8
#> 85 93.3
#> 86 100
#> 87 79.4
#> 88 76.1
#> 89 91
#> 90 78.1
#> 91 94.8
#> 92 80.4
#> 93 97.2
#> 94 86.8
#> 95 87.1
#> 96 84.2
#> 97 92.2
#> 98 35.4
Seasonal Analysis
df |>
group_by(clean_season) |>
summarise(
n_records = n(),
occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
median_occ = round(median(beds_rate), 1),
mean_available = round(mean(available_beds), 1),
.groups = "drop"
) |>
arrange(desc(occ_rate))
#> # A tibble: 4 × 5
#> clean_season n_records occ_rate median_occ mean_available
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 Spring 241 54 57.6 81.7
#> 2 Autumn 264 51.2 51.5 88.9
#> 3 Winter 237 51.2 52.7 85.1
#> 4 Summer 258 49 46 92.7
Notes Analysis
df |>
count(notes) |>
arrange(desc(n)) |>
mutate(pct = round(n / sum(n) * 100, 1)) |>
rename(Note = notes, Count = n, Percent = pct)
#> # A tibble: 9 × 3
#> Note Count Percent
#> <chr> <int> <dbl>
#> 1 Overcrowded 130 13
#> 2 Needs maintenance 120 12
#> 3 Recently renovated 119 11.9
#> 4 High demand 113 11.3
#> 5 Staff shortage 112 11.2
#> 6 No issues reported 107 10.7
#> 7 Under inspection 107 10.7
#> 8 Excellent condition 105 10.5
#> 9 <NA> 87 8.7
df |>
mutate(
sentiment = case_when(
notes == "Excellent condition" ~ "Positive",
notes == "Recently renovated" ~ "Positive",
notes == "No issues reported" ~ "Positive",
notes == "Overcrowded" ~ "Negative",
notes == "Needs maintenance" ~ "Negative",
notes == "Staff shortage" ~ "Negative",
notes == "Under inspection" ~ "Concerning",
notes == "High demand" ~ "Concerning",
is.na(notes) ~ "Not Reported",
TRUE ~ "Uncategorized"
)
) |>
group_by(sentiment) |>
summarise(
n = n(),
pct = round(n() / nrow(df) * 100, 1),
.groups = "drop"
) |>
arrange(desc(n)) |>
rename(Sentiment = sentiment, Count = n, Percent = pct)
#> # A tibble: 4 × 3
#> Sentiment Count Percent
#> <chr> <int> <dbl>
#> 1 Negative 362 36.2
#> 2 Positive 331 33.1
#> 3 Concerning 220 22
#> 4 Not Reported 87 8.7
Demographics
gender_data <- tibble(
gender = c("Male", "Female"),
pct = c(54.6, 45.4)
)
ggplot(gender_data, aes(x = "", y = pct, fill = gender)) +
geom_col(width = 1) +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(pct, "%")),
position = position_stack(vjust = 0.5),
size = 5, color = "white", fontface = "bold") +
scale_fill_manual(values = c("Male" = "#5B2C8D", "Female" = "#4DC8C8")) +
labs(title = "Average Gender Distribution Across All Records", fill = NULL) +
theme_void() +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold"))

ggplot(df, aes(x = average_age)) +
geom_histogram(binwidth = 5, fill = "#5B2C8D", color = "white") +
geom_text(stat = "bin", binwidth = 5,
aes(label = after_stat(ifelse(count > 0, count, ""))),
vjust = -0.5, size = 3.5) +
labs(title = "Distribution of Average Age Across Shelter Records",
x = "Average Age",
y = "Number of Records") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", hjust = 0.5))

cat("Overall Average Age\n")
#> Overall Average Age
df |> summarise(avg_age = round(mean(average_age), 1))
#> # A tibble: 1 × 1
#> avg_age
#> <dbl>
#> 1 42
cat("\nAverage Age by City\n")
#>
#> Average Age by City
df |>
group_by(city, state) |>
summarise(avg_age = round(mean(average_age), 1), .groups = "drop") |>
arrange(desc(avg_age)) |>
rename(City = city, State = state, `Avg Age` = avg_age)
#> # A tibble: 10 × 3
#> City State `Avg Age`
#> <chr> <chr> <dbl>
#> 1 Phoenix AZ 43.2
#> 2 San Antonio TX 43.2
#> 3 San Diego CA 43.2
#> 4 Houston TX 42.9
#> 5 Los Angeles CA 42.7
#> 6 Philadelphia PA 42.4
#> 7 Dallas TX 41.8
#> 8 Chicago IL 41.7
#> 9 New York NY 39.9
#> 10 San Jose CA 39
cat("\nAverage Age by Season\n")
#>
#> Average Age by Season
df |>
group_by(clean_season) |>
summarise(avg_age = round(mean(average_age), 1), .groups = "drop") |>
arrange(desc(avg_age)) |>
rename(Season = clean_season, `Avg Age` = avg_age)
#> # A tibble: 4 × 2
#> Season `Avg Age`
#> <chr> <dbl>
#> 1 Spring 43.5
#> 2 Autumn 42
#> 3 Summer 41.4
#> 4 Winter 41.3