Setup and Data Load

library(tidyverse)
df <- read_csv("C:/Users/shelb/OneDrive/Desktop/Datasets/Durham City Homelessness/homelessness_shelter_data.csv")

# Create clean_season and beds_rate immediately after load
df <- df |>
  mutate(
    month_num    = as.integer(format(as.Date(date), "%m")),
    clean_season = case_when(
      month_num %in% c(12, 1, 2)  ~ "Winter",
      month_num %in% c(3, 4, 5)   ~ "Spring",
      month_num %in% c(6, 7, 8)   ~ "Summer",
      month_num %in% c(9, 10, 11) ~ "Autumn"
    ),
    beds_rate = occupied_beds / total_capacity * 100
  )

glimpse(df)
#> Rows: 1,000
#> Columns: 17
#> $ id                <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
#> $ date              <date> 2023-10-05, 2023-08-01, 2025-04-27, 2024-10-05, 202…
#> $ shelter_name      <chr> "Sunrise Shelter", "Safe Haven", "Second Chance", "H…
#> $ city              <chr> "Dallas", "Dallas", "San Jose", "San Jose", "San Ant…
#> $ state             <chr> "TX", "TX", "CA", "CA", "TX", "CA", "AZ", "TX", "TX"…
#> $ total_capacity    <dbl> 85, 128, 190, 115, 155, 81, 261, 105, 241, 181, 152,…
#> $ occupied_beds     <dbl> 62, 117, 144, 7, 65, 61, 108, 83, 179, 96, 134, 147,…
#> $ available_beds    <dbl> 23, 11, 46, 108, 90, 20, 153, 22, 62, 85, 18, 123, 6…
#> $ occupancy_rate    <dbl> 72.9, 91.4, 75.8, 6.1, 41.9, 75.3, 41.4, 79.0, 74.3,…
#> $ average_age       <dbl> 19, 58, 56, 63, 21, 41, 26, 51, 21, 24, 54, 37, 47, …
#> $ male_percentage   <dbl> 43, 45, 52, 48, 48, 51, 44, 52, 67, 56, 61, 63, 66, …
#> $ female_percentage <dbl> 57, 55, 48, 52, 52, 49, 56, 48, 33, 44, 39, 37, 34, …
#> $ season            <chr> "Spring", "Summer", "Summer", "Autumn", "Summer", "A…
#> $ notes             <chr> "High demand", "No issues reported", "Excellent cond…
#> $ month_num         <int> 10, 8, 4, 10, 9, 9, 6, 11, 7, 2, 2, 4, 11, 8, 5, 1, …
#> $ clean_season      <chr> "Autumn", "Summer", "Spring", "Autumn", "Autumn", "A…
#> $ beds_rate         <dbl> 72.941176, 91.406250, 75.789474, 6.086957, 41.935484…
date_min   <- format(min(as.Date(df$date)), "%B %Y")
date_max   <- format(max(as.Date(df$date)), "%B %Y")
n_records  <- nrow(df)
n_shelters <- df |> distinct(shelter_name, city) |> nrow()
n_cities   <- n_distinct(df$city)
n_states   <- n_distinct(df$state)

cat("Date Range:", date_min, "to", date_max, "\n")
#> Date Range: July 2023 to July 2025
cat("Total Records:", n_records, "\n")
#> Total Records: 1000
cat("Unique Shelters:", n_shelters, "\n")
#> Unique Shelters: 100
cat("Cities:", n_cities, "\n")
#> Cities: 10
cat("States:", n_states, "\n")
#> States: 6

Data Quality Checks

# Missing values by column
df |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
  filter(n_missing > 0)
#> # A tibble: 1 × 2
#>   variable n_missing
#>   <chr>        <int>
#> 1 notes           87
# Occupancy rate column vs beds-calculated rate
df |>
  mutate(
    calc_rate     = round(occupied_beds / total_capacity * 100, 1),
    rate_mismatch = abs(occupancy_rate - calc_rate) > 1
  ) |>
  summarise(
    n_mismatches = sum(rate_mismatch),
    pct_mismatch = round(mean(rate_mismatch) * 100, 1)
  )
#> # A tibble: 1 × 2
#>   n_mismatches pct_mismatch
#>          <int>        <dbl>
#> 1            0            0
# Occupied + available beds check
df |>
  mutate(bed_check = (occupied_beds + available_beds) == total_capacity) |>
  summarise(
    n_mismatch   = sum(!bed_check),
    pct_mismatch = round(mean(!bed_check) * 100, 1)
  )
#> # A tibble: 1 × 2
#>   n_mismatch pct_mismatch
#>        <int>        <dbl>
#> 1          0            0
# Extreme occupancy records
df |>
  filter(beds_rate == 0 | beds_rate == 100) |>
  select(date, shelter_name, city, total_capacity, occupied_beds, beds_rate, notes) |>
  arrange(beds_rate)
#> # A tibble: 15 × 7
#>    date       shelter_name    city  total_capacity occupied_beds beds_rate notes
#>    <date>     <chr>           <chr>          <dbl>         <dbl>     <dbl> <chr>
#>  1 2024-10-15 Second Chance   Phil…             52             0         0 Rece…
#>  2 2025-03-19 Sunrise Shelter San …            162             0         0 Staf…
#>  3 2024-04-19 Second Chance   San …             51             0         0 Over…
#>  4 2023-12-03 Recovery Resid… San …            254             0         0 Over…
#>  5 2025-04-16 Safe Haven      Phil…             73             0         0 No i…
#>  6 2024-07-06 HomeSafe        Chic…            136             0         0 <NA> 
#>  7 2024-06-08 HomeSafe        San …            271           271       100 Rece…
#>  8 2025-02-18 HomeSafe        San …             68            68       100 Unde…
#>  9 2024-01-10 New Beginnings  San …            153           153       100 Exce…
#> 10 2025-01-30 Shelter Plus    New …             64            64       100 Over…
#> 11 2024-09-11 Shelter Plus    Dall…            116           116       100 Over…
#> 12 2024-03-19 Harbor Home     Los …            124           124       100 Need…
#> 13 2024-01-12 Sunrise Shelter Phoe…            294           294       100 Staf…
#> 14 2025-07-11 New Beginnings  San …             92            92       100 Over…
#> 15 2023-08-25 Shelter Plus    San …             59            59       100 Rece…
# Confirm clean_season assignments
df |>
  mutate(month = format(as.Date(date), "%B")) |>
  distinct(clean_season, month_num, month) |>
  arrange(clean_season, month_num)
#> # A tibble: 12 × 3
#>    clean_season month_num month    
#>    <chr>            <int> <chr>    
#>  1 Autumn               9 September
#>  2 Autumn              10 October  
#>  3 Autumn              11 November 
#>  4 Spring               3 March    
#>  5 Spring               4 April    
#>  6 Spring               5 May      
#>  7 Summer               6 June     
#>  8 Summer               7 July     
#>  9 Summer               8 August   
#> 10 Winter               1 January  
#> 11 Winter               2 February 
#> 12 Winter              12 December

Descriptive Statistics

df |>
  summarise(
    mean_occupancy   = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    median_occupancy = round(median(beds_rate), 1),
    sd_occupancy     = round(sd(beds_rate), 1),
    min_occupancy    = round(min(beds_rate), 1),
    max_occupancy    = round(max(beds_rate), 1)
  )
#> # A tibble: 1 × 5
#>   mean_occupancy median_occupancy sd_occupancy min_occupancy max_occupancy
#>            <dbl>            <dbl>        <dbl>         <dbl>         <dbl>
#> 1           51.3             51.8         28.9             0           100
df |>
  mutate(
    occupancy_group = case_when(
      beds_rate < 20 ~ "Under 20%",
      beds_rate < 40 ~ "20-39%",
      beds_rate < 60 ~ "40-59%",
      beds_rate < 80 ~ "60-79%",
      beds_rate < 95 ~ "80-94%",
      TRUE           ~ "95-100%"
    ),
    occupancy_group = factor(occupancy_group, levels = c(
      "Under 20%", "20-39%", "40-59%", "60-79%", "80-94%", "95-100%"
    ))
  ) |>
  count(occupancy_group) |>
  mutate(pct = round(n / sum(n) * 100, 1)) |>
  rename(Group = occupancy_group, Count = n, Percent = pct)
#> # A tibble: 6 × 3
#>   Group     Count Percent
#>   <fct>     <int>   <dbl>
#> 1 Under 20%   185    18.5
#> 2 20-39%      201    20.1
#> 3 40-59%      188    18.8
#> 4 60-79%      226    22.6
#> 5 80-94%      154    15.4
#> 6 95-100%      46     4.6

Available Beds

df |>
  summarise(
    mean_available    = round(mean(available_beds), 1),
    median_available  = round(median(available_beds), 1),
    records_zero_beds = sum(available_beds == 0),
    pct_zero_beds     = round(mean(available_beds == 0) * 100, 1),
    records_under10   = sum(available_beds < 10),
    pct_under10       = round(mean(available_beds < 10) * 100, 1)
  )
#> # A tibble: 1 × 6
#>   mean_available median_available records_zero_beds pct_zero_beds
#>            <dbl>            <dbl>             <int>         <dbl>
#> 1           87.2               71                 9           0.9
#> # ℹ 2 more variables: records_under10 <int>, pct_under10 <dbl>

City-Level Summary

df |>
  group_by(city, state) |>
  summarise(
    n_records       = n(),
    n_shelters      = n_distinct(shelter_name),
    occ_rate        = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    mean_available  = round(mean(available_beds), 1),
    total_capacity  = sum(total_capacity),
    total_occupied  = sum(occupied_beds),
    total_available = sum(available_beds),
    .groups = "drop"
  ) |>
  arrange(desc(occ_rate)) |>
  print(width = Inf)
#> # A tibble: 10 × 9
#>    city         state n_records n_shelters occ_rate mean_available
#>    <chr>        <chr>     <int>      <int>    <dbl>          <dbl>
#>  1 San Jose     CA           90         10     55.3           85.2
#>  2 Chicago      IL          115         10     53.7           83.7
#>  3 Phoenix      AZ           82         10     53.1           88  
#>  4 Dallas       TX          101         10     52.3           80.8
#>  5 San Diego    CA           97         10     52.2           88.9
#>  6 San Antonio  TX          108         10     51             84.6
#>  7 Houston      TX          104         10     50.9           88.7
#>  8 Philadelphia PA           93         10     49.6           89.2
#>  9 Los Angeles  CA          116         10     49.2           89.9
#> 10 New York     NY           94         10     45.3           94.5
#>    total_capacity total_occupied total_available
#>             <dbl>          <dbl>           <dbl>
#>  1          17169           9501            7668
#>  2          20804          11179            9625
#>  3          15390           8174            7216
#>  4          17094           8938            8156
#>  5          18023           9404            8619
#>  6          18633           9496            9137
#>  7          18775           9552            9223
#>  8          16462           8171            8291
#>  9          20537          10106           10431
#> 10          16236           7353            8883
city_season <- df |>
  group_by(city, state, clean_season) |>
  summarise(
    occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    .groups = "drop"
  ) |>
  pivot_wider(names_from = clean_season, values_from = occ_rate)

city_overall <- df |>
  group_by(city, state) |>
  summarise(
    Overall = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    .groups = "drop"
  )

city_table <- left_join(city_season, city_overall, by = c("city", "state")) |>
  select(city, state, Winter, Spring, Summer, Autumn, Overall) |>
  arrange(state, city) |>
  rename(City = city, State = state)

knitr::kable(city_table, caption = "Beds-Based Occupancy Rate (%) by City and Season")
Beds-Based Occupancy Rate (%) by City and Season
City State Winter Spring Summer Autumn Overall
Phoenix AZ 54.1 57.4 50.3 49.9 53.1
Los Angeles CA 45.2 50.6 46.4 53.6 49.2
San Diego CA 56.2 55.1 43.5 56.9 52.2
San Jose CA 54.5 55.8 57.5 53.8 55.3
Chicago IL 51.6 57.5 55.8 51.1 53.7
New York NY 47.6 51.8 44.4 35.5 45.3
Philadelphia PA 58.4 53.9 44.4 45.8 49.6
Dallas TX 47.1 56.3 49.8 55.0 52.3
Houston TX 46.5 46.4 50.6 59.9 50.9
San Antonio TX 54.7 56.4 45.9 47.6 51.0

Shelter-Level Analysis

shelter_avg <- df |>
  group_by(shelter_name, city, state) |>
  summarise(
    n        = n(),
    occ_rate = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    .groups = "drop"
  ) |>
  filter(n >= 5)

cat("--- Top 10 Highest Occupancy (Beds-Based) ---\n")
#> --- Top 10 Highest Occupancy (Beds-Based) ---
shelter_avg |>
  arrange(desc(occ_rate)) |>
  slice_head(n = 10) |>
  print(width = Inf)
#> # A tibble: 10 × 5
#>    shelter_name       city         state     n occ_rate
#>    <chr>              <chr>        <chr> <int>    <dbl>
#>  1 Sunrise Shelter    Phoenix      AZ        6     72.8
#>  2 Recovery Residence San Diego    CA        9     72.2
#>  3 New Beginnings     San Jose     CA        9     69  
#>  4 Hope House         San Jose     CA        9     68.9
#>  5 Safe Haven         San Jose     CA        6     68.8
#>  6 Pathway Place      Philadelphia PA       12     67  
#>  7 Pathway Place      San Antonio  TX       10     66.1
#>  8 Sunrise Shelter    San Antonio  TX       11     66  
#>  9 Shelter Plus       Los Angeles  CA       15     65.7
#> 10 Hope House         Philadelphia PA        6     65.2
cat("\n--- Top 10 Lowest Occupancy (Beds-Based) ---\n")
#> 
#> --- Top 10 Lowest Occupancy (Beds-Based) ---
shelter_avg |>
  arrange(occ_rate) |>
  slice_head(n = 10) |>
  print(width = Inf)
#> # A tibble: 10 × 5
#>    shelter_name       city         state     n occ_rate
#>    <chr>              <chr>        <chr> <int>    <dbl>
#>  1 HomeSafe           New York     NY        9     21.5
#>  2 Hope House         Los Angeles  CA        7     32  
#>  3 Recovery Residence San Jose     CA        8     32.1
#>  4 Sunrise Shelter    Philadelphia PA        8     34.2
#>  5 Second Chance      New York     NY        7     34.8
#>  6 Harbor Home        Los Angeles  CA       13     35.4
#>  7 Pathway Place      Dallas       TX        7     36.2
#>  8 Hope House         Phoenix      AZ        5     37.8
#>  9 Sunrise Shelter    San Diego    CA       12     38.2
#> 10 Shelter Plus       San Antonio  TX       11     38.7
df |>
  group_by(shelter_name, city, state) |>
  summarise(
    n_records = n(),
    occ_rate  = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    sd_occ    = round(sd(beds_rate), 1),
    min_occ   = round(min(beds_rate), 1),
    max_occ   = round(max(beds_rate), 1),
    .groups = "drop"
  ) |>
  filter(n_records >= 5) |>
  arrange(desc(sd_occ)) |>
  print(width = Inf)
#> # A tibble: 98 × 8
#>    shelter_name       city         state n_records occ_rate sd_occ min_occ
#>    <chr>              <chr>        <chr>     <int>    <dbl>  <dbl>   <dbl>
#>  1 Pathway Place      San Jose     CA           12     60.4   39.1     0.3
#>  2 Recovery Residence Phoenix      AZ           12     53.5   36.4     4.4
#>  3 Harbor Home        Philadelphia PA            7     41.4   35.7     9.7
#>  4 HomeSafe           Phoenix      AZ            8     47     35.4     0.6
#>  5 Hope House         San Diego    CA            8     41.9   34.8     1.6
#>  6 Harbor Home        Dallas       TX           10     48     34.7     6.7
#>  7 Second Chance      San Diego    CA            8     59.3   34.7    15.4
#>  8 Sunrise Shelter    Houston      TX           10     49.2   34.6     3  
#>  9 Pathway Place      Dallas       TX            7     36.2   34.5     6  
#> 10 Safe Haven         Dallas       TX           13     55.6   34.5     9.1
#> 11 Hope House         Chicago      IL           11     54.3   34.4     4.9
#> 12 Shelter Plus       Dallas       TX           11     57.2   34.3     8.1
#> 13 New Beginnings     New York     NY           10     44.8   34.1     1  
#> 14 Safe Haven         Houston      TX            5     49.3   34       4.5
#> 15 New Beginnings     Houston      TX            9     58.9   33.8     7.5
#> 16 New Beginnings     Philadelphia PA           11     58.5   33.8     5.1
#> 17 Shelter Plus       Philadelphia PA           11     43.9   33.5     1  
#> 18 Second Chance      Philadelphia PA            7     51     33.4     0  
#> 19 Second Chance      Los Angeles  CA            5     49.5   33.3    10.4
#> 20 Sunrise Shelter    San Diego    CA           12     38.2   32.9     0  
#> 21 Recovery Residence San Diego    CA            9     72.2   32.8     8.2
#> 22 Sunrise Shelter    Los Angeles  CA           19     43.8   32.4     3.8
#> 23 Safe Haven         New York     NY            8     52.6   32.3    11.3
#> 24 Harbor Home        San Jose     CA           11     39.6   32.2     0.5
#> 25 Safe Haven         Chicago      IL            7     54.2   32.1     5.3
#> 26 Second Chance      Phoenix      AZ            6     41.2   32.1     2.2
#> 27 Sunrise Shelter    San Antonio  TX           11     66     32.1     4.7
#> 28 HomeSafe           Chicago      IL            9     51.5   31.8     0  
#> 29 Safe Haven         Phoenix      AZ            9     49.2   31.8     1.8
#> 30 Shelter Plus       San Jose     CA            7     56.9   31.8     0.7
#> 31 Shelter Plus       San Antonio  TX           11     38.7   31.5     2.4
#> 32 Harbor Home        San Diego    CA           11     52.7   31.4     2.8
#> 33 Second Chance      New York     NY            7     34.8   31.4     5.7
#> 34 New Beginnings     Dallas       TX            9     47.7   31.3     2.3
#> 35 Sunrise Shelter    Philadelphia PA            8     34.2   31.3     2.9
#> 36 Pathway Place      Houston      TX            5     61     31.2     8.5
#> 37 HomeSafe           Dallas       TX           14     49.1   30.9     0.8
#> 38 Pathway Place      Los Angeles  CA           11     57.8   30.7     2.6
#> 39 Shelter Plus       Houston      TX           14     48.5   30.7     0.5
#> 40 Harbor Home        Los Angeles  CA           13     35.4   30.6     6.6
#> 41 HomeSafe           San Jose     CA            9     54.5   30.6     5.8
#> 42 Second Chance      San Antonio  TX           13     47.8   30.5     6.3
#> 43 New Beginnings     San Antonio  TX           18     40.3   30.3     0.8
#> 44 New Beginnings     San Diego    CA            9     45.5   30.1    11.7
#> 45 HomeSafe           Philadelphia PA           15     45.7   29.9     4.7
#> 46 Pathway Place      New York     NY            9     39.5   29.5     5.1
#> 47 Sunrise Shelter    Chicago      IL           16     55     29.5     1.1
#> 48 Recovery Residence San Jose     CA            8     32.1   28.6     0  
#> 49 Shelter Plus       Phoenix      AZ            8     58.4   28.6    27.4
#> 50 Harbor Home        Phoenix      AZ            9     65.1   28.5     1.3
#> 51 HomeSafe           San Diego    CA            8     54.3   28.5    19.2
#> 52 Safe Haven         Los Angeles  CA           11     45.3   28.5     1.4
#> 53 New Beginnings     San Jose     CA            9     69     28.3    20  
#> 54 Safe Haven         Philadelphia PA            9     40.1   28.2     0  
#> 55 Recovery Residence San Antonio  TX           10     61.3   28.1    10.5
#> 56 Recovery Residence Philadelphia PA            7     42     28       8.1
#> 57 Sunrise Shelter    Dallas       TX           11     49.2   27.9     5.1
#> 58 Pathway Place      Phoenix      AZ            8     48.4   27.7    18.4
#> 59 Sunrise Shelter    Phoenix      AZ            6     72.8   27.7    35.4
#> 60 New Beginnings     Chicago      IL           19     55.3   27.5     3.8
#> 61 Recovery Residence Los Angeles  CA           12     54.1   27.4    22  
#> 62 Pathway Place      Chicago      IL            9     58.2   27.3     8.9
#> 63 Recovery Residence Houston      TX           10     43.4   27.3    15.1
#> 64 Sunrise Shelter    New York     NY           11     51.4   27.3     3.2
#> 65 Sunrise Shelter    San Jose     CA            9     52.5   27.2     8.4
#> 66 Shelter Plus       Los Angeles  CA           15     65.7   26.6     8.5
#> 67 Pathway Place      San Antonio  TX           10     66.1   26.5    20  
#> 68 Harbor Home        San Antonio  TX           15     49     26.2     2.9
#> 69 Second Chance      San Jose     CA           10     53.2   26.2     0  
#> 70 Hope House         New York     NY           12     44.2   26       6.3
#> 71 Second Chance      Houston      TX           11     64.2   25.8    13  
#> 72 Shelter Plus       Chicago      IL            9     53.4   25.8    13.6
#> 73 Shelter Plus       San Diego    CA           15     54.6   25.7    11.4
#> 74 HomeSafe           Houston      TX           11     52     25.6    19  
#> 75 Hope House         Phoenix      AZ            5     37.8   25.6     0.4
#> 76 Recovery Residence Chicago      IL           11     56.1   25.5    23  
#> 77 Recovery Residence New York     NY            9     63.7   25.4    22.6
#> 78 HomeSafe           Los Angeles  CA           11     50.9   25.2    23.8
#> 79 Second Chance      Chicago      IL            9     56.8   25      24.9
#> 80 Harbor Home        Chicago      IL           15     45.6   24.6     2.7
#> 81 Harbor Home        New York     NY           11     42.7   24.6    14.5
#> 82 Hope House         Los Angeles  CA            7     32     24.6     3.2
#> 83 Hope House         Houston      TX           17     49.6   24.3    10.6
#> 84 Safe Haven         San Diego    CA           11     54.3   24.2     7  
#> 85 Hope House         San Antonio  TX           13     51.1   24.1    12.9
#> 86 Shelter Plus       New York     NY            8     62.9   23.9    31.1
#> 87 Second Chance      Dallas       TX            7     53.4   23.2    23.6
#> 88 Harbor Home        Houston      TX           12     39     23       4.6
#> 89 Hope House         Philadelphia PA            6     65.2   22.5    25  
#> 90 Pathway Place      San Diego    CA            6     44.5   22.4    20.2
#> 91 Hope House         Dallas       TX           12     61.9   21.6    14.6
#> 92 New Beginnings     Phoenix      AZ           11     52.5   21.6    12.7
#> 93 Pathway Place      Philadelphia PA           12     67     21.5    16.1
#> 94 New Beginnings     Los Angeles  CA           12     53.7   21      12.7
#> 95 Recovery Residence Dallas       TX            7     59.4   19.9    25.3
#> 96 Safe Haven         San Jose     CA            6     68.8   18.8    33.1
#> 97 Hope House         San Jose     CA            9     68.9   17.6    44.1
#> 98 HomeSafe           New York     NY            9     21.5   12.3     3.9
#>    max_occ
#>      <dbl>
#>  1    98.7
#>  2    98.2
#>  3    94.6
#>  4    97.6
#>  5    96.6
#>  6    96  
#>  7    96.6
#>  8    98.1
#>  9    93.8
#> 10    98.3
#> 11    95.6
#> 12   100  
#> 13    94.6
#> 14    75.6
#> 15    96.1
#> 16    99.2
#> 17    90.2
#> 18    85.7
#> 19    86.8
#> 20    94.6
#> 21    96.9
#> 22    96.1
#> 23    98.2
#> 24    93.2
#> 25    91.5
#> 26    79.7
#> 27    99.1
#> 28    80.5
#> 29    85.4
#> 30    95.5
#> 31   100  
#> 32    95.8
#> 33    79.7
#> 34    92.6
#> 35    85.7
#> 36    87.7
#> 37    96.7
#> 38    98.1
#> 39    94.5
#> 40   100  
#> 41   100  
#> 42    95.9
#> 43   100  
#> 44    92.6
#> 45    96.4
#> 46    93.9
#> 47    98.8
#> 48    79.3
#> 49    89.9
#> 50    94.7
#> 51   100  
#> 52    87.2
#> 53   100  
#> 54    92.2
#> 55    93.1
#> 56    77.4
#> 57    94.9
#> 58    90.3
#> 59   100  
#> 60    95  
#> 61    94.2
#> 62    94.4
#> 63    88.2
#> 64    96.4
#> 65    90.9
#> 66    97.5
#> 67    95  
#> 68    76.5
#> 69    86.1
#> 70    89  
#> 71    97.8
#> 72    94.4
#> 73    83.2
#> 74    95.6
#> 75    68.9
#> 76    98.2
#> 77    86.8
#> 78    98.5
#> 79    92.5
#> 80    85.8
#> 81    87.5
#> 82    78.5
#> 83    90.8
#> 84    97.8
#> 85    93.3
#> 86   100  
#> 87    79.4
#> 88    76.1
#> 89    91  
#> 90    78.1
#> 91    94.8
#> 92    80.4
#> 93    97.2
#> 94    86.8
#> 95    87.1
#> 96    84.2
#> 97    92.2
#> 98    35.4

Seasonal Analysis

df |>
  group_by(clean_season) |>
  summarise(
    n_records      = n(),
    occ_rate       = round(sum(occupied_beds) / sum(total_capacity) * 100, 1),
    median_occ     = round(median(beds_rate), 1),
    mean_available = round(mean(available_beds), 1),
    .groups = "drop"
  ) |>
  arrange(desc(occ_rate))
#> # A tibble: 4 × 5
#>   clean_season n_records occ_rate median_occ mean_available
#>   <chr>            <int>    <dbl>      <dbl>          <dbl>
#> 1 Spring             241     54         57.6           81.7
#> 2 Autumn             264     51.2       51.5           88.9
#> 3 Winter             237     51.2       52.7           85.1
#> 4 Summer             258     49         46             92.7

Notes Analysis

df |>
  count(notes) |>
  arrange(desc(n)) |>
  mutate(pct = round(n / sum(n) * 100, 1)) |>
  rename(Note = notes, Count = n, Percent = pct)
#> # A tibble: 9 × 3
#>   Note                Count Percent
#>   <chr>               <int>   <dbl>
#> 1 Overcrowded           130    13  
#> 2 Needs maintenance     120    12  
#> 3 Recently renovated    119    11.9
#> 4 High demand           113    11.3
#> 5 Staff shortage        112    11.2
#> 6 No issues reported    107    10.7
#> 7 Under inspection      107    10.7
#> 8 Excellent condition   105    10.5
#> 9 <NA>                   87     8.7
df |>
  mutate(
    sentiment = case_when(
      notes == "Excellent condition" ~ "Positive",
      notes == "Recently renovated"  ~ "Positive",
      notes == "No issues reported"  ~ "Positive",
      notes == "Overcrowded"         ~ "Negative",
      notes == "Needs maintenance"   ~ "Negative",
      notes == "Staff shortage"      ~ "Negative",
      notes == "Under inspection"    ~ "Concerning",
      notes == "High demand"         ~ "Concerning",
      is.na(notes)                   ~ "Not Reported",
      TRUE                           ~ "Uncategorized"
    )
  ) |>
  group_by(sentiment) |>
  summarise(
    n   = n(),
    pct = round(n() / nrow(df) * 100, 1),
    .groups = "drop"
  ) |>
  arrange(desc(n)) |>
  rename(Sentiment = sentiment, Count = n, Percent = pct)
#> # A tibble: 4 × 3
#>   Sentiment    Count Percent
#>   <chr>        <int>   <dbl>
#> 1 Negative       362    36.2
#> 2 Positive       331    33.1
#> 3 Concerning     220    22  
#> 4 Not Reported    87     8.7

Demographics

gender_data <- tibble(
  gender = c("Male", "Female"),
  pct    = c(54.6, 45.4)
)

ggplot(gender_data, aes(x = "", y = pct, fill = gender)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(pct, "%")),
            position = position_stack(vjust = 0.5),
            size = 5, color = "white", fontface = "bold") +
  scale_fill_manual(values = c("Male" = "#5B2C8D", "Female" = "#4DC8C8")) +
  labs(title = "Average Gender Distribution Across All Records", fill = NULL) +
  theme_void() +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5, face = "bold"))

ggplot(df, aes(x = average_age)) +
  geom_histogram(binwidth = 5, fill = "#5B2C8D", color = "white") +
  geom_text(stat = "bin", binwidth = 5,
            aes(label = after_stat(ifelse(count > 0, count, ""))),
            vjust = -0.5, size = 3.5) +
  labs(title = "Distribution of Average Age Across Shelter Records",
       x = "Average Age",
       y = "Number of Records") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

cat("Overall Average Age\n")
#> Overall Average Age
df |> summarise(avg_age = round(mean(average_age), 1))
#> # A tibble: 1 × 1
#>   avg_age
#>     <dbl>
#> 1      42
cat("\nAverage Age by City\n")
#> 
#> Average Age by City
df |>
  group_by(city, state) |>
  summarise(avg_age = round(mean(average_age), 1), .groups = "drop") |>
  arrange(desc(avg_age)) |>
  rename(City = city, State = state, `Avg Age` = avg_age)
#> # A tibble: 10 × 3
#>    City         State `Avg Age`
#>    <chr>        <chr>     <dbl>
#>  1 Phoenix      AZ         43.2
#>  2 San Antonio  TX         43.2
#>  3 San Diego    CA         43.2
#>  4 Houston      TX         42.9
#>  5 Los Angeles  CA         42.7
#>  6 Philadelphia PA         42.4
#>  7 Dallas       TX         41.8
#>  8 Chicago      IL         41.7
#>  9 New York     NY         39.9
#> 10 San Jose     CA         39
cat("\nAverage Age by Season\n")
#> 
#> Average Age by Season
df |>
  group_by(clean_season) |>
  summarise(avg_age = round(mean(average_age), 1), .groups = "drop") |>
  arrange(desc(avg_age)) |>
  rename(Season = clean_season, `Avg Age` = avg_age)
#> # A tibble: 4 × 2
#>   Season `Avg Age`
#>   <chr>      <dbl>
#> 1 Spring      43.5
#> 2 Autumn      42  
#> 3 Summer      41.4
#> 4 Winter      41.3