Setup and Data Load

library(tidyverse)

df <- read_csv("C:/Users/shelb/OneDrive/Desktop/Datasets/Durham City Homelessness/homelessness_shelter_data.csv")

glimpse(df)
#> Rows: 1,000
#> Columns: 14
#> $ id                <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
#> $ date              <date> 2023-10-05, 2023-08-01, 2025-04-27, 2024-10-05, 202…
#> $ shelter_name      <chr> "Sunrise Shelter", "Safe Haven", "Second Chance", "H…
#> $ city              <chr> "Dallas", "Dallas", "San Jose", "San Jose", "San Ant…
#> $ state             <chr> "TX", "TX", "CA", "CA", "TX", "CA", "AZ", "TX", "TX"…
#> $ total_capacity    <dbl> 85, 128, 190, 115, 155, 81, 261, 105, 241, 181, 152,…
#> $ occupied_beds     <dbl> 62, 117, 144, 7, 65, 61, 108, 83, 179, 96, 134, 147,…
#> $ available_beds    <dbl> 23, 11, 46, 108, 90, 20, 153, 22, 62, 85, 18, 123, 6…
#> $ occupancy_rate    <dbl> 72.9, 91.4, 75.8, 6.1, 41.9, 75.3, 41.4, 79.0, 74.3,…
#> $ average_age       <dbl> 19, 58, 56, 63, 21, 41, 26, 51, 21, 24, 54, 37, 47, …
#> $ male_percentage   <dbl> 43, 45, 52, 48, 48, 51, 44, 52, 67, 56, 61, 63, 66, …
#> $ female_percentage <dbl> 57, 55, 48, 52, 52, 49, 56, 48, 33, 44, 39, 37, 34, …
#> $ season            <chr> "Spring", "Summer", "Summer", "Autumn", "Summer", "A…
#> $ notes             <chr> "High demand", "No issues reported", "Excellent cond…
n_records   <- nrow(df)
n_cities    <- n_distinct(df$city)
n_states    <- n_distinct(df$state)
n_shelters  <- n_distinct(df$shelter_name)
date_min    <- format(min(as.Date(df$date)), "%B %Y")
date_max    <- format(max(as.Date(df$date)), "%B %Y")

Descriptive Statistics

df |> 
  summarise(
    mean_occupancy   = round(mean(occupancy_rate), 1),
    median_occupancy = round(median(occupancy_rate), 1),
    sd_occupancy     = round(sd(occupancy_rate), 1),
    min_occupancy    = min(occupancy_rate),
    max_occupancy    = max(occupancy_rate)
  )
#> # A tibble: 1 × 5
#>   mean_occupancy median_occupancy sd_occupancy min_occupancy max_occupancy
#>            <dbl>            <dbl>        <dbl>         <dbl>         <dbl>
#> 1           51.2             51.8         28.9             0           100

Occupancy Distribution

df |>
  mutate(
    occupancy_group = case_when(
      occupancy_rate < 20  ~ "Under 20%",
      occupancy_rate < 40  ~ "20-39%",
      occupancy_rate < 60  ~ "40-59%",
      occupancy_rate < 80  ~ "60-79%",
      occupancy_rate < 95  ~ "80-94%",
      TRUE                 ~ "95-100%"
    ),
    occupancy_group = factor(occupancy_group, levels = c(
      "Under 20%", "20-39%", "40-59%", "60-79%", "80-94%", "95-100%"
    ))
  ) |>
  count(occupancy_group) |>
  mutate(pct = round(n / sum(n) * 100, 1)) |>
  rename(Group = occupancy_group, Count = n, Percent = pct)
#> # A tibble: 6 × 3
#>   Group     Count Percent
#>   <fct>     <int>   <dbl>
#> 1 Under 20%   185    18.5
#> 2 20-39%      201    20.1
#> 3 40-59%      188    18.8
#> 4 60-79%      226    22.6
#> 5 80-94%      152    15.2
#> 6 95-100%      48     4.8

City-Level Summary

df |>
  group_by(city, state) |>
  summarise(
    n_records      = n(),
    n_shelters      = n_distinct(shelter_name),
    mean_occupancy = round(mean(occupancy_rate), 1),
    mean_available = round(mean(available_beds), 1),
    total_capacity = sum(total_capacity),
    total_occupied = sum(occupied_beds),
    total_available = sum(available_beds),
    .groups = "drop"
  ) |>
  arrange(state, city) |> 
  print(width = Inf)
#> # A tibble: 10 × 9
#>    city         state n_records n_shelters mean_occupancy mean_available
#>    <chr>        <chr>     <int>      <int>          <dbl>          <dbl>
#>  1 Phoenix      AZ           82         10           50.4           88  
#>  2 Los Angeles  CA          116         10           49.5           89.9
#>  3 San Diego    CA           97         10           52.7           88.9
#>  4 San Jose     CA           90         10           54.3           85.2
#>  5 Chicago      IL          115         10           54.2           83.7
#>  6 New York     NY           94         10           46.9           94.5
#>  7 Philadelphia PA           93         10           49.7           89.2
#>  8 Dallas       TX          101         10           52.2           80.8
#>  9 Houston      TX          104         10           50.4           88.7
#> 10 San Antonio  TX          108         10           51.5           84.6
#>    total_capacity total_occupied total_available
#>             <dbl>          <dbl>           <dbl>
#>  1          15390           8174            7216
#>  2          20537          10106           10431
#>  3          18023           9404            8619
#>  4          17169           9501            7668
#>  5          20804          11179            9625
#>  6          16236           7353            8883
#>  7          16462           8171            8291
#>  8          17094           8938            8156
#>  9          18775           9552            9223
#> 10          18633           9496            9137

Seasonal Occupancy

df |>
  group_by(season) |>
  summarise(
    n_records      = n(),
    mean_occupancy = round(mean(occupancy_rate), 1),
    median_occupancy = round(median(occupancy_rate), 1),
    mean_available = round(mean(available_beds), 1),
    .groups = "drop"
  ) |>
  arrange(desc(mean_occupancy))
#> # A tibble: 4 × 5
#>   season n_records mean_occupancy median_occupancy mean_available
#>   <chr>      <int>          <dbl>            <dbl>          <dbl>
#> 1 Winter       248           53.6             54.8           83  
#> 2 Autumn       252           51.3             54.7           86.4
#> 3 Summer       244           50.6             47.9           88.6
#> 4 Spring       256           49.4             50.9           90.8

Available Beds

df |>
  summarise(
    total_records        = n(),
    mean_available       = round(mean(available_beds), 1),
    median_available     = round(median(available_beds), 1),
    records_zero_beds    = sum(available_beds == 0),
    pct_zero_beds        = round(mean(available_beds == 0) * 100, 1),
    records_under10      = sum(available_beds < 10),
    pct_under10          = round(mean(available_beds < 10) * 100, 1)
  )
#> # A tibble: 1 × 7
#>   total_records mean_available median_available records_zero_beds pct_zero_beds
#>           <int>          <dbl>            <dbl>             <int>         <dbl>
#> 1          1000           87.2               71                 9           0.9
#> # ℹ 2 more variables: records_under10 <int>, pct_under10 <dbl>

Shelter-Level Variance

df |>
  group_by(shelter_name, city, state) |>
  summarise(
    n_records      = n(),
    mean_occupancy = round(mean(occupancy_rate), 1),
    sd_occupancy   = round(sd(occupancy_rate), 1),
    min_occupancy  = round(min(occupancy_rate), 1),
    max_occupancy  = round(max(occupancy_rate), 1),
    .groups = "drop"
  ) |>
  filter(n_records >= 5) |>
  arrange(state, city) |>
  print(width = Inf)
#> # A tibble: 98 × 8
#>    shelter_name       city    state n_records mean_occupancy sd_occupancy
#>    <chr>              <chr>   <chr>     <int>          <dbl>        <dbl>
#>  1 Harbor Home        Phoenix AZ            9           62.9         28.5
#>  2 HomeSafe           Phoenix AZ            8           39.7         35.4
#>  3 Hope House         Phoenix AZ            5           43.2         25.6
#>  4 New Beginnings     Phoenix AZ           11           51.1         21.6
#>  5 Pathway Place      Phoenix AZ            8           48.9         27.8
#>  6 Recovery Residence Phoenix AZ           12           41.8         36.4
#>  7 Safe Haven         Phoenix AZ            9           51.2         31.8
#>  8 Second Chance      Phoenix AZ            6           39.6         32.1
#>  9 Shelter Plus       Phoenix AZ            8           61.9         28.6
#> 10 Sunrise Shelter    Phoenix AZ            6           64.7         27.7
#>    min_occupancy max_occupancy
#>            <dbl>         <dbl>
#>  1           1.3          94.7
#>  2           0.6          97.6
#>  3           0.4          68.9
#>  4          12.7          80.4
#>  5          18.4          90.3
#>  6           4.4          98.2
#>  7           1.8          85.4
#>  8           2.2          79.7
#>  9          27.4          89.9
#> 10          35.4         100  
#> # ℹ 88 more rows

Notes / Comments Distribution

df |>
  count(notes) |>
  arrange(desc(n)) |>
  mutate(pct = round(n / sum(n) * 100, 1)) |>
  rename(Note = notes, Count = n, Percent = pct)
#> # A tibble: 9 × 3
#>   Note                Count Percent
#>   <chr>               <int>   <dbl>
#> 1 Overcrowded           130    13  
#> 2 Needs maintenance     120    12  
#> 3 Recently renovated    119    11.9
#> 4 High demand           113    11.3
#> 5 Staff shortage        112    11.2
#> 6 No issues reported    107    10.7
#> 7 Under inspection      107    10.7
#> 8 Excellent condition   105    10.5
#> 9 <NA>                   87     8.7

Notes Sentiment Analysis

df |>
  mutate(
    sentiment = case_when(
      notes == "Excellent condition"  ~ "Positive",
      notes == "Recently renovated"   ~ "Positive",
      notes == "No issues reported"   ~ "Positive",
      notes == "Overcrowded"          ~ "Negative",
      notes == "Needs maintenance"    ~ "Negative",
      notes == "Staff shortage"       ~ "Negative",
      notes == "Under inspection"     ~ "Concerning",
      notes == "High demand"          ~ "Concerning",
      is.na(notes)                    ~ "Not Reported",
      TRUE                            ~ "Uncategorized"
    )
  ) |>
  group_by(sentiment) |>
  summarise(
    n = n(),
    pct = round(n() / nrow(df) * 100, 1),
    .groups = "drop"
  ) |>
  arrange(desc(n)) |>
  rename(Sentiment = sentiment, Count = n, Percent = pct)
#> # A tibble: 4 × 3
#>   Sentiment    Count Percent
#>   <chr>        <int>   <dbl>
#> 1 Negative       362    36.2
#> 2 Positive       331    33.1
#> 3 Concerning     220    22  
#> 4 Not Reported    87     8.7

Data Quality and Outlier Check

# Missing values by column
df |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
  filter(n_missing > 0)
#> # A tibble: 1 × 2
#>   variable n_missing
#>   <chr>        <int>
#> 1 notes           87
# Occupancy rate vs calculated rate
df |>
  mutate(
    calc_rate     = round(occupied_beds / total_capacity * 100, 1),
    rate_mismatch = abs(occupancy_rate - calc_rate) > 1
  ) |>
  summarise(
    n_mismatches  = sum(rate_mismatch),
    pct_mismatch  = round(mean(rate_mismatch) * 100, 1)
  )
#> # A tibble: 1 × 2
#>   n_mismatches pct_mismatch
#>          <int>        <dbl>
#> 1            0            0
# Records where occupied + available does not equal total capacity
df |>
  mutate(bed_check = (occupied_beds + available_beds) == total_capacity) |>
  summarise(
    n_mismatch    = sum(!bed_check),
    pct_mismatch  = round(mean(!bed_check) * 100, 1)
  )
#> # A tibble: 1 × 2
#>   n_mismatch pct_mismatch
#>        <int>        <dbl>
#> 1          0            0
# Extreme occupancy values
df |>
  filter(occupancy_rate == 0 | occupancy_rate == 100) |>
  select(date, shelter_name, city, total_capacity, occupied_beds, occupancy_rate, notes) |>
  arrange(occupancy_rate)
#> # A tibble: 15 × 7
#>    date       shelter_name     city  total_capacity occupied_beds occupancy_rate
#>    <date>     <chr>            <chr>          <dbl>         <dbl>          <dbl>
#>  1 2024-10-15 Second Chance    Phil…             52             0              0
#>  2 2025-03-19 Sunrise Shelter  San …            162             0              0
#>  3 2024-04-19 Second Chance    San …             51             0              0
#>  4 2023-12-03 Recovery Reside… San …            254             0              0
#>  5 2025-04-16 Safe Haven       Phil…             73             0              0
#>  6 2024-07-06 HomeSafe         Chic…            136             0              0
#>  7 2024-06-08 HomeSafe         San …            271           271            100
#>  8 2025-02-18 HomeSafe         San …             68            68            100
#>  9 2024-01-10 New Beginnings   San …            153           153            100
#> 10 2025-01-30 Shelter Plus     New …             64            64            100
#> 11 2024-09-11 Shelter Plus     Dall…            116           116            100
#> 12 2024-03-19 Harbor Home      Los …            124           124            100
#> 13 2024-01-12 Sunrise Shelter  Phoe…            294           294            100
#> 14 2025-07-11 New Beginnings   San …             92            92            100
#> 15 2023-08-25 Shelter Plus     San …             59            59            100
#> # ℹ 1 more variable: notes <chr>