Setup and Data Load
library(tidyverse)
df <- read_csv("C:/Users/shelb/OneDrive/Desktop/Datasets/Durham City Homelessness/homelessness_shelter_data.csv")
glimpse(df)
#> Rows: 1,000
#> Columns: 14
#> $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
#> $ date <date> 2023-10-05, 2023-08-01, 2025-04-27, 2024-10-05, 202…
#> $ shelter_name <chr> "Sunrise Shelter", "Safe Haven", "Second Chance", "H…
#> $ city <chr> "Dallas", "Dallas", "San Jose", "San Jose", "San Ant…
#> $ state <chr> "TX", "TX", "CA", "CA", "TX", "CA", "AZ", "TX", "TX"…
#> $ total_capacity <dbl> 85, 128, 190, 115, 155, 81, 261, 105, 241, 181, 152,…
#> $ occupied_beds <dbl> 62, 117, 144, 7, 65, 61, 108, 83, 179, 96, 134, 147,…
#> $ available_beds <dbl> 23, 11, 46, 108, 90, 20, 153, 22, 62, 85, 18, 123, 6…
#> $ occupancy_rate <dbl> 72.9, 91.4, 75.8, 6.1, 41.9, 75.3, 41.4, 79.0, 74.3,…
#> $ average_age <dbl> 19, 58, 56, 63, 21, 41, 26, 51, 21, 24, 54, 37, 47, …
#> $ male_percentage <dbl> 43, 45, 52, 48, 48, 51, 44, 52, 67, 56, 61, 63, 66, …
#> $ female_percentage <dbl> 57, 55, 48, 52, 52, 49, 56, 48, 33, 44, 39, 37, 34, …
#> $ season <chr> "Spring", "Summer", "Summer", "Autumn", "Summer", "A…
#> $ notes <chr> "High demand", "No issues reported", "Excellent cond…
n_records <- nrow(df)
n_cities <- n_distinct(df$city)
n_states <- n_distinct(df$state)
n_shelters <- n_distinct(df$shelter_name)
date_min <- format(min(as.Date(df$date)), "%B %Y")
date_max <- format(max(as.Date(df$date)), "%B %Y")
Descriptive Statistics
df |>
summarise(
mean_occupancy = round(mean(occupancy_rate), 1),
median_occupancy = round(median(occupancy_rate), 1),
sd_occupancy = round(sd(occupancy_rate), 1),
min_occupancy = min(occupancy_rate),
max_occupancy = max(occupancy_rate)
)
#> # A tibble: 1 × 5
#> mean_occupancy median_occupancy sd_occupancy min_occupancy max_occupancy
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 51.2 51.8 28.9 0 100
Occupancy Distribution
df |>
mutate(
occupancy_group = case_when(
occupancy_rate < 20 ~ "Under 20%",
occupancy_rate < 40 ~ "20-39%",
occupancy_rate < 60 ~ "40-59%",
occupancy_rate < 80 ~ "60-79%",
occupancy_rate < 95 ~ "80-94%",
TRUE ~ "95-100%"
),
occupancy_group = factor(occupancy_group, levels = c(
"Under 20%", "20-39%", "40-59%", "60-79%", "80-94%", "95-100%"
))
) |>
count(occupancy_group) |>
mutate(pct = round(n / sum(n) * 100, 1)) |>
rename(Group = occupancy_group, Count = n, Percent = pct)
#> # A tibble: 6 × 3
#> Group Count Percent
#> <fct> <int> <dbl>
#> 1 Under 20% 185 18.5
#> 2 20-39% 201 20.1
#> 3 40-59% 188 18.8
#> 4 60-79% 226 22.6
#> 5 80-94% 152 15.2
#> 6 95-100% 48 4.8
City-Level Summary
df |>
group_by(city, state) |>
summarise(
n_records = n(),
n_shelters = n_distinct(shelter_name),
mean_occupancy = round(mean(occupancy_rate), 1),
mean_available = round(mean(available_beds), 1),
total_capacity = sum(total_capacity),
total_occupied = sum(occupied_beds),
total_available = sum(available_beds),
.groups = "drop"
) |>
arrange(state, city) |>
print(width = Inf)
#> # A tibble: 10 × 9
#> city state n_records n_shelters mean_occupancy mean_available
#> <chr> <chr> <int> <int> <dbl> <dbl>
#> 1 Phoenix AZ 82 10 50.4 88
#> 2 Los Angeles CA 116 10 49.5 89.9
#> 3 San Diego CA 97 10 52.7 88.9
#> 4 San Jose CA 90 10 54.3 85.2
#> 5 Chicago IL 115 10 54.2 83.7
#> 6 New York NY 94 10 46.9 94.5
#> 7 Philadelphia PA 93 10 49.7 89.2
#> 8 Dallas TX 101 10 52.2 80.8
#> 9 Houston TX 104 10 50.4 88.7
#> 10 San Antonio TX 108 10 51.5 84.6
#> total_capacity total_occupied total_available
#> <dbl> <dbl> <dbl>
#> 1 15390 8174 7216
#> 2 20537 10106 10431
#> 3 18023 9404 8619
#> 4 17169 9501 7668
#> 5 20804 11179 9625
#> 6 16236 7353 8883
#> 7 16462 8171 8291
#> 8 17094 8938 8156
#> 9 18775 9552 9223
#> 10 18633 9496 9137
Seasonal Occupancy
df |>
group_by(season) |>
summarise(
n_records = n(),
mean_occupancy = round(mean(occupancy_rate), 1),
median_occupancy = round(median(occupancy_rate), 1),
mean_available = round(mean(available_beds), 1),
.groups = "drop"
) |>
arrange(desc(mean_occupancy))
#> # A tibble: 4 × 5
#> season n_records mean_occupancy median_occupancy mean_available
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 Winter 248 53.6 54.8 83
#> 2 Autumn 252 51.3 54.7 86.4
#> 3 Summer 244 50.6 47.9 88.6
#> 4 Spring 256 49.4 50.9 90.8
Available Beds
df |>
summarise(
total_records = n(),
mean_available = round(mean(available_beds), 1),
median_available = round(median(available_beds), 1),
records_zero_beds = sum(available_beds == 0),
pct_zero_beds = round(mean(available_beds == 0) * 100, 1),
records_under10 = sum(available_beds < 10),
pct_under10 = round(mean(available_beds < 10) * 100, 1)
)
#> # A tibble: 1 × 7
#> total_records mean_available median_available records_zero_beds pct_zero_beds
#> <int> <dbl> <dbl> <int> <dbl>
#> 1 1000 87.2 71 9 0.9
#> # ℹ 2 more variables: records_under10 <int>, pct_under10 <dbl>
Shelter-Level Variance
df |>
group_by(shelter_name, city, state) |>
summarise(
n_records = n(),
mean_occupancy = round(mean(occupancy_rate), 1),
sd_occupancy = round(sd(occupancy_rate), 1),
min_occupancy = round(min(occupancy_rate), 1),
max_occupancy = round(max(occupancy_rate), 1),
.groups = "drop"
) |>
filter(n_records >= 5) |>
arrange(state, city) |>
print(width = Inf)
#> # A tibble: 98 × 8
#> shelter_name city state n_records mean_occupancy sd_occupancy
#> <chr> <chr> <chr> <int> <dbl> <dbl>
#> 1 Harbor Home Phoenix AZ 9 62.9 28.5
#> 2 HomeSafe Phoenix AZ 8 39.7 35.4
#> 3 Hope House Phoenix AZ 5 43.2 25.6
#> 4 New Beginnings Phoenix AZ 11 51.1 21.6
#> 5 Pathway Place Phoenix AZ 8 48.9 27.8
#> 6 Recovery Residence Phoenix AZ 12 41.8 36.4
#> 7 Safe Haven Phoenix AZ 9 51.2 31.8
#> 8 Second Chance Phoenix AZ 6 39.6 32.1
#> 9 Shelter Plus Phoenix AZ 8 61.9 28.6
#> 10 Sunrise Shelter Phoenix AZ 6 64.7 27.7
#> min_occupancy max_occupancy
#> <dbl> <dbl>
#> 1 1.3 94.7
#> 2 0.6 97.6
#> 3 0.4 68.9
#> 4 12.7 80.4
#> 5 18.4 90.3
#> 6 4.4 98.2
#> 7 1.8 85.4
#> 8 2.2 79.7
#> 9 27.4 89.9
#> 10 35.4 100
#> # ℹ 88 more rows
Notes Sentiment Analysis
df |>
mutate(
sentiment = case_when(
notes == "Excellent condition" ~ "Positive",
notes == "Recently renovated" ~ "Positive",
notes == "No issues reported" ~ "Positive",
notes == "Overcrowded" ~ "Negative",
notes == "Needs maintenance" ~ "Negative",
notes == "Staff shortage" ~ "Negative",
notes == "Under inspection" ~ "Concerning",
notes == "High demand" ~ "Concerning",
is.na(notes) ~ "Not Reported",
TRUE ~ "Uncategorized"
)
) |>
group_by(sentiment) |>
summarise(
n = n(),
pct = round(n() / nrow(df) * 100, 1),
.groups = "drop"
) |>
arrange(desc(n)) |>
rename(Sentiment = sentiment, Count = n, Percent = pct)
#> # A tibble: 4 × 3
#> Sentiment Count Percent
#> <chr> <int> <dbl>
#> 1 Negative 362 36.2
#> 2 Positive 331 33.1
#> 3 Concerning 220 22
#> 4 Not Reported 87 8.7
Data Quality and Outlier Check
# Missing values by column
df |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
filter(n_missing > 0)
#> # A tibble: 1 × 2
#> variable n_missing
#> <chr> <int>
#> 1 notes 87
# Occupancy rate vs calculated rate
df |>
mutate(
calc_rate = round(occupied_beds / total_capacity * 100, 1),
rate_mismatch = abs(occupancy_rate - calc_rate) > 1
) |>
summarise(
n_mismatches = sum(rate_mismatch),
pct_mismatch = round(mean(rate_mismatch) * 100, 1)
)
#> # A tibble: 1 × 2
#> n_mismatches pct_mismatch
#> <int> <dbl>
#> 1 0 0
# Records where occupied + available does not equal total capacity
df |>
mutate(bed_check = (occupied_beds + available_beds) == total_capacity) |>
summarise(
n_mismatch = sum(!bed_check),
pct_mismatch = round(mean(!bed_check) * 100, 1)
)
#> # A tibble: 1 × 2
#> n_mismatch pct_mismatch
#> <int> <dbl>
#> 1 0 0
# Extreme occupancy values
df |>
filter(occupancy_rate == 0 | occupancy_rate == 100) |>
select(date, shelter_name, city, total_capacity, occupied_beds, occupancy_rate, notes) |>
arrange(occupancy_rate)
#> # A tibble: 15 × 7
#> date shelter_name city total_capacity occupied_beds occupancy_rate
#> <date> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 2024-10-15 Second Chance Phil… 52 0 0
#> 2 2025-03-19 Sunrise Shelter San … 162 0 0
#> 3 2024-04-19 Second Chance San … 51 0 0
#> 4 2023-12-03 Recovery Reside… San … 254 0 0
#> 5 2025-04-16 Safe Haven Phil… 73 0 0
#> 6 2024-07-06 HomeSafe Chic… 136 0 0
#> 7 2024-06-08 HomeSafe San … 271 271 100
#> 8 2025-02-18 HomeSafe San … 68 68 100
#> 9 2024-01-10 New Beginnings San … 153 153 100
#> 10 2025-01-30 Shelter Plus New … 64 64 100
#> 11 2024-09-11 Shelter Plus Dall… 116 116 100
#> 12 2024-03-19 Harbor Home Los … 124 124 100
#> 13 2024-01-12 Sunrise Shelter Phoe… 294 294 100
#> 14 2025-07-11 New Beginnings San … 92 92 100
#> 15 2023-08-25 Shelter Plus San … 59 59 100
#> # ℹ 1 more variable: notes <chr>