Key Variables are:
PNSC_15 - Feeling of safety - walking alone in neighbourhood NEI_05F
- Neighbourhood issues - people using or dealing drugs PRSPGNDR - Gender
of reference person PHHTTINC - Total income of household
Clean Variables
table(chs_data$PRSPGNDR)
##
## 1 2
## 18244 22744
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Clean and Recode the Gender Variable (PRSPGNDR)
chs_cleaned <- chs_data %>%
mutate(
PRSPGNDR = case_when(
PRSPGNDR == "1" ~ "Male",
PRSPGNDR == "2" ~ "Female",
TRUE ~ NA_character_ # Remove missing values
),
# Create ordered factor for proper sorting
PRSPGNDR = factor(PRSPGNDR,
levels = c("Male", "Female"))
)
# Check Table
table(chs_cleaned$PRSPGNDR)
##
## Male Female
## 18244 22744
table(chs_data$PHHTTINC)
##
## -60000 -57500 -15000 -10500 -5750 -275
## 1 2 1 1 1 1
## 0 775 800 825 850 875
## 5 2 1 1 1 3
## 925 950 975 1025 1050 1300
## 1 1 1 1 2 7
## 1350 1400 1450 1500 1650 1700
## 2 6 2 2 1 1
## 1750 1900 1950 2000 2100 2200
## 1 1 1 4 5 11
## 2300 2400 2500 2600 2700 3400
## 2 1 4 1 1 1
## 3600 3700 3800 3900 4000 4200
## 1 1 1 1 3 3
## 4300 4400 4500 4600 4800 5000
## 3 1 2 1 2 1
## 5250 5500 5750 6000 6250 6500
## 1 4 3 8 4 4
## 6750 7000 7250 7500 7750 8000
## 4 5 10 7 18 22
## 8250 8500 8750 9000 9250 9500
## 21 16 13 11 14 12
## 9750 10000 10250 10500 11000 11500
## 18 43 1 45 71 76
## 12000 12500 13000 13500 14000 14500
## 69 55 48 69 77 65
## 15000 15500 16000 16500 17000 17500
## 71 79 93 94 83 62
## 18000 18500 19000 19500 20000 20500
## 72 92 126 71 233 6
## 21000 22000 23000 24000 25000 26000
## 466 746 798 636 530 426
## 27000 28000 29000 30000 31000 32000
## 368 321 293 266 249 268
## 33000 34000 35000 36000 37000 38000
## 247 292 294 301 323 294
## 39000 40000 41000 42000 43000 44000
## 334 321 294 269 320 271
## 45000 46000 47000 47500 48000 49000
## 287 289 244 87 258 221
## 50000 51000 52500 55000 57500 60000
## 468 37 555 620 633 624
## 62500 65000 67500 70000 72500 75000
## 618 584 580 558 481 503
## 77500 80000 82500 85000 87500 90000
## 501 535 469 507 466 462
## 92500 95000 97500 1e+05 102500 105000
## 450 548 406 599 51 612
## 110000 115000 120000 125000 130000 135000
## 674 607 543 586 482 480
## 140000 145000 150000 155000 160000 165000
## 438 431 393 344 316 295
## 170000 175000 180000 185000 190000 195000
## 256 260 258 186 250 147
## 2e+05 205000 210000 220000 230000 240000
## 259 17 266 233 208 204
## 250000 260000 270000 280000 290000 3e+05
## 160 124 111 73 65 57
## 310000 320000 330000 340000 350000 360000
## 55 45 49 32 41 19
## 370000 380000 390000 4e+05 410000 420000
## 20 23 33 30 22 11
## 430000 440000 450000 460000 470000 480000
## 5 2 7 10 8 11
## 490000 5e+05 510000 525000 550000 575000
## 7 11 1 15 26 18
## 6e+05 625000 650000 675000 7e+05 725000
## 29 20 10 12 10 2
## 750000 975000 1e+06 1025000 1150000 1200000
## 4 9 7 3 1 4
## 1250000 1300000 99999999999
## 4 2 7471
chs_data$income <- chs_data$PHHTTINC
# Remove negative values (replace with NA)
chs_data$income[chs_data$income < 0] <- NA
# Remove extremely high values
chs_data$income[chs_data$income > 1500000] <- NA
# View a summary of the cleaned income variable
summary(chs_data$income)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 36000 67500 88289 115000 1300000 7478
library(tidyverse)
# Clean and Recode the Safety Variable (PNSC_15)
chs_cleaned <- chs_cleaned %>%
mutate(
PNSC_15 = case_when(
PNSC_15 == "1" ~ "Very Safe",
PNSC_15 == "2" ~ "Reasonably Safe",
PNSC_15 == "3" ~ "Unsafe",
PNSC_15 == "4" ~ "Do not walk alone",
TRUE ~ NA_character_ # Remove missing values
),
# Create ordered factor for proper sorting
PNSC_15 = factor(PNSC_15,
levels = c("Very Safe", "Reasonably Safe", "Unsafe", "Do not walk alone"))
)
# Check Table
table(chs_cleaned$PNSC_15)
##
## Very Safe Reasonably Safe Unsafe Do not walk alone
## 12758 16800 5565 5828
table(chs_data$NEI_05F)
##
## 1 2 3 4 9
## 2681 3477 6723 27868 239
library(tidyverse)
# Clean and Recode the Dealing and Usage of Drugs Variable (NEI_05F)
chs_cleaned <- chs_cleaned %>%
mutate(
NEI_05F = case_when(
NEI_05F == "1" ~ "A big problem",
NEI_05F == "2" ~ "A moderate problem",
NEI_05F == "3" ~ "A small problem",
NEI_05F == "4" ~ "Not a problem",
NEI_05F == "9" ~ "Not stated",
TRUE ~ NA_character_ # Remove missing values
),
# Create ordered factor for proper sorting
NEI_05F = factor(NEI_05F,
levels = c("A big problem", "A moderate problem", "A small problem", "Not a problem"))
)
# Check Table
table(chs_cleaned$NEI_05F)
##
## A big problem A moderate problem A small problem Not a problem
## 2681 3477 6723 27868
Create Table for Gender and Feeling of Safety
# Calculate the count of each gender by feeling of safety (Safe/Unsafe)
gender_safety_percentages <- chs_cleaned %>%
mutate(
# Recategorize safety values
PNSC_15 = case_when(
PNSC_15 %in% c("Very Safe", "Reasonably Safe") ~ "Safe",
PNSC_15 %in% c("Unsafe", "Do not walk alone") ~ "Unsafe",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(PNSC_15)) %>%
group_by(PRSPGNDR, PNSC_15) %>%
summarise(n = n(), .groups = "drop") %>% # Drop the grouping structure
mutate(percentage = n / sum(n) * 100) %>% # Calculate percentage
select(PRSPGNDR, PNSC_15, n, percentage)
# Preview the data
head(gender_safety_percentages)
## # A tibble: 4 × 4
## PRSPGNDR PNSC_15 n percentage
## <fct> <chr> <int> <dbl>
## 1 Male Safe 15216 37.2
## 2 Male Unsafe 3014 7.36
## 3 Female Safe 14342 35.0
## 4 Female Unsafe 8379 20.5
# Load the gt package
library(gt)
# Create a table using the gender_safety_percentages data
gender_safety_percentages %>%
gt() %>%
# Label the columns
cols_label(
PRSPGNDR = "Gender",
PNSC_15 = "Feeling of Safety",
n = "Count",
percentage = "Percentage"
) %>%
# Format percentage column to display as percentages
fmt_percent(
columns = vars(percentage),
decimals = 1
) %>%
# Add a table header with a title
tab_header(
title = md("**Gender and Feeling of Safety (Safe vs. Unsafe)**"),
subtitle = "Percentage and Count of People Feeling Safe or Unsafe by Gender"
) %>%
# Custom styling for the header
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
# Table options for better visualization
tab_options(
table.border.top.width = 2,
table.border.bottom.width = 2,
column_labels.border.bottom.width = 2,
table.font.size = px(12),
heading.title.font.size = px(14),
heading.subtitle.font.size = px(12),
source_notes.font.size = px(10),
data_row.padding = px(4)
)
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
| Gender and Feeling of Safety (Safe vs. Unsafe) |
| Percentage and Count of People Feeling Safe or Unsafe by Gender |
| Gender |
Feeling of Safety |
Count |
Percentage |
| Male |
Safe |
15216 |
3,715.7% |
| Male |
Unsafe |
3014 |
736.0% |
| Female |
Safe |
14342 |
3,502.2% |
| Female |
Unsafe |
8379 |
2,046.1% |
Create a Visualization for Gender & Feeling of Safety
# Load libraries
library(ggplot2)
library(dplyr)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3
# Define colors for Safe vs. Unsafe
safety_colors <- c("Safe" = "#2E86AB", "Unsafe" = "#E74C3C")
# Create visualization
ggplot(gender_safety_percentages,
aes(x = percentage, y = reorder(PRSPGNDR, -percentage), fill = PNSC_15)) +
# Add bars
geom_col(width = 0.6, alpha = 0.9) +
# Add points at the end of bars
geom_point(aes(x = percentage), size = 3, color = "black") +
# Add value labels using ggrepel
geom_label_repel(
aes(label = sprintf("%.1f%%", percentage)),
nudge_x = 2, # Push labels slightly away from bars
direction = "y",
hjust = 0,
segment.size = 0.3,
segment.color = "grey70",
box.padding = 0.4,
point.padding = 0.4,
size = 3.5,
fontface = "bold",
label.size = 0.1,
label.r = unit(0.2, "lines"),
fill = alpha("white", 0.7)
) +
# Set color scale for bars
scale_fill_manual(values = safety_colors) +
# Adjust x-axis limits to 60%
scale_x_continuous(
limits = c(0, 60), # Max at 60%
breaks = seq(0, 60, by = 10),
labels = scales::label_percent(scale = 1)
) +
# Add labels
labs(
title = "Gender and Feeling of Safety",
subtitle = "Percentage of People Feeling Safe or Unsafe by Gender",
caption = "Data Source: CHS 2021",
x = "Percentage",
y = NULL,
fill = "Feeling of Safety" # Legend title
) +
# Custom theme
theme_minimal() +
theme(
# Title styling
plot.title = element_text(family = "sans", face = "bold", size = 16, margin = margin(b = 10)),
plot.subtitle = element_text(family = "sans", color = "#2E86AB", size = 12, margin = margin(b = 20)),
plot.caption = element_text(color = "grey40", margin = margin(t = 20)),
# Axis formatting
axis.text = element_text(family = "sans", size = 11, color = "grey20"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(margin = margin(t = 10), color = "grey20"),
# Grid lines
panel.grid.major.x = element_line(color = "grey95", size = 0.3),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
# Legend styling
legend.position = "top",
legend.title = element_text(face = "bold"),
# Margins
plot.margin = margin(30, 30, 30, 30)
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Create Table for Income and Feeling of Safety
# Remove negative values (replace with NA)
chs_cleaned <- chs_cleaned %>%
mutate(PHHTTINC = ifelse(PHHTTINC < 0 | PHHTTINC > 1500000, NA, PHHTTINC))
# Compute quartiles based on CHS 2021 dataset
income_quartiles <- quantile(chs_cleaned$PHHTTINC, probs = c(0.25, 0.50, 0.75), na.rm = TRUE)
# Assign categories
chs_cleaned <- chs_cleaned %>%
mutate(income_category = case_when(
PHHTTINC <= income_quartiles[1] ~ "Bottom 25%",
PHHTTINC > income_quartiles[1] & PHHTTINC <= income_quartiles[2] ~ "Lower Middle",
PHHTTINC > income_quartiles[2] & PHHTTINC <= income_quartiles[3] ~ "Upper Middle",
PHHTTINC > income_quartiles[3] ~ "Top 25%",
TRUE ~ NA_character_
))
# Check distribution of new categories
table(chs_cleaned$income_category, useNA = "ifany")
##
## Bottom 25% Lower Middle Top 25% Upper Middle <NA>
## 8490 8531 8060 8429 7478
# Calculate the count of each income category by feeling of safety (Safe/Unsafe)
income_safety_percentages <- chs_cleaned %>%
mutate(
# Recategorize safety values
PNSC_15 = case_when(
PNSC_15 %in% c("Very Safe", "Reasonably Safe") ~ "Safe",
PNSC_15 %in% c("Unsafe", "Do not walk alone") ~ "Unsafe",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(PNSC_15) & !is.na(income_category)) %>%
group_by(income_category, PNSC_15) %>%
summarise(n = n(), .groups = "drop") %>%
mutate(percentage = n / sum(n) * 100) %>%
select(income_category, PNSC_15, n, percentage)
# Preview the data
head(income_safety_percentages)
## # A tibble: 6 × 4
## income_category PNSC_15 n percentage
## <chr> <chr> <int> <dbl>
## 1 Bottom 25% Safe 4749 14.2
## 2 Bottom 25% Unsafe 3725 11.1
## 3 Lower Middle Safe 6067 18.1
## 4 Lower Middle Unsafe 2456 7.34
## 5 Top 25% Safe 6779 20.3
## 6 Top 25% Unsafe 1277 3.81
# Load the gt package
library(gt)
# Create a table using the income_safety_percentages data
income_safety_percentages %>%
gt() %>%
# Label the columns for better readability
cols_label(
income_category = "Income Category",
PNSC_15 = "Feeling of Safety",
n = "Count",
percentage = "Percentage"
) %>%
# Format the percentage column to display percentages with 1 decimal point
fmt_percent(
columns = c(percentage),
decimals = 1
) %>%
# Add a table header with a title
tab_header(
title = md("**Income and Feeling of Safety**"),
subtitle = "Percentage and Count of People Feeling Safe or Unsafe by Income Category"
) %>%
# Custom styling for the header
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
# Table options for better visualization
tab_options(
table.border.top.width = 2,
table.border.bottom.width = 2,
column_labels.border.bottom.width = 2,
table.font.size = px(12),
heading.title.font.size = px(14),
heading.subtitle.font.size = px(12),
source_notes.font.size = px(10),
data_row.padding = px(4)
)
| Income and Feeling of Safety |
| Percentage and Count of People Feeling Safe or Unsafe by Income Category |
| Income Category |
Feeling of Safety |
Count |
Percentage |
| Bottom 25% |
Safe |
4749 |
1,418.6% |
| Bottom 25% |
Unsafe |
3725 |
1,112.7% |
| Lower Middle |
Safe |
6067 |
1,812.3% |
| Lower Middle |
Unsafe |
2456 |
733.7% |
| Top 25% |
Safe |
6779 |
2,025.0% |
| Top 25% |
Unsafe |
1277 |
381.5% |
| Upper Middle |
Safe |
6551 |
1,956.9% |
| Upper Middle |
Unsafe |
1872 |
559.2% |
Create Visualization for relationship between Income & Feeling
of Safety
# Create the line plot
ggplot(income_safety_percentages, aes(x = income_category, y = percentage, color = PNSC_15, group = PNSC_15)) +
geom_line(size = 1.2) + # Add lines
geom_point(size = 3) + # Add points
# Add direct labels using ggrepel
geom_label_repel(
data = income_safety_percentages %>%
group_by(PNSC_15) %>%
slice_max(order_by = income_category, n = 1),
aes(label = PNSC_15),
nudge_x = 0.5,
direction = "y",
hjust = 0,
segment.size = 0.3,
segment.color = "grey70",
box.padding = 0.5,
point.padding = 0.5,
size = 4,
fontface = "bold",
label.size = 0.1,
label.r = unit(0.2, "lines"),
fill = alpha("white", 0.7)
) +
# Set the factor levels for 'income_category' to control the order
scale_x_discrete(
limits = c("Bottom 25%", "Lower Middle", "Upper Middle", "Top 25%")
) +
# Set color scale for lines
scale_color_manual(values = safety_colors) +
# Adjust y-axis limits and formatting
scale_y_continuous(
labels = scales::percent_format(scale = 1),
limits = c(0, 40), # Set the upper limit to 40%
breaks = seq(0, 30, by = 5) # Increments of 5%
) +
labs(
title = "Income and Feeling of Safety",
subtitle = "Percentage of People Feeling Safe or Unsafe by Income Category",
caption = "Data Source: CHS 2021",
x = "Income Category",
y = "Percentage",
color = "Feeling of Safety"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(color = "grey40", size = 12),
axis.text = element_text(size = 11),
axis.title.x = element_text(margin = margin(t = 10)),
panel.grid.major.y = element_line(color = "grey90", size = 0.3),
legend.position = "none"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Create a visualization for Drug Issues & Feeling of Safety
# Calculate the count of each recoded NEI_05F category
drugs_summary <- chs_cleaned %>%
mutate(
# Recategorize neighborhood drug issues
NEI_05F = case_when(
NEI_05F %in% c("A big problem", "A moderate problem") ~ "A Problem",
NEI_05F %in% c("A small problem", "Not a problem") ~ "Not a Problem",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(NEI_05F)) %>% # Remove missing values
group_by(NEI_05F) %>% # Group by recoded variable
summarise(n = n(), .groups = "drop") %>% # Count occurrences
mutate(percentage = n / sum(n) * 100) %>% # Calculate percentages
select(NEI_05F, n, percentage) # Select relevant columns
# Preview the data
head(drugs_summary)
## # A tibble: 2 × 3
## NEI_05F n percentage
## <chr> <int> <dbl>
## 1 A Problem 6158 15.1
## 2 Not a Problem 34591 84.9
# Neighborhood Drug Issues and Feeling of Safety
drugs_safety_summary <- chs_cleaned %>%
mutate(
# Recategorize neighborhood drug issues
NEI_05F = case_when(
NEI_05F %in% c("A big problem", "A moderate problem") ~ "A Problem",
NEI_05F %in% c("A small problem", "Not a problem") ~ "Not a Problem",
TRUE ~ NA_character_
),
# Recategorize safety values
PNSC_15 = case_when(
PNSC_15 %in% c("Very Safe", "Reasonably Safe") ~ "Safe",
PNSC_15 %in% c("Unsafe", "Do not walk alone") ~ "Unsafe",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(NEI_05F) & !is.na(PNSC_15)) %>% # Remove missing values
group_by(NEI_05F, PNSC_15) %>%
summarise(n = n(), .groups = "drop") %>% # Count occurrences
group_by(NEI_05F) %>%
mutate(percentage = n / sum(n) * 100) %>% # Calculate percentages relative to each NEI_05F group
select(NEI_05F, PNSC_15, n, percentage) # Select relevant columns
# Preview the data
head(drugs_safety_summary)
## # A tibble: 4 × 4
## # Groups: NEI_05F [2]
## NEI_05F PNSC_15 n percentage
## <chr> <chr> <int> <dbl>
## 1 A Problem Safe 2691 43.7
## 2 A Problem Unsafe 3461 56.3
## 3 Not a Problem Safe 26737 77.4
## 4 Not a Problem Unsafe 7826 22.6
# Create the bar graph with smaller font for percentage labels and y-axis limit
ggplot(drugs_safety_summary, aes(x = NEI_05F, y = percentage, fill = PNSC_15)) +
# Create the bars
geom_bar(stat = "identity", position = "dodge", width = 0.7, alpha = 0.9) +
# Add smaller labels at the end of the bars
geom_text(aes(label = sprintf("%.1f%%", percentage)),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 3.5) + # Set smaller font size (e.g., 3.5)
# Customize the color palette
scale_fill_manual(values = c("Safe" = "#2E86AB", "Unsafe" = "#E74C3C")) +
# Set y-axis limits up to 90% and define breaks
scale_y_continuous(
limits = c(0, 80), # Set the y-axis to go up to 90%
breaks = seq(0, 80, by = 10), # Define breaks at every 10%
labels = scales::percent_format(scale = 1) # Ensure labels are in percentage format
) +
# Set labels and title
labs(
title = "Neighborhood Drug Issues and Feeling of Safety",
subtitle = "Percentage of People Feeling Safe or Unsafe by Neighborhood Drug Issues",
caption = "Data Source: CHS 2021",
x = "Neighborhood Drug Issues",
y = "Percentage (%)",
fill = "Feeling of Safety"
) +
# Improve theme for readability
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
legend.position = "top",
panel.grid = element_blank()
)

Create a table showing the relationship between drug issues &
feeling of safety
# Load the gt package
library(gt)
# Create a table using the drugs_safety_summary data
drugs_safety_summary %>%
gt() %>%
# Label the columns for clarity
cols_label(
NEI_05F = "Neighborhood Drug Issues",
PNSC_15 = "Feeling of Safety",
n = "Count",
percentage = "Percentage"
) %>%
# Format percentage column to display as percentages
fmt_percent(
columns = c(percentage),
decimals = 1
) %>%
# Add a table header with a title
tab_header(
title = md("**Neighborhood Drug Issues and Feeling of Safety**"),
subtitle = "Percentage and Count of People Feeling Safe or Unsafe by Neighborhood Drug Issues"
) %>%
# Custom styling for the header
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
) %>%
# Table options for better visualization
tab_options(
table.border.top.width = 2,
table.border.bottom.width = 2,
column_labels.border.bottom.width = 2,
table.font.size = px(12),
heading.title.font.size = px(14),
heading.subtitle.font.size = px(12),
source_notes.font.size = px(10),
data_row.padding = px(4)
)
| Neighborhood Drug Issues and Feeling of Safety |
| Percentage and Count of People Feeling Safe or Unsafe by Neighborhood Drug Issues |
| Feeling of Safety |
Count |
Percentage |
| A Problem |
| Safe |
2691 |
4,374.2% |
| Unsafe |
3461 |
5,625.8% |
| Not a Problem |
| Safe |
26737 |
7,735.7% |
| Unsafe |
7826 |
2,264.3% |