# Load required packages
packages <- c("tidyverse", "fst", "gt")
# Install packages if needed
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "gt" "fst" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
chs_data <- read.csv("C:/Users/kathy/Documents/SOC_3320/CHS2021ECL_PUMF.csv")
# Read the CHS data csv directly (109 variables)
chs_data <- read.csv("C:/Users/kathy/Documents/SOC_3320/CHS2021ECL_PUMF.csv")
# Read the 1001 variables version -- compressed
chs_data_full <- read.csv("C:/Users/kathy/Documents/SOC_3320/CHS2021ECL_PUMF_BSW.csv")
# Basic variable/row summary of the data
dim(chs_data)
## [1] 40988 109
Example of exploration of housing related variables (check the data dictionary PDF!)
# Neighborhood Issues
nei.issues_vars <- c("NEI_05A", "NEI_05B", "NEI_05C", "NEI_05D", "NEI_05E", " NEI_05F", " NEI_05G")
# Feeling of Safety in Neighborhood
safety_vars <- c("PNSC_15", "PDWS_10G")
# Level of community Satisfaction
satisfaction_vars <- c("COS_10")
Explore, with your data dictionary handy, your variables of interest. Look closely and what the values mean before summarizing them and doing some exploration.
library(gt)
library(dplyr)
# Select relevant variables
selected_vars <- c("NEI_05A", "NEI_05B", "NEI_05C", "NEI_05D", "NEI_05E", "NEI_05F", "NEI_05G",
"PNSC_15", "PDWS_10G")
# Rename Variables
variable_labels <- c(
"NEI_05A" = "Noise Pollution",
"NEI_05B" = "Loitering",
"NEI_05C" = "Littering",
"NEI_05D" = "Vandalism",
"NEI_05E" = "Discrimination",
"NEI_05F" = "Drugs",
"NEI_05G" = "Public Intoxication",
"PNSC_15" = "Feeling of safety outside",
"PDWS_10G" = "Feeling of safety within the home"
)
# Summary statistics for each variable
summary_list <- lapply(selected_vars, function(var) {
data.frame(
Variable = var,
Statistic = c("Mean", "SD", "Min", "Max", "N"),
Value = c(
mean(chs_data[[var]], na.rm = TRUE),
sd(chs_data[[var]], na.rm = TRUE),
min(chs_data[[var]], na.rm = TRUE),
max(chs_data[[var]], na.rm = TRUE),
sum(!is.na(chs_data[[var]]))
)
)
})
# Combine all variable summaries into one dataframe
chs_summary <- bind_rows(summary_list)
chs_summary <- chs_summary %>%
mutate(Variable = recode(Variable, !!!variable_labels)) # Rename variables
# Create gt table
chs_summary %>%
gt() %>%
tab_header(
title = "Descriptive Summary of Neighborhood Issues & Safety Variables",
subtitle = "Summary statistics based on CHS 2021 data"
) %>%
fmt_number(columns = "Value", decimals = 2) %>%
cols_label(
Variable = "Variable",
Statistic = "Statistic",
Value = "Value"
) %>%
tab_style(
style = list(cell_text(weight = "bold")),
locations = cells_column_labels(everything())
)
| Descriptive Summary of Neighborhood Issues & Safety Variables | ||
| Summary statistics based on CHS 2021 data | ||
| Variable | Statistic | Value |
|---|---|---|
| Noise Pollution | Mean | 3.59 |
| Noise Pollution | SD | 0.79 |
| Noise Pollution | Min | 1.00 |
| Noise Pollution | Max | 9.00 |
| Noise Pollution | N | 40,988.00 |
| Loitering | Mean | 3.66 |
| Loitering | SD | 0.79 |
| Loitering | Min | 1.00 |
| Loitering | Max | 9.00 |
| Loitering | N | 40,988.00 |
| Littering | Mean | 3.52 |
| Littering | SD | 0.89 |
| Littering | Min | 1.00 |
| Littering | Max | 9.00 |
| Littering | N | 40,988.00 |
| Vandalism | Mean | 3.63 |
| Vandalism | SD | 0.82 |
| Vandalism | Min | 1.00 |
| Vandalism | Max | 9.00 |
| Vandalism | N | 40,988.00 |
| Discrimination | Mean | 3.83 |
| Discrimination | SD | 0.68 |
| Discrimination | Min | 1.00 |
| Discrimination | Max | 9.00 |
| Discrimination | N | 40,988.00 |
| Drugs | Mean | 3.50 |
| Drugs | SD | 0.99 |
| Drugs | Min | 1.00 |
| Drugs | Max | 9.00 |
| Drugs | N | 40,988.00 |
| Public Intoxication | Mean | 3.66 |
| Public Intoxication | SD | 0.83 |
| Public Intoxication | Min | 1.00 |
| Public Intoxication | Max | 9.00 |
| Public Intoxication | N | 40,988.00 |
| Feeling of safety outside | Mean | 2.12 |
| Feeling of safety outside | SD | 1.02 |
| Feeling of safety outside | Min | 1.00 |
| Feeling of safety outside | Max | 9.00 |
| Feeling of safety outside | N | 40,988.00 |
| Feeling of safety within the home | Mean | 1.81 |
| Feeling of safety within the home | SD | 0.86 |
| Feeling of safety within the home | Min | 1.00 |
| Feeling of safety within the home | Max | 9.00 |
| Feeling of safety within the home | N | 40,988.00 |
library(ggplot2)
library(dplyr)
# Rename variables
chs_data <- chs_data %>%
rename(
Noise_Pollution = NEI_05A,
Loitering = NEI_05B,
Littering = NEI_05C,
Vandalism = NEI_05D,
Discrimination = NEI_05E,
Drugs = NEI_05F,
Public_Intoxication = NEI_05G
)
# Create a long-format data frame
long_data <- chs_data %>%
select(Noise_Pollution, Loitering, Littering, Vandalism, Discrimination, Drugs, Public_Intoxication) %>%
gather(key = "Issue", value = "Severity")
# Convert severity levels to categorical labels
long_data$Severity <- factor(long_data$Severity,
levels = c(1, 2, 3, 4, 9),
labels = c("Big", "Moderate", "Small", "Not a Problem", "Not Stated"))
# Create bar plot
ggplot(long_data, aes(x = Severity, fill = Issue)) +
geom_bar(stat = "count", position = "dodge", alpha = 0.8) + # Count occurrences of each severity level
labs(
title = "Neighborhood Issues by Severity of Problem",
x = "Severity of Issue",
y = "Frequency Count",
caption = "Data Source: CHS 2021"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 8),
axis.title = element_text(size = 12),
legend.title = element_blank()
) +
scale_fill_brewer(palette = "Set3")
library(ggplot2)
library(dplyr)
# Recode PNSC_15 using CHS code categories
chs_data <- chs_data %>%
mutate(
Safety_Outside_Category = case_when(
PNSC_15 == 1 ~ "Very safe",
PNSC_15 == 2 ~ "Reasonably safe",
PNSC_15 == 3 ~ "Unsafe",
PNSC_15 == 4 ~ "Do not walk alone",
PNSC_15 == 9 ~ "Not stated",
TRUE ~ "Other" # In case of unexpected values
)
)
ggplot(chs_data, aes(x = Safety_Outside_Category, fill = Safety_Outside_Category)) +
geom_bar() +
labs(
title = "Feeling of Safety When Walking Outside",
x = "Level of Safety",
y = "Frequency Count",
fill = "Safety Level",
caption = "Data Source: CHS 2021"
) +
scale_fill_manual(values = c(
"Very safe" = "#1b9e77",
"Reasonably safe" = "#d95f02",
"Unsafe" = "#e7298a",
"Do not walk alone" = "#7570b3",
"Not stated" = "#999999"
)) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 12),
axis.title = element_text(size = 12),
legend.position = "none"
)
# Create Visualization for feeling of safety within the home
chs_data <- chs_data %>%
mutate(Safety_Inside_Category = case_when(
PDWS_10G == 1 ~ "Very satisfied",
PDWS_10G == 2 ~ "Satisfied",
PDWS_10G == 3 ~ "Neither",
PDWS_10G == 4 ~ "Dissatisfied",
PDWS_10G == 9 ~ "Not stated",
TRUE ~ "Other")
)
ggplot(chs_data, aes(x = Safety_Inside_Category, fill = Safety_Inside_Category)) +
geom_bar() +
labs(
title = "Feeling of Safety Inside the Home",
x = "Satisfaction Level",
y = "Frequency Count",
fill = "Satisfaction Level",
caption = "Data Source: CHS 2021"
) +
scale_fill_manual(values = c(
"Very satisfied" = "#1b9e77",
"Satisfied" = "#d95f02",
"Neither" = "#e7298a",
"Dissatisfied" = "#7570b3",
"Not stated" = "#999999"
)) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 12),
axis.title = element_text(size = 12),
legend.position = "none"
)
library(ggplot2)
library(dplyr)
library(tidyr)
# Prepare Safety Outside data
safety_outside_data <- chs_data %>%
mutate(Category = "Safety Outside") %>%
rename(Response = Safety_Outside_Category) %>%
select(Category, Response)
# Prepare Safety Inside data
safety_inside_data <- chs_data %>%
mutate(Category = "Safety Inside") %>%
rename(Response = Safety_Inside_Category) %>%
select(Category, Response)
# Prepare Neighborhood Issues data
neighborhood_issues_data <- chs_data %>%
select(Noise_Pollution, Loitering, Littering, Vandalism, Discrimination, Drugs, Public_Intoxication) %>%
gather(key = "Issue", value = "Response") %>%
mutate(Category = "Neighborhood Issues")
# Convert severity levels to categorical labels
neighborhood_issues_data$Response <- factor(
neighborhood_issues_data$Response,
levels = c(1, 2, 3, 4, 9),
labels = c("Big Problem", "Moderate Problem", "Small Problem", "Not a Problem", "Not Stated")
)
# Combine all data
combined_data <- bind_rows(safety_outside_data, safety_inside_data, neighborhood_issues_data)
# Create Bar Chart
ggplot(combined_data, aes(x = Response, fill = Category)) +
geom_bar(position = "dodge") +
facet_wrap(~Category, scales = "free_x") + # Facets for each category
labs(
title = "Comparison of Safety & Neighborhood Issues",
x = "Response",
y = "Frequency Count",
caption = "Data Source: CHS 2021"
) +
theme_minimal() +
scale_fill_manual(values = c(
"Safety Outside" = "#1b9e77",
"Safety Inside" = "#d95f02",
"Neighborhood Issues" = "#7570b3"
)) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title = element_text(size = 12),
legend.position = "top"
)
Now that we have all of our data regarding neighborhood issues and feeling of safety inside and outside of the home, we can compare specific variables. We use an example of a line graph as well as a bar graph.
# Create a line graph
library(ggplot2)
library(dplyr)
library(tidyr)
# Select Data
safety_outside_data <- chs_data %>%
count(Safety_Outside_Category) %>%
rename(Response = Safety_Outside_Category, Count = n) %>%
mutate(Category = "Safety Outside")
drugs_data <- chs_data %>%
count(Drugs) %>%
rename(Response = Drugs, Count = n) %>%
mutate(Category = "Drugs")
# Convert response levels to categories
# Define the same factor levels for both data sets
common_levels <- c("Very safe", "Reasonably safe", "Unsafe", "Do not walk alone",
"Big Problem", "Moderate Problem", "Small Problem", "Not a Problem")
safety_outside_data$Response <- factor(safety_outside_data$Response,
levels = c("Very safe", "Reasonably safe", "Unsafe", "Do not walk alone"),
ordered = TRUE)
drugs_data$Response <- factor(drugs_data$Response,
levels = c(1, 2, 3, 4),
labels = c("Big Problem", "Moderate Problem", "Small Problem", "Not a Problem"),
ordered = TRUE)
# Make sure that both datasets have the same levels
safety_outside_data$Response <- factor(safety_outside_data$Response, levels = common_levels, ordered = TRUE)
drugs_data$Response <- factor(drugs_data$Response, levels = common_levels, ordered = TRUE)
# Combine both datasets
comparison_data <- bind_rows(safety_outside_data, drugs_data)
# Create the Line Graph
ggplot(comparison_data, aes(x = Response, y = Count, group = Category, color = Category)) +
geom_line(size = 1) + # Line graph
geom_point(size = 3) + # Add points for clarity
labs(
title = "Comparison: Feeling of Safety Outside vs. Drugs in the Neighborhood",
x = "Response Category",
y = "Frquency Count",
color = "Category",
caption = "Data Source: CHS 2021"
) +
scale_color_manual(values = c("Safety Outside" = "#1b9e77", "Drugs" = "#d95f02")) +
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title = element_text(size = 12),
legend.position = "top"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(ggplot2)
library(dplyr)
library(tidyr)
# Select and Clean Data
safety_outside_data <- chs_data %>%
count(Safety_Outside_Category) %>%
rename(Response = Safety_Outside_Category, Count = n) %>%
mutate(Category = "Safety Outside")
drugs_data <- chs_data %>%
count(Drugs) %>%
rename(Response = Drugs, Count = n) %>%
mutate(Category = "Drugs")
# Convert response levels to categories
common_levels <- c("Very safe", "Reasonably safe", "Unsafe", "Do not walk alone",
"Big Problem", "Moderate Problem", "Small Problem", "Not a Problem")
safety_outside_data$Response <- factor(safety_outside_data$Response,
levels = c("Very safe", "Reasonably safe", "Unsafe", "Do not walk alone"),
ordered = TRUE)
drugs_data$Response <- factor(drugs_data$Response,
levels = c(1, 2, 3, 4),
labels = c("Big Problem", "Moderate Problem", "Small Problem", "Not a Problem"),
ordered = TRUE)
# Make sure that both datasets have the same levels
safety_outside_data$Response <- factor(safety_outside_data$Response, levels = common_levels, ordered = TRUE)
drugs_data$Response <- factor(drugs_data$Response, levels = common_levels, ordered = TRUE)
# Combine both datasets
comparison_data <- bind_rows(safety_outside_data, drugs_data)
# Create Bar Graph
ggplot(comparison_data, aes(x = Response, y = Count, fill = Category)) +
geom_bar(stat = "identity", position = "dodge") + # Bar graph with dodge position
labs(
title = "Comparison: Feeling of Safety Outside vs. Drugs in the Neighborhood",
x = "Response Category",
y = "Frequency Count",
fill = "Category",
caption = "Data Source: CHS 2021"
) +
scale_fill_manual(values = c("Safety Outside" = "#1b9e77", "Drugs" = "#d95f02")) +
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title = element_text(size = 12),
legend.position = "top"
)
For our analysis section of the research project, we will refine our variables that we have available to us through the CHS 2021 Dataset. We will choose what specific aspects of neighborhood issues and safety within neighborhoods that we want to focus on and observe. We want to observe the patterns that are presented and explore the relationships between these two variables (safety and neighborhood issues).