# Filter data
df %<>% mutate(
    Longitude = as.numeric(Longitude),
    Latitude = as.numeric(Latitude)
  ) %>%
  filter(!is.na(Longitude) & !is.na(Latitude))

df_sf <- st_as_sf(df, coords = c("Longitude", "Latitude"), crs = 4326)

df %<>% mutate(Age = as.numeric(Age)) %>%
  filter(!is.na(Age))

df %<>% mutate(Race_Categorized = case_when(
    str_detect(Race, "Black") ~ "Black",
    str_detect(Race, "White") ~ "White",
    str_detect(Race, "Hispanic") ~ "Hispanic/Latino",
    str_detect(Race, "Middle Eastern") ~ "Middle Eastern",
    str_detect(Race, "Native American") ~ "Native American",
    str_detect(Race, "Asian") ~ "Asian",
    str_detect(Race, "Race unspecified") ~ "Race unspecified",
    TRUE ~ "Race unspecified"
  ))

Plot 1: Variables used - avg_rating, hhincome

ggplot(df %>% filter(Gender %in% c("Female", "Male")), aes(x = Age, color = Gender)) +
  geom_density() +
  theme_minimal() +
  labs(title = "Age Distribution by Gender", x = "Age", y = "Density") +
  scale_color_manual(values = c("Female" = "orange", "Male" = "dodgerblue"))

Younger individuals tend to be more prevalent in the dataset, particularly in their 20s, with males slightly overrepresented compared to females in the dataset.

Plot 2: Variables used - avg_rating, hhincome, county

ggplot(df %>% filter(Gender %in% c("Female", "Male")), aes(x = Gender, fill = Race_Categorized)) +
  geom_bar(position = "stack") +
  theme_minimal() +
  labs(title = "Gender and Race Breakdown", x = "Gender", y = "Count")

This plot reveals that male victims overwhelmingly outnumber female victims. For race specified cases,White make up the largest portions across both genders, but Black and Hispanic/Latino men are also highly represented.

Plot 3: Variables used - review_count_log, hhincome, county, pct_white

top_five_forces <- df_sf %>%
  count(Highest.level.of.force, sort = TRUE) %>%
  top_n(5, n) %>%
  pull(Highest.level.of.force)

fe_data_state <- df %>%
  filter(Highest.level.of.force %in% top_five_forces) %>%
  group_by(State) %>%
  summarise(Total_Force_Count = sum(n()))

# load state boundary
states <- states(cb = TRUE)
states_merged <- states %>%
  left_join(fe_data_state, by = c("STUSPS" = "State")) %>%  
  filter(!STUSPS %in% c("AK", "HI", "AS", "MP", "VI", "PR", "GU"))

tmap_mode("view")
tm_shape(states_merged) +
  tm_polygons("Total_Force_Count", title = "Total Force Count (Top 5)") +
  tm_layout(title = "Total Top 5 Forces Used by State")

California stands out with the highest number of top 5 force incidents. Texas follows closely, with a value of 2442. Other states, like Florida, Georgia, and some Northeastern states, show a moderate count of 1,001 to 2,000 incidents.

Plot 4: Variables used - pct_pov_log, hhincome, pct_white, pop, review_count_log, county

counties_atlanta <- counties(state = "GA", cb = TRUE) %>%
  filter(NAME %in% c("Fulton", "DeKalb", "Clayton"))
df_sf <- st_transform(df_sf, crs = 4326)
counties_atlanta <- st_transform(counties_atlanta, crs = 4326)

df_sf_top5 <- df_sf %>%
  filter(Highest.level.of.force %in% top_five_forces)

within_atlanta <- st_within(df_sf_top5, counties_atlanta)
df_sf_atlanta <- df_sf_top5[lengths(within_atlanta) > 0, ]

tm_shape(counties_atlanta) +
  tm_polygons() +
  tm_shape(df_sf_atlanta) +
  tm_dots(col = "Highest.level.of.force", palette = "Set1", title = "Top 5 Forces") +
  tm_layout(title = "Top 5 Forces in Atlanta Region")

Gunshot incidents are by far the most common, with a dense concentration across the region, especially in the central parts of these counties.