DATA 608 Story 3

{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)

1. Load the required libraries

library(httr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:httr':
## 
##     config
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(tools)  # for toTitleCase()

2. Load and preview the data

Data Source

url <- "https://data.cdc.gov/resource/489q-934x.json"
res <- GET(url)
content <- fromJSON(content(res, "text", encoding = "UTF-8"))
data <- as.data.frame(content)

str(data)

## 'data.frame':    792 obs. of  69 variables:
##  $ year_and_quarter         : chr  "2022 Q1" "2022 Q1" "2022 Q1" "2022 Q1" ...
##  $ time_period              : chr  "12 months ending with quarter" "12 months ending with quarter" "12 months ending with quarter" "12 months ending with quarter" ...
##  $ cause_of_death           : chr  "All causes" "Alzheimer disease" "COVID-19" "Cancer" ...
##  $ rate_type                : chr  "Age-adjusted" "Age-adjusted" "Age-adjusted" "Age-adjusted" ...
##  $ unit                     : chr  "Deaths per 100,000" "Deaths per 100,000" "Deaths per 100,000" "Deaths per 100,000" ...
##  $ rate_overall             : chr  "873.2" "30.6" "95" "145.9" ...
##  $ rate_sex_female          : chr  "729.4" "35" "75.2" "127.4" ...
##  $ rate_sex_male            : chr  "1038" "23.8" "119.1" "170.9" ...
##  $ rate_alaska              : chr  "944.5" "28.5" "121.3" "156" ...
##  $ rate_alabama             : chr  "1109.8" "45.5" "133.6" "159.9" ...
##  $ rate_arkansas            : chr  "1097.1" "43.2" "123.6" "167.9" ...
##  $ rate_arizona             : chr  "882.5" "29.6" "113.9" "134.5" ...
##  $ rate_california          : chr  "719.5" "38.4" "62.4" "131.4" ...
##  $ rate_colorado            : chr  "808.2" "32.1" "89.9" "125" ...
##  $ rate_connecticut         : chr  "725.9" "21.6" "50.9" "134.1" ...
##  $ rate_district_of_columbia: chr  "844.4" "10.7" "54.8" "143.8" ...
##  $ rate_delaware            : chr  "868.3" "30.1" "78.4" "155.4" ...
##  $ rate_florida             : chr  "828" "19.3" "106.6" "141.3" ...
##  $ rate_georgia             : chr  "973.6" "43.4" "116.4" "150.4" ...
##  $ rate_hawaii              : chr  "647.1" "23.6" "43.9" "126.7" ...
##  $ rate_iowa                : chr  "860.8" "30.9" "78.1" "152" ...
##  $ rate_idaho               : chr  "892.8" "41.5" "118.9" "139.6" ...
##  $ rate_illinois            : chr  "839.4" "26.6" "82.1" "149.3" ...
##  $ rate_indiana             : chr  "1011.9" "29.6" "112.1" "169" ...
##  $ rate_kansas              : chr  "938.4" "22.9" "109.7" "152.8" ...
##  $ rate_kentucky            : chr  "1153.2" "32.5" "146.7" "179.6" ...
##  $ rate_louisiana           : chr  "1084.4" "42.8" "108.8" "161.9" ...
##  $ rate_massachusetts       : chr  "717" "17.5" "46.7" "136.1" ...
##  $ rate_maryland            : chr  "800.3" "15.9" "66.9" "139.2" ...
##  $ rate_maine               : chr  "910.3" "28.1" "70.9" "161.2" ...
##  $ rate_michigan            : chr  "956.2" "34.1" "115" "159.6" ...
##  $ rate_minnesota           : chr  "771.6" "34.1" "68" "143.6" ...
##  $ rate_missouri            : chr  "986.6" "33.5" "110.5" "162.4" ...
##  $ rate_mississippi         : chr  "1193.4" "51.8" "140.8" "184" ...
##  $ rate_montana             : chr  "925.3" "24.4" "111.8" "142.9" ...
##  $ rate_north_carolina      : chr  "952.8" "36" "98.3" "152.4" ...
##  $ rate_north_dakota        : chr  "810.5" "32.5" "80.2" "134.2" ...
##  $ rate_nebraska            : chr  "839.4" "29.8" "73.8" "152.2" ...
##  $ rate_new_hampshire       : chr  "791.9" "23.3" "56.5" "145.3" ...
##  $ rate_new_jersey          : chr  "723.5" "20.5" "62.9" "130.3" ...
##  $ rate_new_mexico          : chr  "1007.9" "25.4" "138.3" "135.6" ...
##  $ rate_nevada              : chr  "933.3" "26.3" "134.8" "140.8" ...
##  $ rate_new_york            : chr  "694.4" "12.8" "63.4" "125.3" ...
##  $ rate_ohio                : chr  "1019.8" "34" "128.1" "161.5" ...
##  $ rate_oklahoma            : chr  "1126.4" "37.1" "150.3" "176.7" ...
##  $ rate_oregon              : chr  "875.6" "40" "77.5" "153.7" ...
##  $ rate_pennsylvania        : chr  "893.5" "22.5" "97.4" "151.8" ...
##  $ rate_rhode_island        : chr  "771" "28.3" "55.5" "139.1" ...
##  $ rate_south_carolina      : chr  "1022" "40.1" "122.9" "154.3" ...
##  $ rate_south_dakota        : chr  "871.4" "39.1" "76.3" "148.6" ...
##  $ rate_tennessee           : chr  "1122.6" "37.2" "140.5" "165.2" ...
##  $ rate_texas               : chr  "918.3" "41.2" "126.7" "143.5" ...
##  $ rate_utah                : chr  "822.2" "41" "78.4" "120.6" ...
##  $ rate_virginia            : chr  "860.1" "26.1" "80.8" "149.8" ...
##  $ rate_vermont             : chr  "800.5" "36.2" "32.9" "155" ...
##  $ rate_washington          : chr  "811.1" "46" "66.6" "148.5" ...
##  $ rate_wisconsin           : chr  "857.1" "33.6" "77.7" "146.7" ...
##  $ rate_west_virginia       : chr  "1239.9" "35.4" "154" "183.8" ...
##  $ rate_wyoming             : chr  "956.8" "34.2" "145.1" "153.1" ...
##  $ rate_age_1_4             : chr  NA NA NA NA ...
##  $ rate_age_5_14            : chr  NA NA NA NA ...
##  $ rate_age_15_24           : chr  NA NA NA NA ...
##  $ rate_age_25_34           : chr  NA NA NA NA ...
##  $ rate_age_35_44           : chr  NA NA NA NA ...
##  $ rate_age_45_54           : chr  NA NA NA NA ...
##  $ rate_age_55_64           : chr  NA NA NA NA ...
##  $ rate_65_74               : chr  NA NA NA NA ...
##  $ rate_age_75_84           : chr  NA NA NA NA ...
##  $ rate_age_85_plus         : chr  NA NA NA NA ...

cat("\n", "\n")

gun_law_data <- read.csv("https://raw.githubusercontent.com/hawa1983/DATA-608/refs/heads/main/Story%203/strictest-gun-laws-by-state-2024.csv")
str(gun_law_data)

## 'data.frame':    50 obs. of  5 variables:
##  $ state                     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ GunLawsStrengthRank       : int  38 41 42 50 1 14 3 13 23 34 ...
##  $ GunLawsGiffordGrade       : chr  "F" "F" "F" "F" ...
##  $ GunLawsGunDeathRatePer100k: num  26.4 25.2 18.3 23.3 9 17.8 6.7 16.6 14.1 20.3 ...
##  $ GunLawsGunDeathRateRank   : int  4 6 17 8 43 18 45 23 34 14 ...

cat("\n", "\n")

population <- read.csv("https://raw.githubusercontent.com/hawa1983/DATA-608/refs/heads/main/Story%203/population.csv")
str(population)

## 'data.frame':    50 obs. of  2 variables:
##  $ state     : chr  "california" "texas" "florida" "new york" ...
##  $ population: int  38965193 30503301 22610726 19571216 12961683 12549689 11785935 11029227 10835491 10037261 ...

cat("\n", "\n")

3. Prepare the mortality rate data

# Step 1: Remove 'rate_district_of_columbia' column and filter out '2024 Q1' rows
mort_rate <- data %>%
  select(1:4, -rate_district_of_columbia, rate_alaska:rate_wyoming) %>%  # Select all except the 'rate_district_of_columbia' column
  filter(cause_of_death == "Firearm-related injury" &
         time_period == "12 months ending with quarter" &
         rate_type == "Crude" &
         year_and_quarter != "2024 Q1")  # Filter out the '2024 Q1' rows

# Step 2: Convert all rate columns (state columns) to numeric
mort_rate <- mort_rate %>%
  mutate(across(starts_with("rate_"), as.numeric))  # Convert all 'rate_' columns to numeric

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(starts_with("rate_"), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion

# Step 3: Calculate the mean for each column and store it in a new dataset
mean_data <- mort_rate %>%
  select(-rate_type) %>%
  summarise(across(starts_with("rate_"), mean, na.rm = TRUE))  # Calculate mean for all columns starting with 'rate_'

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(starts_with("rate_"), mean, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

# Step 4: Remove 'rate_' prefix from the state column names
mean_data <- mean_data %>%
  rename_with(~ str_replace(., "rate_", ""), starts_with("rate_"))  # Remove 'rate_' prefix

# Step 5: Pivot the data to longer format, remove the 'rate_' prefix from state names, and rename the column
data_long <- mort_rate %>%
  select(1, rate_alaska:rate_wyoming) %>%  # Select year_and_quarter and all rate columns for states
  pivot_longer(cols = rate_alaska:rate_wyoming, 
               names_to = "states", 
               values_to = "mortality_rate") %>%  # Rename the value column to 'mortality_rate'
  mutate(states = str_replace_all(states, "rate_", ""),  # Remove 'rate_' prefix from state names
         states = str_replace_all(states, "_", " "))     # Replace underscores with spaces

# Step 6: Pivot the mean data to longer format and rename the column to 'mortality_rate'
mean_data_long <- mean_data %>%
  pivot_longer(cols = everything(), 
               names_to = "states", 
               values_to = "mortality_rate") %>%  # Rename the value column to 'mortality_rate'
  mutate(states = str_replace_all(states, "_", " "))     # Replace underscores with spaces

# Step 7: Print the head of the long data to check
head(mean_data_long)

## # A tibble: 6 × 2
##   states     mortality_rate
##   <chr>               <dbl>
## 1 alaska              23.3 
## 2 alabama             25.5 
## 3 arkansas            22.2 
## 4 arizona             20.0 
## 5 california           8.76
## 6 colorado            17.6

4. Prepare the gun law rating data

# 1. First, mutate gun_law_data to include state abbreviations
gun_law_data <- gun_law_data %>%
  mutate(States = state.abb[match(state, state.name)])

# 2. Ensure that the states column in both data frames is properly formatted
mean_Mort_long_data <- mean_data_long %>%
  mutate(states = tolower(states))  

# 3. Ensure that the states column in both data frames is properly formatted
gun_law_data <- gun_law_data %>%
  mutate(state = tolower(state))

# 4. Now proceed with the merge (join) by the state abbreviation
merged_data <- merge(mean_Mort_long_data, gun_law_data, by.x = "states", by.y = "state")

# Merge map data with the population for dots
population <- population %>%
  mutate(state_name = tolower(state)) %>%
  select(-state)

merged_data <- merged_data %>%
  left_join(population, by = c("states" = "state_name"))

head(merged_data)

##       states mortality_rate GunLawsStrengthRank GunLawsGiffordGrade
## 1    alabama        25.4875                  38                   F
## 2     alaska        23.2750                  41                   F
## 3    arizona        20.0125                  42                   F
## 4   arkansas        22.1750                  50                   F
## 5 california         8.7625                   1                   A
## 6   colorado        17.6375                  14                   B
##   GunLawsGunDeathRatePer100k GunLawsGunDeathRateRank States population
## 1                       26.4                       4     AL    5108468
## 2                       25.2                       6     AK     733406
## 3                       18.3                      17     AZ    7431344
## 4                       23.3                       8     AR    3067732
## 5                        9.0                      43     CA   38965193
## 6                       17.8                      18     CO    5877610

5. Plot the Heatmap of US States Map

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(patchwork)  # For arranging plots side by side
library(maps)

## 
## Attaching package: 'maps'

## The following object is masked from 'package:purrr':
## 
##     map

library(scales)  # For rescaling

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

# Load US map data
states_map <- map_data("state") %>%
  filter(!region %in% c("district of columbia", "puerto rico", "guam"))

# Convert state names in the merged_data to lowercase to match with map data
merged_data$States <- tolower(merged_data$States)

# Filter out problematic names if necessary
merged_data <- merged_data %>%
  filter(!States %in% c("district of columbia", "puerto rico", "guam"))

# Step 1: Standardize GunLawsStrengthRank using min-max normalization
merged_data <- merged_data %>%
  mutate(GunLawsStrengthRank_std = (GunLawsStrengthRank - min(GunLawsStrengthRank, na.rm = TRUE)) /
                                   (max(GunLawsStrengthRank, na.rm = TRUE) - min(GunLawsStrengthRank, na.rm = TRUE)))

# Step 2: Standardize mortality_rate using min-max normalization
merged_data <- merged_data %>%
  mutate(mortality_rate_std = (mortality_rate - min(mortality_rate, na.rm = TRUE)) /
                              (max(mortality_rate, na.rm = TRUE) - min(mortality_rate, na.rm = TRUE)))

# Merge map data with the merged_data based on state names
map_data <- states_map %>%
  left_join(merged_data, by = c("region" = "states"))

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(maps)

# Prepare map data
states_map <- map_data("state")

# Calculate centroids of states for plotting points
state_centroids <- states_map %>%
  group_by(region) %>%
  summarize(long = mean(range(long)), lat = mean(range(lat)))

# Merge centroids with the merged data
merged_data_centroids <- state_centroids %>%
  left_join(merged_data, by = c("region" = "states"))

# Standardize GunLawsStrengthRank using min-max normalization

merged_data <- merged_data %>%
  mutate(GunLawsStrengthRank_std = (GunLawsStrengthRank - min(GunLawsStrengthRank, na.rm = TRUE)) /
                                   (max(GunLawsStrengthRank, na.rm = TRUE) - min(GunLawsStrengthRank, na.rm = TRUE)))

# Standardize mortality_rate using min-max normalization
merged_data <- merged_data %>%
  mutate(mortality_rate_std = (mortality_rate - min(mortality_rate, na.rm = TRUE)) /
                              (max(mortality_rate, na.rm = TRUE) - min(mortality_rate, na.rm = TRUE)))

# Merge map data with the gun law and mortality data for the map background
map_data <- states_map %>%
  left_join(merged_data, by = c("region" = "states"))

# Rescale legends for GunLawsStrengthRank and Mortality Rate without affecting the plot data
actual_gun_law_labels <- pretty(merged_data$GunLawsStrengthRank, n = 5)
actual_mortality_rate_labels <- pretty(merged_data$mortality_rate, n = 5)

# Accessible color palettes from RColorBrewer for the rescaled values
gun_law_colors <- scale_fill_distiller(palette = "Blues", direction = 1, name = "Gun Law Rank\n(Lower is Stricter)",
                                       labels = actual_gun_law_labels, 
                                       breaks = scales::rescale(actual_gun_law_labels, to = range(merged_data$GunLawsStrengthRank_std)))

mortality_colors <- scale_color_distiller(palette = "Oranges", direction = 1, name = "Mortality Rate",
                                          labels = actual_mortality_rate_labels, 
                                          breaks = scales::rescale(actual_mortality_rate_labels, to = range(merged_data$mortality_rate_std)))

# Create the combined map with accessible color palettes and actual legend values
combined_map <- ggplot() +
  # Gun law strength represented by fill color using accessible palette
  geom_polygon(data = map_data, aes(x = long, y = lat, group = group, fill = GunLawsStrengthRank_std), color = "white") +
  # Firearm mortality rate represented by dots at state centroids with another accessible palette
  geom_point(data = merged_data_centroids, aes(x = long, y = lat, color = mortality_rate_std), size = 3, alpha = 0.7) +
  coord_fixed(1.3) +  # Fix aspect ratio
  gun_law_colors +  # Accessible color scale for Gun Law Rank
  mortality_colors +  # Accessible color scale for Mortality Rate
  labs(title = "Gun Law Strength & Firearm Mortality Rate by State") +
  theme_minimal() +
  theme(legend.position = "right",
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank())

# Show the plot
print(combined_map)

6. Plot the scatter plot of mortality rate versus gun control ranking

# Ensure that the states column in both data frames is properly formatted
merged_data <- merged_data %>%
  mutate(States = toupper(States))

# Round population to the nearest 10 million for legend labels
merged_data <- merged_data %>%
  mutate(population_rounded = round(population / 1e7) * 10)  # Round to nearest 10 million

# Scatter plot with regression line and dots sized by actual population
ggplot(merged_data, aes(x = GunLawsStrengthRank, y = mortality_rate)) +
  geom_point(aes(size = population), color = "#1a2309", alpha = 0.7) +  # Size of dot reflects actual population
  geom_smooth(method = "lm", se = TRUE, color = "red") +  # Add regression line with confidence interval
  geom_text(aes(label = States), vjust = -1, size = 3, color = "black") +  # Place state abbreviations at the centroid
  scale_size_continuous(name = "Population (in millions)", range = c(3, 8), 
                        breaks = seq(10, 40, by = 10),  # Use breaks to display 10 million increments
                        labels = function(x) paste0(x, "M")) +  # Format labels to show as '10M', '20M', etc.
  labs(#title = "Gun Law Strength vs Mortality Rate",
       x = "Gun Law Strength Rank",
       y = "Mortality Rate per 100k") +
  theme_minimal() +  # Minimal theme
  theme(
    panel.grid = element_blank()   # Remove gridlines
  )

## `geom_smooth()` using formula = 'y ~ x'

# Create a custom color scale for the Gun Laws Gifford Grades
grade_colors <- c("A" = "#1f77b4", "A-" = "#ff7f0e", "B+" = "#2ca02c", "B" = "#d62728", 
                  "B-" = "#9467bd", "C+" = "#8c564b", "C" = "#e377c2", 
                  "C-" = "#7f7f7f", "D+" = "#bcbd22", "F" = "#17becf")

# Scatter plot with regression line, annotations, and no legend
ggplot(merged_data, aes(x = GunLawsStrengthRank, y = mortality_rate, color = GunLawsGiffordGrade)) +
  geom_point(size = 2) +  # Plot the points with color based on grade
  geom_smooth(method = "lm", se = TRUE, color = "red") +  # Add regression line with confidence interval
  geom_text(aes(label = GunLawsGiffordGrade), vjust = -0.5, hjust = 0.5, size = 3, color = "black") +  # Annotate with grades
  labs(x = "Gun Law Strength Rank (Lower is Stricter)", y = "Mortality Rate per 100k") +
  scale_color_manual(values = grade_colors, guide = "none") +  # Remove the legend
  theme_minimal() +  # Minimal theme
  theme(
    panel.grid = element_blank()  # Remove gridlines
  )

## `geom_smooth()` using formula = 'y ~ x'

6. Plot the dot plot of mortality rate versus gun control ranking

# Reorder the GunLawsGiffordGrade factor levels
merged_data$GunLawsGiffordGrade <- factor(merged_data$GunLawsGiffordGrade, 
                                          levels = c("A", "A-", "B+", "B",  "B-",  "C+",  "C",  "C-",  "D+",  "F"))

# Create a consistent jitter effect for both points and labels
jitter_pos <- position_jitter(width = 0.3, height = 0)

# Swap the colors for "B" and "F" in the custom color set
custom_colors <- c("A" = "#2ca02c",      # Blue
                   "A-" = "#2ca02c",     # Orange
                   "B+" = "#7f7f7f",     # Green
                   "B" = "#7f7f7f",      # Swapped: Cyan (was originally red)
                   "B-" = "#7f7f7f",     # Purple
                   "C+" = "#7f7f7f",     # Brown
                   "C" = "#7f7f7f",      # Pink
                   "C-" = "#7f7f7f",     # Gray
                   "D+" = "#7f7f7f",     # Yellow-green
                   "F" = "#d62728")      # Swapped: Red (was originally cyan)

# Create a jittered dot plot (strip plot) with state labels and swapped custom colors
dot_plot <- ggplot(merged_data, aes(x = GunLawsGiffordGrade, y = mortality_rate, color = GunLawsGiffordGrade)) +
  geom_jitter(aes(x = GunLawsGiffordGrade, y = mortality_rate), position = jitter_pos, alpha = 0.7) +  # Jitter the dots to separate them
  geom_hline(yintercept = max(merged_data$mortality_rate[merged_data$GunLawsGiffordGrade == "A"]) + 0.5,  # Just above the highest A- dot
             linetype = "dashed", color = "lightgrey", linewidth = 1.0) +  # Dashed green line, thinner size
  scale_color_manual(values = custom_colors) +  # Use manually defined colors with swapped B and F
  labs(# title = "Firearm Control Laws vs Mortality Rates",
       x = "Giffords Gun Law Grade",
       y = "Firearm Mortality Rate per 100k",
       color = "Gun Law Grade") +
  theme_minimal() +  # Clean theme
  theme(legend.position = "none")  # Remove the legend as the color directly represents the grades

# Print the dot plot
print(dot_plot)

DATA 608 Story 3

Fomba Kassoh

2024-10-14

1. Load the required libraries

2. Load and preview the data

Data Source

3. Prepare the mortality rate data

4. Prepare the gun law rating data

5. Plot the Heatmap of US States Map

6. Plot the scatter plot of mortality rate versus gun control ranking

6. Plot the dot plot of mortality rate versus gun control ranking