NEISS (2016-2025)

Are We safe at Home

setwd("C:/Users/mvx13/OneDrive - Texas State University/Papers/2026/2027_TRBAM/Data/NEISS")
library(data.table)
dt= fread("NEISS_2016_2025_merged_decoded.csv")
dim(dt)
## [1] 3565774      38
names(dt)
##  [1] "CPSC_Case_Number"     "Treatment_Date"       "Age"                 
##  [4] "Sex"                  "Race"                 "Other_Race"          
##  [7] "Hispanic"             "Body_Part"            "Diagnosis"           
## [10] "Other_Diagnosis"      "Body_Part_2"          "Diagnosis_2"         
## [13] "Other_Diagnosis_2"    "Disposition"          "Location"            
## [16] "Fire_Involvement"     "Product_1"            "Product_2"           
## [19] "Product_3"            "Alcohol"              "Drug"                
## [22] "Narrative_1"          "Stratum"              "PSU"                 
## [25] "Weight"               "Year"                 "Body_Part_cat"       
## [28] "Body_Part_2_cat"      "Diagnosis_cat"        "Diagnosis_2_cat"     
## [31] "Disposition_cat"      "Fire_Involvement_cat" "Sex_cat"             
## [34] "Location_cat"         "Race_cat"             "Product_1_cat"       
## [37] "Product_2_cat"        "Product_3_cat"
head(dt)
##    CPSC_Case_Number Treatment_Date   Age   Sex  Race Other_Race Hispanic
##               <int>         <IDat> <int> <int> <int>     <char>    <num>
## 1:        160101845     2016-01-01    92     1     0                  NA
## 2:        160101847     2016-01-01    90     1     0                  NA
## 3:        160101848     2016-01-01    71     2     0                  NA
## 4:        160101852     2016-01-01    71     2     0                  NA
## 5:        160101857     2016-01-01    57     1     0                  NA
## 6:        160101860     2016-01-01    46     2     0                  NA
##    Body_Part Diagnosis Other_Diagnosis Body_Part_2 Diagnosis_2
##        <int>     <int>          <char>       <num>       <num>
## 1:        79        57                          NA          NA
## 2:        79        57                          NA          NA
## 3:        79        57                          NA          NA
## 4:        31        64                          NA          NA
## 5:        83        59                          NA          NA
## 6:        79        53                          NA          NA
##    Other_Diagnosis_2 Disposition Location Fire_Involvement Product_1 Product_2
##               <char>       <int>    <int>            <int>     <int>     <int>
## 1:                             4        1                0      1645      1807
## 2:                             4        1                0       670         0
## 3:                             4        1                0      1807         0
## 4:                             1        0                0      1144         0
## 5:                             1        1                0       464         0
## 6:                             1        1                0       611         0
##    Product_3 Alcohol  Drug
##        <int>   <num> <num>
## 1:         0      NA    NA
## 2:         0      NA    NA
## 3:         0      NA    NA
## 4:         0      NA    NA
## 5:         0      NA    NA
## 6:         0      NA    NA
##                                                                            Narrative_1
##                                                                                 <char>
## 1: 92YOM TRYINGO TO TAKE OFF PANTS AND LOST BALANCE AND FELL TO TILE FLOORHIP FRACTURE
## 2:             90YOM FELL GETTING OUT OF A RECLINER CHAIR AND SUSTAINED A HIP FRACTURE
## 3:                   71YOF SLIPPED AND FELL TO HER WET KITCHEN FLOOR AND FRACTURED HIP
## 4:                            71YOF CARRYING A 50 POUND BAG AND STRAINED CHEST MUSCLES
## 5:                             57YOM DROPPED A KNIFE ONTO LEFT FOOT LACERATION TO FOOT
## 6:              46YOF FELL GETTING OUT OF HER BATHTUB AND SUSTAINED A CONTUSION TO HIP
##    Stratum   PSU   Weight  Year Body_Part_cat Body_Part_2_cat    Diagnosis_cat
##     <char> <int>    <num> <int>        <char>          <char>           <char>
## 1:       M    63 103.2251  2016   Lower Trunk                         Fracture
## 2:       M    63 103.2251  2016   Lower Trunk                         Fracture
## 3:       M    63 103.2251  2016   Lower Trunk                         Fracture
## 4:       M    63 103.2251  2016   Upper Trunk                   Strain, Sprain
## 5:       M    63 103.2251  2016          Foot                       Laceration
## 6:       M    63 103.2251  2016   Lower Trunk                 Contusions, Abr.
##    Diagnosis_2_cat                                           Disposition_cat
##             <char>                                                    <char>
## 1:                      Treated & Admitted For Hospitalization, Hospitalized
## 2:                      Treated & Admitted For Hospitalization, Hospitalized
## 3:                      Treated & Admitted For Hospitalization, Hospitalized
## 4:                 Treated & Released, Or Examined & Released Without Trtmnt
## 5:                 Treated & Released, Or Examined & Released Without Trtmnt
## 6:                 Treated & Released, Or Examined & Released Without Trtmnt
##                Fire_Involvement_cat Sex_cat Location_cat Race_cat
##                              <char>  <char>       <char>   <char>
## 1: No Fire Or No Flame/Smoke Spread    Male         Home     N.S.
## 2: No Fire Or No Flame/Smoke Spread    Male         Home     N.S.
## 3: No Fire Or No Flame/Smoke Spread  Female         Home     N.S.
## 4: No Fire Or No Flame/Smoke Spread  Female          Unk     N.S.
## 5: No Fire Or No Flame/Smoke Spread    Male         Home     N.S.
## 6: No Fire Or No Flame/Smoke Spread  Female         Home     N.S.
##                       Product_1_cat                Product_2_cat Product_3_cat
##                              <char>                       <char>        <char>
## 1:                         Day Wear Floors Or Flooring Materials              
## 2:                   Recliner Chair                                           
## 3:     Floors Or Flooring Materials                                           
## 4:   Bags, Not Elsewhere Classified                                           
## 5: Knives, Not Elsewhere Classified                                           
## 6:              Bathtubs Or Showers
library(data.table)
library(ggplot2)
library(scales)
library(forcats)
library(ggrepel)
library(wesanderson)
library(stringr)

# Clean Location_cat
dt[, Location_cat_clean := fifelse(
  is.na(Location_cat) | trimws(Location_cat) == "",
  "Missing / Not Recorded",
  trimws(Location_cat)
)]

# Count by Location and Year
location_year_count <- dt[
  ,
  .N,
  by = .(Year, Location_cat_clean)
][order(Year, -N)]

# Overall count by location
location_total <- dt[
  ,
  .N,
  by = Location_cat_clean
][order(-N)]

location_total
##    Location_cat_clean       N
##                <char>   <int>
## 1:               Home 1457036
## 2:                Unk 1145232
## 3:             Sports  459754
## 4:             Public  262219
## 5:             School  157911
## 6:             Street   81184
## 7:               Farm    1366
## 8:             Mobile     743
## 9:             Indst.     329
ggplot(
  location_year_count,
  aes(
    x = factor(Year),
    y = reorder(Location_cat_clean, N, FUN = sum),
    fill = N
  )
) +
  geom_tile(color = "white", linewidth = 0.25) +
  scale_fill_gradientn(
    colors = wes_palette("Zissou1", 100, type = "continuous"),
    labels = comma
  ) +
  labs(
    title = "Annual Distribution of Injury Cases by Location",
    subtitle = "Counts by Location_cat and year",
    x = "Year",
    y = "Location Category",
    fill = "Cases"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  )

dt[, severe_disposition := grepl(
  "Admitted|Hospitalized|Transferred|Held|Observation",
  Disposition_cat,
  ignore.case = TRUE
)]

location_severity <- dt[
  ,
  .(
    total_cases = .N,
    severe_cases = sum(severe_disposition, na.rm = TRUE),
    severe_percent = 100 * mean(severe_disposition, na.rm = TRUE)
  ),
  by = Location_cat_clean
][order(-severe_cases)]

location_severity
##    Location_cat_clean total_cases severe_cases severe_percent
##                <char>       <int>        <int>          <num>
## 1:               Home     1457036       221204      15.181780
## 2:                Unk     1145232       103202       9.011449
## 3:             Public      262219        49735      18.966970
## 4:             Sports      459754        25664       5.582116
## 5:             Street       81184        13722      16.902345
## 6:             School      157911         4842       3.066284
## 7:               Farm        1366          259      18.960469
## 8:             Mobile         743          225      30.282638
## 9:             Indst.         329           48      14.589666
# Clean Location_cat and Disposition_cat
dt[, Location_cat_clean := fifelse(
  is.na(Location_cat) | trimws(Location_cat) == "",
  "Missing / Not Recorded",
  trimws(Location_cat)
)]

dt[, Disposition_cat_clean := fifelse(
  is.na(Disposition_cat) | trimws(Disposition_cat) == "",
  "Missing / Not Recorded",
  trimws(Disposition_cat)
)]

# Count by Location and Disposition
loc_disp_count <- dt[
  ,
  .N,
  by = .(Location_cat_clean, Disposition_cat_clean)
][order(-N)]

# Add percentage within location
loc_disp_count[
  ,
  pct_within_location := 100 * N / sum(N),
  by = Location_cat_clean
]



ggplot(
  loc_disp_count,
  aes(
    x = str_wrap(Disposition_cat_clean, width = 28),
    y = fct_reorder(Location_cat_clean, N, .fun = sum),
    fill = N
  )
) +
  geom_tile(color = "white", linewidth = 0.35) +
  scale_fill_gradientn(
    colors = wes_palette("Zissou1", 100, type = "continuous"),
    labels = comma
  ) +
  labs(
    title = "Distribution of Injury Cases by Location and Disposition",
    subtitle = "Counts by Location_cat and Disposition_cat",
    x = "Disposition Category",
    y = "Location Category",
    fill = "Cases"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    axis.text.x = element_text(angle = 35, hjust = 1),
    panel.grid = element_blank()
  )

ggplot(
  loc_disp_count,
  aes(
    x = str_wrap(Disposition_cat_clean, width = 28),
    y = fct_reorder(Location_cat_clean, pct_within_location, .fun = max),
    fill = pct_within_location
  )
) +
  geom_tile(color = "white", linewidth = 0.35) +
  scale_fill_gradientn(
    colors = wes_palette("Darjeeling1", 100, type = "continuous"),
    labels = function(x) paste0(round(x, 1), "%")
  ) +
  labs(
    title = "Disposition Profile by Injury Location",
    subtitle = "Percentage distribution within each Location_cat",
    x = "Disposition Category",
    y = "Location Category",
    fill = "Percent"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    axis.text.x = element_text(angle = 35, hjust = 1),
    panel.grid = element_blank()
  )

ggplot(
  loc_disp_count,
  aes(
    x = fct_reorder(Location_cat_clean, N, .fun = sum),
    y = N,
    fill = str_wrap(Disposition_cat_clean, width = 30)
  )
) +
  geom_col(color = "white", linewidth = 0.25) +
  coord_flip() +
  scale_y_continuous(labels = comma) +
  scale_fill_manual(
    values = wes_palette(
      "Zissou1",
      n = length(unique(loc_disp_count$Disposition_cat_clean)),
      type = "continuous"
    )
  ) +
  labs(
    title = "Injury Disposition by Location",
    subtitle = "Stacked counts by Location_cat and Disposition_cat",
    x = "Location Category",
    y = "Number of Cases",
    fill = "Disposition"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    legend.position = "bottom",
    legend.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

dt[, severe_disposition := grepl(
  "Admitted|Hospitalized|Transferred|Held|Observation|Fatality",
  Disposition_cat_clean,
  ignore.case = TRUE
)]

loc_severity <- dt[
  ,
  .(
    total_cases = .N,
    severe_cases = sum(severe_disposition, na.rm = TRUE),
    severe_percent = 100 * mean(severe_disposition, na.rm = TRUE)
  ),
  by = Location_cat_clean
][order(-severe_cases)]

loc_severity
##    Location_cat_clean total_cases severe_cases severe_percent
##                <char>       <int>        <int>          <num>
## 1:               Home     1457036       221204      15.181780
## 2:                Unk     1145232       103202       9.011449
## 3:             Public      262219        49735      18.966970
## 4:             Sports      459754        25664       5.582116
## 5:             Street       81184        13722      16.902345
## 6:             School      157911         4842       3.066284
## 7:               Farm        1366          259      18.960469
## 8:             Mobile         743          225      30.282638
## 9:             Indst.         329           48      14.589666
ggplot(
  loc_severity,
  aes(
    x = fct_reorder(Location_cat_clean, severe_cases),
    y = severe_cases
  )
) +
  geom_col(
    fill = wes_palette("Zissou1", 5)[4],
    width = 0.75
  ) +
  coord_flip() +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Severe Injury Burden by Location",
    subtitle = "Disposition categories indicating hospitalization, transfer, observation, or fatality",
    x = "Location Category",
    y = "Number of Severe Cases"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    panel.grid.minor = element_blank()
  )

Events

Top 10 Event Injury

# assuming your data.table is named dt
# If not, replace dt with your dataset name

# Clean missing/blank Product_1_cat
dt[, Product_1_cat_clean := fifelse(
  is.na(Product_1_cat) | trimws(Product_1_cat) == "",
  "Missing / Not Recorded",
  trimws(Product_1_cat)
)]

# Count Product_1 by year
prod_year_count <- dt[
  ,
  .N,
  by = .(Year, Product_1_cat_clean)
][order(Year, -N)]

# View table
prod_year_count
##        Year                      Product_1_cat_clean     N
##       <int>                                   <char> <int>
##    1:  2016                          Stairs Or Steps 30152
##    2:  2016             Floors Or Flooring Materials 29029
##    3:  2016     Beds Or Bedframes, Other Or Not Spec 18685
##    4:  2016 Basketball (Activity, Apparel Or Equip.) 14555
##    5:  2016  Bicycles And Accessories (Excl Mountain 12597
##   ---                                                     
## 7511:  2025                     Electric Can Openers     1
## 7512:  2025                                Toboggans     1
## 7513:  2025                 Wringer Washing Machines     1
## 7514:  2025 Curling (Activity, Apparel Or Equipment)     1
## 7515:  2025             Electrical Testing Equipment     1
# Identify top 10 Product_1 categories overall
top_products <- dt[
  ,
  .N,
  by = Product_1_cat_clean
][order(-N)][1:10, Product_1_cat_clean]

# Filter yearly count for top products
plot_data <- prod_year_count[
  Product_1_cat_clean %in% top_products
]

# Get last year value for direct labels
label_data <- plot_data[
  Year == max(Year),
  .SD[which.max(N)],
  by = Product_1_cat_clean
]

ggplot(
  plot_data,
  aes(
    x = Year,
    y = N,
    color = Product_1_cat_clean,
    group = Product_1_cat_clean
  )
) +
  geom_line(linewidth = 1.1, alpha = 0.85) +
  geom_point(size = 2.2, alpha = 0.85) +
  geom_text_repel(
    data = label_data,
    aes(label = Product_1_cat_clean),
    size = 3,
    direction = "y",
    hjust = 0,
    nudge_x = 0.35,
    segment.size = 0.25,
    segment.alpha = 0.6,
    show.legend = FALSE,
    max.overlaps = Inf
  ) +
  scale_color_manual(
    values = wes_palette("Darjeeling1", n = length(unique(plot_data$Product_1_cat_clean)), type = "continuous")
  ) +
  scale_y_continuous(labels = comma) +
  scale_x_continuous(
    breaks = sort(unique(plot_data$Year)),
    expand = expansion(mult = c(0.02, 0.25))
  ) +
  labs(
    title = "Annual Counts of Top Event Category",
    subtitle = "Based on Product_1",
    x = "Year",
    y = "Number of Cases"
  ) +
  coord_cartesian(clip = "off") +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    legend.position = "none",
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 120, 10, 10)
  )

Top 100 Event Injury

library(wesanderson)

# Clean Product_1_cat
dt[, Product_1_cat_clean := fifelse(
  is.na(Product_1_cat) | trimws(Product_1_cat) == "",
  "Missing / Not Recorded",
  trimws(Product_1_cat)
)]

# Top 100 Product_1 categories overall
top100_products <- dt[
  ,
  .N,
  by = Product_1_cat_clean
][order(-N)][1:100, Product_1_cat_clean]

# Yearly count for top 100
prod_year_top100 <- dt[
  Product_1_cat_clean %in% top100_products,
  .N,
  by = .(Year, Product_1_cat_clean)
]

# Add total count for ordering
prod_total <- dt[
  Product_1_cat_clean %in% top100_products,
  .N,
  by = Product_1_cat_clean
]

prod_year_top100 <- merge(
  prod_year_top100,
  prod_total,
  by = "Product_1_cat_clean",
  suffixes = c("", "_total")
)

# Order products by total count
prod_year_top100[
  ,
  Product_1_cat_clean := fct_reorder(Product_1_cat_clean, N_total)
]


ggplot(
  prod_year_top100,
  aes(
    x = factor(Year),
    y = Product_1_cat_clean,
    fill = N
  )
) +
  geom_tile(color = "white", linewidth = 0.15) +
  scale_fill_gradientn(
    colors = wes_palette("Zissou1", 100, type = "continuous"),
    labels = comma
  ) +
  labs(
    title = "Annual Distribution of Top 100 Event Categories",
    subtitle = "Counts by Product_1 category and year",
    x = "Year",
    y = "Event Category",
    fill = "Cases"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    axis.text.y = element_text(size = 6.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "right",
    panel.grid = element_blank()
  )

ggplot(
  prod_year_top100,
  aes(
    x = factor(Year),
    y = Product_1_cat_clean,
    fill = log1p(N)
  )
) +
  geom_tile(color = "white", linewidth = 0.15) +
  scale_fill_gradientn(
    colors = wes_palette("Darjeeling1", 100, type = "continuous"),
    labels = function(x) comma(expm1(x))
  ) +
  labs(
    title = "Annual Distribution of Top 100 Event Categories",
    subtitle = "Log-scaled counts improve visibility for lower-frequency events",
    x = "Year",
    y = "Event Category",
    fill = "Cases"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    axis.text.y = element_text(size = 6.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "right",
    panel.grid = element_blank()
  )

Dispositions

Top 10 Dispositions

# Clean Disposition_cat
dt[, Disposition_cat_clean := fifelse(
  is.na(Disposition_cat) | trimws(Disposition_cat) == "",
  "Missing / Not Recorded",
  trimws(Disposition_cat)
)]

# Count Disposition by year
disp_year_count <- dt[
  ,
  .N,
  by = .(Year, Disposition_cat_clean)
][order(Year, -N)]

# Select top categories overall
top_disp <- dt[
  ,
  .N,
  by = Disposition_cat_clean
][order(-N)][1:min(30, .N), Disposition_cat_clean]

# Plot data
plot_data_disp <- disp_year_count[
  Disposition_cat_clean %in% top_disp
]

# Last-year values for direct labels
label_data_disp <- plot_data_disp[
  Year == max(Year),
  .SD[which.max(N)],
  by = Disposition_cat_clean
]



ggplot(
  plot_data_disp,
  aes(
    x = Year,
    y = N,
    color = Disposition_cat_clean,
    group = Disposition_cat_clean
  )
) +
  geom_line(linewidth = 1.1, alpha = 0.85) +
  geom_point(size = 2.3, alpha = 0.85) +
  geom_text_repel(
    data = label_data_disp,
    aes(label = Disposition_cat_clean),
    size = 3,
    direction = "y",
    hjust = 0,
    nudge_x = 0.35,
    segment.size = 0.25,
    segment.alpha = 0.6,
    show.legend = FALSE,
    max.overlaps = Inf
  ) +
  scale_color_manual(
    values = wes_palette(
      "Darjeeling1",
      n = length(unique(plot_data_disp$Disposition_cat_clean)),
      type = "continuous"
    )
  ) +
  scale_y_continuous(labels = comma) +
  scale_x_continuous(
    breaks = sort(unique(plot_data_disp$Year)),
    expand = expansion(mult = c(0.02, 0.35))
  ) +
  labs(
    title = "Annual Counts by Disposition Category",
    subtitle = "Disposition categories shown directly at the end of each trend line",
    x = "Year",
    y = "Number of Cases"
  ) +
  coord_cartesian(clip = "off") +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12),
    legend.position = "none",
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 180, 10, 10)
  )