NEISS (2016-2025)
Are We safe at Home
setwd("C:/Users/mvx13/OneDrive - Texas State University/Papers/2026/2027_TRBAM/Data/NEISS")
library(data.table)
dt= fread("NEISS_2016_2025_merged_decoded.csv")
dim(dt)## [1] 3565774 38
## [1] "CPSC_Case_Number" "Treatment_Date" "Age"
## [4] "Sex" "Race" "Other_Race"
## [7] "Hispanic" "Body_Part" "Diagnosis"
## [10] "Other_Diagnosis" "Body_Part_2" "Diagnosis_2"
## [13] "Other_Diagnosis_2" "Disposition" "Location"
## [16] "Fire_Involvement" "Product_1" "Product_2"
## [19] "Product_3" "Alcohol" "Drug"
## [22] "Narrative_1" "Stratum" "PSU"
## [25] "Weight" "Year" "Body_Part_cat"
## [28] "Body_Part_2_cat" "Diagnosis_cat" "Diagnosis_2_cat"
## [31] "Disposition_cat" "Fire_Involvement_cat" "Sex_cat"
## [34] "Location_cat" "Race_cat" "Product_1_cat"
## [37] "Product_2_cat" "Product_3_cat"
## CPSC_Case_Number Treatment_Date Age Sex Race Other_Race Hispanic
## <int> <IDat> <int> <int> <int> <char> <num>
## 1: 160101845 2016-01-01 92 1 0 NA
## 2: 160101847 2016-01-01 90 1 0 NA
## 3: 160101848 2016-01-01 71 2 0 NA
## 4: 160101852 2016-01-01 71 2 0 NA
## 5: 160101857 2016-01-01 57 1 0 NA
## 6: 160101860 2016-01-01 46 2 0 NA
## Body_Part Diagnosis Other_Diagnosis Body_Part_2 Diagnosis_2
## <int> <int> <char> <num> <num>
## 1: 79 57 NA NA
## 2: 79 57 NA NA
## 3: 79 57 NA NA
## 4: 31 64 NA NA
## 5: 83 59 NA NA
## 6: 79 53 NA NA
## Other_Diagnosis_2 Disposition Location Fire_Involvement Product_1 Product_2
## <char> <int> <int> <int> <int> <int>
## 1: 4 1 0 1645 1807
## 2: 4 1 0 670 0
## 3: 4 1 0 1807 0
## 4: 1 0 0 1144 0
## 5: 1 1 0 464 0
## 6: 1 1 0 611 0
## Product_3 Alcohol Drug
## <int> <num> <num>
## 1: 0 NA NA
## 2: 0 NA NA
## 3: 0 NA NA
## 4: 0 NA NA
## 5: 0 NA NA
## 6: 0 NA NA
## Narrative_1
## <char>
## 1: 92YOM TRYINGO TO TAKE OFF PANTS AND LOST BALANCE AND FELL TO TILE FLOORHIP FRACTURE
## 2: 90YOM FELL GETTING OUT OF A RECLINER CHAIR AND SUSTAINED A HIP FRACTURE
## 3: 71YOF SLIPPED AND FELL TO HER WET KITCHEN FLOOR AND FRACTURED HIP
## 4: 71YOF CARRYING A 50 POUND BAG AND STRAINED CHEST MUSCLES
## 5: 57YOM DROPPED A KNIFE ONTO LEFT FOOT LACERATION TO FOOT
## 6: 46YOF FELL GETTING OUT OF HER BATHTUB AND SUSTAINED A CONTUSION TO HIP
## Stratum PSU Weight Year Body_Part_cat Body_Part_2_cat Diagnosis_cat
## <char> <int> <num> <int> <char> <char> <char>
## 1: M 63 103.2251 2016 Lower Trunk Fracture
## 2: M 63 103.2251 2016 Lower Trunk Fracture
## 3: M 63 103.2251 2016 Lower Trunk Fracture
## 4: M 63 103.2251 2016 Upper Trunk Strain, Sprain
## 5: M 63 103.2251 2016 Foot Laceration
## 6: M 63 103.2251 2016 Lower Trunk Contusions, Abr.
## Diagnosis_2_cat Disposition_cat
## <char> <char>
## 1: Treated & Admitted For Hospitalization, Hospitalized
## 2: Treated & Admitted For Hospitalization, Hospitalized
## 3: Treated & Admitted For Hospitalization, Hospitalized
## 4: Treated & Released, Or Examined & Released Without Trtmnt
## 5: Treated & Released, Or Examined & Released Without Trtmnt
## 6: Treated & Released, Or Examined & Released Without Trtmnt
## Fire_Involvement_cat Sex_cat Location_cat Race_cat
## <char> <char> <char> <char>
## 1: No Fire Or No Flame/Smoke Spread Male Home N.S.
## 2: No Fire Or No Flame/Smoke Spread Male Home N.S.
## 3: No Fire Or No Flame/Smoke Spread Female Home N.S.
## 4: No Fire Or No Flame/Smoke Spread Female Unk N.S.
## 5: No Fire Or No Flame/Smoke Spread Male Home N.S.
## 6: No Fire Or No Flame/Smoke Spread Female Home N.S.
## Product_1_cat Product_2_cat Product_3_cat
## <char> <char> <char>
## 1: Day Wear Floors Or Flooring Materials
## 2: Recliner Chair
## 3: Floors Or Flooring Materials
## 4: Bags, Not Elsewhere Classified
## 5: Knives, Not Elsewhere Classified
## 6: Bathtubs Or Showers
library(data.table)
library(ggplot2)
library(scales)
library(forcats)
library(ggrepel)
library(wesanderson)
library(stringr)
# Clean Location_cat
dt[, Location_cat_clean := fifelse(
is.na(Location_cat) | trimws(Location_cat) == "",
"Missing / Not Recorded",
trimws(Location_cat)
)]
# Count by Location and Year
location_year_count <- dt[
,
.N,
by = .(Year, Location_cat_clean)
][order(Year, -N)]
# Overall count by location
location_total <- dt[
,
.N,
by = Location_cat_clean
][order(-N)]
location_total## Location_cat_clean N
## <char> <int>
## 1: Home 1457036
## 2: Unk 1145232
## 3: Sports 459754
## 4: Public 262219
## 5: School 157911
## 6: Street 81184
## 7: Farm 1366
## 8: Mobile 743
## 9: Indst. 329
ggplot(
location_year_count,
aes(
x = factor(Year),
y = reorder(Location_cat_clean, N, FUN = sum),
fill = N
)
) +
geom_tile(color = "white", linewidth = 0.25) +
scale_fill_gradientn(
colors = wes_palette("Zissou1", 100, type = "continuous"),
labels = comma
) +
labs(
title = "Annual Distribution of Injury Cases by Location",
subtitle = "Counts by Location_cat and year",
x = "Year",
y = "Location Category",
fill = "Cases"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()
)dt[, severe_disposition := grepl(
"Admitted|Hospitalized|Transferred|Held|Observation",
Disposition_cat,
ignore.case = TRUE
)]
location_severity <- dt[
,
.(
total_cases = .N,
severe_cases = sum(severe_disposition, na.rm = TRUE),
severe_percent = 100 * mean(severe_disposition, na.rm = TRUE)
),
by = Location_cat_clean
][order(-severe_cases)]
location_severity## Location_cat_clean total_cases severe_cases severe_percent
## <char> <int> <int> <num>
## 1: Home 1457036 221204 15.181780
## 2: Unk 1145232 103202 9.011449
## 3: Public 262219 49735 18.966970
## 4: Sports 459754 25664 5.582116
## 5: Street 81184 13722 16.902345
## 6: School 157911 4842 3.066284
## 7: Farm 1366 259 18.960469
## 8: Mobile 743 225 30.282638
## 9: Indst. 329 48 14.589666
# Clean Location_cat and Disposition_cat
dt[, Location_cat_clean := fifelse(
is.na(Location_cat) | trimws(Location_cat) == "",
"Missing / Not Recorded",
trimws(Location_cat)
)]
dt[, Disposition_cat_clean := fifelse(
is.na(Disposition_cat) | trimws(Disposition_cat) == "",
"Missing / Not Recorded",
trimws(Disposition_cat)
)]
# Count by Location and Disposition
loc_disp_count <- dt[
,
.N,
by = .(Location_cat_clean, Disposition_cat_clean)
][order(-N)]
# Add percentage within location
loc_disp_count[
,
pct_within_location := 100 * N / sum(N),
by = Location_cat_clean
]
ggplot(
loc_disp_count,
aes(
x = str_wrap(Disposition_cat_clean, width = 28),
y = fct_reorder(Location_cat_clean, N, .fun = sum),
fill = N
)
) +
geom_tile(color = "white", linewidth = 0.35) +
scale_fill_gradientn(
colors = wes_palette("Zissou1", 100, type = "continuous"),
labels = comma
) +
labs(
title = "Distribution of Injury Cases by Location and Disposition",
subtitle = "Counts by Location_cat and Disposition_cat",
x = "Disposition Category",
y = "Location Category",
fill = "Cases"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
axis.text.x = element_text(angle = 35, hjust = 1),
panel.grid = element_blank()
)ggplot(
loc_disp_count,
aes(
x = str_wrap(Disposition_cat_clean, width = 28),
y = fct_reorder(Location_cat_clean, pct_within_location, .fun = max),
fill = pct_within_location
)
) +
geom_tile(color = "white", linewidth = 0.35) +
scale_fill_gradientn(
colors = wes_palette("Darjeeling1", 100, type = "continuous"),
labels = function(x) paste0(round(x, 1), "%")
) +
labs(
title = "Disposition Profile by Injury Location",
subtitle = "Percentage distribution within each Location_cat",
x = "Disposition Category",
y = "Location Category",
fill = "Percent"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
axis.text.x = element_text(angle = 35, hjust = 1),
panel.grid = element_blank()
)ggplot(
loc_disp_count,
aes(
x = fct_reorder(Location_cat_clean, N, .fun = sum),
y = N,
fill = str_wrap(Disposition_cat_clean, width = 30)
)
) +
geom_col(color = "white", linewidth = 0.25) +
coord_flip() +
scale_y_continuous(labels = comma) +
scale_fill_manual(
values = wes_palette(
"Zissou1",
n = length(unique(loc_disp_count$Disposition_cat_clean)),
type = "continuous"
)
) +
labs(
title = "Injury Disposition by Location",
subtitle = "Stacked counts by Location_cat and Disposition_cat",
x = "Location Category",
y = "Number of Cases",
fill = "Disposition"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
legend.position = "bottom",
legend.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)dt[, severe_disposition := grepl(
"Admitted|Hospitalized|Transferred|Held|Observation|Fatality",
Disposition_cat_clean,
ignore.case = TRUE
)]
loc_severity <- dt[
,
.(
total_cases = .N,
severe_cases = sum(severe_disposition, na.rm = TRUE),
severe_percent = 100 * mean(severe_disposition, na.rm = TRUE)
),
by = Location_cat_clean
][order(-severe_cases)]
loc_severity## Location_cat_clean total_cases severe_cases severe_percent
## <char> <int> <int> <num>
## 1: Home 1457036 221204 15.181780
## 2: Unk 1145232 103202 9.011449
## 3: Public 262219 49735 18.966970
## 4: Sports 459754 25664 5.582116
## 5: Street 81184 13722 16.902345
## 6: School 157911 4842 3.066284
## 7: Farm 1366 259 18.960469
## 8: Mobile 743 225 30.282638
## 9: Indst. 329 48 14.589666
ggplot(
loc_severity,
aes(
x = fct_reorder(Location_cat_clean, severe_cases),
y = severe_cases
)
) +
geom_col(
fill = wes_palette("Zissou1", 5)[4],
width = 0.75
) +
coord_flip() +
scale_y_continuous(labels = comma) +
labs(
title = "Severe Injury Burden by Location",
subtitle = "Disposition categories indicating hospitalization, transfer, observation, or fatality",
x = "Location Category",
y = "Number of Severe Cases"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
panel.grid.minor = element_blank()
)Events
Top 10 Event Injury
# assuming your data.table is named dt
# If not, replace dt with your dataset name
# Clean missing/blank Product_1_cat
dt[, Product_1_cat_clean := fifelse(
is.na(Product_1_cat) | trimws(Product_1_cat) == "",
"Missing / Not Recorded",
trimws(Product_1_cat)
)]
# Count Product_1 by year
prod_year_count <- dt[
,
.N,
by = .(Year, Product_1_cat_clean)
][order(Year, -N)]
# View table
prod_year_count## Year Product_1_cat_clean N
## <int> <char> <int>
## 1: 2016 Stairs Or Steps 30152
## 2: 2016 Floors Or Flooring Materials 29029
## 3: 2016 Beds Or Bedframes, Other Or Not Spec 18685
## 4: 2016 Basketball (Activity, Apparel Or Equip.) 14555
## 5: 2016 Bicycles And Accessories (Excl Mountain 12597
## ---
## 7511: 2025 Electric Can Openers 1
## 7512: 2025 Toboggans 1
## 7513: 2025 Wringer Washing Machines 1
## 7514: 2025 Curling (Activity, Apparel Or Equipment) 1
## 7515: 2025 Electrical Testing Equipment 1
# Identify top 10 Product_1 categories overall
top_products <- dt[
,
.N,
by = Product_1_cat_clean
][order(-N)][1:10, Product_1_cat_clean]
# Filter yearly count for top products
plot_data <- prod_year_count[
Product_1_cat_clean %in% top_products
]
# Get last year value for direct labels
label_data <- plot_data[
Year == max(Year),
.SD[which.max(N)],
by = Product_1_cat_clean
]
ggplot(
plot_data,
aes(
x = Year,
y = N,
color = Product_1_cat_clean,
group = Product_1_cat_clean
)
) +
geom_line(linewidth = 1.1, alpha = 0.85) +
geom_point(size = 2.2, alpha = 0.85) +
geom_text_repel(
data = label_data,
aes(label = Product_1_cat_clean),
size = 3,
direction = "y",
hjust = 0,
nudge_x = 0.35,
segment.size = 0.25,
segment.alpha = 0.6,
show.legend = FALSE,
max.overlaps = Inf
) +
scale_color_manual(
values = wes_palette("Darjeeling1", n = length(unique(plot_data$Product_1_cat_clean)), type = "continuous")
) +
scale_y_continuous(labels = comma) +
scale_x_continuous(
breaks = sort(unique(plot_data$Year)),
expand = expansion(mult = c(0.02, 0.25))
) +
labs(
title = "Annual Counts of Top Event Category",
subtitle = "Based on Product_1",
x = "Year",
y = "Number of Cases"
) +
coord_cartesian(clip = "off") +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
legend.position = "none",
panel.grid.minor = element_blank(),
plot.margin = margin(10, 120, 10, 10)
)Top 100 Event Injury
library(wesanderson)
# Clean Product_1_cat
dt[, Product_1_cat_clean := fifelse(
is.na(Product_1_cat) | trimws(Product_1_cat) == "",
"Missing / Not Recorded",
trimws(Product_1_cat)
)]
# Top 100 Product_1 categories overall
top100_products <- dt[
,
.N,
by = Product_1_cat_clean
][order(-N)][1:100, Product_1_cat_clean]
# Yearly count for top 100
prod_year_top100 <- dt[
Product_1_cat_clean %in% top100_products,
.N,
by = .(Year, Product_1_cat_clean)
]
# Add total count for ordering
prod_total <- dt[
Product_1_cat_clean %in% top100_products,
.N,
by = Product_1_cat_clean
]
prod_year_top100 <- merge(
prod_year_top100,
prod_total,
by = "Product_1_cat_clean",
suffixes = c("", "_total")
)
# Order products by total count
prod_year_top100[
,
Product_1_cat_clean := fct_reorder(Product_1_cat_clean, N_total)
]
ggplot(
prod_year_top100,
aes(
x = factor(Year),
y = Product_1_cat_clean,
fill = N
)
) +
geom_tile(color = "white", linewidth = 0.15) +
scale_fill_gradientn(
colors = wes_palette("Zissou1", 100, type = "continuous"),
labels = comma
) +
labs(
title = "Annual Distribution of Top 100 Event Categories",
subtitle = "Counts by Product_1 category and year",
x = "Year",
y = "Event Category",
fill = "Cases"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
axis.text.y = element_text(size = 6.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right",
panel.grid = element_blank()
)ggplot(
prod_year_top100,
aes(
x = factor(Year),
y = Product_1_cat_clean,
fill = log1p(N)
)
) +
geom_tile(color = "white", linewidth = 0.15) +
scale_fill_gradientn(
colors = wes_palette("Darjeeling1", 100, type = "continuous"),
labels = function(x) comma(expm1(x))
) +
labs(
title = "Annual Distribution of Top 100 Event Categories",
subtitle = "Log-scaled counts improve visibility for lower-frequency events",
x = "Year",
y = "Event Category",
fill = "Cases"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
axis.text.y = element_text(size = 6.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right",
panel.grid = element_blank()
)Dispositions
Top 10 Dispositions
# Clean Disposition_cat
dt[, Disposition_cat_clean := fifelse(
is.na(Disposition_cat) | trimws(Disposition_cat) == "",
"Missing / Not Recorded",
trimws(Disposition_cat)
)]
# Count Disposition by year
disp_year_count <- dt[
,
.N,
by = .(Year, Disposition_cat_clean)
][order(Year, -N)]
# Select top categories overall
top_disp <- dt[
,
.N,
by = Disposition_cat_clean
][order(-N)][1:min(30, .N), Disposition_cat_clean]
# Plot data
plot_data_disp <- disp_year_count[
Disposition_cat_clean %in% top_disp
]
# Last-year values for direct labels
label_data_disp <- plot_data_disp[
Year == max(Year),
.SD[which.max(N)],
by = Disposition_cat_clean
]
ggplot(
plot_data_disp,
aes(
x = Year,
y = N,
color = Disposition_cat_clean,
group = Disposition_cat_clean
)
) +
geom_line(linewidth = 1.1, alpha = 0.85) +
geom_point(size = 2.3, alpha = 0.85) +
geom_text_repel(
data = label_data_disp,
aes(label = Disposition_cat_clean),
size = 3,
direction = "y",
hjust = 0,
nudge_x = 0.35,
segment.size = 0.25,
segment.alpha = 0.6,
show.legend = FALSE,
max.overlaps = Inf
) +
scale_color_manual(
values = wes_palette(
"Darjeeling1",
n = length(unique(plot_data_disp$Disposition_cat_clean)),
type = "continuous"
)
) +
scale_y_continuous(labels = comma) +
scale_x_continuous(
breaks = sort(unique(plot_data_disp$Year)),
expand = expansion(mult = c(0.02, 0.35))
) +
labs(
title = "Annual Counts by Disposition Category",
subtitle = "Disposition categories shown directly at the end of each trend line",
x = "Year",
y = "Number of Cases"
) +
coord_cartesian(clip = "off") +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 12),
legend.position = "none",
panel.grid.minor = element_blank(),
plot.margin = margin(10, 180, 10, 10)
)