library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.2
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
# NCHS file
nchs <- read_csv("NCHS_cleaned.csv")
## Rows: 10868 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): X113.Cause.Name, Cause.Name, State
## dbl (3): Year, Deaths, Age.adjusted.Death.Rate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 2000–2009 population
pop_00_09 <- read_excel("st-est00int-01.xls", skip = 3)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...13`
## • `` -> `...14`
# 2010–2017 population
pop_10_17 <- read_excel("nst-est2020int-pop.xlsx", skip = 3)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...13`
To begin, I am going to explain my reasoning and process for adding to the current dataset. Given the dataset is fairly black and white without a great degree of elements to question, given this week’s assignment for the data dive I wanted to add some more depth to the data.
I gathered two csv files from the United States Census Bureau website. One including population data from 2010-2029 and one 2000-2009. I excluded 1999 since the data was not properly available in a format that would allow extraction to occur (only PDFs existed). I later removed the 1999 data from the final dataset so everything would be in alignment so, now the dataset is 2000-2017.
I then had to clean the data and ensure the headers were properly named for data merging. There were some issues with how the headers were named so multiple iterations of cleaning occurred to decipher the issue which was additional punctuation in the header that prevented the data from merging properly.
Once all of the data was successfully merged with the original csv file I was working with (NCHS_cleaned.csv), I added an additional column for some insight and clarification which was Deaths per 100k. This was done using the newly added Total Population column with its information extracted from the aforementioned Census Bureau site. The Deaths was then calculated into the Total Population to create this new column Deaths per 100k.
Acquiring the necessary data to do a direct comparison to Age.adjusted.Death.Rate was becoming problematic which was why I decided to go the route of just Deaths per 100k. This will allow for a comparison to Age.adjusted.Death.Rate for insights, along with providing its own insights and visualizations.
After the below operations are performed, I will begin to analyze the data and proceed as usual for the week’s data dive.
# Cleaning the 2000-2009 data
pop_00_09_clean <- pop_00_09 |>
rename(State = `...1`) |>
mutate(
State = str_trim(State),
State = gsub("^\\.", "", State)
) |>
filter(!State %in% c("Northeast", "Midwest", "South", "West")) |>
select(State, `2000`:`2009`) |>
pivot_longer(
cols = `2000`:`2009`,
names_to = "Year",
values_to = "Total_Population"
) |>
mutate(Year = as.numeric(Year))
# Cleaning the 2010-2019 data
pop_10_17_clean <- pop_10_17 |>
rename(State = `...1`) |>
mutate(
State = str_trim(State),
State = gsub("^\\.", "", State)
) |>
filter(!State %in% c("Northeast", "Midwest", "South", "West")) |>
select(State, `2010`:`2017`) |>
pivot_longer(
cols = `2010`:`2017`,
names_to = "Year",
values_to = "Total_Population"
) |>
mutate(Year = as.numeric(Year))
# Combining the two population census files
population_all <- bind_rows(pop_00_09_clean, pop_10_17_clean)
# Ensuring the current nchs dataset is clean
nchs <- nchs |>
mutate(
State = str_trim(State),
Year = as.numeric(Year)
)
nchs <- nchs |>
filter(Year >= 2000)
# Ensuring it now starts at the year 2000
nchs |>
count(Year) |>
arrange(Year)
## # A tibble: 18 × 2
## Year n
## <dbl> <int>
## 1 2000 572
## 2 2001 572
## 3 2002 572
## 4 2003 572
## 5 2004 572
## 6 2005 572
## 7 2006 572
## 8 2007 572
## 9 2008 572
## 10 2009 572
## 11 2010 572
## 12 2011 572
## 13 2012 572
## 14 2013 572
## 15 2014 572
## 16 2015 572
## 17 2016 572
## 18 2017 572
# Merging the populations from other files into nchs
nchs_merged <- nchs |>
left_join(population_all, by = c("State", "Year"))
# Checking to see if the merge worked correctly
nchs_merged |>
summarize(missing_pop = sum(is.na(Total_Population)))
## # A tibble: 1 × 1
## missing_pop
## <int>
## 1 0
# Creating a Deaths per 100k column
nchs_merged <- nchs_merged |>
mutate(
Deaths_per_100k = round((Deaths / Total_Population) * 100000, 2)
)
# Doing a quick verification check
nchs_merged |>
filter(State == "Alabama", Year == 2010) |>
select(State, Year, Deaths, Total_Population, Deaths_per_100k)
## # A tibble: 11 × 5
## State Year Deaths Total_Population Deaths_per_100k
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 2010 679 4788095 14.2
## 2 Alabama 2010 942 4788095 19.7
## 3 Alabama 2010 1184 4788095 24.7
## 4 Alabama 2010 1302 4788095 27.2
## 5 Alabama 2010 1523 4788095 31.8
## 6 Alabama 2010 2394 4788095 50
## 7 Alabama 2010 2619 4788095 54.7
## 8 Alabama 2010 2866 4788095 59.9
## 9 Alabama 2010 10196 4788095 213.
## 10 Alabama 2010 12083 4788095 252.
## 11 Alabama 2010 48038 4788095 1003.
# Writing the updated dataset to a new csv
write_csv(nchs_merged, "NCHS_final_2000_2017_with_population.csv")
# Creating a variable for the dataset that is now updated
deaths <- read_csv("NCHS_final_2000_2017_with_population.csv")
## Rows: 10296 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): X113.Cause.Name, Cause.Name, State
## dbl (5): Year, Deaths, Age.adjusted.Death.Rate, Total_Population, Deaths_per...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The data’s sourcing was from the CDC’s official state that acquires the data through death certificates from each state for each year. This was done in collaboration with the National Center for Health Certificates.
The Cause.Name and Age.adjusted.Death.Rate columns I was curious in relation to what they represented and how the information was acquired.
This is an excerpt from where the data was collected (https://www.cdc.gov/nchs/data-visualization/mortality-leading-causes/#sources):
“Data are based on information from all resident death certificates filed in the 50 states and the District of Columbia. Age-adjusted death rates (per 100,000 population) are based on the 2000 U.S. standard population. Populations used for computing death rates for 2011-2017 are postcensal estimates based on the 2010 census, estimated as of July 1, 2010. Rates for census years are based on populations enumerated in the corresponding censuses. Rates for non-census years before 2010 are revised using updated intercensal population estimates and may differ from rates previously published. Causes of death classified by the International Classification of Diseases, Tenth Revision (ICD–10) are ranked according to the number of deaths assigned to rankable causes. The 10 leading causes shown are for the entire U.S. Cause-of-death rankings are dynamically updated in the bar chart according to the selected year. Cause-of-death statistics are based on the underlying cause of death.”
I now know the causes are based on the underlying cause of death which can yield some bias and ambiguity due to deaths being misclassified, whether intentionally for financial or other gain from the doctors and facilities, or unintentionally due to human error.
Furthermore, the Age.adjusted.Death.Rate was calculated based on data up to 2010, but estimates were used thereafter which created the desire to acquire the actual data and update the dataset to be more wide in its scope and depth.
Without looking more into the documentation, the acquisition of more data (i.e. the Total Population and Deaths per 100k) would likely not have occurred at this stage in the data analysis.
Based on the current, and new, data; the first visualization will compare the crude, being the non-age-adjusted-death-rate, and the existing Age.adjusted.Death.Rate for the United States for a specific condition Heart disease through the timeline of the data (2000-2017).
deaths |>
filter(State == "United States",
Cause.Name == "Heart disease") |>
ggplot(aes(x = Year)) +
geom_line(aes(y = Deaths_per_100k, color = "Crude")) +
geom_line(aes(y = Age.adjusted.Death.Rate, color = "Age-Adjusted")) +
labs(title = "Crude vs Age-Adjusted Death Rates",
y = "Rate per 100,000",
color = "Rate Type")
# Creating a visualization to illustrate the difference between death rate calculations
deaths |>
filter(State == "United States",
Cause.Name == "Heart disease") |>
mutate(Rate_Difference = Deaths_per_100k - Age.adjusted.Death.Rate) |>
ggplot(aes(x = Year, y = Rate_Difference)) +
geom_line(color = "darkred", size = 1) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Difference Between Crude and Age-Adjusted Death Rates",
subtitle = "United States — Heart Disease (2000–2017)",
caption = "Positive values indicate crude rate exceeds age-adjusted rate suggesting impact of population aging.",
y = "Crude Rate Minus Age-Adjusted Rate",
x = "Year"
) +
theme_minimal() +
theme(
plot.caption = element_text(hjust = 0.5)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The growing gap between crude and age-adjusted rates suggests that the U.S. population has aged relative to the year 2000 standard population, increasing the influence of older age groups on crude mortality rates.
While the increasing difference between crude and age-adjusted rates suggests demographic aging, the dataset does not provide age-specific mortality rates. Therefore, it is unclear whether heart disease risk is decreasing uniformly across all age groups or whether improvements are concentrated in certain populations. This lack of granularity may cloud important public health patterns. Additionally, reliance on crude rates alone could lead policymakers to overestimate worsening mortality risk, when in fact the trend may largely reflect demographic change rather than increasing disease severity.
Further testing on the data would be necessary to elaborate on the insights, such as comparing across causes of death and focusing down on a state-by-state comparison to ascertain which states may be affected by having an older population than others.
Additionally, because age-adjusted rates rely on a fixed year 2000 standard population, the interpretability of these comparisons may diminish as the real population structure continues to diverge from that baseline.
Even after reviewing the documentation, it remains unclear how age-adjustment interacts with state-level demographic shifts over time, particularly since age-specific mortality breakdowns are not provided in the dataset. Without knowing how mortality risk differs across age groups within each state, it is difficult to determine whether changes in age-adjusted rates reflect true improvements in cardiovascular health or simply demographic restructuring. This limitation introduces interpretive uncertainty that cannot be resolved using the available variables.
A quick check on missing values for the two main categorical columns is now being performed, Cause.Name and State since X113.Cause.Name is redundant. As can be seen, there are no missing values or groups in this dataset as it is rather clean currently.
sum(is.na(deaths$State))
## [1] 0
sum(is.na(deaths$Cause.Name))
## [1] 0
# Checking for implicitly missing state-year combinations
state_year_check <- deaths |>
count(State, Year)
state_year_check |>
summarise(min_count = min(n), max_count = max(n))
## # A tibble: 1 × 2
## min_count max_count
## <int> <int>
## 1 11 11
To assess implicitly missing data, I verified that each state appears consistently across all years and causes. The minimum and maximum observation counts per state-year combination are both 11, indicating that every state-year pair contains observations for all 11 causes of death. This confirms that there are no implicitly missing state-year records and that the dataset is structurally complete.
# Showing distribution of Heart Disease Deaths
# per 1000,000 by Year where the box plots are states
deaths |>
filter(Cause.Name == "Heart disease") |>
ggplot(aes(x = factor(Year), y = Deaths_per_100k)) +
geom_boxplot(outlier.color = "red") +
labs(
title = "Distribution of Heart Disease Deaths per 100,000 by Year",
x = "Year",
y = "Deaths per 100,000"
)
theme_minimal()
## <theme> List of 144
## $ line : <ggplot2::element_line>
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ lineend : chr "butt"
## ..@ linejoin : chr "round"
## ..@ arrow : logi FALSE
## ..@ arrow.fill : chr "black"
## ..@ inherit.blank: logi TRUE
## $ rect : <ggplot2::element_rect>
## ..@ fill : chr "white"
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ linejoin : chr "round"
## ..@ inherit.blank: logi TRUE
## $ text : <ggplot2::element_text>
## ..@ family : chr ""
## ..@ face : chr "plain"
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : chr "black"
## ..@ size : num 11
## ..@ hjust : num 0.5
## ..@ vjust : num 0.5
## ..@ angle : num 0
## ..@ lineheight : num 0.9
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 0
## ..@ debug : logi FALSE
## ..@ inherit.blank: logi TRUE
## $ title : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ point : <ggplot2::element_point>
## ..@ colour : chr "black"
## ..@ shape : num 19
## ..@ size : num 1.5
## ..@ fill : chr "white"
## ..@ stroke : num 0.5
## ..@ inherit.blank: logi TRUE
## $ polygon : <ggplot2::element_polygon>
## ..@ fill : chr "white"
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ linejoin : chr "round"
## ..@ inherit.blank: logi TRUE
## $ geom : <ggplot2::element_geom>
## ..@ ink : chr "black"
## ..@ paper : chr "white"
## ..@ accent : chr "#3366FF"
## ..@ linewidth : num 0.5
## ..@ borderwidth: num 0.5
## ..@ linetype : int 1
## ..@ bordertype : int 1
## ..@ family : chr ""
## ..@ fontsize : num 3.87
## ..@ pointsize : num 1.5
## ..@ pointshape : num 19
## ..@ colour : NULL
## ..@ fill : NULL
## $ spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ margins : <ggplot2::margin> num [1:4] 5.5 5.5 5.5 5.5
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 2.75 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.x.top : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 0
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 2.75 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.x.bottom : NULL
## $ axis.title.y : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : num 90
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.75 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.y.left : NULL
## $ axis.title.y.right : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : num -90
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 2.75
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : chr "#4D4D4DFF"
## ..@ size : 'rel' num 0.8
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 2.2 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x.top : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 4.95 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x.bottom : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 4.95 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 1
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.2 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y.left : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 4.95 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y.right : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 4.95
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.theta : NULL
## $ axis.text.r : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0.5
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.2 0 2.2
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.ticks : <ggplot2::element_blank>
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'rel' num 0.5
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : <ggplot2::element_blank>
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : <ggplot2::element_blank>
## $ legend.margin : NULL
## $ legend.spacing : 'rel' num 2
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : <ggplot2::element_blank>
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : NULL
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.key.justification : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : 'rel' num 0.8
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.text.position : NULL
## $ legend.title : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.title.position : NULL
## $ legend.position : chr "right"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## [list output truncated]
## @ complete: logi TRUE
## @ validate: logi TRUE
The visualization shows there are 3 outliers in terms of using the 1.5x IQR rule across the states where an outlier can be considered a value that falls outside of the 50% range of Heart Disease deaths across states by 1.5 times its width.
Now, the outlier will be identified and cross-checked.
# Filter to heart disease in 2002
data_2017 <- deaths |>
filter(Cause.Name == "Heart disease",
Year == 2002)
# Calculate IQR bounds
Q1 <- quantile(data_2017$Deaths_per_100k, 0.25, na.rm = TRUE)
Q3 <- quantile(data_2017$Deaths_per_100k, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outlier states
outliers_2017 <- data_2017 |>
filter(Deaths_per_100k < lower_bound |
Deaths_per_100k > upper_bound)
outliers_2017 |> select(State, Deaths_per_100k)
## # A tibble: 1 × 2
## State Deaths_per_100k
## <chr> <dbl>
## 1 Alaska 88.3
# Filter to heart disease in 2004
data_2017 <- deaths |>
filter(Cause.Name == "Heart disease",
Year == 2004)
# Calculate IQR bounds
Q1 <- quantile(data_2017$Deaths_per_100k, 0.25, na.rm = TRUE)
Q3 <- quantile(data_2017$Deaths_per_100k, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outlier states
outliers_2017 <- data_2017 |>
filter(Deaths_per_100k < lower_bound |
Deaths_per_100k > upper_bound)
outliers_2017 |> select(State, Deaths_per_100k)
## # A tibble: 1 × 2
## State Deaths_per_100k
## <chr> <dbl>
## 1 Alaska 89.3
# Filter to heart disease in 2007
data_2017 <- deaths |>
filter(Cause.Name == "Heart disease",
Year == 2007)
# Calculate IQR bounds
Q1 <- quantile(data_2017$Deaths_per_100k, 0.25, na.rm = TRUE)
Q3 <- quantile(data_2017$Deaths_per_100k, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outlier states
outliers_2017 <- data_2017 |>
filter(Deaths_per_100k < lower_bound |
Deaths_per_100k > upper_bound)
outliers_2017 |> select(State, Deaths_per_100k)
## # A tibble: 1 × 2
## State Deaths_per_100k
## <chr> <dbl>
## 1 Alaska 90.1
# Filter to heart disease and selected years
selected_data <- deaths |>
filter(Cause.Name == "Heart disease",
Year %in% c(2002, 2004, 2007))
# Identify outliers within each year using IQR rule
outliers_selected <- selected_data |>
group_by(Year) |>
mutate(
Q1 = quantile(Deaths_per_100k, 0.25, na.rm = TRUE),
Q3 = quantile(Deaths_per_100k, 0.75, na.rm = TRUE),
IQR_value = Q3 - Q1,
lower_bound = Q1 - 1.5 * IQR_value,
upper_bound = Q3 + 1.5 * IQR_value
) |>
filter(Deaths_per_100k < lower_bound |
Deaths_per_100k > upper_bound) |>
select(Year, State, Deaths_per_100k)
outliers_selected
## # A tibble: 3 × 3
## # Groups: Year [3]
## Year State Deaths_per_100k
## <dbl> <chr> <dbl>
## 1 2007 Alaska 90.1
## 2 2004 Alaska 89.3
## 3 2002 Alaska 88.3
# Creating a visualization to show population outliers, highlighting Alaska
deaths |>
filter(State != "United States") |>
group_by(Year, State) |>
summarise(Total_Population = mean(Total_Population), .groups = "drop") |>
ggplot(aes(x = factor(Year), y = Total_Population / 100000)) +
geom_boxplot(fill = "lightgray") +
geom_point(
data = deaths |>
filter(State == "Alaska") |>
group_by(Year) |>
summarise(Total_Population = mean(Total_Population)),
aes(x = factor(Year), y = Total_Population / 100000),
color = "red",
size = 2
) +
labs(
title = "State Population Distribution by Year (Alaska Highlighted)",
x = "Year",
y = "Population (Hundreds of Thousands)"
) +
theme_minimal()
As can be seen, Alaska (indicated by the red dots) consistently has one of the lowest total populations across time compared to other states. While death rates are standardized per 100,000 residents, smaller populations can produce greater variability in rate estimates from year to year. This increased variability may partially explain why Alaska appears as a statistical outlier in certain years, even if its underlying risk patterns are not fundamentally different from other states.
Therefore, although Alaska is mathematically classified as an outlier under the 1.5×IQR rule, this designation may reflect statistical variability rather than a meaningful structural difference in cardiovascular risk.
Additional visualizations and testing could further determine whether this pattern is isolated to heart disease or whether similar variability appears across other causes of death.