library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.2
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df <- read_csv("NCHS_final_2000_2017_with_population.csv")
## Rows: 10296 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): X113.Cause.Name, Cause.Name, State
## dbl (5): Year, Deaths, Age.adjusted.Death.Rate, Total_Population, Deaths_per...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
New columns are being created to illustrate how many total deaths occurred over the time period of 2000-2017 by cause, the total population of each state over the course of the time period (calculated by taking the population of each year for each state and summing them together), and the cumulative death rate per 100k of the values of those two results. This will help determine and illustrate which causes of death were the most significant in the time period as a whole per state.
pop_exposure <- df |>
select(State, Year, Total_Population) |>
distinct() |>
group_by(State) |>
summarise(total_pop_exposure = sum(Total_Population),
.groups = "drop")
deaths_by_cause <- df |>
group_by(State, Cause.Name) |>
summarise(total_deaths_18yr = sum(Deaths),
.groups = "drop")
cumulative_df <- deaths_by_cause |>
left_join(pop_exposure, by = "State") |>
mutate(cum_rate_per_100k =
(total_deaths_18yr / total_pop_exposure) * 100000)
df <- df |>
left_join(cumulative_df,
by = c("State", "Cause.Name"))
# Checking to see if prior cells performed correctly
df |>
filter(State == "Vermont") |>
select(Year, Total_Population) |>
distinct() |>
arrange(Year)
## # A tibble: 18 × 2
## Year Total_Population
## <dbl> <dbl>
## 1 2000 609618
## 2 2001 612223
## 3 2002 615442
## 4 2003 617858
## 5 2004 619920
## 6 2005 621215
## 7 2006 622892
## 8 2007 623481
## 9 2008 624151
## 10 2009 624817
## 11 2010 626366
## 12 2011 629600
## 13 2012 630693
## 14 2013 632859
## 15 2014 633872
## 16 2015 635912
## 17 2016 636397
## 18 2017 639087
df |>
filter(State == "Vermont") |>
select(Year, Total_Population) |>
distinct() |>
summarise(sum_pop = sum(Total_Population))
## # A tibble: 1 × 1
## sum_pop
## <dbl>
## 1 11256403
pop_exposure |>
filter(State == "Vermont")
## # A tibble: 1 × 2
## State total_pop_exposure
## <chr> <dbl>
## 1 Vermont 11256403
df |>
filter(State == "Vermont",
Cause.Name == "Kidney disease") |>
summarise(sum_deaths = sum(Deaths))
## # A tibble: 1 × 1
## sum_deaths
## <dbl>
## 1 836
cumulative_df |>
filter(State == "Vermont",
Cause.Name == "Kidney disease")
## # A tibble: 1 × 5
## State Cause.Name total_deaths_18yr total_pop_exposure cum_rate_per_100k
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Vermont Kidney disease 836 11256403 7.43
test <- cumulative_df |>
filter(State == "Vermont",
Cause.Name == "Kidney disease")
test$total_deaths_18yr /
test$total_pop_exposure * 100000
## [1] 7.426884
test$cum_rate_per_100k
## [1] 7.426884
summary(cumulative_df$cum_rate_per_100k)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.998 20.765 39.243 136.012 165.054 1177.411
df |>
group_by(State, Year) |>
summarise(n_rows = n()) |>
summarise(min(n_rows), max(n_rows))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by State and Year.
## ℹ Output is grouped by State.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(State, Year))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
## # A tibble: 52 × 3
## State `min(n_rows)` `max(n_rows)`
## <chr> <int> <int>
## 1 Alabama 11 11
## 2 Alaska 11 11
## 3 Arizona 11 11
## 4 Arkansas 11 11
## 5 California 11 11
## 6 Colorado 11 11
## 7 Connecticut 11 11
## 8 Delaware 11 11
## 9 District of Columbia 11 11
## 10 Florida 11 11
## # ℹ 42 more rows
df |>
filter(State == "Vermont") |>
select(Year, Total_Population) |>
distinct() |>
summarise(mean_pop = mean(Total_Population),
exposure = sum(Total_Population))
## # A tibble: 1 × 2
## mean_pop exposure
## <dbl> <dbl>
## 1 625356. 11256403
heart_df <- df |>
filter(Cause.Name == "Heart disease")
heart_state <- heart_df |>
group_by(State) |>
summarise(
avg_population = mean(Total_Population),
avg_death_rate = mean(Deaths_per_100k),
.groups = "drop"
)
heart_state <- heart_df |>
filter(State != "United States") |>
group_by(State) |>
summarise(
avg_population = mean(Total_Population),
avg_death_rate = mean(Deaths_per_100k),
.groups = "drop"
) |>
mutate(pop_millions = avg_population / 1000000)
# Fit model
model <- lm(avg_death_rate ~ pop_millions, data = heart_state)
# Create sorted data for line
line_data <- heart_state |>
arrange(pop_millions)
# Add predictions explicitly
line_data$predicted <- predict(model, newdata = line_data)
# Create plot
p <- ggplot(heart_state,
aes(x = pop_millions,
y = avg_death_rate,
text = paste0("<b>", State, "</b>",
"<br>Population: ",
round(pop_millions, 2), " million",
"<br>Heart Disease Rate: ",
round(avg_death_rate, 1),
" per 100,000"))) +
geom_point(size = 2) +
geom_line(data = line_data,
aes(x = pop_millions, y = predicted),
color = "red",
linewidth = 1,
inherit.aes = FALSE) +
labs(title = "State Population vs Heart Disease Death Rate",
x = "Average State Population (millions)",
y = "Average Heart Disease Deaths per 100,000")
ggplotly(p, tooltip = "text")
Each point in the plot represents one state, using its average population (in millions) and its average heart disease death rate per 100,000 people from 2000–2017.
The scatterplot shows virtually no overall trend, and the regression line is nearly flat. The points are widely scattered across population sizes, indicating that there is no strong linear relationship between state population and heart disease mortality rate. There are no extreme outliers that substantially distort the pattern, and states with higher or lower mortality rates are spread across both large and small populations.
Overall, the visualization suggests that population size does not meaningfully predict heart disease mortality rates. Because death rates are standardized per 100,000 people, differences in mortality across states are likely influenced by other factors such as healthcare access, socioeconomic conditions, age distribution, or health behaviors rather than overall population size.
cor(heart_state$pop_millions,
heart_state$avg_death_rate)
## [1] -0.004324441
The Pearson correlation coefficient between average state population (in millions) and average heart disease death rate is r = -0.004.
This value is extremely close to zero, indicating virtually no linear relationship between population size and heart disease mortality rate.
This result aligns with the visualization. The regression line in the scatterplot appeared nearly flat, and the points were widely dispersed across population sizes. Although the value is slightly negative, its magnitude is so close to zero that it indicates no meaningful linear relationship.
This makes conceptual sense because heart disease mortality rates are already standardized per 100,000 people. Standardization removes the direct effect of population size, so we would not expect larger states to systematically have higher or lower mortality rates based on size alone.
Overall, population size does not appear to be a meaningful predictor of heart disease mortality rate.
t.test(heart_state$avg_death_rate)
##
## One Sample t-test
##
## data: heart_state$avg_death_rate
## t = 36.082, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 198.7344 222.1642
## sample estimates:
## mean of x
## 210.4493
A 95% confidence interval was constructed for the mean heart disease death rate across U.S. states from 2000–2017. We are 95% confident that the true mean state-level heart disease mortality rate lies between 198.73 and 222.16 deaths per 100,000 people, with a sample mean of 210.45 deaths per 100,000.
This interval suggests that the average burden of heart disease mortality across states during this period was centered around approximately 210 deaths per 100,000. The width of the interval indicates moderate variation between states, but the overall mean appears relatively stable within this range.
Combined with the near-zero correlation between population size and mortality rate, this reinforces the conclusion that differences in heart disease mortality across states are not explained by population size alone. Instead, other demographic, socioeconomic, or health-related factors likely contribute more meaningfully to variation in mortality rates.
heart_year <- df |>
filter(State != "United States",
Cause.Name == "Heart disease") |>
group_by(Year) |>
summarise(
avg_death_rate = mean(Deaths_per_100k),
.groups = "drop"
)
heart_year
## # A tibble: 18 × 2
## Year avg_death_rate
## <dbl> <dbl>
## 1 2000 248.
## 2 2001 243.
## 3 2002 239.
## 4 2003 234.
## 5 2004 221.
## 6 2005 219.
## 7 2006 211.
## 8 2007 204.
## 9 2008 203.
## 10 2009 196.
## 11 2010 194.
## 12 2011 192.
## 13 2012 193.
## 14 2013 195.
## 15 2014 195.
## 16 2015 200.
## 17 2016 200.
## 18 2017 202.
model_year <- lm(avg_death_rate ~ Year, data = heart_year)
line_year <- heart_year |>
arrange(Year)
line_year$predicted <- predict(model_year, newdata = line_year)
p <- ggplot(heart_year,
aes(x = Year,
y = avg_death_rate,
text = paste0("<b>Year:</b> ", Year,
"<br>Avg Heart Disease Rate: ",
round(avg_death_rate, 1),
" per 100,000"))) +
geom_point(size = 2) +
geom_line() +
geom_line(data = line_year,
aes(x = Year, y = predicted),
color = "red",
linewidth = 1,
inherit.aes = FALSE) +
labs(title = "Trend in Heart Disease Death Rate (2000–2017)",
x = "Year",
y = "Average Heart Disease Deaths per 100,000")
ggplotly(p, tooltip = "text")
Each point in the plot represents one year from 2000 to 2017, showing the average heart disease death rate per 100,000 across states.
The visualization shows a clear downward trend in heart disease mortality from 2000 through roughly 2011. After this period, the decline slows and rates begin to level off, with a slight increase toward the end of the time span. The red regression line confirms an overall negative linear trend across the full period.
There are no extreme outliers that substantially distort the pattern. The early years show consistent decline, while later years indicate stabilization.
Overall, the visualization suggests that heart disease mortality rates decreased significantly over the study period, though the pace of improvement slowed in the later years. This pattern raises important questions about what factors contributed to the early decline and why progress appears to have plateaued after approximately 2011–2013.
cor(heart_year$Year,
heart_year$avg_death_rate)
## [1] -0.855878
The Pearson correlation coefficient between Year and average heart disease death rate is r = -0.856.
This indicates a strong negative linear relationship between year and heart disease mortality. As the years increase from 2000 to 2017, the average heart disease death rate tends to decrease.
This result aligns with the visualization. The regression line slopes downward, and the plotted values show a clear overall decline over time, particularly in the early and mid-2000s. The magnitude of the correlation confirms that the downward trend is substantial rather than minor.
Overall, the correlation supports the conclusion that heart disease mortality rates declined significantly over the study period.
t.test(heart_year$avg_death_rate)
##
## One Sample t-test
##
## data: heart_year$avg_death_rate
## t = 47.24, df = 17, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 201.0503 219.8483
## sample estimates:
## mean of x
## 210.4493
A 95% confidence interval was constructed for the mean heart disease death rate from 2000 to 2017.
Since this interval is based on year-level averages, it represents the average annual heart disease mortality rate across the 18-year period.
The results show that we are 95% confident that the true mean heart disease mortality rate during this period lies between 201.05 and 219.85 deaths per 100,000 people. The sample mean was 210.45 deaths per 100,000, which falls within this interval as expected.
This interval provides an estimate of the overall average heart disease mortality rate across the study period. The relatively moderate width of the interval suggests some variation across years, but the mean appears to be centered around approximately 210 deaths per 100,000.
Overall, this supports the conclusion that while mortality rates declined over time, the average rate across the full 18-year period remained around this level.
This data dive examined two relationships involving heart disease mortality rates: differences across states and changes over time. In the first analysis, state population size was not meaningfully associated with heart disease mortality rates. The visualization, near-zero correlation, and confidence interval all supported the conclusion that population size alone does not explain variation in mortality across states. Instead, other demographic and socioeconomic factors are likely more influential.
In contrast, the second analysis revealed a strong negative relationship between year and heart disease mortality rate. Both the visualization and correlation demonstrated a substantial decline in mortality from 2000 through the early 2010s, followed by a period of slower improvement. This suggests that while significant progress was made over time, recent trends may indicate a plateau.
Together, these analyses demonstrate how visualization, correlation, and confidence intervals can be used to evaluate relationships and quantify uncertainty. The findings highlight that temporal trends played a much larger role in explaining variation in heart disease mortality than population size during this period.