project_milestone 4

Final datasets for creation of visualization

# Join all datasests together 
combined_with_pop <- combined_dataset %>%
  left_join(
    ca_pop_2023, 
    by = c(
      "county",
      "age_category",
      "sex",
      "race_ethnicity"
    )
  )

# Calculate infection rates per 100,000 population
combined_with_pop1 <- combined_with_pop %>%
  mutate(infection_rate = (new_infections / pop) * 100000)

str(combined_with_pop1)
tibble [100,688 × 12] (S3: tbl_df/tbl/data.frame)
 $ county             : chr [1:100688] "alameda" "alameda" "alameda" "alameda" ...
 $ age_category       : chr [1:100688] "0-17" "0-17" "0-17" "0-17" ...
 $ sex                : chr [1:100688] "female" "female" "female" "female" ...
 $ race_ethnicity     : chr [1:100688] "white, non-hispanic" "white, non-hispanic" "white, non-hispanic" "white, non-hispanic" ...
 $ date_diagnosis     : Date[1:100688], format: "2023-05-29" "2023-06-05" ...
 $ date_report        : Date[1:100688], format: "2023-06-03" "2023-06-10" ...
 $ new_infections     : num [1:100688] 6 1 2 10 19 25 23 18 22 35 ...
 $ cumulative_infected: num [1:100688] 6 7 9 19 38 63 86 104 126 161 ...
 $ new_severe         : num [1:100688] 0 0 1 0 0 0 0 0 0 0 ...
 $ cumulative_severe  : num [1:100688] 0 0 1 1 1 1 1 1 1 1 ...
 $ pop                : num [1:100688] 34155 34155 34155 34155 34155 ...
 $ infection_rate     : num [1:100688] 17.57 2.93 5.86 29.28 55.63 ...
# Display the aggregated data in a clear format
library(knitr)
library(kableExtra)

Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':

    group_rows
# Create a summary table of the aggregated data
summary_table <- aggregated_data %>%
  arrange(desc(total_cases)) %>% # Sort by total cases in descending order
  head(10) # Show the top 10 rows

# Render the table with formatting
summary_table %>%
  kable(
    col.names = c("Year", "Month", "County", "Race/Ethnicity", "Total Cases", "Avg Infection Rate", "Population"),
    caption = "Top 10 Rows of Aggregated Data by Year, Month, County, and Race/Ethnicity",
    align = "c",
    format = "html"
  ) %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  add_footnote("Data aggregated by year, month, county, and race/ethnicity. Infection rates are per 100,000 population.",
               notation = "none")
Top 10 Rows of Aggregated Data by Year, Month, County, and Race/Ethnicity
Year Month County Race/Ethnicity Total Cases Avg Infection Rate Population
2023 Sep los angeles hispanic 136295 879.6760 545522
2023 Oct los angeles hispanic 135444 700.6342 545522
2023 Aug los angeles hispanic 86148 559.5841 545522
2023 Sep san bernardino hispanic 78181 1746.3274 177555
2023 Oct los angeles white, non-hispanic 76769 498.4006 181757
2023 Sep los angeles white, non-hispanic 76632 622.4045 181757
2023 Oct san bernardino hispanic 72702 1295.7942 177555
2023 Aug san bernardino hispanic 51891 1171.1056 177555
2023 Sep san bernardino white, non-hispanic 50059 2125.5053 49276
2023 Aug los angeles white, non-hispanic 48612 393.0919 181757
Data aggregated by year, month, county, and race/ethnicity. Infection rates are per 100,000 population.

Table Interpretation

The table highlights the top 10 county and race/ethnicity combinations with the highest total cases, along with their average infection rates per 100,000 population and population size. This provides insight into which groups experienced the most significant disease burden during the observed time period.

# Load visualization library
library(ggplot2)

# Select top 3 counties by total cases for visualization
top_counties <- aggregated_data %>%
  group_by(county) %>%
  summarize(total_cases = sum(total_cases)) %>%
  top_n(3, total_cases) %>%
  pull(county)

# Filter data for top counties
visual_data <- aggregated_data %>%
  filter(county %in% top_counties)

# Create the plot
plot <- ggplot(visual_data, aes(x = month_diagnosed, y = total_cases, color = county, group = county)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  labs(
    title = "Monthly Trends in Total Cases for Top 3 Counties",
    x = "Month",
    y = "Total Cases",
    color = "County",
    caption = "Data Source: Aggregated dataset combining morbidity and population data."
  ) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal(base_size = 14)

# Display the plot
print(plot)

Plot Interpretation

The line plot shows monthly trends in total cases for the top 3 counties by total cases, revealing temporal patterns in disease spread. It helps identify periods of peak infection and differences in the magnitude of cases between counties, offering a foundation for understanding localized outbreak dynamics.

combined_with_pop2 <- combined_with_pop %>%
  filter(age_category == "65+") %>%
 group_by(county, age_category,sex) %>% #group by age_group, county and sex
  summarise(total_cumulative_case= sum (cumulative_infected), #calculate total number of cumulative infected
        total_pop= sum (pop), #calculate total number of population
      cumulative_rate= round ((total_cumulative_case/total_pop)*100000, 0))%>% #calculate total case rate
  arrange(desc(cumulative_rate)) %>% ungroup() # Sort by cumulative rate in descending order
`summarise()` has grouped output by 'county', 'age_category'. You can override
using the `.groups` argument.
library(DT)

# Render the datatable with title, footnote, and conditional formatting
datatable(combined_with_pop2,
          options = list(
            pageLength=10,
            lengthMenu=c(10,20,60),
            columnDefs=list(
              list(className='dt-center',targets=1:3)),  # list(visible=FALSE,targets=3)
            dom = 'ltp'
          ),
          rownames=FALSE,
         colnames = c("County", "Age Group", "Total Case", "Sex", "Total Population", "Total Case Rate"),
  caption = htmltools::tags$caption(
    style = "caption-side: top; text-align: center; font-size: 16px; font-weight: bold;",
    "Title: COVID-19 Cumulative Rates by County, Sex and Age Group 65 and Above, 2023" #title name 
  )
)%>%
formatStyle(
    'cumulative_rate', # Apply formatting to the 'cumulative_rate' column
    fontWeight = styleInterval(8489, c("normal", "bold"))
)%>%
  htmlwidgets::prependContent(
    htmltools::tags$div(
      style = "text-align: right; font-size: 12px; font-style: italic; margin-top: 10px;",
      "Footnote: Case rates are per 100,000 population age group 65 and above stratified by sex in 2023" #Footnote
    )
  )
Footnote: Case rates are per 100,000 population age group 65 and above stratified by sex in 2023

Table Interpretation

This table presents the total case rate among residents aged 65 and older, categorized by county and gender, for the year 2023. The data reveals a higher prevalence of cases among individuals in this age group residing in Imperial, Tulare, Kings, and Kern counties. These findings highlight the need for further investigation into vaccination uptake and accessibility for residents aged 65 and older in these counties.

library(pacman)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(gt)
library(htmlwidgets)

top_counties_65<- combined_with_pop2 %>%
  slice_head(n = 20) %>% mutate(county = str_to_title(county), 
                                sex= str_to_title(sex))  

plot_ly(
  top_counties_65,
  x = ~county,
  y = ~cumulative_rate,
  color = ~sex,
  type = "bar",
  text = ~cumulative_rate, # Add data labels
  textposition = "auto",  # Automatically position data labels
  colors = c("darkgreen", "darkorange") # Customize colors for sex
) %>%
  layout(
    barmode = "stack", 
    title = "COVID-19 Cumulative Rates Among Age Group 65 and Above by Top 10 Counties and Sex, 2023",
    yaxis = list(title = "Case Rate per 100,000"),
    xaxis = list(title = "County")
  )
Warning: `arrange_()` was deprecated in dplyr 0.7.0.
ℹ Please use `arrange()` instead.
ℹ See vignette('programming') for more help
ℹ The deprecated feature was likely used in the plotly package.
  Please report the issue at <https://github.com/ropensci/plotly/issues>.

Plot Interpretation

The bar plot highlights the top 10 counties in California with the highest COVID-19 case rates per 100,000 population among residents aged 65 and older in 2023. Imperial County stands out with the highest case rate in this age group, highlighting the highest cases rate in the region.