Project Milestone 4

Author

Usah Dutson and Jillian Kadota Tomlinson

Visualizations

1. Final data sets for creation of visualization

1) Join all data sets together

2) Calculate any remaining data elements needed for analysis

3) Show code used to create joined data set, but please do not print full data frame output (showing data structure with str() is okay)

#Age + Vaccination data
age_joined <- 
  inner_join(ca_vax_quarter_3_pivot, flu_age, 
              by = c("county",  "quarter", "age_cat")) %>%
  select(county, quarter, estimated_pop, fully_vaccinated, vax_rate, age_cat, pop, new_infections, flu_rate) %>%
  mutate(flu_risk = round(new_infections/pop * 100000,0)) %>%
  group_by(age_cat) %>%
  mutate(avg_inf_rate_age = round(mean(flu_rate),2)) %>%
ungroup()

str(age_joined)
tibble [693 × 11] (S3: tbl_df/tbl/data.frame)
 $ county          : chr [1:693] "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ quarter         : Date[1:693], format: "2023-04-01" "2023-01-01" ...
 $ estimated_pop   : num [1:693] 340355 340355 340355 340355 754988 ...
 $ fully_vaccinated: num [1:693] 214952 213845 210053 202047 669899 ...
 $ vax_rate        : num [1:693] 0.632 0.628 0.617 0.594 0.887 ...
 $ age_cat         : chr [1:693] "0-17" "0-17" "0-17" "0-17" ...
 $ pop             : num [1:693] 338015 338015 338015 1335 752145 ...
 $ new_infections  : num [1:693] 8102 115912 21974 3 18883 ...
 $ flu_rate        : num [1:693] 0.02397 0.34292 0.06501 0.00225 0.02511 ...
 $ flu_risk        : num [1:693] 2397 34292 6501 225 2511 ...
 $ avg_inf_rate_age: num [1:693] 0.12 0.12 0.12 0.12 0.11 0.11 0.11 0.11 0.1 0.1 ...
#Sex + Vaccination data
sex_joined <- 
  inner_join(ca_vax_quarter_3_pivot, flu_sex, 
              by = c("county",  "quarter", "sex")) %>%
  select(county, quarter, estimated_pop, fully_vaccinated, vax_rate, sex, pop, new_infections, flu_rate) %>%
  mutate(flu_risk = round(new_infections/pop * 100000,0)) %>%
  group_by(sex) %>%
  mutate(avg_inf_rate_sex = round(mean(flu_rate),2)) %>%
ungroup()

str(sex_joined)
tibble [427 × 11] (S3: tbl_df/tbl/data.frame)
 $ county          : chr [1:427] "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ quarter         : Date[1:427], format: "2023-04-01" "2023-01-01" ...
 $ estimated_pop   : num [1:427] 852167 852167 852167 852167 826167 ...
 $ fully_vaccinated: num [1:427] 727728 727140 724823 720211 685653 ...
 $ vax_rate        : num [1:427] 0.854 0.853 0.851 0.845 0.83 ...
 $ sex             : chr [1:427] "Female" "Female" "Female" "Female" ...
 $ pop             : num [1:427] 856569 856569 856569 192117 832969 ...
 $ new_infections  : num [1:427] 20887 295520 53445 417 20483 ...
 $ flu_rate        : num [1:427] 0.02438 0.345 0.06239 0.00217 0.02459 ...
 $ flu_risk        : num [1:427] 2438 34500 6239 217 2459 ...
 $ avg_inf_rate_sex: num [1:427] 0.11 0.11 0.11 0.11 0.1 0.1 0.1 0.1 0.11 0.1 ...
#Race/Ethnicity + Vaccination data
re_joined <- 
  inner_join(ca_vax_quarter_3_pivot, flu_race_ethnicity, 
              by = c("county",  "quarter", "race_ethnicity")) %>%
  select(county, quarter, estimated_pop, fully_vaccinated, vax_rate, race_ethnicity, pop, new_infections, flu_rate) %>%
  mutate(flu_risk = round(new_infections/pop * 100000,0)) %>%
  group_by(race_ethnicity) %>%
  mutate(avg_inf_risk_re= round(mean(flu_risk),2), 
         above_avg_inf_risk = ifelse(flu_risk > avg_inf_risk_re, "Above Average", "Below Average"),
         avg_vax_rate_re = round(mean(vax_rate),2),
         above_avg_vax_rate = ifelse(vax_rate > avg_vax_rate_re, "Above Average", "Below Average")) %>%
  ungroup()

str(re_joined)
tibble [1,052 × 14] (S3: tbl_df/tbl/data.frame)
 $ county            : chr [1:1052] "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ quarter           : Date[1:1052], format: "2023-04-01" "2023-01-01" ...
 $ estimated_pop     : num [1:1052] 4113 4113 4113 539237 539237 ...
 $ fully_vaccinated  : num [1:1052] 3525 3522 3512 472751 472305 ...
 $ vax_rate          : num [1:1052] 0.857 0.856 0.854 0.877 0.876 ...
 $ race_ethnicity    : chr [1:1052] "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "American Indian or Alaska Native, Non-Hispanic" "Asian, Non-Hispanic" ...
 $ pop               : num [1:1052] 4986 4986 4986 444318 444318 ...
 $ new_infections    : num [1:1052] 145 1704 301 11146 153428 ...
 $ flu_rate          : num [1:1052] 0.0291 0.3418 0.0604 0.0251 0.3453 ...
 $ flu_risk          : num [1:1052] 2908 34176 6037 2509 34531 ...
 $ avg_inf_risk_re   : num [1:1052] 8428 8428 8428 10711 10711 ...
 $ above_avg_inf_risk: chr [1:1052] "Below Average" "Above Average" "Below Average" "Below Average" ...
 $ avg_vax_rate_re   : num [1:1052] 0.53 0.53 0.53 0.69 0.69 0.69 0.69 0.56 0.56 0.56 ...
 $ above_avg_vax_rate: chr [1:1052] "Above Average" "Above Average" "Above Average" "Above Average" ...

2. Visualizations (at least one per group member)

1) Table 1. Interactive Table displaying flu and vaccination data across all counties in California for different levels of race/ethnicity by quarter.

Interpretation: We have chosen to display the flu and vaccination data across all counties in California for the different levels of race/ethnicity by quarter. We have flagged the groups of people that less than 70% of the population has been vaccinated against COVID 19 in yellow, and sorted by the risk of flu per 100,000 population. We have also included a column specifying if the infection risk for that race/ethnicity category in that given quarter is above the average infection risk for that race/ethnicity across all quarters of data. We note that Hispanic (any race) and Multiracial categories consistently have lower than 70% of the population vaccinated and have among the top flu risks per 100,000 in any quarter of data.

table1 <- re_joined %>% 
  select(county,quarter, race_ethnicity, vax_rate, flu_risk, above_avg_inf_risk) %>% 
  group_by(race_ethnicity, quarter) %>% 
  reframe(
    mean_vax_rate = mean(unique(vax_rate)),
    mean_flu_rate = mean(unique(flu_risk)),
    above_avg_inf_risk = unique(above_avg_inf_risk)) %>% 
  ungroup() 

datatable(table1,
          options = list(
            pageLength=8,
            lengthMenu=c(8,16,24,32),
            order=list(3,'desc'),
            columnDefs=list(
              list(className='dt-center',targets=0:4)
            ),
            dom = 'ltip'
          ),
          rownames=FALSE,
          colnames=c("Race and Ethnicity", "Quarter", "COVID Vaccination Rate", "Flu Risk per 100,000", "Infection Risk compared to Overall Race/Ethnicity average"),
          filter="top",
          editable=F,
          ) %>%
          formatRound(3,3) %>%
          formatRound(4,0) %>%
          formatStyle(3,
                      backgroundColor = styleInterval(
                                        c(0.700),c('yellow', 'white'))) 

2) Plot 1. Flu infection rate per 100,000 across all counties grouped by race and quarter.

Interpretation: This bar graph demonstrates that while some race/ethnicities including Alaska Native, White (Non-Hispanic) and Hispanic (any race) have proportionally higher infection rates across all quarters of data, it is clear that the quarter starting on 1/1/2023 (demonstrated by the medium blue part of each bar) consists of the highest number of infections across all races/ethnicities.

re_joined_bar <- re_joined %>%
  group_by(race_ethnicity, quarter) %>%
  summarize(total_infections=sum(new_infections),
            total_population=sum(unique(estimated_pop)),
            infection_rate=round(total_infections/total_population,2)) %>%
ungroup()
`summarise()` has grouped output by 'race_ethnicity'. You can override using
the `.groups` argument.
ggplot(re_joined_bar,
       aes(x=race_ethnicity, y=infection_rate, fill=quarter)) + 
  geom_bar(stat = "identity") +
  labs(x = "Race/Ethnicity", y = "% infected",
   title = "Percent of population infected by ehtnicty and quarter") +
theme(axis.text.x = element_text(size = 10, angle = 45, hjust = 1, vjust = 0.9))

3) Plot 2. Flu rate by county for Hispanic (any race) in the quarter starting on 1/1/2023.

Interpretation: Given some of the patterns we have observed in Table 1 and that flu rate seems to be consistent across race but highest in the quarter starting in January 2023 from plot 1, we have chosen to focus on the Hispanic (any race) population in additional visualizations to determine if there is any correlation between COVID vaccination rates and flu within a given race in the quarter with the highest infection rates. From this plot, we see that the proportion of the population with flu ranges from a low of 32.37K per 100,000 in San Mateo county to a high of 34.92K per 100,000 in Calaveras county for Hispanic (any race) populations.

re_plot_hisp <- re_joined %>%
  filter(quarter=="2023-01-01",
         race_ethnicity == "Hispanic (any race)") %>%
  select(county, quarter, flu_rate, vax_rate, race_ethnicity, pop, estimated_pop, fully_vaccinated, new_infections, flu_risk) 

plot_ly(
  re_plot_hisp,
  x=~county,
  y=~flu_risk,
  name="Flu Rate per 100,000",
  type="scatter",
  mode="markers") %>%
layout(xaxis=list(title="Flu Rate by county for Hispanic (any race) in the quarter starting 2023-01-01"),
       yaxis=list(title="Flu Rate per 100,000"))

4) Plot 3. Flu rate by county for Hispanic (any race) in the quarter starting on 1/1/2023 vs. proportion of the population that is fully vaccinated for COVID 19.

Interpretation: In order to see if there is any correlation between the rate of flu and the proportion of the population that is vaccinated, we plot the proportion of the population that is vaccinated vs. the rate of flu. We see that within Hispanic populations in the quarter starting on 1/1/23, there may be pattern demonstrating potential lower rates of flu infection per 100,000 among populations with higher proportions who are fully vaccinated against COVID-19, as demonstrated by the blue descending line of best fit.

ggplot(re_plot_hisp, aes(x = vax_rate, y = flu_risk)) +
  geom_point() +
geom_label_repel(aes(label=county, size = NULL), nudge_y = 0.5) +
  geom_smooth(fullrange=TRUE, method = "lm", se = FALSE) +
  labs(x = "Proportion of Population that is Vaccinated, Hispanic (any race)", y = "Flu rate per 100,000 population",
                  title = "Flu Rate Compared to % of Vaccinated Hispanic Population",
                  subtitle = "Data by county in quarter starting 1/1/2023")
`geom_smooth()` using formula = 'y ~ x'