Summary

I used the US Contagious Diseases data set to look at the rate of Hepatitis A in the South, West, Midwest, and Northeast. I created an interactive line chart using Plotly. I colored-coded the different regions, remove the grid-lines and legend title. I also made the background gray.

Load Packages

library("dslabs")
data(package="dslabs")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(RColorBrewer)
library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Load Dataset

setwd("/Users/smhenderson/Desktop/DATA110/R/Datasets")
diseases <- read_csv ("us_contagious_diseases.csv")
## Rows: 16065 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): disease, state
## dbl (4): year, weeks_reporting, count, population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#str(diseases)

Create Dataset to work with (disease = Hep A)

diseases2 <- diseases %>%
  filter(disease == "Hepatitis A")

Calculate the rate per 10,000 and remove NAs

diseases3 <- diseases2 %>%
 filter(!state%in%c("Hawaii","Alaska")) %>%
 mutate(rate = count / population * 10000 * 52 / weeks_reporting) %>%
  filter(!is.na(disease) & !is.na(rate))
#unique(diseases3$state)

Assign Regions (West, South, Midwest, Northeast)

diseases_regions <- diseases3 %>%
  mutate(region = ifelse(state %in% c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho", "Montana", "Nevada", "Oregon",
                                      "Utah", "Washington", "Wyoming", "New Mexico"), "West",
                         ifelse(state %in% c("Alabama", "Arkansas", "Delaware", "District Of Columbia", "Florida", "Georgia", "Kentucky", "Louisiana", 
                                                "Maryland", "Mississippi", "North Carolina", "Oklahoma", "South Carolina",
                                                "Tennessee", "Texas", "Virginia", "West Virginia"), "South",
                                                ifelse(state %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan",
                                                                      "Minnesota", "Missouri", "Nebraska", "North Dakota",
                                                                      "Ohio", "South Dakota", "Wisconsin"), "Midwest",
                                                       ifelse(state %in% c("Connecticut", "Maine", "Massachusetts",
                                                                              "New Hampshire", "New Jersey", "New York",
                                                                              "Pennsylvania","Rhode Island", "Vermont"), "Northeast", NA)))))
#any_na <- any(is.na(diseases_regions$region))
#na_rows <- diseases_regions[is.na(diseases_regions$region), ]

Subset data, aggregate data by region, & calculate the avg rate for each region

subset_data <- diseases_regions[, c("region", "rate", "year")]
aggregate_data <- aggregate(rate ~ region + year, data = subset_data, FUN = mean)

#group the regions and years together & calculate the mean/avg rate
aggregate_data2 <- aggregate_data %>%
  group_by(region, year) %>%
  summarize(rate = mean(rate), .groups = "drop")

Create interactive line chart using Plotly

p <- plot_ly(data = aggregate_data, x = ~year, y = ~rate, type = "scatter",
             mode = "markers+lines", color = ~region, colors = c("green", "blue", "red", "orange"),
             marker = list(size = 8, line = list(color = "black", width = 0.5)),
             line = list(width = 1)) %>%
  layout(title = "Trends in Hepatitis A By US Regions",
         xaxis = list(title = "Year", showgrid = FALSE),
         yaxis = list(title = "Rate", showgrid = FALSE),
         legend = list(title = "Regions", x = 1.1),
         paper_bgcolor = "gray", plot_bgcolor = "gray",
         font = list(color = "white"))
p