I used the US Contagious Diseases data set to look at the rate of Hepatitis A in the South, West, Midwest, and Northeast. I created an interactive line chart using Plotly. I colored-coded the different regions, remove the grid-lines and legend title. I also made the background gray.
library("dslabs")
data(package="dslabs")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(RColorBrewer)
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
setwd("/Users/smhenderson/Desktop/DATA110/R/Datasets")
diseases <- read_csv ("us_contagious_diseases.csv")
## Rows: 16065 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): disease, state
## dbl (4): year, weeks_reporting, count, population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#str(diseases)
diseases2 <- diseases %>%
filter(disease == "Hepatitis A")
diseases3 <- diseases2 %>%
filter(!state%in%c("Hawaii","Alaska")) %>%
mutate(rate = count / population * 10000 * 52 / weeks_reporting) %>%
filter(!is.na(disease) & !is.na(rate))
#unique(diseases3$state)
diseases_regions <- diseases3 %>%
mutate(region = ifelse(state %in% c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho", "Montana", "Nevada", "Oregon",
"Utah", "Washington", "Wyoming", "New Mexico"), "West",
ifelse(state %in% c("Alabama", "Arkansas", "Delaware", "District Of Columbia", "Florida", "Georgia", "Kentucky", "Louisiana",
"Maryland", "Mississippi", "North Carolina", "Oklahoma", "South Carolina",
"Tennessee", "Texas", "Virginia", "West Virginia"), "South",
ifelse(state %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan",
"Minnesota", "Missouri", "Nebraska", "North Dakota",
"Ohio", "South Dakota", "Wisconsin"), "Midwest",
ifelse(state %in% c("Connecticut", "Maine", "Massachusetts",
"New Hampshire", "New Jersey", "New York",
"Pennsylvania","Rhode Island", "Vermont"), "Northeast", NA)))))
#any_na <- any(is.na(diseases_regions$region))
#na_rows <- diseases_regions[is.na(diseases_regions$region), ]
subset_data <- diseases_regions[, c("region", "rate", "year")]
aggregate_data <- aggregate(rate ~ region + year, data = subset_data, FUN = mean)
#group the regions and years together & calculate the mean/avg rate
aggregate_data2 <- aggregate_data %>%
group_by(region, year) %>%
summarize(rate = mean(rate), .groups = "drop")
p <- plot_ly(data = aggregate_data, x = ~year, y = ~rate, type = "scatter",
mode = "markers+lines", color = ~region, colors = c("green", "blue", "red", "orange"),
marker = list(size = 8, line = list(color = "black", width = 0.5)),
line = list(width = 1)) %>%
layout(title = "Trends in Hepatitis A By US Regions",
xaxis = list(title = "Year", showgrid = FALSE),
yaxis = list(title = "Rate", showgrid = FALSE),
legend = list(title = "Regions", x = 1.1),
paper_bgcolor = "gray", plot_bgcolor = "gray",
font = list(color = "white"))
p