Section 11.15
1. Reproduce the image plot we previously made but for smallpox. For this plot, do not include years in which cases were not reported in 10 or more weeks.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RColorBrewer)
library(dslabs)
data(us_contagious_diseases)
names(us_contagious_diseases)
## [1] "disease" "state" "year" "weeks_reporting"
## [5] "count" "population"
the_disease <- "Smallpox"
dat <- us_contagious_diseases |> filter(!state%in%c("Hawaii", "Alaska") & disease==the_disease & weeks_reporting>=10) |> mutate(rate=count/population*10000*52/ weeks_reporting) |> mutate(state=reorder(state, ifelse(year<=1963, rate, NA), median, na.rm=TRUE))
dat |> ggplot(aes(year, state, fill=rate)) + geom_tile(color="grey50") +
scale_x_continuous(expand=c(0,0)) +
scale_fill_gradientn(colors=brewer.pal(9, "Reds"), trans="sqrt") +
geom_vline(xintercept=1963, col="blue") +
theme_minimal() + theme(panel.grid=element_blank(), legend.position="bottom",
text=element_text (size=8)) +
labs(title=the_disease, x="", y="")
2. Now reproduce the time series plot we previously made, but this time following the instructions of the previous question for smallpox.
data(us_contagious_diseases)
names(us_contagious_diseases)
## [1] "disease" "state" "year" "weeks_reporting"
## [5] "count" "population"
the_disease <- "Smallpox"
dat <- us_contagious_diseases |> filter(!state%in%c("Hawaii", "Alaska") & disease
==the_disease & weeks_reporting>=10) |> mutate(rate=count/population*10000) |> mutate(state=reorder(state, rate))
dat |> filter(state=="California" & !is.na(rate))|>
ggplot(aes(year, rate)) +
geom_line() +
ylab("Cases per 10,000") +
geom_vline(xintercept=1963, col="blue")
avg <- us_contagious_diseases |>
filter(disease==the_disease & weeks_reporting>=10) |> group_by(year) |>
summarize(us_rate=sum(count, na.rm=TRUE)/sum(population, na.rm=TRUE)*10000)
dat |> ggplot() +
geom_line(aes(year, rate, group=state), color="grey50",
show.legend=FALSE, alpha=0.2, size=1) +
geom_line(mapping=aes(year, us_rate), data=avg, size=1) +
scale_y_continuous(trans="sqrt", breaks=c(5, 25, 125, 300)) +
ggtitle("Cases per 10,000 by state") + xlab("") + ylab("") +
geom_text(data=data.frame(x=1955, y=50), mapping=aes(x, y, label="US average"),
color="black") +
geom_vline(xintercept=1963, col="blue")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
3. For the state of California, make a time series plot showing rates for all diseases. Include only years with 10 or more weeks reporting. Use a different color for each disease.
dat2<-us_contagious_diseases |> filter(state=="California" & weeks_reporting>=10)|> group_by(year, disease) |> summarize(rate=sum(count)/sum(population)*10000) |> ggplot(aes(year, rate, color=disease)) + geom_line()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
dat2
4. Now do the same for the rates for the US. Hint: compute the US rate by using summarize: the total divided by total population.
us_contagious_diseases|>filter(!is.na(population))|>group_by(year, disease)|>
summarise(rate=sum(count)/sum(population)*100000)|>ggplot(aes(year, rate, color =disease))+geom_line()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.