Week 8 Assignment

Author

Qian He

Load the data

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("dslabs")
#list out all Data sets in package ‘dslabs’
data(package="dslabs")
#load the data
data("us_contagious_diseases")
head("us_contagious_diseases")

[1] "us_contagious_diseases"

#clean nas
disease_nona <- us_contagious_diseases |>
  filter(!is.na(disease) & 
           !is.na(state) &
           !is.na(year) &
           !is.na(weeks_reporting) &
           !is.na(count))
      
#view dataset
head(disease_nona)

      disease   state year weeks_reporting count population
1 Hepatitis A Alabama 1966              50   321    3345787
2 Hepatitis A Alabama 1967              49   291    3364130
3 Hepatitis A Alabama 1968              52   314    3386068
4 Hepatitis A Alabama 1969              49   380    3412450
5 Hepatitis A Alabama 1970              51   413    3444165
6 Hepatitis A Alabama 1971              51   378    3481798

Group all states into 4 regions:Northeast,South,North Central, West

disease_nona1<-disease_nona |>
  mutate(region=case_when(
    state %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey", "New   York", "Pennsylvania", "Rhode Island", "Vermont","New York")~"Northeast",
    state %in% c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia", "Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma", "South Carolina", "Tennessee", "Texas", "Virginia","West Virginia","District Of Columbia")~"South",
    state %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota", "Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin")~"North Central",
    state %in% c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho", "Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming")~"West",
    TRUE ~ "Others"))

Replace each state names with its abbreviation

#convert state to charactor to change its value into abbreviation
disease_nona1 <- disease_nona1 |>
  mutate(state=as.character(state))

disease_nona1$state[disease_nona1$state == "Alabama"] <- "AL"
disease_nona1$state[disease_nona1$state == "Alaska"] <- "AK"
disease_nona1$state[disease_nona1$state == "Arizona"] <- "AZ"
disease_nona1$state[disease_nona1$state == "Arkansas"] <- "AR"
disease_nona1$state[disease_nona1$state == "California"] <- "CA"
disease_nona1$state[disease_nona1$state == "Colorado"] <- "CO"
disease_nona1$state[disease_nona1$state == "Connecticut"] <- "CT"
disease_nona1$state[disease_nona1$state == "Delaware"] <- "DE"
disease_nona1$state[disease_nona1$state == "District Of Columbia"] <- "DC"
disease_nona1$state[disease_nona1$state == "Florida"] <- "FL"
disease_nona1$state[disease_nona1$state == "Georgia"] <- "GA"
disease_nona1$state[disease_nona1$state == "Hawaii"] <- "HI"
disease_nona1$state[disease_nona1$state == "Idaho"] <- "ID"
disease_nona1$state[disease_nona1$state == "Illinois"] <- "IL"
disease_nona1$state[disease_nona1$state == "Indiana"] <- "IN"
disease_nona1$state[disease_nona1$state == "Iowa"] <- "IA"
disease_nona1$state[disease_nona1$state == "Kansas"] <- "KS"
disease_nona1$state[disease_nona1$state == "Kentucky"] <- "KY"
disease_nona1$state[disease_nona1$state == "Louisiana"] <- "LA"
disease_nona1$state[disease_nona1$state == "Maine"] <- "ME"
disease_nona1$state[disease_nona1$state == "Maryland"] <- "MD"
disease_nona1$state[disease_nona1$state == "Massachusetts"] <- "MA"
disease_nona1$state[disease_nona1$state == "Michigan"] <- "MI"
disease_nona1$state[disease_nona1$state == "Minnesota"] <- "MN"
disease_nona1$state[disease_nona1$state == "Mississippi"] <- "MS"
disease_nona1$state[disease_nona1$state == "Missouri"] <- "MO"
disease_nona1$state[disease_nona1$state == "Montana"] <- "MT"
disease_nona1$state[disease_nona1$state == "Nebraska"] <- "NE"
disease_nona1$state[disease_nona1$state == "Nevada"] <- "NV"
disease_nona1$state[disease_nona1$state == "New Hampshire"] <- "NH"
disease_nona1$state[disease_nona1$state == "New Jersey"] <- "NJ"
disease_nona1$state[disease_nona1$state == "New York"] <- "NY"
disease_nona1$state[disease_nona1$state == "New Mexico"] <- "NM"
disease_nona1$state[disease_nona1$state == "North Carolina"] <- "NC"
disease_nona1$state[disease_nona1$state == "North Dakota"] <- "ND"
disease_nona1$state[disease_nona1$state == "Ohio"] <- "OH"
disease_nona1$state[disease_nona1$state == "Oklahoma"] <- "OK"
disease_nona1$state[disease_nona1$state == "Oregon"] <- "OR"
disease_nona1$state[disease_nona1$state == "Pennsylvania"] <- "PA"
disease_nona1$state[disease_nona1$state == "Rhode Island"] <- "RI"
disease_nona1$state[disease_nona1$state == "South Carolina"] <- "SC"
disease_nona1$state[disease_nona1$state == "South Dakota"] <- "SD"
disease_nona1$state[disease_nona1$state == "Tennessee"] <- "TN"
disease_nona1$state[disease_nona1$state == "Texas"] <- "TX"
disease_nona1$state[disease_nona1$state == "Utah"] <- "UT"
disease_nona1$state[disease_nona1$state == "Vermont"] <- "VT"
disease_nona1$state[disease_nona1$state == "Virginia"] <- "VA"
disease_nona1$state[disease_nona1$state == "Washington"] <- "WA"
disease_nona1$state[disease_nona1$state == "West Virginia"] <- "WV"
disease_nona1$state[disease_nona1$state == "Wisconsin"] <- "WI"
disease_nona1$state[disease_nona1$state == "Wyoming"] <- "WY"

Summarize data by state

state1 <-disease_nona1|>
  group_by(state,region) |>
  summarize(
    #calculate the total number of diseases over all years
    total_count=sum(count),
    #average population represents each state's size
    avg_population=mean(population))

`summarise()` has regrouped the output.
ℹ Summaries were computed grouped by state and region.
ℹ Output is grouped by state.
ℹ Use `summarise(.groups = "drop_last")` to silence this message.
ℹ Use `summarise(.by = c(state, region))` for per-operation grouping
  (`?dplyr::dplyr_by`) instead.

Calculate the overall disease rate for the country

r<-disease_nona |>
  #disease rate=count/population
  summarize(rate=sum(count)/sum(population)*10^6) |>
  pull(rate)

Create a scatterplot

#theme for better visualization 
library(ggthemes)
# to avoid overlapping labels
library(ggrepel)

#set theme/improve visualization
ds_theme_set()
  ggplot(state1,aes(x=total_count,y=avg_population/10^6,label=state))+
  #reference line(manual)
  #geom_abline(slope=1,intercept=-log10(r),ity=2,col="blue")+
  geom_point(aes(color=region),size=3,alpha=0.9)+
    #nudge_x horizontal shoft，nudge_y vertical shift  ,segment.colour:color of line from label      #to point,box.padding:space for labels
  geom_text_repel(nudge_x = 0.005,segment.colour = "orange",size=3,box.padding = 0.3)+
  scale_x_log10("Contagious Diseases Count(log scale)")+
  scale_y_log10("State Population(million,log scale)")+
  ggtitle("        U.S. Contagious Diseases vs Population by State ")  +
  scale_color_discrete(name="Region")+
  scale_color_brewer(name="State",palette="Set2")+
  #theme_minimal(base_size=14,base_family="serif")+
  #在geom_smooth(reference line auto)
  geom_smooth(method=lm,se=FALSE,Ity=2,color="lightgreen",linewidth=1)

Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Warning in geom_smooth(method = lm, se = FALSE, Ity = 2, color = "lightgreen",
: Ignoring unknown parameters: `Ity`

`geom_smooth()` using formula = 'y ~ x'

Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

Summary

I chose the U.S Contagious Diseases dataset to explore the relationship between disease and population across U.S. states. Initially, I loaded the dataset and filtered out NA values. I also grouped all states into 4 regions: Northeast, South, North Central, and West. Then I replaced each state’s name with its abbreviation. I grouped the data by state and region , calculated the total number of disease cases across all years, and the average population for each state. Since the population does not change significantly over time, I use the average population as a representation of each state’s size.

For the visualization part, I created a scatterplot. The x-axis is the total disease count, and the y-axis is the average population, both on a log scale, given the large sizes of both variables. Each point represents a state, and each color a region. I added state abbreviations as labels using geom_text_repel to avoid overlapping text and improve readability. Additionally, I made a reference line representing the country’s overall disease rate using geom_smooth. Unfortunately, it was too complicated to change the x-axis values into simpler numbers without using more complex code on scale_x_log10. The graph shows a clear positive relationship between population size and total disease cases. The North Central and Northeast regions appear to have higher disease counts, while the West region has the lowest. The South region has a relatively moderate disease count

Citation

Data 110 class notes

https://ggrepel.slowkow.com/articles/examples.html（geom_text_repel code）

https://www.cdc.gov/nchs/hus/sources-definitions/geographic-region.htm(U.S. states)