Throughout the exam, you will be analyzing the data set named us_contagious_diseases in the R package dslabs. Please submit both the .Rmd and knitted .html files.

1.

#install.packages("dslabs")
library(dslabs)
d <- us_contagious_diseases
str(d)
## 'data.frame':    16065 obs. of  6 variables:
##  $ disease        : Factor w/ 7 levels "Hepatitis A",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ state          : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year           : num  1966 1967 1968 1969 1970 ...
##  $ weeks_reporting: num  50 49 52 49 51 51 45 45 45 46 ...
##  $ count          : num  321 291 314 380 413 378 342 467 244 286 ...
##  $ population     : num  3345787 3364130 3386068 3412450 3444165 ...
table(d$disease)
## 
## Hepatitis A     Measles       Mumps   Pertussis       Polio     Rubella 
##        2346        3825        1785        2856        2091        1887 
##    Smallpox 
##        1275
quantile(d$population,0.15,na.rm = TRUE)
##    15% 
## 670515

2.

Find the top 5 states with the most “Measles” cases over the 10 years from 1991 to 2000 (both years inclusive).

d %>%
  filter(year >= 1991, year <= 2000, disease == "Measles") %>%
  arrange(desc(count)) %>%
  head(5)

3.

For the state of Texas,

  1. Add a variable ave_count, representing the average count per weeks_reporting
d %>%
  filter(state == "Texas") %>%
  select(count, weeks_reporting) %>%
  mutate(ave_count = count/weeks_reporting)
  1. Create a scatterplot and smoothline fit on the same plot between year (x-axis) and ave_count (y-axis), using different colors for different diseases.
d %>%
  filter(state == "Texas") %>%
  mutate(ave_count = count/weeks_reporting) %>%
  ggplot(mapping = aes(x = year, y = ave_count, color = disease)) + geom_point() +geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

(c) Remove all the observations for disease “Measles” and redo the plot in (b).

c <-which(d$disease=="Measles")
without_m <- d[-c,]
without_m %>%
  filter(state == "Texas") %>%
  mutate(ave_count = count/weeks_reporting) %>%
  ggplot(mapping = aes(x = year, y = ave_count, color = disease)) + geom_point() +geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

### 4.

  1. For each state and year combination, find the total count of all diseases.
d %>%
  group_by(year,state) %>%
  summarize(total_count= sum(count)) 
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
  1. For each state and year, create a variable named count_density, which is defined by the total count of all diseases divided by the population.
  d %>%
  group_by(state, year) %>%
  summarize(count_density = sum(count) / sum(population))
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
  1. The 5 state and year pairs that have the largest count_density.
 d %>%
  group_by(state, year) %>%
  summarize(count_density = sum(count) / sum(population))%>%
  arrange(count_density)
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.