In your RMD file / Rpubs document, be sure you describe in a paragraph what dataset you have used and document how you have created your graph.

Load Libraries

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.1
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dslabs)
library(wesanderson)
library(ggthemes)

Import US Contagious Disease Dataset and Determine Disease Count per 1,000,000 people

cont1 <- dslabs::us_contagious_diseases

str(cont1)

## 'data.frame':    16065 obs. of  6 variables:
##  $ disease        : Factor w/ 7 levels "Hepatitis A",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ state          : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year           : num  1966 1967 1968 1969 1970 ...
##  $ weeks_reporting: num  50 49 52 49 51 51 45 45 45 46 ...
##  $ count          : num  321 291 314 380 413 378 342 467 244 286 ...
##  $ population     : num  3345787 3364130 3386068 3412450 3444165 ...

levels(cont1$disease)

## [1] "Hepatitis A" "Measles"     "Mumps"       "Pertussis"   "Polio"      
## [6] "Rubella"     "Smallpox"

cont1 %>% group_by(disease) %>% count(disease)

## # A tibble: 7 x 2
## # Groups:   disease [7]
##   disease         n
##   <fct>       <int>
## 1 Hepatitis A  2346
## 2 Measles      3825
## 3 Mumps        1785
## 4 Pertussis    2856
## 5 Polio        2091
## 6 Rubella      1887
## 7 Smallpox     1275

options(scipen=999)

cont2 <- cont1 %>%  filter(disease != "Measles",weeks_reporting != 0) %>% group_by(state,year) %>% mutate(cases_per_mill = as.integer(((count/population)*1000000)*52/weeks_reporting))

Group States by Region

region <- vector(mode = "character", length = nrow(cont2))
region <- as.factor(region)
levels(region) <- c("South","Northeast","Midwest","West")

cont3 <- cbind(cont2,region)

## New names:
## * NA -> ...8

cont3 <- cont3 %>% mutate(region = ...8) %>% select(-c(...8))

cont3 <- cont3 %>% mutate(region = case_when(state %in% c("Alabama","Arkansas","Kentucky","Louisiana","Virginia","Tennessee","Florida","Georgia","Mississippi","North Carolina","South Carolina","West Virginia","Texas","Maryland","Delaware","District Of Columbia","Oklahoma")~"South",state %in% c("Connecticut","Maine","Massachusetts","New Hampshire","New Jersey","New York","Pennsylvania","Rhode Island","Vermont")~"Northeast",state %in% c("Iowa","Kansas","Missouri","Nebraska","North Dakota","South Dakota","Illinois","Indiana","Michigan","Minnesota","Ohio","Wisconsin")~"Midwest",state %in% c("Arizona","Colorado","Idaho","Montana","Nevada","New Mexico","Utah","Wyoming","Alaska","California","Hawaii","Oregon","Washington")~"West"))

Plot 1: Did certain regions in the US do a worse job of containing contagious diseases than others?

ticks <- c(1940, 1970, 2000)

ggplot(cont3, mapping = aes(year,cases_per_mill,color = region, size = population)) +  geom_point()+ ylab("Cases Per Million") + xlab("Year") + ggtitle("Annual Cases of Disease Per Million People By Region from 1930 to 1970") + theme(plot.title = element_text(hjust = 0.5)) + theme_solarized()+ scale_color_manual(values = wes_palette("Moonrise3", n = 4))+facet_grid(~disease) + ylim(c(0,7500)) + theme(legend.position = "top")+ scale_x_continuous(breaks = ticks) + theme(legend.title = element_blank())

## Warning: Removed 25 rows containing missing values (geom_point).

Plot 2: Did certain regions in the US do a worse job of containing Polio than others?

cont4 <- cont3 %>% filter(disease == "Polio")

ggplot(cont4, mapping = aes(year,cases_per_mill,color = region, size = population)) +  geom_point()+ ylab("Cases Per Million") + xlab("Year") + ggtitle("Annual Cases of Polio Per Million People By Region from 1930 to 1970") + theme(plot.title = element_text(hjust = 0.5)) + theme_solarized()+ scale_color_manual(values = wes_palette("Moonrise3", n = 4))

## Warning: Removed 16 rows containing missing values (geom_point).

Plot 3: Did certain regions in the US do a worse job of containing Polio than others?

cont5 <- cont4 %>% group_by(region,year) %>% summarise(cases_per_mill_region = mean(cases_per_mill), pop_per_region = mean(population))

## `summarise()` regrouping output by 'region' (override with `.groups` argument)

cont5

## # A tibble: 164 x 4
## # Groups:   region [4]
##    region   year cases_per_mill_region pop_per_region
##    <chr>   <dbl>                 <dbl>          <dbl>
##  1 Midwest  1928                  43.5       3157608.
##  2 Midwest  1929                  14.8       3189133.
##  3 Midwest  1930                 130         3216175 
##  4 Midwest  1931                  93.8       3238191.
##  5 Midwest  1932                  22         3255736.
##  6 Midwest  1933                  41.6       3269682.
##  7 Midwest  1934                  30.5       3280943.
##  8 Midwest  1935                  27.6       3290452.
##  9 Midwest  1936                  29.1       3299160.
## 10 Midwest  1937                  91.2       3308026.
## # … with 154 more rows

ggplot(cont5, mapping = aes(year,cases_per_mill_region,color = region, size = pop_per_region)) +  geom_point()+ ylab("Cases Per Million") + xlab("Year") + ggtitle("Annual Cases of Polio Per Million People By Region from 1930 to 1970") + theme(plot.title = element_text(hjust = 0.5)) + theme_solarized()+ scale_color_manual(values = wes_palette("Moonrise3", n = 4))

## Warning: Removed 8 rows containing missing values (geom_point).

The above graphs were developed using the Dslabs library and the US Contagious disease dataset. They are variations of the Measles graph shown in class last Wednesday. The key difference is that I decided to use a scatterplot, rather than a heatmap, and I opted to look at the annual cases per one million people rather than per 10,000 people. I also decided to look at cases by region instead of by state. The first graph looks at every disease but Measles, while the second graph focuses on Polio, and both graphs preserve the state-based data (i.e. population by state, cases per one million people) but categorize this data by region. The third graph is slightly different in that it omits the state-based data by taking the average annual cases per one million people for each region and the average annual population for each region. I thought it would be interesting to use a scatterplot because I would be able to look at more than three variables (i.e. rate, year, region, population) using a scatterplot whereas I would be unable to do so using a heatmap.

Week 8 HW

Hudson Finch-Batista

10/25/2020