In your RMD file / Rpubs document, be sure you describe in a paragraph what dataset you have used and document how you have created your graph.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dslabs)
library(wesanderson)
library(ggthemes)
cont1 <- dslabs::us_contagious_diseases
str(cont1)
## 'data.frame': 16065 obs. of 6 variables:
## $ disease : Factor w/ 7 levels "Hepatitis A",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 1966 1967 1968 1969 1970 ...
## $ weeks_reporting: num 50 49 52 49 51 51 45 45 45 46 ...
## $ count : num 321 291 314 380 413 378 342 467 244 286 ...
## $ population : num 3345787 3364130 3386068 3412450 3444165 ...
levels(cont1$disease)
## [1] "Hepatitis A" "Measles" "Mumps" "Pertussis" "Polio"
## [6] "Rubella" "Smallpox"
cont1 %>% group_by(disease) %>% count(disease)
## # A tibble: 7 x 2
## # Groups: disease [7]
## disease n
## <fct> <int>
## 1 Hepatitis A 2346
## 2 Measles 3825
## 3 Mumps 1785
## 4 Pertussis 2856
## 5 Polio 2091
## 6 Rubella 1887
## 7 Smallpox 1275
options(scipen=999)
cont2 <- cont1 %>% filter(disease != "Measles",weeks_reporting != 0) %>% group_by(state,year) %>% mutate(cases_per_mill = as.integer(((count/population)*1000000)*52/weeks_reporting))
region <- vector(mode = "character", length = nrow(cont2))
region <- as.factor(region)
levels(region) <- c("South","Northeast","Midwest","West")
cont3 <- cbind(cont2,region)
## New names:
## * NA -> ...8
cont3 <- cont3 %>% mutate(region = ...8) %>% select(-c(...8))
cont3 <- cont3 %>% mutate(region = case_when(state %in% c("Alabama","Arkansas","Kentucky","Louisiana","Virginia","Tennessee","Florida","Georgia","Mississippi","North Carolina","South Carolina","West Virginia","Texas","Maryland","Delaware","District Of Columbia","Oklahoma")~"South",state %in% c("Connecticut","Maine","Massachusetts","New Hampshire","New Jersey","New York","Pennsylvania","Rhode Island","Vermont")~"Northeast",state %in% c("Iowa","Kansas","Missouri","Nebraska","North Dakota","South Dakota","Illinois","Indiana","Michigan","Minnesota","Ohio","Wisconsin")~"Midwest",state %in% c("Arizona","Colorado","Idaho","Montana","Nevada","New Mexico","Utah","Wyoming","Alaska","California","Hawaii","Oregon","Washington")~"West"))
ticks <- c(1940, 1970, 2000)
ggplot(cont3, mapping = aes(year,cases_per_mill,color = region, size = population)) + geom_point()+ ylab("Cases Per Million") + xlab("Year") + ggtitle("Annual Cases of Disease Per Million People By Region from 1930 to 1970") + theme(plot.title = element_text(hjust = 0.5)) + theme_solarized()+ scale_color_manual(values = wes_palette("Moonrise3", n = 4))+facet_grid(~disease) + ylim(c(0,7500)) + theme(legend.position = "top")+ scale_x_continuous(breaks = ticks) + theme(legend.title = element_blank())
## Warning: Removed 25 rows containing missing values (geom_point).
cont4 <- cont3 %>% filter(disease == "Polio")
ggplot(cont4, mapping = aes(year,cases_per_mill,color = region, size = population)) + geom_point()+ ylab("Cases Per Million") + xlab("Year") + ggtitle("Annual Cases of Polio Per Million People By Region from 1930 to 1970") + theme(plot.title = element_text(hjust = 0.5)) + theme_solarized()+ scale_color_manual(values = wes_palette("Moonrise3", n = 4))
## Warning: Removed 16 rows containing missing values (geom_point).
cont5 <- cont4 %>% group_by(region,year) %>% summarise(cases_per_mill_region = mean(cases_per_mill), pop_per_region = mean(population))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
cont5
## # A tibble: 164 x 4
## # Groups: region [4]
## region year cases_per_mill_region pop_per_region
## <chr> <dbl> <dbl> <dbl>
## 1 Midwest 1928 43.5 3157608.
## 2 Midwest 1929 14.8 3189133.
## 3 Midwest 1930 130 3216175
## 4 Midwest 1931 93.8 3238191.
## 5 Midwest 1932 22 3255736.
## 6 Midwest 1933 41.6 3269682.
## 7 Midwest 1934 30.5 3280943.
## 8 Midwest 1935 27.6 3290452.
## 9 Midwest 1936 29.1 3299160.
## 10 Midwest 1937 91.2 3308026.
## # … with 154 more rows
ggplot(cont5, mapping = aes(year,cases_per_mill_region,color = region, size = pop_per_region)) + geom_point()+ ylab("Cases Per Million") + xlab("Year") + ggtitle("Annual Cases of Polio Per Million People By Region from 1930 to 1970") + theme(plot.title = element_text(hjust = 0.5)) + theme_solarized()+ scale_color_manual(values = wes_palette("Moonrise3", n = 4))
## Warning: Removed 8 rows containing missing values (geom_point).
The above graphs were developed using the Dslabs library and the US Contagious disease dataset. They are variations of the Measles graph shown in class last Wednesday. The key difference is that I decided to use a scatterplot, rather than a heatmap, and I opted to look at the annual cases per one million people rather than per 10,000 people. I also decided to look at cases by region instead of by state. The first graph looks at every disease but Measles, while the second graph focuses on Polio, and both graphs preserve the state-based data (i.e. population by state, cases per one million people) but categorize this data by region. The third graph is slightly different in that it omits the state-based data by taking the average annual cases per one million people for each region and the average annual population for each region. I thought it would be interesting to use a scatterplot because I would be able to look at more than three variables (i.e. rate, year, region, population) using a scatterplot whereas I would be unable to do so using a heatmap.