Week 10 DS Labs HW

Pull in packages

#install.packages("dslabs")
#install.packages("tidyverse")
#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.3.3
## v tidyr   1.0.0     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.1.3     v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.5.3
library(dslabs)
## Warning: package 'dslabs' was built under R version 3.5.3
data(package="dslabs")
list.files(system.file("script", package="dslabs"))
##  [1] "make-admissions.R"                   
##  [2] "make-brca.R"                         
##  [3] "make-brexit_polls.R"                 
##  [4] "make-death_prob.R"                   
##  [5] "make-divorce_margarine.R"            
##  [6] "make-gapminder-rdas.R"               
##  [7] "make-greenhouse_gases.R"             
##  [8] "make-historic_co2.R"                 
##  [9] "make-mnist_27.R"                     
## [10] "make-movielens.R"                    
## [11] "make-murders-rda.R"                  
## [12] "make-na_example-rda.R"               
## [13] "make-nyc_regents_scores.R"           
## [14] "make-olive.R"                        
## [15] "make-outlier_example.R"              
## [16] "make-polls_2008.R"                   
## [17] "make-polls_us_election_2016.R"       
## [18] "make-reported_heights-rda.R"         
## [19] "make-research_funding_rates.R"       
## [20] "make-stars.R"                        
## [21] "make-temp_carbon.R"                  
## [22] "make-tissue-gene-expression.R"       
## [23] "make-trump_tweets.R"                 
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"

Pull in US Murder dataset.Pull in packages.

data("murders")
str(murders)
## 'data.frame':    51 obs. of  5 variables:
##  $ state     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ abb       : chr  "AL" "AK" "AZ" "AR" ...
##  $ region    : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
##  $ population: num  4779736 710231 6392017 2915918 37253956 ...
##  $ total     : num  135 19 232 93 1257 ...

Save the murders dataset using write_csv command.

write_csv(murders, "murders.csv", na="")

Use dplyr and the pull command to select column in a dataframe and transform it into a vector.

r <- murders %>%
  summarize(rate=sum(total)/sum(population)*10^6) %>%
  pull(rate)

Create a static graph with labels for each point.

ds_theme_set()
murders %>%
  ggplot(aes(x=population/10^6, y=total, label=abb)) +
  geom_abline(intercept=log10(r), lty=2, col="darkgrey") +
  geom_point(aes(color=region), size=3) +
  geom_text_repel(nudge_x = 0.005) +
  scale_x_log10("Populations in millions (log scale)") +
  scale_y_log10("Total number of murders (log scale)") +
  ggtitle("US Gun Murders in 2010") +
  scale_color_discrete(name="Region") +
  # Remove legend title
  theme(legend.title=element_blank())

Gapminder Dataset.

Pull in the dataset and name the regions.

data("gapminder")
west <- c("Western Europe", "Northern Europe", "Southern Europe", "Northern America", "Australia and New Zealand")
gapminder <- gapminder %>%
  mutate(group=case_when(
    region %in% west ~ "The West",
    region %in% c("Eastern Asia", "South-Eastern Asia") ~ "East Asia",
    region %in% c("Caribbean", "Central America", "South America") ~ "Latin America",
    continent =="Africa" & region != "Northern Africa" ~ "Sub-Saharan Africa",
    TRUE ~ "Others"))
gapminder <- gapminder %>%
  mutate(group = factor(group, levels = rev(c("Others", "Latin America", "East Asia", "Sub-Saharan Africa", "The West"))))

Clean up the na values. Mutate population to be a value per million. Change the plot theme. Use the geom_text command to label plots inside. Shift the legend to go across the top.

gapminder %>%
  filter(year %in% c(1962, 2013) & !is.na(group) &
           !is.na(fertility) & !is.na(life_expectancy)) %>%
  mutate(population_in_millions = population/10^6) %>%
  ggplot(aes(fertility, y=life_expectancy, col=group, size=population_in_millions)) +
  geom_point(alpha = 0.8) +
  guides(size=FALSE) +
  theme(plot.title = element_blank(), legend.title = element_blank()) +
  coord_cartesian(ylim=c(30,85)) +
  xlab("Fertility rate (births per woman)") +
  ylab("Life Expectancy") +
  geom_text(aes(x=7,y=82,label=year), cex=12, color="grey") +
  facet_grid(.~year) +
  theme(strip.background = element_blank(),
        strip.text.x=element_blank(),
        strip.text.y=element_blank(),
        legend.position="top")

Contagious disease data for US.

Focus on measles. Filter out Alaska & Hawaii. Mutate the rate. Draw a line at 1963.

library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 3.5.2
data("us_contagious_diseases")
the_disease <- "Measles"
us_contagious_diseases %>%
  filter(!state%in%c("Hawaii","Alaska") & disease == the_disease) %>%
  mutate(rate=count/population*10000*52/weeks_reporting) %>%
  mutate(state=reorder(state,rate)) %>%
  ggplot(aes(year,state,fill=rate)) +
  geom_tile(color="grey50") +
  scale_x_continuous(expand=c(0,0)) +
  scale_fill_gradientn(colors=brewer.pal(9,"Reds"),trans="sqrt") +
               geom_vline(xintercept=1963,col="blue") +
               theme_minimal() + theme(panel.grid = element_blank()) +
               ggtitle(the_disease) +
               ylab("") +
               xlab("")

2016 Poll Data

Focus on polls for Clinton and Trump after July 2016. Plot a scatterplot of the enddate to the percentage in the polls. Include a loess smoother regression.

data(polls_us_election_2016)
polls_us_election_2016 %>%
  filter(state == "U.S." & enddate >= "2016-07-01") %>%
  select(enddate, pollster, rawpoll_clinton, rawpoll_trump) %>%
  rename(Clinton = rawpoll_clinton, Trump = rawpoll_trump) %>%
  gather(candidate, percentage, -enddate, -pollster) %>%
  mutate(candidate = factor(candidate, levels = c("Trump", "Clinton"))) %>%
  group_by(pollster) %>%
  filter(n() >=10) %>%
  ungroup() %>%
  ggplot(aes(enddate, percentage, color = candidate)) +
  geom_point(show.legend = FALSE, alpha = 0.4) +
  geom_smooth(method = "loess", span = 0.15) +
  scale_y_continuous(limits = c(30,50))
## Warning: Removed 22 rows containing non-finite values (stat_smooth).
## Warning: Removed 22 rows containing missing values (geom_point).

Working with HTML widgets and Highcharter.

Load packages.

library(readr)
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 3.5.3
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(dplyr)

Install and load highcharter.

#install.packages("highcharter")
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.5.3
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
## 
## Attaching package: 'highcharter'
## The following object is masked from 'package:dslabs':
## 
##     stars

Load the nations dataset and add a column showing GDP in trillions of dollars.

nations <- read_csv("nations.csv") %>%
  mutate(gdp_tn = gdp_percap*population/1000000000000)
## Parsed with column specification:
## cols(
##   iso2c = col_character(),
##   iso3c = col_character(),
##   country = col_character(),
##   year = col_double(),
##   gdp_percap = col_double(),
##   population = col_double(),
##   birth_rate = col_double(),
##   neonat_mortal_rate = col_double(),
##   region = col_character(),
##   income = col_character()
## )

Prepare the data from the China’s rise chart we did before. Arrange to put the data in year order.

big4 <- nations %>%
  filter(iso3c =="CHN" | iso3c == "DEU" | iso3c == "JPN" | iso3c == "USA") %>%
  arrange(year)

Draw a basic chart.

highchart() %>%
  hc_add_series(data = big4, type = "line", hcaes(x=year, y=gdp_tn, group=country))
## Warning: `parse_quosure()` is deprecated as of rlang 0.2.0.
## Please use `parse_quo()` instead.
## This warning is displayed once per session.

Define the color palette.

cols <- brewer.pal(4, "Set1")
highchart() %>%
  hc_add_series(data = big4, type = "line", hcaes(x=year, y= gdp_tn, group = country)) %>%
  hc_colors(cols)

Add axis label.

highchart() %>%
  hc_add_series(data = big4, type = "line", hcaes(x=year, y=gdp_tn, group=country)) %>%
  hc_colors(cols) %>%
  hc_xAxis(title=list(text="Year")) %>%
  hc_yAxis(title = list(text = "GDP ($trillion)"))

Change the legend position.

highchart() %>%
  hc_add_series(data = big4, type = "line", hcaes(x=year, y=gdp_tn, group=country)) %>%
  hc_colors(cols) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "GDP ($ trillion)")) %>%
  hc_plotOptions(series = list(marker = list(symbol = "circle"))) %>%
  hc_legend(align = "right", verticalAlign = "top")

Customize the tooltips.

big4_chart <- highchart() %>%
  hc_add_series(data = big4, type = "line", hcaes(x=year, y=gdp_tn, group=country)) %>%
  hc_colors(cols) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "GDP ($ trillion)")) %>%
  hc_plotOptions(series = list(marker = list(symbol = "circle"))) %>%
  hc_legend(align = "right", verticalAlign = "top") %>%
  hc_tooltip(shared = TRUE, borderColor = "black", pointFormat = "{point.country}: {point.gdp_tn:.2f}<br>")
big4_chart

Prepare the stacked chart.

regions <- nations %>%
  group_by(year,region) %>%
  summarize(gdp_tn = sum(gdp_tn, na.rm = TRUE)) %>%
  arrange(year, region)

Make an area chart using the default options.

highchart() %>%
   hc_add_series(data = regions, type = "area", hcaes(x=year, y=gdp_tn, group=region))

Change up the colors.

cols <- brewer.pal(7, "Set2")
highchart () %>%
  hc_add_series(data = regions, type = "area", hcaes(x = year, y = gdp_tn, group = region)) %>%
  hc_colors(cols) %>%
  hc_chart(style = list(fontFamily = "Georgia", fontWeight = "bold")) %>%
  hc_plotOptions(series = list(stacking = "normal", marker = list(enabled = FALSE, states = list(hover = list(enabled = FALSE))), lineWidth = 0.5, lineColor = "white")) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "GDP ($ trillion)")) %>%
  hc_legend(align = "right", verticalAlign = "top", layout = "vertical") %>%
  hc_tooltip(enabled = FALSE)

Change up the colors. PLUS changing chart from normal stacking to percent.

cols <- brewer.pal(7, "Set2")
highchart () %>%
  hc_add_series(data = regions, type = "area", hcaes(x = year, y = gdp_tn, group = region)) %>%
  hc_colors(cols) %>%
  hc_chart(style = list(fontFamily = "Georgia", fontWeight = "bold")) %>%
  hc_plotOptions(series = list(stacking = "percent", marker = list(enabled = FALSE, states = list(hover = list(enabled = FALSE))), lineWidth = 0.5, lineColor = "white")) %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "GDP ($ trillion)")) %>%
  hc_legend(align = "right", verticalAlign = "top", layout = "vertical") %>%
  hc_tooltip(enabled = FALSE)

Food Stamps Data - Combining two types.

Pull in the dataset.

cols <-  c("red", "black")
food_stamps <-read_csv("food_stamps.csv")
## Parsed with column specification:
## cols(
##   year = col_double(),
##   participants = col_double(),
##   costs = col_double()
## )
highchart() %>%
  hc_yAxis_multiples(list(title = list(text = "Participants (millions)")),list(title = list(text = "Costs ($ billions)"), opposite = TRUE)) %>%
  hc_add_series(data = food_stamps$participants, name = "Participants (millions)", type = "column", yAxis = 0) %>%
  hc_add_series(data = food_stamps$costs, name = "Costs ($ billions)", type = "line", yAxis = 1) %>%
  hc_xAxis(categories = food_stamps$year, tickInterval = 5) %>%
  hc_colors(cols) %>%
  hc_chart(style = list(fontFamily = "Georgia", fontWeight = "bold"))

Week 10 DSLabs HW.

Use the contagious diseases dataset but chart it using highcharter.

Prepare the dataset.

I filtered out Hawaii and Alaska and grouped by disease.

I did not understand the calculation which multiplied by 10,000 and 52 while dividing by the # of weeks reported so I removed parts of the calculation to get to a rate.

library(dplyr)
library(highcharter)
sickly <- us_contagious_diseases %>%
  filter(!state%in%c("Hawaii","Alaska")) %>%
  filter(year>=1965 & year<=1975) %>%
  group_by(disease,year) %>% 
  summarise(diseasetotal=sum(count))
head(sickly)
## # A tibble: 6 x 3
## # Groups:   disease [1]
##   disease      year diseasetotal
##   <fct>       <dbl>        <dbl>
## 1 Hepatitis A  1966        32270
## 2 Hepatitis A  1967        38196
## 3 Hepatitis A  1968        44998
## 4 Hepatitis A  1969        45834
## 5 Hepatitis A  1970        54838
## 6 Hepatitis A  1971        58527

Chart it.

I tried to chart the 6 disease rates, by year with the highcharter package, trying to get a different line for each disease, with a dot for each year. However, I had trouble because my charts kept plotting each state separately for each disease - a real mess. I tried summing the counts or selecting only 1 state but could not get the thing to chart anything meaningful until our professor helped me with the summarise on the counts of disease.

I also selected only a 10-year period to help with the scaling of the chart.

sicklychart <- highchart() %>%
  hc_add_series(data = sickly, type = "line", hcaes(x=year, y=diseasetotal, group = disease)) %>%
  hc_xAxis(title = list(text="Year")) %>%
  hc_yAxis(title = list(text="Disease Count")) %>%
  hc_plotOptions(series = list(marker = list(symbol = "circle"))) %>%
  hc_legend(align = "right", verticalAlign = "top") %>%
  hc_tooltip(shared = TRUE, borderColor = "black", pointFormat = "{point.disease}: {point.diseasetotal:.2f}<br>")
sicklychart