library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dslabs)
library(ggrepel)
import US election results data and US murders data
knitr::opts_chunk$set(echo = TRUE)
data(murders)
head(murders)
## state abb region population total
## 1 Alabama AL South 4779736 135
## 2 Alaska AK West 710231 19
## 3 Arizona AZ West 6392017 232
## 4 Arkansas AR South 2915918 93
## 5 California CA West 37253956 1257
## 6 Colorado CO West 5029196 65
data(polls_us_election_2016) # includes results_us_election_2016
head(polls_us_election_2016)
## state startdate enddate
## 1 U.S. 2016-11-03 2016-11-06
## 2 U.S. 2016-11-01 2016-11-07
## 3 U.S. 2016-11-02 2016-11-06
## 4 U.S. 2016-11-04 2016-11-07
## 5 U.S. 2016-11-03 2016-11-06
## 6 U.S. 2016-11-03 2016-11-06
## pollster grade
## 1 ABC News/Washington Post A+
## 2 Google Consumer Surveys B
## 3 Ipsos A-
## 4 YouGov B
## 5 Gravis Marketing B-
## 6 Fox News/Anderson Robbins Research/Shaw & Company Research A
## samplesize population rawpoll_clinton rawpoll_trump rawpoll_johnson
## 1 2220 lv 47.00 43.00 4.00
## 2 26574 lv 38.03 35.69 5.46
## 3 2195 lv 42.00 39.00 6.00
## 4 3677 lv 45.00 41.00 5.00
## 5 16639 rv 47.00 43.00 3.00
## 6 1295 lv 48.00 44.00 3.00
## rawpoll_mcmullin adjpoll_clinton adjpoll_trump adjpoll_johnson
## 1 NA 45.20163 41.72430 4.626221
## 2 NA 43.34557 41.21439 5.175792
## 3 NA 42.02638 38.81620 6.844734
## 4 NA 45.65676 40.92004 6.069454
## 5 NA 46.84089 42.33184 3.726098
## 6 NA 49.02208 43.95631 3.057876
## adjpoll_mcmullin
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
head(results_us_election_2016)
## state electoral_votes clinton trump others
## 1 California 55 61.7 31.6 6.7
## 2 Texas 38 43.2 52.2 4.5
## 3 Florida 29 47.8 49.0 3.2
## 4 New York 29 59.0 36.5 4.5
## 5 Illinois 20 55.8 38.8 5.4
## 6 Pennsylvania 20 47.9 48.6 3.6
using both tables so lets check if state columns have the same order in both datasets
identical(murders$state, polls_us_election_2016$state)
## [1] FALSE
plot electoral vote vs population
but there is no population column in election dataset so we need to join tables first
tab <- left_join(murders, results_us_election_2016, by = "state")
head(tab)
## state abb region population total electoral_votes clinton trump
## 1 Alabama AL South 4779736 135 9 34.4 62.1
## 2 Alaska AK West 710231 19 3 36.6 51.3
## 3 Arizona AZ West 6392017 232 11 45.1 48.7
## 4 Arkansas AR South 2915918 93 6 33.7 60.6
## 5 California CA West 37253956 1257 55 61.7 31.6
## 6 Colorado CO West 5029196 65 9 48.2 43.3
## others
## 1 3.6
## 2 12.2
## 3 6.2
## 4 5.8
## 5 6.7
## 6 8.6
plot
tab %>% ggplot(mapping = aes(population/10^6, electoral_votes, label = abb)) +
geom_point() +
geom_text_repel() +
scale_x_continuous(trans = "log2") +
scale_y_continuous(trans = "log2") +
geom_smooth(method = "lm", se = FALSE)
