check working directory
setwd("~/DATA110")
getwd()
## [1] "C:/Users/libcl/OneDrive/Documents/DATA110"
Get DSLabs
library("dslabs")
## Warning: package 'dslabs' was built under R version 4.0.3
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
## [1] "make-admissions.R"
## [2] "make-brca.R"
## [3] "make-brexit_polls.R"
## [4] "make-death_prob.R"
## [5] "make-divorce_margarine.R"
## [6] "make-gapminder-rdas.R"
## [7] "make-greenhouse_gases.R"
## [8] "make-historic_co2.R"
## [9] "make-mnist_27.R"
## [10] "make-movielens.R"
## [11] "make-murders-rda.R"
## [12] "make-na_example-rda.R"
## [13] "make-nyc_regents_scores.R"
## [14] "make-olive.R"
## [15] "make-outlier_example.R"
## [16] "make-polls_2008.R"
## [17] "make-polls_us_election_2016.R"
## [18] "make-reported_heights-rda.R"
## [19] "make-research_funding_rates.R"
## [20] "make-stars.R"
## [21] "make-temp_carbon.R"
## [22] "make-tissue-gene-expression.R"
## [23] "make-trump_tweets.R"
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"
load libraries
library(tidyverse)
## -- Attaching packages -------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(RColorBrewer)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
load data set and examine the data
polls <- brexit_polls
summary(polls)
## startdate enddate pollster poll_type
## Min. :2016-01-08 Min. :2016-01-10 ICM :28 Online :85
## 1st Qu.:2016-03-04 1st Qu.:2016-03-08 YouGov :26 Telephone:42
## Median :2016-04-22 Median :2016-04-26 ORB :14
## Mean :2016-04-16 Mean :2016-04-18 ComRes :10
## 3rd Qu.:2016-05-31 3rd Qu.:2016-06-01 Opinium: 9
## Max. :2016-06-23 Max. :2016-06-23 TNS : 9
## (Other):31
## samplesize remain leave undecided
## Min. : 497 Min. :0.3500 Min. :0.3200 Min. :0.0000
## 1st Qu.:1010 1st Qu.:0.4100 1st Qu.:0.3900 1st Qu.:0.0900
## Median :1693 Median :0.4400 Median :0.4200 Median :0.1300
## Mean :1694 Mean :0.4424 Mean :0.4223 Mean :0.1265
## 3rd Qu.:2010 3rd Qu.:0.4800 3rd Qu.:0.4500 3rd Qu.:0.1700
## Max. :4772 Max. :0.5500 Max. :0.5500 Max. :0.3000
##
## spread
## Min. :-0.10000
## 1st Qu.:-0.02000
## Median : 0.01000
## Mean : 0.02008
## 3rd Qu.: 0.05000
## Max. : 0.19000
##
head(polls)
## startdate enddate pollster poll_type samplesize remain leave undecided
## 1 2016-06-23 2016-06-23 YouGov Online 4772 0.52 0.48 0.00
## 2 2016-06-22 2016-06-22 Populus Online 4700 0.55 0.45 0.00
## 3 2016-06-20 2016-06-22 YouGov Online 3766 0.51 0.49 0.00
## 4 2016-06-20 2016-06-22 Ipsos MORI Telephone 1592 0.49 0.46 0.01
## 5 2016-06-20 2016-06-22 Opinium Online 3011 0.44 0.45 0.09
## 6 2016-06-17 2016-06-22 ComRes Telephone 1032 0.54 0.46 0.00
## spread
## 1 0.04
## 2 0.10
## 3 0.02
## 4 0.03
## 5 -0.01
## 6 0.08
view(polls)
I noticed that the two types of collection used for these poll percentages were online and telephone. I know there are some concerns with telephone polls such as landlines skewing to older people, but there are more serious issues with online polls. I also noted that the spread when positive supports “remain” whereas negative values supports “leave”
Just to get a general idea of the data:
group and arrange by enddate to plot the spread in order by date
color shows differentiates online from telephone gathering for the polls
added a regression line to note that telephone tends to fall above the line (remain) while online tends to fall below (leave)
polls %>%
group_by(enddate) %>%
arrange(enddate, spread, poll_type) %>%
ggplot(aes(x = enddate, y = spread)) +
geom_point(aes(x = enddate, y = spread, color = poll_type)) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

I would like to view the proportional sample size by pollster for online versus telephone type polls
First I sum the sample size column into a variable called “totalss”
I then mutate a new variable sampleprop using the totalss in the denominator for each sample size to get a relative proportion to the whole
totalss <- sum(polls$samplesize, na.rm = FALSE, dims = 1)
polls <- polls %>%
group_by(pollster, samplesize) %>%
mutate(sampleprop = samplesize / totalss)
polls
## # A tibble: 127 x 10
## # Groups: pollster, samplesize [109]
## startdate enddate pollster poll_type samplesize remain leave undecided
## <date> <date> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2016-06-23 2016-06-23 YouGov Online 4772 0.52 0.48 0
## 2 2016-06-22 2016-06-22 Populus Online 4700 0.55 0.45 0
## 3 2016-06-20 2016-06-22 YouGov Online 3766 0.51 0.49 0
## 4 2016-06-20 2016-06-22 Ipsos M~ Telephone 1592 0.49 0.46 0.01
## 5 2016-06-20 2016-06-22 Opinium Online 3011 0.44 0.45 0.09
## 6 2016-06-17 2016-06-22 ComRes Telephone 1032 0.54 0.46 0
## 7 2016-06-17 2016-06-22 ComRes Telephone 1032 0.48 0.42 0.11
## 8 2016-06-16 2016-06-22 TNS Online 2320 0.41 0.43 0.16
## 9 2016-06-20 2016-06-20 Survati~ Telephone 1003 0.45 0.44 0.11
## 10 2016-06-18 2016-06-19 YouGov Online 1652 0.42 0.44 0.13
## # ... with 117 more rows, and 2 more variables: spread <dbl>, sampleprop <dbl>
So that the graph is representative but not cluttered, I chose to view 6 pollsters and placed them into a data set called top 6
# choose 6 pollsters
top6 <- polls %>%
filter(pollster == "ICM" | pollster == "YouGov" | pollster == "ORB" | pollster == "ComRes" | pollster == "Ipsos MORI" | pollster == "BMG Research") %>%
select(enddate, pollster, spread, sampleprop, poll_type) %>%
arrange(enddate) #to plot in order
## Adding missing grouping variables: `samplesize`
top6
## # A tibble: 92 x 6
## # Groups: pollster, samplesize [76]
## samplesize enddate pollster spread sampleprop poll_type
## <dbl> <date> <fct> <dbl> <dbl> <fct>
## 1 2055 2016-01-10 ICM 0.06 0.00955 Online
## 2 2023 2016-01-17 ICM 0.0200 0.00940 Online
## 3 2015 2016-01-21 ORB 0.04 0.00936 Online
## 4 1006 2016-01-24 ComRes 0.18 0.00467 Telephone
## 5 2010 2016-01-24 ICM 0 0.00934 Online
## 6 513 2016-01-25 Ipsos MORI 0.19 0.00238 Telephone
## 7 1511 2016-01-25 BMG Research 0.02 0.00702 Online
## 8 1735 2016-01-28 YouGov -0.0400 0.00806 Online
## 9 2002 2016-01-31 ICM 0.0300 0.00930 Online
## 10 2018 2016-02-07 ICM -0.01 0.00938 Online
## # ... with 82 more rows
Now I use plotly to create a chart showing possible difference between online and telephone polls.
Change endate by mutating to char_date, in order to be able to use character class instead of date class to set x coordinate, y coordinate position for titles on graph
Set sampleprop as the size element and color to identify pollster shown in the legend.
#need date as character class for geom_text
top6 %>%
mutate(char_date = as.character.Date(enddate)) %>%
#make scatterplot
ggplot(aes(x = char_date, y = spread, col = pollster, size = sampleprop)) +
geom_point(aes(x = char_date, y = spread), alpha = 0.8) + #alpha is transparacy
#add color brewer
scale_color_brewer(palette = "Paired") +
guides(size=FALSE) +
theme(plot.title = element_blank(), legend.title = element_blank()) +
#set y values on coordinate
coord_cartesian(ylim = c(-0.20, 0.20)) +
xlab("Poll End Date") +
ylab("Spread (remain minus leave)") +
ggtitle("") +
facet_grid(. ~ poll_type) +
#add text to each facet pane
geom_text(aes(x="2016-02-15", y=-0.15, label=poll_type), cex=3, color="blue") +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank(),
legend.position = "top")

#add ggplotly to get tooltip
ggplotly()
Pattern is a little less clear, but telephone gathering seems to favor remain more than online polls.