check working directory
setwd("~/DATA110")
getwd()
## [1] "C:/Users/libcl/OneDrive/Documents/DATA110"
Get DSLabs
library("dslabs")
## Warning: package 'dslabs' was built under R version 4.0.3
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
##  [1] "make-admissions.R"                   
##  [2] "make-brca.R"                         
##  [3] "make-brexit_polls.R"                 
##  [4] "make-death_prob.R"                   
##  [5] "make-divorce_margarine.R"            
##  [6] "make-gapminder-rdas.R"               
##  [7] "make-greenhouse_gases.R"             
##  [8] "make-historic_co2.R"                 
##  [9] "make-mnist_27.R"                     
## [10] "make-movielens.R"                    
## [11] "make-murders-rda.R"                  
## [12] "make-na_example-rda.R"               
## [13] "make-nyc_regents_scores.R"           
## [14] "make-olive.R"                        
## [15] "make-outlier_example.R"              
## [16] "make-polls_2008.R"                   
## [17] "make-polls_us_election_2016.R"       
## [18] "make-reported_heights-rda.R"         
## [19] "make-research_funding_rates.R"       
## [20] "make-stars.R"                        
## [21] "make-temp_carbon.R"                  
## [22] "make-tissue-gene-expression.R"       
## [23] "make-trump_tweets.R"                 
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"
load libraries
library(tidyverse)
## -- Attaching packages -------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(RColorBrewer)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
load data set and examine the data
polls <- brexit_polls
summary(polls)
##    startdate             enddate              pollster      poll_type 
##  Min.   :2016-01-08   Min.   :2016-01-10   ICM    :28   Online   :85  
##  1st Qu.:2016-03-04   1st Qu.:2016-03-08   YouGov :26   Telephone:42  
##  Median :2016-04-22   Median :2016-04-26   ORB    :14                 
##  Mean   :2016-04-16   Mean   :2016-04-18   ComRes :10                 
##  3rd Qu.:2016-05-31   3rd Qu.:2016-06-01   Opinium: 9                 
##  Max.   :2016-06-23   Max.   :2016-06-23   TNS    : 9                 
##                                            (Other):31                 
##    samplesize       remain           leave          undecided     
##  Min.   : 497   Min.   :0.3500   Min.   :0.3200   Min.   :0.0000  
##  1st Qu.:1010   1st Qu.:0.4100   1st Qu.:0.3900   1st Qu.:0.0900  
##  Median :1693   Median :0.4400   Median :0.4200   Median :0.1300  
##  Mean   :1694   Mean   :0.4424   Mean   :0.4223   Mean   :0.1265  
##  3rd Qu.:2010   3rd Qu.:0.4800   3rd Qu.:0.4500   3rd Qu.:0.1700  
##  Max.   :4772   Max.   :0.5500   Max.   :0.5500   Max.   :0.3000  
##                                                                   
##      spread        
##  Min.   :-0.10000  
##  1st Qu.:-0.02000  
##  Median : 0.01000  
##  Mean   : 0.02008  
##  3rd Qu.: 0.05000  
##  Max.   : 0.19000  
## 
head(polls)
##    startdate    enddate   pollster poll_type samplesize remain leave undecided
## 1 2016-06-23 2016-06-23     YouGov    Online       4772   0.52  0.48      0.00
## 2 2016-06-22 2016-06-22    Populus    Online       4700   0.55  0.45      0.00
## 3 2016-06-20 2016-06-22     YouGov    Online       3766   0.51  0.49      0.00
## 4 2016-06-20 2016-06-22 Ipsos MORI Telephone       1592   0.49  0.46      0.01
## 5 2016-06-20 2016-06-22    Opinium    Online       3011   0.44  0.45      0.09
## 6 2016-06-17 2016-06-22     ComRes Telephone       1032   0.54  0.46      0.00
##   spread
## 1   0.04
## 2   0.10
## 3   0.02
## 4   0.03
## 5  -0.01
## 6   0.08
view(polls)
I noticed that the two types of collection used for these poll percentages were online and telephone. I know there are some concerns with telephone polls such as landlines skewing to older people, but there are more serious issues with online polls. I also noted that the spread when positive supports “remain” whereas negative values supports “leave”
Just to get a general idea of the data:
group and arrange by enddate to plot the spread in order by date
color shows differentiates online from telephone gathering for the polls
added a regression line to note that telephone tends to fall above the line (remain) while online tends to fall below (leave)
polls %>% 
  group_by(enddate) %>% 
  arrange(enddate, spread, poll_type) %>% 
  ggplot(aes(x = enddate, y = spread)) +
  geom_point(aes(x = enddate, y = spread, color = poll_type)) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

I would like to view the proportional sample size by pollster for online versus telephone type polls
First I sum the sample size column into a variable called “totalss”
I then mutate a new variable sampleprop using the totalss in the denominator for each sample size to get a relative proportion to the whole
totalss <- sum(polls$samplesize, na.rm = FALSE, dims = 1)
polls <- polls %>%
  group_by(pollster, samplesize) %>% 
  mutate(sampleprop = samplesize / totalss)
polls
## # A tibble: 127 x 10
## # Groups:   pollster, samplesize [109]
##    startdate  enddate    pollster poll_type samplesize remain leave undecided
##    <date>     <date>     <fct>    <fct>          <dbl>  <dbl> <dbl>     <dbl>
##  1 2016-06-23 2016-06-23 YouGov   Online          4772   0.52  0.48      0   
##  2 2016-06-22 2016-06-22 Populus  Online          4700   0.55  0.45      0   
##  3 2016-06-20 2016-06-22 YouGov   Online          3766   0.51  0.49      0   
##  4 2016-06-20 2016-06-22 Ipsos M~ Telephone       1592   0.49  0.46      0.01
##  5 2016-06-20 2016-06-22 Opinium  Online          3011   0.44  0.45      0.09
##  6 2016-06-17 2016-06-22 ComRes   Telephone       1032   0.54  0.46      0   
##  7 2016-06-17 2016-06-22 ComRes   Telephone       1032   0.48  0.42      0.11
##  8 2016-06-16 2016-06-22 TNS      Online          2320   0.41  0.43      0.16
##  9 2016-06-20 2016-06-20 Survati~ Telephone       1003   0.45  0.44      0.11
## 10 2016-06-18 2016-06-19 YouGov   Online          1652   0.42  0.44      0.13
## # ... with 117 more rows, and 2 more variables: spread <dbl>, sampleprop <dbl>

So that the graph is representative but not cluttered, I chose to view 6 pollsters and placed them into a data set called top 6

# choose 6 pollsters   
top6 <- polls %>% 
 filter(pollster == "ICM" | pollster ==  "YouGov" | pollster == "ORB" | pollster == "ComRes" | pollster == "Ipsos MORI" | pollster == "BMG Research") %>%
  select(enddate, pollster, spread, sampleprop, poll_type) %>% 
  arrange(enddate) #to plot in order
## Adding missing grouping variables: `samplesize`
top6
## # A tibble: 92 x 6
## # Groups:   pollster, samplesize [76]
##    samplesize enddate    pollster      spread sampleprop poll_type
##         <dbl> <date>     <fct>          <dbl>      <dbl> <fct>    
##  1       2055 2016-01-10 ICM           0.06      0.00955 Online   
##  2       2023 2016-01-17 ICM           0.0200    0.00940 Online   
##  3       2015 2016-01-21 ORB           0.04      0.00936 Online   
##  4       1006 2016-01-24 ComRes        0.18      0.00467 Telephone
##  5       2010 2016-01-24 ICM           0         0.00934 Online   
##  6        513 2016-01-25 Ipsos MORI    0.19      0.00238 Telephone
##  7       1511 2016-01-25 BMG Research  0.02      0.00702 Online   
##  8       1735 2016-01-28 YouGov       -0.0400    0.00806 Online   
##  9       2002 2016-01-31 ICM           0.0300    0.00930 Online   
## 10       2018 2016-02-07 ICM          -0.01      0.00938 Online   
## # ... with 82 more rows

Now I use plotly to create a chart showing possible difference between online and telephone polls.

Change endate by mutating to char_date, in order to be able to use character class instead of date class to set x coordinate, y coordinate position for titles on graph

Set sampleprop as the size element and color to identify pollster shown in the legend.

#need date as character class for geom_text
top6 %>%
  mutate(char_date = as.character.Date(enddate)) %>% 
#make scatterplot
  ggplot(aes(x = char_date, y = spread, col = pollster, size = sampleprop)) +
  geom_point(aes(x = char_date,  y = spread), alpha = 0.8) + #alpha is transparacy
#add color brewer  
 scale_color_brewer(palette = "Paired") +  
  guides(size=FALSE) +
  theme(plot.title = element_blank(), legend.title = element_blank()) +
#set y values on coordinate  
  coord_cartesian(ylim = c(-0.20, 0.20)) +
  xlab("Poll End Date") +
  ylab("Spread (remain minus leave)") +
  ggtitle("") +
  facet_grid(. ~ poll_type) +
#add text to each facet pane  
  geom_text(aes(x="2016-02-15", y=-0.15, label=poll_type), cex=3, color="blue") + 
  theme(strip.background = element_blank(),
        strip.text.x = element_blank(),
        strip.text.y = element_blank(),
   legend.position = "top")

#add ggplotly to get tooltip 
ggplotly()

Pattern is a little less clear, but telephone gathering seems to favor remain more than online polls.