For this week’s class discussion, I’ll use an autoregressive neural network to forecast the percentage of stories published by POLITICO that mention the word “poll” anywhere in them. This data set was compiled as a response to a POLITICO story decrying how reporters are over-reliant on polling in their political coverage, which is a less-than-credible claim coming from a new outlet that mentions polls in up to 40% of their stories!

POLITICO’s search function appears to include all articles, so searching for a period returns their total publication volume and searching for “poll” returns all poll-mentioning articles. From that point, it’s a simple scraping job (warning: if you’re playing along at home, this part takes a while…). The resulting data set has columns for date, count of stories mentioning “poll,” and the total number of stories published that day. Finally, the data is grouped by week and converted to percentages then cast to a tm() object.

library(rvest)
library(dplyr)
# dates <- list()
# for(i in seq(1,2174)){
#   url = paste0("http://www.politico.com/search/",i,"?q=poll")
#   articles <- read_html(url)%>%
#     html_nodes('p time')%>%
#     html_attr('datetime')
#   dates[[i]] <- articles
# }
# 
# all_articles <- list()
# for(i in seq(1,15501)){
#   url = paste0("http://www.politico.com/search/",i,"?q=.")
#   articles <- read_html(url)%>%
#     html_nodes('p time')%>%
#     html_attr('datetime')
#   all_articles[[i]] <- articles
# }
# 
# library(lubridate)
# 
# condense_to_days <- function(l){
#   date_dfs <- list()
#   for(i in seq(1,length(l))){
#     date_dfs[[i]] <- data.frame(date = l[[i]])
#   }
# 
#   article_dates <- bind_rows(date_dfs)
#   
#   article_dates <- article_dates%>%
#     mutate(date_day = round_date(as.Date(substr(date, 1,10), format = '%Y-%m-%d'), unit = 'day'))%>%
#     group_by(date_day)%>%
#     summarise(n = n())
# 
#   return(article_dates)  
# } 
# 
# poll_dates <- condense_to_days(dates)
# all_dates <- condense_to_days(all_articles)
# 
# article_mentions <- full_join(
#   poll_dates%>%dplyr::select(date_day, poll = n),
#   all_dates%>%dplyr::select(date_day, all = n)
# )
# Since I did this already, I'll just read in the csv saved from when I ran this...
politico_raw <- readr::read_csv('C:/Users/joshy/Desktop/politico_data.csv')%>%
  filter(date_day > '2007-01-21')%>%
  group_by(week = floor_date(date_day, 'weeks'))%>%
  summarise(poll = sum(poll, na.rm = T), 
            all = sum(all, na.rm = T))%>%
  mutate(percentage = poll/all)
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  X1 = col_integer(),
  date_day = col_date(format = ""),
  poll = col_integer(),
  all = col_integer()
)
politico_ts <- ts(politico_raw, frequency = 52, start = c(2007, 1, 14))
politico_raw%>%head(100)

TheThe data has a clear seasonal pattern, but no real trend. From visual inspection, it looks like the spikes in polling-related stories happen in October-November of even numbered years and the beginnings of presidential years.

library(forecast)
library(ggplot2)
library(yaztheme)
autoplot(decompose(politico_ts[,'percentage']))

First, we’ll develop and test a neural net model. The forecast from this model looks like a naiive projection with incredibly wide confidence intervals. Perhaps an ARIMA or STL model will fit the data more effectively.

nn.fit <- nnetar(politico_ts[,'percentage'])
autoplot(forecast(nn.fit, h = 50, PI = TRUE))+
  theme_yaz()+
  labs(y = 'Percentage of Stories Mentioning "Poll"')

The auto ARIMA model performed similarly to the NNAR model. The STL method more appropriately fit the seasonality of the data, with a spike projected for November 2018 when the next major election is scheduled. None of the models reach the extremes of the past peaks in poll-related publications.

stl.fit <- stlf(politico_ts[,'percentage'])
arima.fit <- auto.arima(politico_ts[,'percentage'])
autoplot(politico_ts[,'percentage'])+
  autolayer(forecast(stl.fit, h = 60), series = 'STL', colour = yaz_cols[3], PI = F)+
  autolayer(forecast(arima.fit, h = 60), series = 'ARIMA', colour = yaz_cols[4], PI = F)+
  autolayer(forecast(nn.fit, h = 60), series = 'NNAR', colour = yaz_cols[6], PI = F)+
  theme_yaz()+
  labs(y = 'Percentage of Stories Mentioning "Poll"',
       title = 'Projected Incidence of Polling Related Stories',
       subtitle = 'Green = STL | Blue = ARIMA | Pink = NNAR')

The STL model performs the best of all, confirming the suggestion from the chart of projected values.

acc.measures <- bind_rows(
  data.frame(accuracy(stl.fit))%>%mutate(method = 'STL'),
  data.frame(accuracy(arima.fit))%>%mutate(method = 'ARIMA'),
  data.frame(accuracy(nn.fit))%>%mutate(method = 'NNAR')
)
acc.measures%>%
  reshape2::melt(id.vars= 'method')%>%
  ggplot(aes(x = method, y = value))+
  geom_col(fill = yaz_cols[1])+
  facet_wrap(~variable, scales = 'free', nrow = 2)+
  theme_yaz()

LS0tDQp0aXRsZTogIlBSRUQgNDEzIC0gRGlzY3Vzc2lvbiBXZWVrIDk6IE5ldXJhbCBOZXR3b3JrcyBmb3IgRm9yZWNhc3RpbmciDQphdXRob3I6ICdKb3NoIFlhem1hbicNCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCkZvciB0aGlzIHdlZWsncyBjbGFzcyBkaXNjdXNzaW9uLCBJJ2xsIHVzZSBhbiBhdXRvcmVncmVzc2l2ZSBuZXVyYWwgbmV0d29yayB0byBmb3JlY2FzdCB0aGUgcGVyY2VudGFnZSBvZiBzdG9yaWVzIHB1Ymxpc2hlZCBieSBQT0xJVElDTyB0aGF0IG1lbnRpb24gdGhlIHdvcmQgInBvbGwiIGFueXdoZXJlIGluIHRoZW0uIFRoaXMgZGF0YSBzZXQgd2FzIGNvbXBpbGVkIGFzIGEgcmVzcG9uc2UgdG8gYSBQT0xJVElDTyBzdG9yeSBkZWNyeWluZyBob3cgcmVwb3J0ZXJzIGFyZSBvdmVyLXJlbGlhbnQgb24gcG9sbGluZyBpbiB0aGVpciBwb2xpdGljYWwgY292ZXJhZ2UsIHdoaWNoIGlzIGEgbGVzcy10aGFuLWNyZWRpYmxlIGNsYWltIGNvbWluZyBmcm9tIGEgbmV3IG91dGxldCB0aGF0IG1lbnRpb25zIHBvbGxzIGluIHVwIHRvIDQwJSBvZiB0aGVpciBzdG9yaWVzIQ0KDQpQT0xJVElDTydzIHNlYXJjaCBmdW5jdGlvbiBhcHBlYXJzIHRvIGluY2x1ZGUgYWxsIGFydGljbGVzLCBzbyBzZWFyY2hpbmcgZm9yIGEgcGVyaW9kIHJldHVybnMgdGhlaXIgdG90YWwgcHVibGljYXRpb24gdm9sdW1lIGFuZCBzZWFyY2hpbmcgZm9yICJwb2xsIiByZXR1cm5zIGFsbCBwb2xsLW1lbnRpb25pbmcgYXJ0aWNsZXMuIEZyb20gdGhhdCBwb2ludCwgaXQncyBhIHNpbXBsZSBzY3JhcGluZyBqb2IgKHdhcm5pbmc6IGlmIHlvdSdyZSBwbGF5aW5nIGFsb25nIGF0IGhvbWUsIHRoaXMgcGFydCB0YWtlcyBhIHdoaWxlLi4uKS4gVGhlIHJlc3VsdGluZyBkYXRhIHNldCBoYXMgY29sdW1ucyBmb3IgZGF0ZSwgY291bnQgb2Ygc3RvcmllcyBtZW50aW9uaW5nICJwb2xsLCIgYW5kIHRoZSB0b3RhbCBudW1iZXIgb2Ygc3RvcmllcyBwdWJsaXNoZWQgdGhhdCBkYXkuIEZpbmFsbHksIHRoZSBkYXRhIGlzIGdyb3VwZWQgYnkgd2VlayBhbmQgY29udmVydGVkIHRvIHBlcmNlbnRhZ2VzIHRoZW4gY2FzdCB0byBhIGB0bSgpYCBvYmplY3QuDQpgYGB7ciwgZWNobyA9IFRSVUV9DQpsaWJyYXJ5KHJ2ZXN0KQ0KbGlicmFyeShkcGx5cikNCg0KIyBkYXRlcyA8LSBsaXN0KCkNCiMgZm9yKGkgaW4gc2VxKDEsMjE3NCkpew0KIyAgIHVybCA9IHBhc3RlMCgiaHR0cDovL3d3dy5wb2xpdGljby5jb20vc2VhcmNoLyIsaSwiP3E9cG9sbCIpDQojICAgYXJ0aWNsZXMgPC0gcmVhZF9odG1sKHVybCklPiUNCiMgICAgIGh0bWxfbm9kZXMoJ3AgdGltZScpJT4lDQojICAgICBodG1sX2F0dHIoJ2RhdGV0aW1lJykNCiMgICBkYXRlc1tbaV1dIDwtIGFydGljbGVzDQojIH0NCiMgDQojIGFsbF9hcnRpY2xlcyA8LSBsaXN0KCkNCiMgZm9yKGkgaW4gc2VxKDEsMTU1MDEpKXsNCiMgICB1cmwgPSBwYXN0ZTAoImh0dHA6Ly93d3cucG9saXRpY28uY29tL3NlYXJjaC8iLGksIj9xPS4iKQ0KIyAgIGFydGljbGVzIDwtIHJlYWRfaHRtbCh1cmwpJT4lDQojICAgICBodG1sX25vZGVzKCdwIHRpbWUnKSU+JQ0KIyAgICAgaHRtbF9hdHRyKCdkYXRldGltZScpDQojICAgYWxsX2FydGljbGVzW1tpXV0gPC0gYXJ0aWNsZXMNCiMgfQ0KIyANCiMgbGlicmFyeShsdWJyaWRhdGUpDQojIA0KIyBjb25kZW5zZV90b19kYXlzIDwtIGZ1bmN0aW9uKGwpew0KIyAgIGRhdGVfZGZzIDwtIGxpc3QoKQ0KIyAgIGZvcihpIGluIHNlcSgxLGxlbmd0aChsKSkpew0KIyAgICAgZGF0ZV9kZnNbW2ldXSA8LSBkYXRhLmZyYW1lKGRhdGUgPSBsW1tpXV0pDQojICAgfQ0KIyANCiMgICBhcnRpY2xlX2RhdGVzIDwtIGJpbmRfcm93cyhkYXRlX2RmcykNCiMgICANCiMgICBhcnRpY2xlX2RhdGVzIDwtIGFydGljbGVfZGF0ZXMlPiUNCiMgICAgIG11dGF0ZShkYXRlX2RheSA9IHJvdW5kX2RhdGUoYXMuRGF0ZShzdWJzdHIoZGF0ZSwgMSwxMCksIGZvcm1hdCA9ICclWS0lbS0lZCcpLCB1bml0ID0gJ2RheScpKSU+JQ0KIyAgICAgZ3JvdXBfYnkoZGF0ZV9kYXkpJT4lDQojICAgICBzdW1tYXJpc2UobiA9IG4oKSkNCiMgDQojICAgcmV0dXJuKGFydGljbGVfZGF0ZXMpICANCiMgfSANCiMgDQojIHBvbGxfZGF0ZXMgPC0gY29uZGVuc2VfdG9fZGF5cyhkYXRlcykNCiMgYWxsX2RhdGVzIDwtIGNvbmRlbnNlX3RvX2RheXMoYWxsX2FydGljbGVzKQ0KIyANCiMgYXJ0aWNsZV9tZW50aW9ucyA8LSBmdWxsX2pvaW4oDQojICAgcG9sbF9kYXRlcyU+JWRwbHlyOjpzZWxlY3QoZGF0ZV9kYXksIHBvbGwgPSBuKSwNCiMgICBhbGxfZGF0ZXMlPiVkcGx5cjo6c2VsZWN0KGRhdGVfZGF5LCBhbGwgPSBuKQ0KIyApDQojIFNpbmNlIEkgZGlkIHRoaXMgYWxyZWFkeSwgSSdsbCBqdXN0IHJlYWQgaW4gdGhlIGNzdiBzYXZlZCBmcm9tIHdoZW4gSSByYW4gdGhpcy4uLg0KcG9saXRpY29fcmF3IDwtIHJlYWRyOjpyZWFkX2NzdignQzovVXNlcnMvam9zaHkvRGVza3RvcC9wb2xpdGljb19kYXRhLmNzdicpJT4lDQogIGZpbHRlcihkYXRlX2RheSA+ICcyMDA3LTAxLTIxJyklPiUNCiAgZ3JvdXBfYnkod2VlayA9IGZsb29yX2RhdGUoZGF0ZV9kYXksICd3ZWVrcycpKSU+JQ0KICBzdW1tYXJpc2UocG9sbCA9IHN1bShwb2xsLCBuYS5ybSA9IFQpLCANCiAgICAgICAgICAgIGFsbCA9IHN1bShhbGwsIG5hLnJtID0gVCkpJT4lDQogIG11dGF0ZShwZXJjZW50YWdlID0gcG9sbC9hbGwpDQpwb2xpdGljb190cyA8LSB0cyhwb2xpdGljb19yYXcsIGZyZXF1ZW5jeSA9IDUyLCBzdGFydCA9IGMoMjAwNywgMSwgMTQpKQ0KcG9saXRpY29fcmF3JT4laGVhZCgxMDApDQpgYGANCg0KVGhlVGhlIGRhdGEgaGFzIGEgY2xlYXIgc2Vhc29uYWwgcGF0dGVybiwgYnV0IG5vIHJlYWwgdHJlbmQuIEZyb20gdmlzdWFsIGluc3BlY3Rpb24sIGl0IGxvb2tzIGxpa2UgdGhlIHNwaWtlcyBpbiBwb2xsaW5nLXJlbGF0ZWQgc3RvcmllcyBoYXBwZW4gaW4gT2N0b2Jlci1Ob3ZlbWJlciBvZiBldmVuIG51bWJlcmVkIHllYXJzIGFuZCB0aGUgYmVnaW5uaW5ncyBvZiBwcmVzaWRlbnRpYWwgeWVhcnMuIA0KDQpgYGB7ciwgZmlnLndpZHRoPTEwLCBmaWcuaGVpZ2h0PTUsIGVjaG8gPSBUUlVFfQ0KbGlicmFyeShmb3JlY2FzdCkNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoeWF6dGhlbWUpDQphdXRvcGxvdChkZWNvbXBvc2UocG9saXRpY29fdHNbLCdwZXJjZW50YWdlJ10pKQ0KYGBgDQoNCkZpcnN0LCB3ZSdsbCBkZXZlbG9wIGFuZCB0ZXN0IGEgbmV1cmFsIG5ldCBtb2RlbC4gVGhlIGZvcmVjYXN0IGZyb20gdGhpcyBtb2RlbCBsb29rcyBsaWtlIGEgbmFpaXZlIHByb2plY3Rpb24gd2l0aCBpbmNyZWRpYmx5IHdpZGUgY29uZmlkZW5jZSBpbnRlcnZhbHMuIFBlcmhhcHMgYW4gQVJJTUEgb3IgU1RMIG1vZGVsIHdpbGwgZml0IHRoZSBkYXRhIG1vcmUgZWZmZWN0aXZlbHkuDQoNCmBgYHtyLCBmaWcud2lkdGg9MTAsIGZpZy5oZWlnaHQ9NSwgZWNobyA9IFRSVUV9DQpubi5maXQgPC0gbm5ldGFyKHBvbGl0aWNvX3RzWywncGVyY2VudGFnZSddKQ0KYXV0b3Bsb3QoZm9yZWNhc3Qobm4uZml0LCBoID0gNTAsIFBJID0gVFJVRSkpKw0KICB0aGVtZV95YXooKSsNCiAgbGFicyh5ID0gJ1BlcmNlbnRhZ2Ugb2YgU3RvcmllcyBNZW50aW9uaW5nICJQb2xsIicpDQpgYGANCg0KVGhlIGF1dG8gQVJJTUEgbW9kZWwgcGVyZm9ybWVkIHNpbWlsYXJseSB0byB0aGUgTk5BUiBtb2RlbC4gVGhlIFNUTCBtZXRob2QgbW9yZSBhcHByb3ByaWF0ZWx5IGZpdCB0aGUgc2Vhc29uYWxpdHkgb2YgdGhlIGRhdGEsIHdpdGggYSBzcGlrZSBwcm9qZWN0ZWQgZm9yIE5vdmVtYmVyIDIwMTggd2hlbiB0aGUgbmV4dCBtYWpvciBlbGVjdGlvbiBpcyBzY2hlZHVsZWQuIE5vbmUgb2YgdGhlIG1vZGVscyByZWFjaCB0aGUgZXh0cmVtZXMgb2YgdGhlIHBhc3QgcGVha3MgaW4gcG9sbC1yZWxhdGVkIHB1YmxpY2F0aW9ucy4gDQpgYGB7ciwgZmlnLndpZHRoPTEwLCBmaWcuaGVpZ2h0PTUsIGVjaG8gPSBUUlVFfQ0Kc3RsLmZpdCA8LSBzdGxmKHBvbGl0aWNvX3RzWywncGVyY2VudGFnZSddKQ0KYXJpbWEuZml0IDwtIGF1dG8uYXJpbWEocG9saXRpY29fdHNbLCdwZXJjZW50YWdlJ10pDQphdXRvcGxvdChwb2xpdGljb190c1ssJ3BlcmNlbnRhZ2UnXSkrDQogIGF1dG9sYXllcihmb3JlY2FzdChzdGwuZml0LCBoID0gNjApLCBzZXJpZXMgPSAnU1RMJywgY29sb3VyID0geWF6X2NvbHNbM10sIFBJID0gRikrDQogIGF1dG9sYXllcihmb3JlY2FzdChhcmltYS5maXQsIGggPSA2MCksIHNlcmllcyA9ICdBUklNQScsIGNvbG91ciA9IHlhel9jb2xzWzRdLCBQSSA9IEYpKw0KICBhdXRvbGF5ZXIoZm9yZWNhc3Qobm4uZml0LCBoID0gNjApLCBzZXJpZXMgPSAnTk5BUicsIGNvbG91ciA9IHlhel9jb2xzWzZdLCBQSSA9IEYpKw0KICB0aGVtZV95YXooKSsNCiAgbGFicyh5ID0gJ1BlcmNlbnRhZ2Ugb2YgU3RvcmllcyBNZW50aW9uaW5nICJQb2xsIicsDQogICAgICAgdGl0bGUgPSAnUHJvamVjdGVkIEluY2lkZW5jZSBvZiBQb2xsaW5nIFJlbGF0ZWQgU3RvcmllcycsDQogICAgICAgc3VidGl0bGUgPSAnR3JlZW4gPSBTVEwgfCBCbHVlID0gQVJJTUEgfCBQaW5rID0gTk5BUicpDQpgYGANCg0KVGhlIFNUTCBtb2RlbCBwZXJmb3JtcyB0aGUgYmVzdCBvZiBhbGwsIGNvbmZpcm1pbmcgdGhlIHN1Z2dlc3Rpb24gZnJvbSB0aGUgY2hhcnQgb2YgcHJvamVjdGVkIHZhbHVlcy4NCmBgYHtyLCBmaWcud2lkdGg9MTAsIGZpZy5oZWlnaHQ9NSwgZWNobyA9IFRSVUV9DQphY2MubWVhc3VyZXMgPC0gYmluZF9yb3dzKA0KICBkYXRhLmZyYW1lKGFjY3VyYWN5KHN0bC5maXQpKSU+JW11dGF0ZShtZXRob2QgPSAnU1RMJyksDQogIGRhdGEuZnJhbWUoYWNjdXJhY3koYXJpbWEuZml0KSklPiVtdXRhdGUobWV0aG9kID0gJ0FSSU1BJyksDQogIGRhdGEuZnJhbWUoYWNjdXJhY3kobm4uZml0KSklPiVtdXRhdGUobWV0aG9kID0gJ05OQVInKQ0KKQ0KDQphY2MubWVhc3VyZXMlPiUNCiAgcmVzaGFwZTI6Om1lbHQoaWQudmFycz0gJ21ldGhvZCcpJT4lDQogIGdncGxvdChhZXMoeCA9IG1ldGhvZCwgeSA9IHZhbHVlKSkrDQogIGdlb21fY29sKGZpbGwgPSB5YXpfY29sc1sxXSkrDQogIGZhY2V0X3dyYXAofnZhcmlhYmxlLCBzY2FsZXMgPSAnZnJlZScsIG5yb3cgPSAyKSsNCiAgdGhlbWVfeWF6KCkNCmBgYA0KDQo=