Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: assertthat
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:psych':
## 
##     rescale
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## The following objects are masked from 'package:purrr':
## 
##     is_logical, is_numeric
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  googlesheets4,
  rms,
  lubridate,
  bootstrap #CV LOESS
)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
theme_set(theme_bw())

Functions

fix_list_col = function(x) {
  #already vector, use it
  if (!is.list(x)) return(x)
  
  #if list, then loop
  map_chr(x, function(y) {
    if (is.null(y)) return(NA_character_)
    unlist(as.character(y))
  })
}


#https://rpubs.com/mengxu/loess_cv
#----------------------------------------------------------------------#
# fit data points with LOESS + cross validation
#----------------------------------------------------------------------#
# Quantify error for each span, using CV
loess.model <- function(x, y, span){
  loess(y ~ x, span = span, control=loess.control(surface="direct"))
}

loess.predict <- function(fit, newdata) {
  predict(fit, newdata = newdata)
  }

loess_wrapper_extrapolate <- function (x, y, span.vals = seq(0.25, 1, by = 0.05), folds = 5){
  # Do model selection using mean absolute error, which is more robust than squared error.
  mean.abs.error <- numeric(length(span.vals))
  
  span.index <- 0
  for (each.span in span.vals) {
    span.index <- span.index + 1
    y.hat.cv <- crossval(x, y, theta.fit = loess.model, theta.predict = loess.predict, span = each.span, ngroup = folds)$cv.fit
    non.empty.indices <- !is.na(y.hat.cv)
    mean.abs.error[span.index] <- mean(abs(y[non.empty.indices] - y.hat.cv[non.empty.indices]))
  }
  
  # find the span which minimizes error
  best.span <- span.vals[which.min(mean.abs.error)]
  
  # fit and return the best model
  best.model <- loess(y ~ x, span = best.span, control=loess.control(surface="direct"))
  return(best.model)
}

#repeated cross-cv to get more stable results
loess_wrapper_extrapolate_repeated = function(x, y, span.vals = seq(0.25, 1, by = 0.05), folds = 5, repeats = 10) {
  #repeatly fit the CV
  #use the mean of the best spans
  fits = map(1:repeats,
             .f = function(i) {
               loess_wrapper_extrapolate(x = x, y = y, span.vals = span.vals, folds = folds)
             })
  
  mean_best_span = map_dbl(fits, ~.$pars$span) %>% mean()
  
  #final fit using the mean span
  loess(y ~ x, span = mean_best_span, control = loess.control(surface="direct"))
}

Data

Full data

#get data
d_raw = googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1vAvPZQn6a24JHXgqKFMg2ZOcb7y_IfKSqVKfMZcuIB0/edit#gid=0")
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for 'the.dfx@gmail.com'.
## Auto-refreshing stale OAuth token.
## ✔ Reading from "French election data".
## ✔ Range 'full'.
#tidy it up
d = d_raw %>% 
  #fix names
  df_legalize_names(func = function(x) str_legalize(x, ascii = F)) %>% 
  #filter bad rows
  filter(!is.na(Firm))

#fix non-vector columns
for (v in names(d)) d[[v]] = d[[v]] %>% fix_list_col()

#interpret dates
#these are a mess!
d$date_clean = d$Date %>% str_replace(".+–", "")
d$Date1 = readr::parse_date(d$date_clean, format = "%d %b %Y")

#set nicer names
nice_names = c("Arthaud", "Poutou", "Roussel", 
"Mélenchon", "Hidalgo", "Jadot", "Macron", "Pécresse", 
"Lassalle", "Dupont_Aignan", "Le_Pen", "Zemmour")
names(d)[4:15] = nice_names

#fix values, missing = NA, inequalitties = NA
for (v in nice_names) d[[v]] = d[[v]] %>% as.numeric()
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion

## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
#sort by date
d %<>% arrange(Date1)

Run-off

d2_raw = googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1vAvPZQn6a24JHXgqKFMg2ZOcb7y_IfKSqVKfMZcuIB0/edit#gid=1420594908", sheet = 2)
## ✔ Reading from "French election data".
## ✔ Range ''lepen_macron''.
#fix it up
d2 = d2_raw %>% 
  df_legalize_names()

#list col fix
for (v in names(d2)) d2[[v]] = d2[[v]] %>% fix_list_col()

#date
d2$date_clean = d2$Date %>% str_replace(".+–", "")
d2$Date1 = readr::parse_date(d2$date_clean, format = "%d %b %Y")
## Warning: 8 parsing failures.
## row col           expected         actual
##   1  -- date like %d %b %Y 2022-04-22    
##   8  -- date like %d %b %Y 21-21 Apr 2022
##  49  -- date like %d %b %Y 2022-04-10    
##  50  -- date like %d %b %Y 2022-04-10    
##  51  -- date like %d %b %Y 2022-04-10    
## ... ... .................. ..............
## See problems(...) for more details.
#move the post-pred data to another frame
d2b = d2
d2 = d2 %>% filter(post_pred == 0)

Analysis

Basic plots

#full plot
d %>% 
  pivot_longer(
    cols = !!nice_names
  ) %>% 
  ggplot(aes(Date1, value, color = name)) +
  geom_point(size = .2) +
  geom_smooth() +
  scale_y_continuous("Vote in poll", labels = scales::percent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 202 rows containing non-finite values (stat_smooth).
## Warning: Removed 202 rows containing missing values (geom_point).

GG_save("figs/french_full.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 202 rows containing non-finite values (stat_smooth).
## Removed 202 rows containing missing values (geom_point).
#run off
d2 %>% 
  pivot_longer(
    cols = c(Macron, Le_Pen)
  ) %>% 
  ggplot(aes(Date1, value, color = name)) +
  geom_point(size = .2) +
  geom_smooth() +
  scale_y_continuous("Vote in poll", labels = scales::percent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_save("figs/french_runoff.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Full data predictions

#fit a LOESS model
#cross-validate the optimal span value
#do loess fit for each candidate
full_loess_fits = replicate(12, NULL) %>% set_names(nice_names)

set.seed(1)
for (v in nice_names) {
  #subset to no missing data for this candiate
  v_sym = rlang::sym(v)
  local_d = d %>% filter(!is.na(!!v_sym))
  
  full_loess_fits[[v]] = loess_wrapper_extrapolate_repeated(
    x = as.numeric(local_d$Date1),
    y = local_d[[v]]
  )
}

#get predictions for dataset with the included forward date range
d_pred = map_df(nice_names, function(v) {
  y = tibble(
    name = v,
    x = seq(as.Date("2021-01-01"), as.Date("2022-04-10"), by = "day") %>% as.numeric()
  )
  
  #add predicted values
  preds =     predict(
      full_loess_fits[[v]],
      newdata = y,
      se = T
      )
  
  y$pred = preds$fit
  y$pred_se = preds$se.fit
  
  y
})

#more columns
#numerical predictions
d_pred %<>% mutate(
  pred_upper = pred + 2*pred_se,
  pred_lower = pred - 2*pred_se,
  Date1 = as.Date(x, "1970-01-01")
)

#plot again with predictions
d %>% 
  pivot_longer(
    cols = !!nice_names
  ) %>% 
  ggplot(aes(Date1, value, color = name)) +
  geom_point(size = .2) +
  geom_ribbon(data = d_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
  geom_line(data = d_pred, aes(as.Date(x, "1970-01-01"), pred)) +
  scale_y_continuous("Vote in poll", labels = scales::percent)
## Warning: Removed 202 rows containing missing values (geom_point).

GG_save("figs/french_full_preds.png")
## Warning: Removed 202 rows containing missing values (geom_point).
#numbers
d_pred %>% 
  filter(Date1 == as.Date("2022-04-10")) %>% 
  arrange(-pred)

Run-off predictions

#fit a LOESS model
#cross-validate the optimal span value
#do loess fit for each candidate
runoff_names = c("Macron", "Le_Pen")
runoff_loess_fits = replicate(2, NULL) %>% set_names(runoff_names)

set.seed(1)
for (v in runoff_names) {
  #subset to no missing data for this candiate
  v_sym = rlang::sym(v)
  local_d = d2%>% filter(!is.na(!!v_sym))
  
  runoff_loess_fits[[v]] = loess_wrapper_extrapolate_repeated(
    x = as.numeric(local_d$Date1),
    y = local_d[[v]],
    folds = 10
  )
}

#get predictions for dataset with the included forward date range
d2_pred = map_df(runoff_names, function(v) {
  y = tibble(
    name = v,
    x = seq(min(d2$Date1, na.rm=T), as.Date("2022-04-24"), by = "day") %>% as.numeric()
  )
  
  #add predicted values
  preds = predict(
      runoff_loess_fits[[v]],
      newdata = y,
      se = T
      )
  
  y$pred = preds$fit
  y$pred_se = preds$se.fit
  
  y
})

#more columns
#numerical predictions
d2_pred %<>% mutate(
  pred_upper = pred + 2*pred_se,
  pred_lower = pred - 2*pred_se,
  Date1 = as.Date(x, "1970-01-01")
)

#plot again with predictions
d2 %>% 
  pivot_longer(
    cols = !!runoff_names
  ) %>% 
  ggplot(aes(Date1, value, color = name)) +
  geom_point(size = .2) +
  geom_ribbon(data = d2_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
  geom_line(data = d2_pred, aes(as.Date(x, "1970-01-01"), pred)) +
  scale_y_continuous("Vote in poll", labels = scales::percent)

GG_save("figs/french_runoff_preds.png")

#numbers
d2_pred %>% 
  filter(Date1 == as.Date("2022-04-24")) %>% 
  arrange(-pred) ->
  d2_pred_election

d2_pred_election
#uncertainty graphically
d2_pred_density = tibble(
  density = c(
    dnorm(seq(0, 1, length.out = 1000), mean = d2_pred_election[2, ]$pred, sd = d2_pred_election[2, ]$pred_se),
    dnorm(seq(0, 1, length.out = 1000), mean = d2_pred_election[1, ]$pred, sd = d2_pred_election[1, ]$pred_se)
  ),
  vote = rep(seq(0, 1, length.out = 1000), 2),
  name = rep(c("Macron", "Le Pen"), each = 1000)
)

d2_pred_density %>% 
  ggplot(aes(vote, density, color = name)) +
  geom_line() +
  scale_x_continuous(labels = scales::percent)

GG_save("figs/runoff_uncertainty_dist.png")

#sample from the posteriors to determine winning chances
set.seed(1)
runoff_battle_results = map_df(1:10000, function(i) {
  tibble(
    Macron = rnorm(1, mean = d2_pred_election[2, ]$pred, sd = d2_pred_election[2, ]$pred_se),
    Le_Pen = rnorm(1, mean = d2_pred_election[1, ]$pred, sd = d2_pred_election[1, ]$pred_se),
    Macron_margin = Macron - Le_Pen,
    Macron_victory = Macron_margin > 0,
    vote_total = Macron+Le_Pen,
    Macron_adj = Macron/vote_total,
    Le_Pen_adj = Le_Pen/vote_total,
    Macron_margin_adj = Macron_adj - Le_Pen_adj
  )
})

runoff_battle_results %>% describe2()
#plot density again
runoff_battle_results %>% 
  pivot_longer(
    cols = c(Macron_adj, Le_Pen_adj)
  ) %>% 
  ggplot(aes(value, fill = name)) +
  geom_density(alpha = .3) +
  scale_x_continuous("Election forecast", labels = scales::percent)

GG_save("figs/final_posteriors.png")

Follow-up

#zoom plot of old data
d2 %>% 
  pivot_longer(
    cols = !!runoff_names
  ) %>% 
  ggplot(aes(Date1, value, color = name)) +
  geom_point(size = .2) +
  geom_ribbon(data = d2_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
  geom_line(data = d2_pred, aes(as.Date(x, "1970-01-01"), pred)) +
  scale_y_continuous("Vote in poll", labels = scales::percent) +
  coord_cartesian(xlim = c(as.Date("2022-01-01"), NA))

GG_save("figs/french_runoff_preds_2022.png")

#with new data
d2b %>% 
  pivot_longer(
    cols = !!runoff_names
  ) %>% 
  ggplot(aes(Date1, value, color = name)) +
  geom_point(size = .2) +
  geom_ribbon(data = d2_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
  geom_line(data = d2_pred, aes(as.Date(x, "1970-01-01"), pred)) +
  scale_y_continuous("Vote in poll", labels = scales::percent) +
  coord_cartesian(xlim = c(as.Date("2022-01-01"), NA)) +
  geom_vline(xintercept = as.Date("2022-04-05"), linetype = "dashed") +
  geom_point(data = tibble(Date1 = as.Date(c("2022-04-24", "2022-04-24")), value = c(.5855, .4145), name = c("Macron", "Le_Pen")), shape = 4, size = 4)
## Warning: Removed 16 rows containing missing values (geom_point).

GG_save("figs/french_runoff_preds_2022_post.png")
## Warning: Removed 16 rows containing missing values (geom_point).

Meta

write_sessioninfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bootstrap_2019.6      lubridate_1.8.0       rms_6.3-0            
##  [4] SparseM_1.81          googlesheets4_1.0.0   kirkegaard_2022-06-23
##  [7] psych_2.2.3           assertthat_0.2.1      weights_1.0.4        
## [10] Hmisc_4.7-0           Formula_1.2-4         survival_3.2-13      
## [13] lattice_0.20-45       magrittr_2.0.3        forcats_0.5.1        
## [16] stringr_1.4.0         dplyr_1.0.9           purrr_0.3.4          
## [19] readr_2.1.2           tidyr_1.2.0           tibble_3.1.7         
## [22] ggplot2_3.3.6         tidyverse_1.3.1      
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.1-1       googledrive_2.0.0   minqa_1.2.4        
##  [4] colorspace_2.0-3    ellipsis_0.3.2      htmlTable_2.4.0    
##  [7] base64enc_0.1-3     fs_1.5.2            rstudioapi_0.13    
## [10] mice_3.14.0         farver_2.1.0        MatrixModels_0.5-0 
## [13] fansi_1.0.3         mvtnorm_1.1-3       xml2_1.3.3         
## [16] codetools_0.2-18    splines_4.2.0       mnormt_2.0.2       
## [19] knitr_1.39          jsonlite_1.8.0      nloptr_2.0.1       
## [22] broom_0.8.0         cluster_2.1.3       dbplyr_2.1.1       
## [25] png_0.1-7           compiler_4.2.0      httr_1.4.3         
## [28] backports_1.4.1     Matrix_1.4-1        fastmap_1.1.0      
## [31] gargle_1.2.0        cli_3.3.0           htmltools_0.5.2    
## [34] quantreg_5.93       tools_4.2.0         gtable_0.3.0       
## [37] glue_1.6.2          rappdirs_0.3.3      Rcpp_1.0.8.3       
## [40] cellranger_1.1.0    jquerylib_0.1.4     vctrs_0.4.1        
## [43] gdata_2.18.0        nlme_3.1-157        xfun_0.30          
## [46] lme4_1.1-29         rvest_1.0.2         lifecycle_1.0.1    
## [49] gtools_3.9.2        polspline_1.1.20    MASS_7.3-57        
## [52] zoo_1.8-10          scales_1.2.0        hms_1.1.1          
## [55] parallel_4.2.0      sandwich_3.0-1      RColorBrewer_1.1-3 
## [58] curl_4.3.2          yaml_2.3.5          gridExtra_2.3      
## [61] sass_0.4.1          rpart_4.1.16        latticeExtra_0.6-29
## [64] stringi_1.7.6       highr_0.9           checkmate_2.1.0    
## [67] boot_1.3-28         rlang_1.0.2         pkgconfig_2.0.3    
## [70] evaluate_0.15       labeling_0.4.2      htmlwidgets_1.5.4  
## [73] tidyselect_1.1.2    R6_2.5.1            generics_0.1.2     
## [76] multcomp_1.4-19     DBI_1.1.2           mgcv_1.8-40        
## [79] pillar_1.7.0        haven_2.5.0         foreign_0.8-82     
## [82] withr_2.5.0         nnet_7.3-17         modelr_0.1.8       
## [85] crayon_1.5.1        utf8_1.2.2          tmvnsim_1.0-2      
## [88] tzdb_0.3.0          rmarkdown_2.14      jpeg_0.1-9         
## [91] grid_4.2.0          readxl_1.4.0        data.table_1.14.2  
## [94] reprex_2.0.1        digest_0.6.29       openssl_2.0.0      
## [97] munsell_0.5.0       bslib_0.3.1         askpass_1.1
#save clean data to disk
list(
  d, d_pred, d2, d2_pred
) %>% write_rds("data_out.rds", compress = "xz")

#OSF
if (F) {
  library(osfr)
  
  osf_auth(read_lines("~/.config/osf_token"))
  osf_proj = osf_retrieve_node("https://osf.io/5wvun/")
  osf_upload(osf_proj, path = list.files(), conflicts = "overwrite")
}