library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: assertthat
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:psych':
##
## rescale
## The following object is masked from 'package:assertthat':
##
## are_equal
## The following objects are masked from 'package:purrr':
##
## is_logical, is_numeric
## The following object is masked from 'package:base':
##
## +
load_packages(
googlesheets4,
rms,
lubridate,
bootstrap #CV LOESS
)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
theme_set(theme_bw())
fix_list_col = function(x) {
#already vector, use it
if (!is.list(x)) return(x)
#if list, then loop
map_chr(x, function(y) {
if (is.null(y)) return(NA_character_)
unlist(as.character(y))
})
}
#https://rpubs.com/mengxu/loess_cv
#----------------------------------------------------------------------#
# fit data points with LOESS + cross validation
#----------------------------------------------------------------------#
# Quantify error for each span, using CV
loess.model <- function(x, y, span){
loess(y ~ x, span = span, control=loess.control(surface="direct"))
}
loess.predict <- function(fit, newdata) {
predict(fit, newdata = newdata)
}
loess_wrapper_extrapolate <- function (x, y, span.vals = seq(0.25, 1, by = 0.05), folds = 5){
# Do model selection using mean absolute error, which is more robust than squared error.
mean.abs.error <- numeric(length(span.vals))
span.index <- 0
for (each.span in span.vals) {
span.index <- span.index + 1
y.hat.cv <- crossval(x, y, theta.fit = loess.model, theta.predict = loess.predict, span = each.span, ngroup = folds)$cv.fit
non.empty.indices <- !is.na(y.hat.cv)
mean.abs.error[span.index] <- mean(abs(y[non.empty.indices] - y.hat.cv[non.empty.indices]))
}
# find the span which minimizes error
best.span <- span.vals[which.min(mean.abs.error)]
# fit and return the best model
best.model <- loess(y ~ x, span = best.span, control=loess.control(surface="direct"))
return(best.model)
}
#repeated cross-cv to get more stable results
loess_wrapper_extrapolate_repeated = function(x, y, span.vals = seq(0.25, 1, by = 0.05), folds = 5, repeats = 10) {
#repeatly fit the CV
#use the mean of the best spans
fits = map(1:repeats,
.f = function(i) {
loess_wrapper_extrapolate(x = x, y = y, span.vals = span.vals, folds = folds)
})
mean_best_span = map_dbl(fits, ~.$pars$span) %>% mean()
#final fit using the mean span
loess(y ~ x, span = mean_best_span, control = loess.control(surface="direct"))
}
#get data
d_raw = googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1vAvPZQn6a24JHXgqKFMg2ZOcb7y_IfKSqVKfMZcuIB0/edit#gid=0")
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for 'the.dfx@gmail.com'.
## Auto-refreshing stale OAuth token.
## ✔ Reading from "French election data".
## ✔ Range 'full'.
#tidy it up
d = d_raw %>%
#fix names
df_legalize_names(func = function(x) str_legalize(x, ascii = F)) %>%
#filter bad rows
filter(!is.na(Firm))
#fix non-vector columns
for (v in names(d)) d[[v]] = d[[v]] %>% fix_list_col()
#interpret dates
#these are a mess!
d$date_clean = d$Date %>% str_replace(".+–", "")
d$Date1 = readr::parse_date(d$date_clean, format = "%d %b %Y")
#set nicer names
nice_names = c("Arthaud", "Poutou", "Roussel",
"Mélenchon", "Hidalgo", "Jadot", "Macron", "Pécresse",
"Lassalle", "Dupont_Aignan", "Le_Pen", "Zemmour")
names(d)[4:15] = nice_names
#fix values, missing = NA, inequalitties = NA
for (v in nice_names) d[[v]] = d[[v]] %>% as.numeric()
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
## Warning in d[[v]] %>% as.numeric(): NAs introduced by coercion
#sort by date
d %<>% arrange(Date1)
d2_raw = googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1vAvPZQn6a24JHXgqKFMg2ZOcb7y_IfKSqVKfMZcuIB0/edit#gid=1420594908", sheet = 2)
## ✔ Reading from "French election data".
## ✔ Range ''lepen_macron''.
#fix it up
d2 = d2_raw %>%
df_legalize_names()
#list col fix
for (v in names(d2)) d2[[v]] = d2[[v]] %>% fix_list_col()
#date
d2$date_clean = d2$Date %>% str_replace(".+–", "")
d2$Date1 = readr::parse_date(d2$date_clean, format = "%d %b %Y")
## Warning: 8 parsing failures.
## row col expected actual
## 1 -- date like %d %b %Y 2022-04-22
## 8 -- date like %d %b %Y 21-21 Apr 2022
## 49 -- date like %d %b %Y 2022-04-10
## 50 -- date like %d %b %Y 2022-04-10
## 51 -- date like %d %b %Y 2022-04-10
## ... ... .................. ..............
## See problems(...) for more details.
#move the post-pred data to another frame
d2b = d2
d2 = d2 %>% filter(post_pred == 0)
#full plot
d %>%
pivot_longer(
cols = !!nice_names
) %>%
ggplot(aes(Date1, value, color = name)) +
geom_point(size = .2) +
geom_smooth() +
scale_y_continuous("Vote in poll", labels = scales::percent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 202 rows containing non-finite values (stat_smooth).
## Warning: Removed 202 rows containing missing values (geom_point).
GG_save("figs/french_full.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 202 rows containing non-finite values (stat_smooth).
## Removed 202 rows containing missing values (geom_point).
#run off
d2 %>%
pivot_longer(
cols = c(Macron, Le_Pen)
) %>%
ggplot(aes(Date1, value, color = name)) +
geom_point(size = .2) +
geom_smooth() +
scale_y_continuous("Vote in poll", labels = scales::percent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
GG_save("figs/french_runoff.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#fit a LOESS model
#cross-validate the optimal span value
#do loess fit for each candidate
full_loess_fits = replicate(12, NULL) %>% set_names(nice_names)
set.seed(1)
for (v in nice_names) {
#subset to no missing data for this candiate
v_sym = rlang::sym(v)
local_d = d %>% filter(!is.na(!!v_sym))
full_loess_fits[[v]] = loess_wrapper_extrapolate_repeated(
x = as.numeric(local_d$Date1),
y = local_d[[v]]
)
}
#get predictions for dataset with the included forward date range
d_pred = map_df(nice_names, function(v) {
y = tibble(
name = v,
x = seq(as.Date("2021-01-01"), as.Date("2022-04-10"), by = "day") %>% as.numeric()
)
#add predicted values
preds = predict(
full_loess_fits[[v]],
newdata = y,
se = T
)
y$pred = preds$fit
y$pred_se = preds$se.fit
y
})
#more columns
#numerical predictions
d_pred %<>% mutate(
pred_upper = pred + 2*pred_se,
pred_lower = pred - 2*pred_se,
Date1 = as.Date(x, "1970-01-01")
)
#plot again with predictions
d %>%
pivot_longer(
cols = !!nice_names
) %>%
ggplot(aes(Date1, value, color = name)) +
geom_point(size = .2) +
geom_ribbon(data = d_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
geom_line(data = d_pred, aes(as.Date(x, "1970-01-01"), pred)) +
scale_y_continuous("Vote in poll", labels = scales::percent)
## Warning: Removed 202 rows containing missing values (geom_point).
GG_save("figs/french_full_preds.png")
## Warning: Removed 202 rows containing missing values (geom_point).
#numbers
d_pred %>%
filter(Date1 == as.Date("2022-04-10")) %>%
arrange(-pred)
#fit a LOESS model
#cross-validate the optimal span value
#do loess fit for each candidate
runoff_names = c("Macron", "Le_Pen")
runoff_loess_fits = replicate(2, NULL) %>% set_names(runoff_names)
set.seed(1)
for (v in runoff_names) {
#subset to no missing data for this candiate
v_sym = rlang::sym(v)
local_d = d2%>% filter(!is.na(!!v_sym))
runoff_loess_fits[[v]] = loess_wrapper_extrapolate_repeated(
x = as.numeric(local_d$Date1),
y = local_d[[v]],
folds = 10
)
}
#get predictions for dataset with the included forward date range
d2_pred = map_df(runoff_names, function(v) {
y = tibble(
name = v,
x = seq(min(d2$Date1, na.rm=T), as.Date("2022-04-24"), by = "day") %>% as.numeric()
)
#add predicted values
preds = predict(
runoff_loess_fits[[v]],
newdata = y,
se = T
)
y$pred = preds$fit
y$pred_se = preds$se.fit
y
})
#more columns
#numerical predictions
d2_pred %<>% mutate(
pred_upper = pred + 2*pred_se,
pred_lower = pred - 2*pred_se,
Date1 = as.Date(x, "1970-01-01")
)
#plot again with predictions
d2 %>%
pivot_longer(
cols = !!runoff_names
) %>%
ggplot(aes(Date1, value, color = name)) +
geom_point(size = .2) +
geom_ribbon(data = d2_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
geom_line(data = d2_pred, aes(as.Date(x, "1970-01-01"), pred)) +
scale_y_continuous("Vote in poll", labels = scales::percent)
GG_save("figs/french_runoff_preds.png")
#numbers
d2_pred %>%
filter(Date1 == as.Date("2022-04-24")) %>%
arrange(-pred) ->
d2_pred_election
d2_pred_election
#uncertainty graphically
d2_pred_density = tibble(
density = c(
dnorm(seq(0, 1, length.out = 1000), mean = d2_pred_election[2, ]$pred, sd = d2_pred_election[2, ]$pred_se),
dnorm(seq(0, 1, length.out = 1000), mean = d2_pred_election[1, ]$pred, sd = d2_pred_election[1, ]$pred_se)
),
vote = rep(seq(0, 1, length.out = 1000), 2),
name = rep(c("Macron", "Le Pen"), each = 1000)
)
d2_pred_density %>%
ggplot(aes(vote, density, color = name)) +
geom_line() +
scale_x_continuous(labels = scales::percent)
GG_save("figs/runoff_uncertainty_dist.png")
#sample from the posteriors to determine winning chances
set.seed(1)
runoff_battle_results = map_df(1:10000, function(i) {
tibble(
Macron = rnorm(1, mean = d2_pred_election[2, ]$pred, sd = d2_pred_election[2, ]$pred_se),
Le_Pen = rnorm(1, mean = d2_pred_election[1, ]$pred, sd = d2_pred_election[1, ]$pred_se),
Macron_margin = Macron - Le_Pen,
Macron_victory = Macron_margin > 0,
vote_total = Macron+Le_Pen,
Macron_adj = Macron/vote_total,
Le_Pen_adj = Le_Pen/vote_total,
Macron_margin_adj = Macron_adj - Le_Pen_adj
)
})
runoff_battle_results %>% describe2()
#plot density again
runoff_battle_results %>%
pivot_longer(
cols = c(Macron_adj, Le_Pen_adj)
) %>%
ggplot(aes(value, fill = name)) +
geom_density(alpha = .3) +
scale_x_continuous("Election forecast", labels = scales::percent)
GG_save("figs/final_posteriors.png")
#zoom plot of old data
d2 %>%
pivot_longer(
cols = !!runoff_names
) %>%
ggplot(aes(Date1, value, color = name)) +
geom_point(size = .2) +
geom_ribbon(data = d2_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
geom_line(data = d2_pred, aes(as.Date(x, "1970-01-01"), pred)) +
scale_y_continuous("Vote in poll", labels = scales::percent) +
coord_cartesian(xlim = c(as.Date("2022-01-01"), NA))
GG_save("figs/french_runoff_preds_2022.png")
#with new data
d2b %>%
pivot_longer(
cols = !!runoff_names
) %>%
ggplot(aes(Date1, value, color = name)) +
geom_point(size = .2) +
geom_ribbon(data = d2_pred, aes(y = pred, ymin = pred_lower, ymax = pred_upper), alpha = .1) +
geom_line(data = d2_pred, aes(as.Date(x, "1970-01-01"), pred)) +
scale_y_continuous("Vote in poll", labels = scales::percent) +
coord_cartesian(xlim = c(as.Date("2022-01-01"), NA)) +
geom_vline(xintercept = as.Date("2022-04-05"), linetype = "dashed") +
geom_point(data = tibble(Date1 = as.Date(c("2022-04-24", "2022-04-24")), value = c(.5855, .4145), name = c("Macron", "Le_Pen")), shape = 4, size = 4)
## Warning: Removed 16 rows containing missing values (geom_point).
GG_save("figs/french_runoff_preds_2022_post.png")
## Warning: Removed 16 rows containing missing values (geom_point).
write_sessioninfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bootstrap_2019.6 lubridate_1.8.0 rms_6.3-0
## [4] SparseM_1.81 googlesheets4_1.0.0 kirkegaard_2022-06-23
## [7] psych_2.2.3 assertthat_0.2.1 weights_1.0.4
## [10] Hmisc_4.7-0 Formula_1.2-4 survival_3.2-13
## [13] lattice_0.20-45 magrittr_2.0.3 forcats_0.5.1
## [16] stringr_1.4.0 dplyr_1.0.9 purrr_0.3.4
## [19] readr_2.1.2 tidyr_1.2.0 tibble_3.1.7
## [22] ggplot2_3.3.6 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.1-1 googledrive_2.0.0 minqa_1.2.4
## [4] colorspace_2.0-3 ellipsis_0.3.2 htmlTable_2.4.0
## [7] base64enc_0.1-3 fs_1.5.2 rstudioapi_0.13
## [10] mice_3.14.0 farver_2.1.0 MatrixModels_0.5-0
## [13] fansi_1.0.3 mvtnorm_1.1-3 xml2_1.3.3
## [16] codetools_0.2-18 splines_4.2.0 mnormt_2.0.2
## [19] knitr_1.39 jsonlite_1.8.0 nloptr_2.0.1
## [22] broom_0.8.0 cluster_2.1.3 dbplyr_2.1.1
## [25] png_0.1-7 compiler_4.2.0 httr_1.4.3
## [28] backports_1.4.1 Matrix_1.4-1 fastmap_1.1.0
## [31] gargle_1.2.0 cli_3.3.0 htmltools_0.5.2
## [34] quantreg_5.93 tools_4.2.0 gtable_0.3.0
## [37] glue_1.6.2 rappdirs_0.3.3 Rcpp_1.0.8.3
## [40] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.4.1
## [43] gdata_2.18.0 nlme_3.1-157 xfun_0.30
## [46] lme4_1.1-29 rvest_1.0.2 lifecycle_1.0.1
## [49] gtools_3.9.2 polspline_1.1.20 MASS_7.3-57
## [52] zoo_1.8-10 scales_1.2.0 hms_1.1.1
## [55] parallel_4.2.0 sandwich_3.0-1 RColorBrewer_1.1-3
## [58] curl_4.3.2 yaml_2.3.5 gridExtra_2.3
## [61] sass_0.4.1 rpart_4.1.16 latticeExtra_0.6-29
## [64] stringi_1.7.6 highr_0.9 checkmate_2.1.0
## [67] boot_1.3-28 rlang_1.0.2 pkgconfig_2.0.3
## [70] evaluate_0.15 labeling_0.4.2 htmlwidgets_1.5.4
## [73] tidyselect_1.1.2 R6_2.5.1 generics_0.1.2
## [76] multcomp_1.4-19 DBI_1.1.2 mgcv_1.8-40
## [79] pillar_1.7.0 haven_2.5.0 foreign_0.8-82
## [82] withr_2.5.0 nnet_7.3-17 modelr_0.1.8
## [85] crayon_1.5.1 utf8_1.2.2 tmvnsim_1.0-2
## [88] tzdb_0.3.0 rmarkdown_2.14 jpeg_0.1-9
## [91] grid_4.2.0 readxl_1.4.0 data.table_1.14.2
## [94] reprex_2.0.1 digest_0.6.29 openssl_2.0.0
## [97] munsell_0.5.0 bslib_0.3.1 askpass_1.1
#save clean data to disk
list(
d, d_pred, d2, d2_pred
) %>% write_rds("data_out.rds", compress = "xz")
#OSF
if (F) {
library(osfr)
osf_auth(read_lines("~/.config/osf_token"))
osf_proj = osf_retrieve_node("https://osf.io/5wvun/")
osf_upload(osf_proj, path = list.files(), conflicts = "overwrite")
}