Follow up on https://emilkirkegaard.dk/en/2020/04/goals-for-2020/
The links to Metaculus are private (so they won’t work for YOU!)
options(
digits = 3
)
library(pacman)
p_load(
kirkegaard,
readxl,
rms,
lubridate,
rvest
)
theme_set(theme_bw())
Predict pages read and books read. Q https://www.metaculus.com/questions/6215/books-read-in-2021/
goodreads = read_xlsx("data/data.xlsx", skip = 1)
#correlations
GG_scatter(goodreads, "books", "pages", case_names = "year")
## `geom_smooth()` using formula 'y ~ x'
#path
goodreads %>%
ggplot(aes(books, pages, color = year)) +
geom_path() +
geom_text(aes(label = year), color = "black")
#each year
goodreads %>%
pivot_longer(cols = books:pages) %>%
ggplot(aes(year, value)) +
geom_line() +
facet_wrap("name", scales = c("free_y"))
GG_save("figs/goodreads_year.png")
#forecast with linear model
books_model = ols(books ~ year, data = goodreads)
books_model
## Linear Regression Model
##
## ols(formula = books ~ year, data = goodreads)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 16 LR chi2 5.72 R2 0.301
## sigma17.6517 d.f. 1 R2 adj 0.251
## d.f. 14 Pr(> chi2) 0.0168 g 13.308
##
## Residuals
##
## Min 1Q Median 3Q Max
## -18.5779 -8.2669 -4.4640 0.2103 48.1647
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -4696.1029 1926.5629 -2.44 0.0287
## year 2.3485 0.9573 2.45 0.0279
##
predict(books_model, newdata = data.frame(year = 2022))
## 1
## 52.6
pages_model = ols(pages ~ year, data = goodreads)
pages_model
## Linear Regression Model
##
## ols(formula = pages ~ year, data = goodreads)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 16 LR chi2 5.42 R2 0.287
## sigma5937.4721 d.f. 1 R2 adj 0.236
## d.f. 14 Pr(> chi2) 0.0200 g 4333.142
##
## Residuals
##
## Min 1Q Median 3Q Max
## -5680.431 -2572.288 -1851.038 7.765 16243.929
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -1529376.4559 648036.0273 -2.36 0.0333
## year 764.6721 322.0046 2.37 0.0324
##
predict(pages_model, newdata = data.frame(year = 2022))
## 1
## 16790
Stats https://scholar.google.dk/citations?user=VKUbfSIAAAAJ&hl=en Q https://www.metaculus.com/questions/6216/gscholar-citations-for-2021/
citations = read_xlsx("data/data.xlsx", skip = 1, sheet = 2)
#plot
citations %>%
pivot_longer(cols = citations:cumulative) %>%
ggplot(aes(year, value, color = name)) +
geom_line()
GG_save("figs/GS citations_year.png")
#forecast with linear model
cumulative_model = ols(cumulative ~ year, data = citations)
cumulative_model
## Linear Regression Model
##
## ols(formula = cumulative ~ year, data = citations)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7 LR chi2 26.90 R2 0.979
## sigma35.2670 d.f. 1 R2 adj 0.974
## d.f. 5 Pr(> chi2) 0.0000 g 268.476
##
## Residuals
##
## 1 2 3 4 5 6 7
## -3.464 -24.786 -12.107 30.571 42.250 16.929 -49.393
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -202632.2500 13442.9981 -15.07 <0.0001
## year 100.6786 6.6648 15.11 <0.0001
##
predict(cumulative_model, newdata = data.frame(year = 2022))
## 1
## 940
citations_model = ols(citations ~ year, data = citations)
citations_model
## Linear Regression Model
##
## ols(formula = citations ~ year, data = citations)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7 LR chi2 0.44 R2 0.061
## sigma38.0938 d.f. 1 R2 adj -0.127
## d.f. 5 Pr(> chi2) 0.5064 g 10.952
##
## Residuals
##
## 1 2 3 4 5 6 7
## 29.321 -8.786 -42.893 -16.000 16.893 53.786 -32.321
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 8389.1071 14520.4846 0.58 0.5885
## year -4.1071 7.1990 -0.57 0.5930
##
predict(citations_model, newdata = data.frame(year = 2022))
## 1
## 84.5
Ignoring the tail risk of getting banned, project the growth. Q https://www.metaculus.com/questions/5778/twitter-followers-at-end-of-2021/
#manual counts
# library(googlesheets4)
# googlesheets4::gs4_auth("the.dfx@gmail.com")
#
# twitter_manual = read_sheet("https://docs.google.com/spreadsheets/d/19B_2EupQZxqnti8qI0Zg0kihL2vz23w2Ywz2BavmIKk/edit#gid=1438224973", sheet = "twitter")
twitter = read_xlsx("data/data.xlsx", skip = 0, sheet = 3)
#add the exact dates
twitter %<>% mutate(
date = str_match(url, "\\d{8}")[, 1] %>% as_date(),
date_num = as.numeric(date)
)
#fit a model
#have to set surface
twitter_fit = loess(followers ~ date_num, data = twitter, control = loess.control(surface = "direct"))
twitter_fit
## Call:
## loess(formula = followers ~ date_num, data = twitter, control = loess.control(surface = "direct"))
##
## Number of Observations: 10
## Equivalent Number of Parameters: 4.75
## Residual Standard Error: 123
#other model
twitter_fit2 = ols(followers ~ rcs(date_num), data = twitter)
twitter_fit2
## Linear Regression Model
##
## ols(formula = followers ~ rcs(date_num), data = twitter)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 10 LR chi2 74.93 R2 0.999
## sigma141.6979 d.f. 4 R2 adj 0.999
## d.f. 5 Pr(> chi2) 0.0000 g 4995.618
##
## Residuals
##
## 1 2 3 4 5 6 7
## 78.74406 -35.28063 -161.62921 41.65668 143.78201 -121.42535 -19.74596
## 8 9 10
## 152.13239 -78.32790 0.09392
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -4857.8117 2271.4403 -2.14 0.0855
## date_num 0.3115 0.1420 2.19 0.0797
## date_num' 4.3500 2.9853 1.46 0.2049
## date_num'' 38.2860 43.1353 0.89 0.4154
## date_num''' -92.8437 77.6748 -1.20 0.2856
##
#predict
time_span = seq(as.Date("2012-01-01"), as.Date("2022-01-01"), by = "month")
twitter_predict = tibble(
date = time_span,
loess = predict(twitter_fit, newdata = tibble(date_num = time_span %>% as.numeric())),
rcs = predict(twitter_fit2, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>%
gather(key = model, value = prediction, loess:rcs)
#print 2022 values
twitter_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(twitter_predict, aes(date, prediction, color = model)) +
geom_line() +
geom_point(data = twitter, aes(date, followers, color = NULL)) +
scale_x_date(date_breaks = "1 year", labels = year) +
ggtitle("Historical and predicted Twitter followers")
GG_save("figs/twitter_followers.png")
Forecast number of blogposts in 2021. Q https://www.metaculus.com/questions/6139/emil-blogposts-in-2021/
#scrape the counts
archive_page = read_html("https://emilkirkegaard.dk/en/archive/")
#counts
posts = archive_page %>%
html_node("#sya_container")
#df
blogposts = tibble(
year = 2007:2021,
posts = posts %>% html_nodes(".sya_yearcount") %>% html_text() %>% str_extract("\\d+") %>% as.numeric() %>% rev()
)
#counts
blogposts %>%
filter(year != 2021) %>%
ggplot(aes(year, posts)) +
geom_line() +
geom_label(aes(label = posts))
GG_save("figs/blogposts_year.png")
#linear model
posts_model = ols(posts ~ year, data = blogposts)
posts_model
## Linear Regression Model
##
## ols(formula = posts ~ year, data = blogposts)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 15 LR chi2 0.08 R2 0.005
## sigma52.8095 d.f. 1 R2 adj -0.072
## d.f. 13 Pr(> chi2) 0.7838 g 4.305
##
## Residuals
##
## Min 1Q Median 3Q Max
## -82.983 -19.237 6.052 22.535 123.281
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -1545.2524 6356.1369 -0.24 0.8117
## year 0.8071 3.1560 0.26 0.8021
##
predict(posts_model, newdata = data.frame(year = 2022))
## 1
## 86.8
Stats https://socialblade.com/youtube/channel/UCbT_SxIN5oMSb7bjYkE8FpQ/monthly Q https://www.metaculus.com/questions/6050/youtube-subs-will-emil-have-end-2021/
youtube = read_xlsx("data/data.xlsx", skip = 1, sheet = 4) %>%
mutate(
date = as.Date(date),
date_num = as.numeric(date)
)
#counts
youtube %>%
ggplot(aes(date, subscribers)) +
geom_line() +
geom_label(aes(label = subscribers)) +
scale_x_date(date_breaks = "2 months")
GG_save("figs/youtube_year.png")
#have to set surface
youtube_loess = loess(subscribers ~ date_num, data = youtube, control = loess.control(surface = "direct"))
youtube_loess
## Call:
## loess(formula = subscribers ~ date_num, data = youtube, control = loess.control(surface = "direct"))
##
## Number of Observations: 18
## Equivalent Number of Parameters: 4.63
## Residual Standard Error: 23.6
#other model
youtube_rcs = ols(subscribers ~ rcs(date_num), data = youtube)
youtube_rcs
## Linear Regression Model
##
## ols(formula = subscribers ~ rcs(date_num), data = youtube)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 18 LR chi2 93.56 R2 0.994
## sigma32.8769 d.f. 4 R2 adj 0.993
## d.f. 13 Pr(> chi2) 0.0000 g 454.793
##
## Residuals
##
## Min 1Q Median 3Q Max
## -36.57 -19.25 -10.88 22.12 67.12
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -22289.0971 4295.2723 -5.19 0.0002
## date_num 1.2417 0.2365 5.25 0.0002
## date_num' 23.3212 8.1001 2.88 0.0129
## date_num'' -28.6576 10.6984 -2.68 0.0189
## date_num''' 6.2923 5.1457 1.22 0.2431
##
#predict
time_span = seq(as.Date("2019-01-01"), as.Date("2022-01-01"), by = "month")
youtube_predict = tibble(
date = time_span,
loess = predict(youtube_loess, newdata = tibble(date_num = time_span %>% as.numeric())),
rcs = predict(youtube_rcs, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>%
gather(key = model, value = prediction, loess:rcs)
#print 2022 values
youtube_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(youtube_predict, aes(date, prediction, color = model)) +
geom_line() +
geom_point(data = youtube, aes(date, subscribers, color = NULL)) +
scale_x_date(date_breaks = "1 year", labels = year) +
ggtitle("Historical and predicted Youtube followers")
GG_save("figs/youtube_subs.png")
Stats https://web.archive.org/web/20181001000000*/https://www.researchgate.net/profile/Emil_O_W_Kirkegaard Q1 https://www.metaculus.com/questions/6223/ek-researchgate-reads-end-of-2021/ Q2 https://www.metaculus.com/questions/6224/ek-researchgate-citations-end-of-2021/
rg = read_xlsx("data/data.xlsx", skip = 1, sheet = 5) %>%
df_legalize_names()
#add the exact dates
rg %<>% mutate(
date = str_match(url, "\\d{8}")[, 1] %>% as_date(),
date_num = as.numeric(date)
)
#fit models
#have to set surface
rg_fit = loess(reads ~ date_num, data = rg, control = loess.control(surface = "direct"))
rg_fit
## Call:
## loess(formula = reads ~ date_num, data = rg, control = loess.control(surface = "direct"))
##
## Number of Observations: 11
## Equivalent Number of Parameters: 4.36
## Residual Standard Error: 8640
#other model
rg_fit2 = ols(reads ~ rcs(date_num, 4), data = rg)
rg_fit2
## Linear Regression Model
##
## ols(formula = reads ~ rcs(date_num, 4), data = rg)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 11 LR chi2 35.82 R2 0.961
## sigma17877.7262 d.f. 3 R2 adj 0.945
## d.f. 7 Pr(> chi2) 0.0000 g 82497.224
##
## Residuals
##
## Min 1Q Median 3Q Max
## -24963 -8950 -1152 9529 26225
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -574069.7517 554236.0847 -1.04 0.3348
## date_num 34.1385 32.5721 1.05 0.3294
## date_num' -318.6108 335.9341 -0.95 0.3745
## date_num'' 555.5691 463.2443 1.20 0.2694
##
#squared term
rg_fit3 = ols(reads ~ pol(date_num, 2), data = rg)
rg_fit3
## Linear Regression Model
##
## ols(formula = reads ~ pol(date_num, 2), data = rg)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 11 LR chi2 35.28 R2 0.960
## sigma17133.9930 d.f. 2 R2 adj 0.949
## d.f. 8 Pr(> chi2) 0.0000 g 84001.969
##
## Residuals
##
## Min 1Q Median 3Q Max
## -23878 -11080 -1333 10346 28340
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 2.83e+07 5.03e+06 5.62 0.0005
## date_num -3.30e+03 5.70e+02 -5.78 0.0004
## date_num^2 9.62e-02 1.61e-02 5.96 0.0003
##
#predict
time_span = seq(as.Date("2012-01-01"), as.Date("2022-01-01"), by = "month")
rg_predict = tibble(
date = time_span,
loess = predict(rg_fit, newdata = tibble(date_num = time_span %>% as.numeric())),
rcs = predict(rg_fit2, newdata = tibble(date_num = time_span %>% as.numeric())),
quadratic = predict(rg_fit3, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>%
gather(key = model, value = prediction, loess:quadratic)
#print 2022 values
rg_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(rg_predict, aes(date, prediction, color = model)) +
geom_line() +
geom_point(data = rg, aes(date, reads, color = NULL)) +
scale_x_date(date_breaks = "1 year", labels = year) +
ggtitle("Historical and predicted ResearchGate reads")
GG_save("figs/researchgate_reads.png")
#fit models
#have to set surface
rg_fit = loess(citations ~ date_num, data = rg, control = loess.control(surface = "direct"))
rg_fit
## Call:
## loess(formula = citations ~ date_num, data = rg, control = loess.control(surface = "direct"))
##
## Number of Observations: 11
## Equivalent Number of Parameters: 4.36
## Residual Standard Error: 14.9
#other model
rg_fit2 = ols(citations ~ rcs(date_num, 4), data = rg)
rg_fit2
## Linear Regression Model
##
## ols(formula = citations ~ rcs(date_num, 4), data = rg)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 11 LR chi2 53.06 R2 0.992
## sigma19.1625 d.f. 3 R2 adj 0.989
## d.f. 7 Pr(> chi2) 0.0000 g 210.903
##
## Residuals
##
## Min 1Q Median 3Q Max
## -31.769 -6.418 1.875 6.897 29.695
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -2435.0788 594.0662 -4.10 0.0046
## date_num 0.1468 0.0349 4.20 0.0040
## date_num' 0.2324 0.3601 0.65 0.5392
## date_num'' -0.2057 0.4965 -0.41 0.6911
##
#squared term
rg_fit3 = ols(citations ~ pol(date_num, 2), data = rg)
rg_fit3
## Linear Regression Model
##
## ols(formula = citations ~ pol(date_num, 2), data = rg)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 11 LR chi2 56.22 R2 0.994
## sigma15.5212 d.f. 2 R2 adj 0.992
## d.f. 8 Pr(> chi2) 0.0000 g 210.547
##
## Residuals
##
## Min 1Q Median 3Q Max
## -25.9823 -5.8287 -0.4587 8.9859 22.4020
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 26037.6757 4555.9948 5.72 0.0004
## date_num -3.1820 0.5166 -6.16 0.0003
## date_num^2 0.0001 0.0000 6.65 0.0002
##
#predict
time_span = seq(as.Date("2012-01-01"), as.Date("2022-01-01"), by = "month")
rg_predict = tibble(
date = time_span,
loess = predict(rg_fit, newdata = tibble(date_num = time_span %>% as.numeric())),
rcs = predict(rg_fit2, newdata = tibble(date_num = time_span %>% as.numeric())),
quadratic = predict(rg_fit3, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>%
gather(key = model, value = prediction, loess:quadratic)
#print 2022 values
rg_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(rg_predict, aes(date, prediction, color = model)) +
geom_line() +
geom_point(data = rg, aes(date, citations, color = NULL)) +
scale_x_date(date_breaks = "1 year", labels = year) +
ggtitle("Historical and predicted ResearchGate citations")
GG_save("figs/researchgate_citations.png")