About

Follow up on https://emilkirkegaard.dk/en/2020/04/goals-for-2020/

The links to Metaculus are private (so they won’t work for YOU!)

Init

options(
  digits = 3
)

library(pacman)
p_load(
  kirkegaard,
  readxl,
  rms,
  lubridate,
  rvest
)

theme_set(theme_bw())

Reading

Predict pages read and books read. Q https://www.metaculus.com/questions/6215/books-read-in-2021/

goodreads = read_xlsx("data/data.xlsx", skip = 1)

#correlations
GG_scatter(goodreads, "books", "pages", case_names = "year")
## `geom_smooth()` using formula 'y ~ x'

#path
goodreads %>% 
  ggplot(aes(books, pages, color = year)) +
  geom_path() +
  geom_text(aes(label = year), color = "black")

#each year
goodreads %>% 
  pivot_longer(cols = books:pages) %>% 
  ggplot(aes(year, value)) +
  geom_line() +
  facet_wrap("name", scales = c("free_y"))

GG_save("figs/goodreads_year.png")

#forecast with linear model
books_model = ols(books ~ year, data = goodreads)
books_model
## Linear Regression Model
##  
##  ols(formula = books ~ year, data = goodreads)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       16    LR chi2      5.72    R2       0.301    
##  sigma17.6517    d.f.            1    R2 adj   0.251    
##  d.f.      14    Pr(> chi2) 0.0168    g       13.308    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -18.5779  -8.2669  -4.4640   0.2103  48.1647 
##  
##  
##            Coef       S.E.      t     Pr(>|t|)
##  Intercept -4696.1029 1926.5629 -2.44 0.0287  
##  year          2.3485    0.9573  2.45 0.0279  
## 
predict(books_model, newdata = data.frame(year = 2022))
##    1 
## 52.6
pages_model = ols(pages ~ year, data = goodreads)
pages_model
## Linear Regression Model
##  
##  ols(formula = pages ~ year, data = goodreads)
##  
##                     Model Likelihood    Discrimination    
##                           Ratio Test           Indexes    
##  Obs         16    LR chi2      5.42    R2       0.287    
##  sigma5937.4721    d.f.            1    R2 adj   0.236    
##  d.f.        14    Pr(> chi2) 0.0200    g     4333.142    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -5680.431 -2572.288 -1851.038     7.765 16243.929 
##  
##  
##            Coef          S.E.        t     Pr(>|t|)
##  Intercept -1529376.4559 648036.0273 -2.36 0.0333  
##  year           764.6721    322.0046  2.37 0.0324  
## 
predict(pages_model, newdata = data.frame(year = 2022))
##     1 
## 16790

Scholar citations

Stats https://scholar.google.dk/citations?user=VKUbfSIAAAAJ&hl=en Q https://www.metaculus.com/questions/6216/gscholar-citations-for-2021/

citations = read_xlsx("data/data.xlsx", skip = 1, sheet = 2)

#plot
citations %>% 
  pivot_longer(cols = citations:cumulative) %>% 
  ggplot(aes(year, value, color = name)) +
  geom_line()

GG_save("figs/GS citations_year.png")

#forecast with linear model
cumulative_model = ols(cumulative ~ year, data = citations)
cumulative_model
## Linear Regression Model
##  
##  ols(formula = cumulative ~ year, data = citations)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs        7    LR chi2     26.90    R2       0.979    
##  sigma35.2670    d.f.            1    R2 adj   0.974    
##  d.f.       5    Pr(> chi2) 0.0000    g      268.476    
##  
##  Residuals
##  
##        1       2       3       4       5       6       7 
##   -3.464 -24.786 -12.107  30.571  42.250  16.929 -49.393 
##  
##  
##            Coef         S.E.       t      Pr(>|t|)
##  Intercept -202632.2500 13442.9981 -15.07 <0.0001 
##  year          100.6786     6.6648  15.11 <0.0001 
## 
predict(cumulative_model, newdata = data.frame(year = 2022))
##   1 
## 940
citations_model = ols(citations ~ year, data = citations)
citations_model
## Linear Regression Model
##  
##  ols(formula = citations ~ year, data = citations)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs        7    LR chi2      0.44    R2       0.061    
##  sigma38.0938    d.f.            1    R2 adj  -0.127    
##  d.f.       5    Pr(> chi2) 0.5064    g       10.952    
##  
##  Residuals
##  
##        1       2       3       4       5       6       7 
##   29.321  -8.786 -42.893 -16.000  16.893  53.786 -32.321 
##  
##  
##            Coef      S.E.       t     Pr(>|t|)
##  Intercept 8389.1071 14520.4846  0.58 0.5885  
##  year        -4.1071     7.1990 -0.57 0.5930  
## 
predict(citations_model, newdata = data.frame(year = 2022))
##    1 
## 84.5

Twitter followers

Ignoring the tail risk of getting banned, project the growth. Q https://www.metaculus.com/questions/5778/twitter-followers-at-end-of-2021/

#manual counts
# library(googlesheets4)
# googlesheets4::gs4_auth("the.dfx@gmail.com")
# 
# twitter_manual = read_sheet("https://docs.google.com/spreadsheets/d/19B_2EupQZxqnti8qI0Zg0kihL2vz23w2Ywz2BavmIKk/edit#gid=1438224973", sheet = "twitter")
twitter = read_xlsx("data/data.xlsx", skip = 0, sheet = 3)

#add the exact dates
twitter %<>% mutate(
  date = str_match(url, "\\d{8}")[, 1] %>% as_date(),
  date_num = as.numeric(date)
)

#fit a model
#have to set surface
twitter_fit = loess(followers ~ date_num, data = twitter, control = loess.control(surface = "direct"))
twitter_fit
## Call:
## loess(formula = followers ~ date_num, data = twitter, control = loess.control(surface = "direct"))
## 
## Number of Observations: 10 
## Equivalent Number of Parameters: 4.75 
## Residual Standard Error: 123
#other model
twitter_fit2 = ols(followers ~ rcs(date_num), data = twitter)
twitter_fit2
## Linear Regression Model
##  
##  ols(formula = followers ~ rcs(date_num), data = twitter)
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs        10    LR chi2     74.93    R2       0.999    
##  sigma141.6979    d.f.            4    R2 adj   0.999    
##  d.f.        5    Pr(> chi2) 0.0000    g     4995.618    
##  
##  Residuals
##  
##           1          2          3          4          5          6          7 
##    78.74406  -35.28063 -161.62921   41.65668  143.78201 -121.42535  -19.74596 
##           8          9         10 
##   152.13239  -78.32790    0.09392 
##  
##  
##              Coef       S.E.      t     Pr(>|t|)
##  Intercept   -4857.8117 2271.4403 -2.14 0.0855  
##  date_num        0.3115    0.1420  2.19 0.0797  
##  date_num'       4.3500    2.9853  1.46 0.2049  
##  date_num''     38.2860   43.1353  0.89 0.4154  
##  date_num'''   -92.8437   77.6748 -1.20 0.2856  
## 
#predict
time_span = seq(as.Date("2012-01-01"), as.Date("2022-01-01"), by = "month")

twitter_predict = tibble(
  date = time_span,
  loess = predict(twitter_fit, newdata = tibble(date_num = time_span %>% as.numeric())),
  rcs = predict(twitter_fit2, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>% 
  gather(key = model, value = prediction, loess:rcs)

#print 2022 values
twitter_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(twitter_predict, aes(date, prediction, color = model)) +
  geom_line() +
  geom_point(data = twitter, aes(date, followers, color = NULL)) +
  scale_x_date(date_breaks = "1 year", labels = year) +
  ggtitle("Historical and predicted Twitter followers")

GG_save("figs/twitter_followers.png")

Blogposts

Forecast number of blogposts in 2021. Q https://www.metaculus.com/questions/6139/emil-blogposts-in-2021/

#scrape the counts
archive_page = read_html("https://emilkirkegaard.dk/en/archive/")

#counts
posts = archive_page %>% 
  html_node("#sya_container")

#df
blogposts = tibble(
  year = 2007:2021,
  posts = posts %>% html_nodes(".sya_yearcount") %>% html_text() %>% str_extract("\\d+") %>% as.numeric() %>% rev()
)

#counts
blogposts %>% 
  filter(year != 2021) %>% 
  ggplot(aes(year, posts)) + 
  geom_line() +
  geom_label(aes(label = posts))

GG_save("figs/blogposts_year.png")

#linear model
posts_model = ols(posts ~ year, data = blogposts)
posts_model
## Linear Regression Model
##  
##  ols(formula = posts ~ year, data = blogposts)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       15    LR chi2      0.08    R2       0.005    
##  sigma52.8095    d.f.            1    R2 adj  -0.072    
##  d.f.      13    Pr(> chi2) 0.7838    g        4.305    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -82.983 -19.237   6.052  22.535 123.281 
##  
##  
##            Coef       S.E.      t     Pr(>|t|)
##  Intercept -1545.2524 6356.1369 -0.24 0.8117  
##  year          0.8071    3.1560  0.26 0.8021  
## 
predict(posts_model, newdata = data.frame(year = 2022))
##    1 
## 86.8

Youtube subs

Stats https://socialblade.com/youtube/channel/UCbT_SxIN5oMSb7bjYkE8FpQ/monthly Q https://www.metaculus.com/questions/6050/youtube-subs-will-emil-have-end-2021/

youtube = read_xlsx("data/data.xlsx", skip = 1, sheet = 4) %>% 
  mutate(
    date = as.Date(date),
    date_num = as.numeric(date)
  )

#counts
youtube %>% 
  ggplot(aes(date, subscribers)) + 
  geom_line() +
  geom_label(aes(label = subscribers)) +
  scale_x_date(date_breaks = "2 months")

GG_save("figs/youtube_year.png")

#have to set surface
youtube_loess = loess(subscribers ~ date_num, data = youtube, control = loess.control(surface = "direct"))
youtube_loess
## Call:
## loess(formula = subscribers ~ date_num, data = youtube, control = loess.control(surface = "direct"))
## 
## Number of Observations: 18 
## Equivalent Number of Parameters: 4.63 
## Residual Standard Error: 23.6
#other model
youtube_rcs = ols(subscribers ~ rcs(date_num), data = youtube)
youtube_rcs
## Linear Regression Model
##  
##  ols(formula = subscribers ~ rcs(date_num), data = youtube)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       18    LR chi2     93.56    R2       0.994    
##  sigma32.8769    d.f.            4    R2 adj   0.993    
##  d.f.      13    Pr(> chi2) 0.0000    g      454.793    
##  
##  Residuals
##  
##     Min     1Q Median     3Q    Max 
##  -36.57 -19.25 -10.88  22.12  67.12 
##  
##  
##              Coef        S.E.      t     Pr(>|t|)
##  Intercept   -22289.0971 4295.2723 -5.19 0.0002  
##  date_num         1.2417    0.2365  5.25 0.0002  
##  date_num'       23.3212    8.1001  2.88 0.0129  
##  date_num''     -28.6576   10.6984 -2.68 0.0189  
##  date_num'''      6.2923    5.1457  1.22 0.2431  
## 
#predict
time_span = seq(as.Date("2019-01-01"), as.Date("2022-01-01"), by = "month")

youtube_predict = tibble(
  date = time_span,
  loess = predict(youtube_loess, newdata = tibble(date_num = time_span %>% as.numeric())),
  rcs = predict(youtube_rcs, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>% 
  gather(key = model, value = prediction, loess:rcs)

#print 2022 values
youtube_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(youtube_predict, aes(date, prediction, color = model)) +
  geom_line() +
  geom_point(data = youtube, aes(date, subscribers, color = NULL)) +
  scale_x_date(date_breaks = "1 year", labels = year) +
  ggtitle("Historical and predicted Youtube followers")

GG_save("figs/youtube_subs.png")

ResearchGate

Stats https://web.archive.org/web/20181001000000*/https://www.researchgate.net/profile/Emil_O_W_Kirkegaard Q1 https://www.metaculus.com/questions/6223/ek-researchgate-reads-end-of-2021/ Q2 https://www.metaculus.com/questions/6224/ek-researchgate-citations-end-of-2021/

rg = read_xlsx("data/data.xlsx", skip = 1, sheet = 5) %>% 
  df_legalize_names()

#add the exact dates
rg %<>% mutate(
  date = str_match(url, "\\d{8}")[, 1] %>% as_date(),
  date_num = as.numeric(date)
)

Reads

#fit models
#have to set surface
rg_fit = loess(reads ~ date_num, data = rg, control = loess.control(surface = "direct"))
rg_fit
## Call:
## loess(formula = reads ~ date_num, data = rg, control = loess.control(surface = "direct"))
## 
## Number of Observations: 11 
## Equivalent Number of Parameters: 4.36 
## Residual Standard Error: 8640
#other model
rg_fit2 = ols(reads ~ rcs(date_num, 4), data = rg)
rg_fit2
## Linear Regression Model
##  
##  ols(formula = reads ~ rcs(date_num, 4), data = rg)
##  
##                      Model Likelihood    Discrimination    
##                            Ratio Test           Indexes    
##  Obs          11    LR chi2     35.82    R2       0.961    
##  sigma17877.7262    d.f.            3    R2 adj   0.945    
##  d.f.          7    Pr(> chi2) 0.0000    g    82497.224    
##  
##  Residuals
##  
##     Min     1Q Median     3Q    Max 
##  -24963  -8950  -1152   9529  26225 
##  
##  
##             Coef         S.E.        t     Pr(>|t|)
##  Intercept  -574069.7517 554236.0847 -1.04 0.3348  
##  date_num        34.1385     32.5721  1.05 0.3294  
##  date_num'     -318.6108    335.9341 -0.95 0.3745  
##  date_num''     555.5691    463.2443  1.20 0.2694  
## 
#squared term
rg_fit3 = ols(reads ~ pol(date_num, 2), data = rg)
rg_fit3
## Linear Regression Model
##  
##  ols(formula = reads ~ pol(date_num, 2), data = rg)
##  
##                      Model Likelihood    Discrimination    
##                            Ratio Test           Indexes    
##  Obs          11    LR chi2     35.28    R2       0.960    
##  sigma17133.9930    d.f.            2    R2 adj   0.949    
##  d.f.          8    Pr(> chi2) 0.0000    g    84001.969    
##  
##  Residuals
##  
##     Min     1Q Median     3Q    Max 
##  -23878 -11080  -1333  10346  28340 
##  
##  
##             Coef      S.E.     t     Pr(>|t|)
##  Intercept   2.83e+07 5.03e+06  5.62 0.0005  
##  date_num   -3.30e+03 5.70e+02 -5.78 0.0004  
##  date_num^2  9.62e-02 1.61e-02  5.96 0.0003  
## 
#predict
time_span = seq(as.Date("2012-01-01"), as.Date("2022-01-01"), by = "month")

rg_predict = tibble(
  date = time_span,
  loess = predict(rg_fit, newdata = tibble(date_num = time_span %>% as.numeric())),
  rcs = predict(rg_fit2, newdata = tibble(date_num = time_span %>% as.numeric())),
  quadratic = predict(rg_fit3, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>% 
  gather(key = model, value = prediction, loess:quadratic)

#print 2022 values
rg_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(rg_predict, aes(date, prediction, color = model)) +
  geom_line() +
  geom_point(data = rg, aes(date, reads, color = NULL)) +
  scale_x_date(date_breaks = "1 year", labels = year) +
  ggtitle("Historical and predicted ResearchGate reads")

GG_save("figs/researchgate_reads.png")

Citations

#fit models
#have to set surface
rg_fit = loess(citations ~ date_num, data = rg, control = loess.control(surface = "direct"))
rg_fit
## Call:
## loess(formula = citations ~ date_num, data = rg, control = loess.control(surface = "direct"))
## 
## Number of Observations: 11 
## Equivalent Number of Parameters: 4.36 
## Residual Standard Error: 14.9
#other model
rg_fit2 = ols(citations ~ rcs(date_num, 4), data = rg)
rg_fit2
## Linear Regression Model
##  
##  ols(formula = citations ~ rcs(date_num, 4), data = rg)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       11    LR chi2     53.06    R2       0.992    
##  sigma19.1625    d.f.            3    R2 adj   0.989    
##  d.f.       7    Pr(> chi2) 0.0000    g      210.903    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -31.769  -6.418   1.875   6.897  29.695 
##  
##  
##             Coef       S.E.     t     Pr(>|t|)
##  Intercept  -2435.0788 594.0662 -4.10 0.0046  
##  date_num       0.1468   0.0349  4.20 0.0040  
##  date_num'      0.2324   0.3601  0.65 0.5392  
##  date_num''    -0.2057   0.4965 -0.41 0.6911  
## 
#squared term
rg_fit3 = ols(citations ~ pol(date_num, 2), data = rg)
rg_fit3
## Linear Regression Model
##  
##  ols(formula = citations ~ pol(date_num, 2), data = rg)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       11    LR chi2     56.22    R2       0.994    
##  sigma15.5212    d.f.            2    R2 adj   0.992    
##  d.f.       8    Pr(> chi2) 0.0000    g      210.547    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -25.9823  -5.8287  -0.4587   8.9859  22.4020 
##  
##  
##             Coef       S.E.      t     Pr(>|t|)
##  Intercept  26037.6757 4555.9948  5.72 0.0004  
##  date_num      -3.1820    0.5166 -6.16 0.0003  
##  date_num^2     0.0001    0.0000  6.65 0.0002  
## 
#predict
time_span = seq(as.Date("2012-01-01"), as.Date("2022-01-01"), by = "month")

rg_predict = tibble(
  date = time_span,
  loess = predict(rg_fit, newdata = tibble(date_num = time_span %>% as.numeric())),
  rcs = predict(rg_fit2, newdata = tibble(date_num = time_span %>% as.numeric())),
  quadratic = predict(rg_fit3, newdata = tibble(date_num = time_span %>% as.numeric()))
) %>% 
  gather(key = model, value = prediction, loess:quadratic)

#print 2022 values
rg_predict %>% filter(str_detect(date, "2022"))
#plot with model
ggplot(rg_predict, aes(date, prediction, color = model)) +
  geom_line() +
  geom_point(data = rg, aes(date, citations, color = NULL)) +
  scale_x_date(date_breaks = "1 year", labels = year) +
  ggtitle("Historical and predicted ResearchGate citations")

GG_save("figs/researchgate_citations.png")