library(ggbeeswarm)
Loading required package: ggplot2
library(gt)
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
source(here::here("src/common_basis.R"))
here() starts at /Users/jiemakel/tyo/disc-analysis
── Attaching core tidyverse packages ──── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.2.1
✔ purrr     1.0.1     ✔ tidyr     1.3.0── Conflicts ────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errorsRegistered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      

Post count distribution through time

active_posters <- incel_posts_c %>%
  count(year=year(time_posted),month=month(time_posted), poster_id) %>%
  filter(n>3) %>%
  count(year, month, name="value") %>%
  mutate(name="active posters") %>%
  collect()
incel_posts_c %>%
  group_by(year=year(time_posted),month=month(time_posted)) %>%
  summarise(posts=n(),users=n_distinct(poster_id), .groups="drop") %>%
  pivot_longer(posts:users) %>%
  collect() %>%
  union_all(active_posters) %>%
  mutate(month=as.Date(str_c(year,'-',month,'-01'))) %>%
  filter(month<"2023-03-01") %>%
  ggplot(aes(x=month,y=value)) +
  geom_line() +
  scale_y_continuous(labels=scales::number) + 
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  xlab("Month") +
  ylab("N") +
  theme_hsci_discrete() +
  facet_wrap(~name,scales="free_y",ncol=1)

Post count distribution (overall/lounge)

quantiles <- seq(0,1,by=0.05)
incel_users_c %>%
  select(user_total_posts) %>%
  collect() %>%
  reframe(
    quantile=quantiles,
    user_total_posts=quantile(user_total_posts,quantiles)
  ) %>%
  inner_join(
    incel_posts_c %>%
      count(poster_id) %>%
      select(n) %>%
      collect() %>%
      reframe(
        quantile=quantiles,
        user_lounge_posts=quantile(n,quantiles)
    ),
    join_by(quantile)
  ) %>%
  gt(rowname_col = "quantile") %>%
  fmt_percent(quantile, drop_trailing_zeros = TRUE) %>%
  fmt_number(columns = c(user_total_posts,user_lounge_posts), drop_trailing_zeros = TRUE)
user_total_posts user_lounge_posts
0% 0 1
5% 2 1
10% 5 1
15% 8 1
20% 12 2
25% 18 3
30% 26 4
35% 36 5
40% 50 6
45% 69 8
50% 95.5 12
55% 126 16
60% 177 22
65% 247 31
70% 353 47
75% 536 74
80% 802.4 122
85% 1,337.65 228
90% 2,373.1 460
95% 5,339 1,223.65
100% 319,186 30,485

Interaction between join date and total / lounge posts

incel_users_c %>%
  filter(user_joined>"1970-01-01") %>%
  ggplot(aes(x=user_joined,y=user_total_posts)) +
  geom_point(size=0.5) +
  geom_smooth(method="lm", formula="y~x") +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::number) +
  xlab("user join date") +
  ylab("Total posts") +
  ggtitle("Total posts")

incel_posts_c %>%
  count(poster_id) %>%
  inner_join(incel_users_c, join_by(poster_id==user_id)) %>%
  filter(user_joined>"1970-01-01") %>%
  ggplot(aes(x=user_joined,y=n)) +
  geom_point(size=0.5) +
  geom_smooth(method="lm", formula="y~x") +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::number) +
  xlab("user join date") +
  ylab("Lounge posts") +
  ggtitle("Lounge posts")

Board rhythm

incel_posts_c %>% 
  mutate(hour=hour(time_posted),weekday=weekday(time_posted)) %>%
  count(weekday,hour) %>%
  ggplot(aes(x=hour,y=n,color=as_factor(weekday))) +
  geom_line() +
  theme_hsci_discrete() +
  theme(
    legend.justification = c(1, 0), 
    legend.position = c(0.98, 0.02), 
    legend.background = element_blank(), 
    legend.key = element_blank()) +
  xlab("Hour (UTC)") +
  ylab("Total number of posts") +
  labs(color="Day of the week") +
  ggtitle("Posts by the time of day")

weekdays <- tribble(~index,~weekday,
                    0,"Mon",
                    1,"Tue",
                    2,"Wed",
                    3,"Thu",
                    4,"Fri",
                    5,"Sat",
                    6,"Sun")
incel_posts_c %>% 
  mutate(weekday=weekday(time_posted)) %>%
  count(weekday) %>%
  ggplot(aes(x=weekday,y=n)) +
  geom_col() +
  theme_hsci_discrete() +
  scale_x_continuous(breaks=weekdays$index, labels=weekdays$weekday) +
  scale_y_continuous(labels=scales::number) +
  xlab("Day of the week") +
  ylab("Total number of posts") +
  ggtitle("Posts by day of the week")

user_type <- incel_posts_c %>%
  count(poster_id) %>%
  mutate(user_type=if_else(n>=100,"top","other")) %>%
  select(poster_id, user_type)

incel_posts_c %>% 
  mutate(hour=hour(time_posted)) %>%
  inner_join(user_type, join_by(poster_id)) %>%
  count(user_type,hour) %>%
  group_by(user_type) %>%
  mutate(proportion=n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=hour,y=proportion,color=user_type)) +
  geom_line() +
  theme_hsci_discrete() +
  theme(
    legend.justification = c(1, 0), 
    legend.position = c(0.98, 0.02), 
    legend.background = element_blank(), 
    legend.box.just = "bottom", 
    legend.key = element_blank(), 
    legend.box = "horizontal") +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  xlab("Hour (UTC)") +
  ylab("Proportion of posts") +
  labs(color="user type") +
  ggtitle("Proportion of posts by the time of day of top/other users")
Warning: Missing values are always removed in SQL aggregation
functions.
Use `na.rm = TRUE` to silence this warning

Are there distinct subpopulations?

incel_posts_c %>% 
  mutate(hour=hour(time_posted)) %>%
  count(poster_id,hour) %>%
  group_by(poster_id) %>%
  filter(sum(n)>=100) %>% # limit to users with enough data to get any pattern
  mutate(proportion=n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=hour,y=proportion)) +
  geom_quasirandom(size=0.25) +
  coord_cartesian(ylim=c(0,0.25)) +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  xlab("Hour (UTC)") +
  ylab("Proportion of posts") +
  labs(color="user type") +
  ggtitle("Proportion of posts by the time of day for each individual user")

  • There do not seem to be clearly distinct time profiles with large groups of users. There may be some variation in UTC night time posting behaviour (3-12 UTC)

How long are people active by year joined

incel_posts_c %>%
    group_by(poster_id) %>%
    summarise(earliest_post=min(time_posted),latest_post=max(time_posted), .groups="drop") %>%
    mutate(earliest_post_year=year(earliest_post), active_period_days=sql("timestampdiff(day,earliest_post,latest_post)")) %>%
  ggplot(aes(x=earliest_post_year,y=active_period_days)) + 
  geom_quasirandom(size=0.25) +
  theme_hsci_discrete() +
  xlab("Year of earliest post") +
  ylab("Time between earliest and latest post (days)") +
  ggtitle("Time between earliest and latest post by year joined")

Different -cels

cel_post_contents <- incel_posts_c %>% 
  filter(str_detect(post_content,"cels?\\b")) %>% 
  select(post_id, post_content) %>% 
  collect() 

cels_by_post <- cel_post_contents %>%
  mutate(cel=str_extract_all(post_content, "(?<! expand...)[^ \\n]+ cels?\\b|[^ \\n]*cels?\\b(?! said)")) %>%
  unnest(cel) %>% 
  filter(!str_detect(cel, "^@")) %>%
  select(post_id, cel) %>% 
  mutate(cel=cel %>% 
           str_to_lower() %>% 
           str_replace_all("\\W","") %>% 
           str_replace_all("s$",""))
cels <- cels_by_post %>%
  count(cel) %>% 
  arrange(desc(n))
cels %>% 
  write_tsv(here("data/output/jiemakel/cels.tsv"),na="",quote="needed")
cels %>%
  head(n=100) %>%
  gt(rowname_col="cel") %>%
  fmt_integer(n)
n
incel 116,353
fakecel 14,234
truecel 10,269
volcel 7,820
greycel 7,145
brocel 6,364
httpsincel 5,779
graycel 5,446
trucel 4,641
ricecel 4,330
currycel 3,160
mentalcel 2,309
oldcel 2,243
gymcel 2,185
youngcel 2,030
blackcel 1,977
lostcel 1,977
braincel 1,628
femcel 1,626
rincel 1,588
ritalincel 1,397
escortcel 1,329
whitecel 1,267
fatcel 1,261
excel 994
maycel 983
poorcel 921
blackops2cel 830
newcel 823
fbicel 788
weebcel 758
iqcel 745
richcel 723
cancel 661
framecel 604
bluecel 570
fellowcel 561
sandcel 521
gaycel 508
ethnicel 488
stormfrontcel 459
animecel 443
2019cel 439
2017cel 436
itcel 420
slavcel 399
tallcel 397
shortcel 373
rbraincel 367
chadcel 366
stemcel 345
2020cel 336
neetcel 316
diocel 313
pedocel 313
teencel 313
ukcel 313
2018cel 304
cel 299
kikecel 297
ethniccel 282
baldcel 277
muslimcel 271
ogrecel 267
standardcel 267
junecel 262
burgercel 259
wristcel 252
ksgcel 251
modcel 241
locationcel 233
mayocel 231
lowiqcel 229
ppecel 229
sergeantincel 228
sfcel 223
2022cel 220
cuckcel 213
voicecel 208
nonincel 205
ogcel 202
jewcel 201
dickcel 200
studycel 198
novembercel 197
antiincel 196
nazicel 194
eskimocel 185
finncel 183
rightfulcel 182
stormcel 177
augustcel 174
octobercel 174
usacel 174
femdomcel 173
bagelcel 170
nearcel 168
arabcel 161
autistcel 160
chatcel 159
cels2 <- cels_by_post %>%
  filter(cel!="incel") %>%
  distinct() %>%
  group_by(post_id) %>%
  filter(n()>1) %>%
  arrange(cel) %>%
  summarise(cel=str_flatten(cel, collapse=", "), .groups="drop") %>%
  count(cel) %>% 
  arrange(desc(n))
cels2 %>% 
  write_tsv(here("data/output/jiemakel/cels2.tsv"),na="",quote="needed")
cels2 %>%
  head(n=100) %>%
gt(rowname_col="cel") %>%
  fmt_integer(n)
n
fakecel, truecel 504
fatcel, volcel 246
fakecel, trucel 148
graycel, greycel 143
fakecel, volcel 126
trucel, truecel 121
fakecel, mentalcel 106
currycel, ricecel 103
bluecel, greycel 94
currycel, trucel 90
fakecel, youngcel 90
oldcel, youngcel 81
mentalcel, truecel 76
fakecel, greycel 66
femcel, volcel 65
mentalcel, volcel 62
truecel, volcel 61
fakecel, graycel 58
fakecel, whitecel 55
rbraincel, rincel 49
aaaaaaaaaaacel, cheesecel, daydreamincel, diocel, itsover4cel, manicel, rightfulcel, singleplayercel 47
lostcel, trucel 46
gaycel, maycel 43
brocel, fakecel 42
escortcel, fakecel 42
junecel, maycel 41
braincel, rincel 40
fakecel, httpsincel 40
standardcel, volcel 38
gaycel, greycel 35
fakecel, fatcel 32
bluecel, graycel 31
greycel, truecel 31
brocel, truecel 30
oldcel, truecel 30
brocel, trucel 29
fakecel, ricecel 28
currycel, truecel 27
brocel, graycel 26
ethnicel, whitecel 26
chadcel, mentalcel 25
fakecel, rincel 25
fakecel, tallcel 25
httpsincel, truecel 25
blackcel, ricecel 24
ricecel, truecel 24
volcel, whitecel 24
blackcel, whitecel 23
fakecel, oldcel 23
poorcel, richcel 22
braincel, rbraincel 21
fakecel, richcel 21
ethniccel, whitecel 20
rincel, rtruecel 20
truecel, youngcel 20
2019cel, 2020cel 19
blackcel, fakecel 18
currycel, fakecel 18
escortcel, volcel 18
fakecel, maycel 18
fakecel, newcel 18
ricecel, whitecel 18
aaaaaaaaaaacel, cheesecel, itsover4cel, singleplayercel 17
brocel, greycel 17
fatcel, truecel 17
graycel, truecel 17
greycel, httpsincel 17
greycel, newcel 17
ricecel, trucel 17
rincel, truecel 17
2018cel, 2019cel 16
braincel, fakecel 16
brocel, httpsincel 16
fakecel, femcel 16
fakecel, rightfulcel 16
fakecel, truestcel 16
graycel, gymcel 16
greycel, trucel 16
2022cel, 2023cel 15
alcoholiccel, homelesscel 15
currycel, volcel 15
escortcel, fakecel, httpsincel, mentacel 15
fakecel, locationcel 15
gaycel, graycel 15
gaycel, volcel 15
mentalcel, trucel 15
tallcel, volcel 15
trucel, volcel 15
truecel, whitecel 15
braincel, femcel 14
framecel, gymcel 14
graycel, trucel 14
graycel, volcel 14
httpsincel, volcel 14
oldcel, trucel 14
2017cel, fakecel 13
blackcel, volcel 13
escortcel, fakecel, mentacel 13
fakecel, mentalcel, truecel 13
graycel, httpsincel 13
trucel_posts <- cels_by_post %>% 
  filter(str_detect(cel,"tru")) %>%
  distinct(post_id)

fakecel_posts <- cels_by_post %>% 
  filter(str_detect(cel,"fake")) %>%
  distinct(post_id)
cels_by_post %>% inner_join(trucel_posts, join_by(post_id)) %>%
  filter(!cel %in% c("trucel","truecel", "incel", "httpsincel")) %>%
  count(cel) %>%
  arrange(desc(n))

cels_by_post %>% inner_join(fakecel_posts, join_by(post_id)) %>%
  filter(!cel %in% c("fakecel","incel", "httpsincel")) %>%
  count(cel) %>%
  arrange(desc(n))

You/we/they are

are_post_contents <- incel_posts_c %>% 
  filter(str_detect(post_content,"(you|they|we)('re| are) ")) %>% 
  select(post_id, post_content) %>% 
  collect() %>%
  mutate(post_content = post_content %>% str_replace_all("Click to expand...",".") %>% str_replace_all("\\s+"," "))
ares_by_post <- c(1:4) %>% 
  map_dfr(~are_post_contents %>%
    mutate(length= .x, are=str_extract_all(post_content, str_c("(you|they|we)('re| are)('nt| not)?( a| an| the)?", strrep(" \\w+",.x))))
  ) %>%  
  unnest(are) %>%
  select(post_id, are, length) %>%
  mutate(are=are %>% str_replace("'re"," are") %>% str_replace("'nt", " not")) %>%
  mutate(
    who=str_replace(are," .*",""), 
    are=str_replace(are, ".*? ",""),
    stem=str_replace(are, " [^ ]*$", "")
  ) %>%
  relocate(post_id, length, who, are)
ares_by_post
ares_by_post_count <- ares_by_post %>%
  count(length, who, are, stem)
ares_by_post_count
top_ares <- ares_by_post_count %>% 
  group_by(length, who) %>%
  slice_max(n,n=20) %>%
  ungroup()
ares_by_post_count %>% 
  anti_join(top_ares, join_by(who, are==stem)) %>%
  anti_join(ares_by_post_count %>% mutate(length=length+1), join_by(who,are,length)) %>%
  select(-stem) %>%
  group_by(length, who) %>%
  slice_max(n,n=20) %>%
  mutate(order=row_number()) %>%
  ungroup() %>%
  filter(order<=20) %>%
  pivot_wider(id_cols=c("length","order"), names_from="who", values_from=c("are","n")) %>%
  relocate(are_they,n_they,are_we,n_we,are_you,n_you) %>%
  arrange(desc(length)) %>%
  gt(groupname_col = "length", rowname_col="order") %>%
  cols_label(
    are_they="They",
    n_they="N",
    are_we="We",
    n_we="N",
    are_you="You",
    n_you="N") %>%
  tab_style(
    style = list(
      cell_borders(
        sides = c("right"),
        style = "solid"
      )
    ),
    locations = cells_body(
      columns = c(n_we,n_you,n_they)
    )
  )
They N We N You N
4
1 are mistaken and should relieve 51 are neurologically constrained from simultaneously 30 are for replying on a 105
2 are just a bunch of 40 are all in this together 26 are a disgrace to your family 80
3 are all a bunch of 25 are actively trying to improve 23 are going to have to 74
4 are being raped all over 22 are all in the same 22 are going to have a 30
5 are victimized every single time 22 are just a bunch of 19 are pretending to be incels 29
6 are weird and sad lunatics 17 are the ones that need to 18 are nothing but LARPers who 28
7 are more likely to be 16 are weird creepy cult they 18 are too low IQ to 26
8 are just pretending to love 15 are on the same page 16 are too much of a 26
9 are not thinking of the father 15 are in the same boat 15 are LITERALLY dedicating at least 24
10 are the scum of the earth 15 are all gonna make it 14 are a man of culture as 24
11 are all the same thing 14 are concerned specifically with the 14 are incel of whether you 24
12 are going to have to 14 are all going to be 11 are going to be a 23
13 are an agreed upon social harm 12 are back to the bragging 11 are one of my favorite 21
14 are drawn to the edgier 12 are supposed to be the 11 are one of the few 21
15 are in the same boat 12 are all a bunch of 10 are a nincompoop will still be 20
16 are one of the most 12 are all going to die 10 are trying to tell me 19
17 are some of the most 12 are going to talk about 10 are never going to get 18
18 are extremely fragile and insecure 11 are talking about the same 10 are one of the most 18
19 are much more likely to 11 are at the bottom of 9 are just throwing posts into 17
20 are very easy on the 11 are going to have a 9 are rating is so far 17
3
1 are going to be 81 are going to be 35 are going to get 116
2 are supposed to be 61 are all the same 29 are one of those 86
3 are part of the 53 are living in the 25 are trying to say 80
4 are less likely to 35 are going to get 20 are paying the price 78
5 are trying to make 35 are radical and misogynistic 18 are just going to 63
6 are going to do 32 are here to suffer 17 are going to do 57
7 are just trying to 29 are missing out on 17 are missing out on 55
8 are better than you 28 are gonna have to 14 are part of the 54
9 are the only ones who 28 are talking about here 14 are good to go 53
10 are going to get 27 are the only ones who 13 are more likely to 50
11 are too stupid to 27 are going to do 12 are on this forum 49
12 are better than us 26 are part of the 12 are one of them 46
13 are nothing more than 26 are going to say 11 are supposed to be 46
14 are genuinely fucking retarded 25 are not entitled to sex 11 are no better than 45
15 are trying to get 24 are talking about a 11 are talking about the 45
16 are going to make 23 are in a simulation 10 are gonna have to 44
17 are full of shit 22 are in this together 10 are still an incel 44
18 are some kind of 22 are led to believe 10 are still a virgin 39
19 are just going to 21 are never going to 10 are nothing but a 36
20 are made out of 21 are stuck in a 10 are in the US 35
2
1 are talking about 184 are in a 87 are talking about 1013
2 are in the 160 are forced to 57 are just a 386
3 are able to 121 are at the 50 are in the 376
4 are attracted to 121 are here to 47 are in a 351
5 are willing to 119 are able to 45 are looking for 311
6 are forced to 111 are trying to 45 are able to 245
7 are just as 100 are dealing with 38 are interested in 225
8 are in a 95 are gonna have 38 are still a 224
9 are doing it 90 are at it 37 are willing to 196
10 are entitled to 84 are all here 36 are not entitled to 176
11 are full of 81 are in this 34 are not going to 157
12 are on the 81 are all gonna 33 are such a 157
13 are in their 80 are used to 33 are forced to 153
14 are a bunch of 76 are part of 32 are doing it 149
15 are no longer 76 are gonna be 31 are referring to 147
16 are not going to 72 are stuck in 31 are good at 144
17 are allowed to 71 are the bad guys 31 are good looking 144
18 are the ones who 71 are all just 30 are on the 141
19 are afraid of 67 are on a 30 are reading this 140
20 are too busy 66 are not going to 29 are ugly and 134
1
1 are so 1046 are just 404 are right 1471
2 are still 879 are still 217 are doing 1208
3 are too 735 are so 215 are ugly 1093
4 are more 667 are both 179 are so 1004
5 are also 427 are ugly 163 are saying 914
6 are good 408 are getting 149 are gonna 878
7 are very 402 are too 136 are incel 850
8 are both 382 are being 135 are on 831
9 are being 374 are incels 128 are an incel 723
10 are gonna 356 are already 120 are too 660
11 are really 348 are doing 112 are good 594
12 are probably 346 are now 89 are white 573
13 are like 341 are to 88 are probably 559
14 are fucking 338 are fucked 85 are at 541
15 are pretty 332 are more 85 are being 496
16 are getting 328 are like 83 are getting 489
17 are actually 317 are stuck 81 are already 477
18 are always 316 are not 76 are really 470
19 are not 312 are the ones 76 are actually 404
20 are ugly 309 are born 75 are here 400
NA
---
title: "Incel analysis"
date: "`r Sys.Date()`"
output: 
  html_notebook:
    toc: yes
    code_folding: hide
  md_document:
    variant: gfm 
    toc: yes
---

```{r setup}
library(ggbeeswarm)
library(gt)
source(here::here("src/common_basis.R"))
```

# Post count distribution through time

```{r}
active_posters <- incel_posts_c %>%
  count(year=year(time_posted),month=month(time_posted), poster_id) %>%
  filter(n>3) %>%
  count(year, month, name="value") %>%
  mutate(name="active posters") %>%
  collect()
```

```{r}
incel_posts_c %>%
  group_by(year=year(time_posted),month=month(time_posted)) %>%
  summarise(posts=n(),users=n_distinct(poster_id), .groups="drop") %>%
  pivot_longer(posts:users) %>%
  collect() %>%
  union_all(active_posters) %>%
  mutate(month=as.Date(str_c(year,'-',month,'-01'))) %>%
  filter(month<"2023-03-01") %>%
  ggplot(aes(x=month,y=value)) +
  geom_line() +
  scale_y_continuous(labels=scales::number) + 
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  xlab("Month") +
  ylab("N") +
  theme_hsci_discrete() +
  facet_wrap(~name,scales="free_y",ncol=1)
```

# Post count distribution (overall/lounge)

```{r}
quantiles <- seq(0,1,by=0.05)
incel_users_c %>%
  select(user_total_posts) %>%
  collect() %>%
  reframe(
    quantile=quantiles,
    user_total_posts=quantile(user_total_posts,quantiles)
  ) %>%
  inner_join(
    incel_posts_c %>%
      count(poster_id) %>%
      select(n) %>%
      collect() %>%
      reframe(
        quantile=quantiles,
        user_lounge_posts=quantile(n,quantiles)
    ),
    join_by(quantile)
  ) %>%
  gt(rowname_col = "quantile") %>%
  fmt_percent(quantile, drop_trailing_zeros = TRUE) %>%
  fmt_number(columns = c(user_total_posts,user_lounge_posts), drop_trailing_zeros = TRUE)
```

 * As expected, the distribution is very skewed. Half the users have less than 100 posts, while the top 25% have more than 500. 

# Interaction between join date and total / lounge posts

```{r}
incel_users_c %>%
  filter(user_joined>"1970-01-01") %>%
  ggplot(aes(x=user_joined,y=user_total_posts)) +
  geom_point(size=0.5) +
  geom_smooth(method="lm", formula="y~x") +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::number) +
  xlab("user join date") +
  ylab("Total posts") +
  ggtitle("Total posts")
```

```{r}
incel_posts_c %>%
  count(poster_id) %>%
  inner_join(incel_users_c, join_by(poster_id==user_id)) %>%
  filter(user_joined>"1970-01-01") %>%
  ggplot(aes(x=user_joined,y=n)) +
  geom_point(size=0.5) +
  geom_smooth(method="lm", formula="y~x") +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::number) +
  xlab("user join date") +
  ylab("Lounge posts") +
  ggtitle("Lounge posts")
```

 * The earlier you join, the more likely you are to have more posts, but there doesn't seem to be a discernible pattern for when the real "heavy hitters" have joined.

# Board rhythm

```{r}
incel_posts_c %>% 
  mutate(hour=hour(time_posted),weekday=weekday(time_posted)) %>%
  count(weekday,hour) %>%
  ggplot(aes(x=hour,y=n,color=as_factor(weekday))) +
  geom_line() +
  theme_hsci_discrete() +
  theme(
    legend.justification = c(1, 0), 
    legend.position = c(0.98, 0.02), 
    legend.background = element_blank(), 
    legend.key = element_blank()) +
  xlab("Hour (UTC)") +
  ylab("Total number of posts") +
  labs(color="Day of the week") +
  ggtitle("Posts by the time of day")
```

```{r}
weekdays <- tribble(~index,~weekday,
                    0,"Mon",
                    1,"Tue",
                    2,"Wed",
                    3,"Thu",
                    4,"Fri",
                    5,"Sat",
                    6,"Sun")
incel_posts_c %>% 
  mutate(weekday=weekday(time_posted)) %>%
  count(weekday) %>%
  ggplot(aes(x=weekday,y=n)) +
  geom_col() +
  theme_hsci_discrete() +
  scale_x_continuous(breaks=weekdays$index, labels=weekdays$weekday) +
  scale_y_continuous(labels=scales::number) +
  xlab("Day of the week") +
  ylab("Total number of posts") +
  ggtitle("Posts by day of the week")
```


```{r}
user_type <- incel_posts_c %>%
  count(poster_id) %>%
  mutate(user_type=if_else(n>=100,"top","other")) %>%
  select(poster_id, user_type)

incel_posts_c %>% 
  mutate(hour=hour(time_posted)) %>%
  inner_join(user_type, join_by(poster_id)) %>%
  count(user_type,hour) %>%
  group_by(user_type) %>%
  mutate(proportion=n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=hour,y=proportion,color=user_type)) +
  geom_line() +
  theme_hsci_discrete() +
  theme(
    legend.justification = c(1, 0), 
    legend.position = c(0.98, 0.02), 
    legend.background = element_blank(), 
    legend.box.just = "bottom", 
    legend.key = element_blank(), 
    legend.box = "horizontal") +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  xlab("Hour (UTC)") +
  ylab("Proportion of posts") +
  labs(color="user type") +
  ggtitle("Proportion of posts by the time of day of top/other users")
```

 * There doesn't seem to be a difference in daily rhythms between top users and others. 
 * Interestingly, no big differences by day of week
 * How international is the forum? 
 
## Are there distinct subpopulations?
 
```{r}
incel_posts_c %>% 
  mutate(hour=hour(time_posted)) %>%
  count(poster_id,hour) %>%
  group_by(poster_id) %>%
  filter(sum(n)>=100) %>% # limit to users with enough data to get any pattern
  mutate(proportion=n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=hour,y=proportion)) +
  geom_quasirandom(size=0.25) +
  coord_cartesian(ylim=c(0,0.25)) +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  xlab("Hour (UTC)") +
  ylab("Proportion of posts") +
  labs(color="user type") +
  ggtitle("Proportion of posts by the time of day for each individual user")
```
 
  * There do not seem to be clearly distinct time profiles with large groups of users. There may be some variation in UTC night time posting behaviour (3-12 UTC)
  
# How long are people active by year joined

```{r}
incel_posts_c %>%
    group_by(poster_id) %>%
    summarise(earliest_post=min(time_posted),latest_post=max(time_posted), .groups="drop") %>%
    mutate(earliest_post_year=year(earliest_post), active_period_days=sql("timestampdiff(day,earliest_post,latest_post)")) %>%
  ggplot(aes(x=earliest_post_year,y=active_period_days)) + 
  geom_quasirandom(size=0.25) +
  theme_hsci_discrete() +
  xlab("Year of earliest post") +
  ylab("Time between earliest and latest post (days)") +
  ggtitle("Time between earliest and latest post by year joined")
```

 * In 2019, there seem to have been more people joining who stayed on longer.

# Different -cels
 
```{r}
cel_post_contents <- incel_posts_c %>% 
  filter(str_detect(post_content,"cels?\\b")) %>% 
  select(post_id, post_content) %>% 
  collect() 

cels_by_post <- cel_post_contents %>%
  mutate(cel=str_extract_all(post_content, "(?<! expand...)[^ \\n]+ cels?\\b|[^ \\n]*cels?\\b(?! said)")) %>%
  unnest(cel) %>% 
  filter(!str_detect(cel, "^@")) %>%
  select(post_id, cel) %>% 
  mutate(cel=cel %>% 
           str_to_lower() %>% 
           str_replace_all("\\W","") %>% 
           str_replace_all("s$",""))
```
 
 
```{r}
cels <- cels_by_post %>%
  count(cel) %>% 
  arrange(desc(n))
```
 
```{r}
cels %>% 
  write_tsv(here("data/output/jiemakel/cels.tsv"),na="",quote="needed")
```

```{r}
cels %>%
  head(n=100) %>%
  gt(rowname_col="cel") %>%
  fmt_integer(n)
```
 
```{r}
cels2 <- cels_by_post %>%
  filter(cel!="incel") %>%
  distinct() %>%
  group_by(post_id) %>%
  filter(n()>1) %>%
  arrange(cel) %>%
  summarise(cel=str_flatten(cel, collapse=", "), .groups="drop") %>%
  count(cel) %>% 
  arrange(desc(n))
```
 
```{r}
cels2 %>% 
  write_tsv(here("data/output/jiemakel/cels2.tsv"),na="",quote="needed")
```
 
```{r}
cels2 %>%
  head(n=100) %>%
gt(rowname_col="cel") %>%
  fmt_integer(n)
```
 
```{r}
trucel_posts <- cels_by_post %>% 
  filter(str_detect(cel,"tru")) %>%
  distinct(post_id)

fakecel_posts <- cels_by_post %>% 
  filter(str_detect(cel,"fake")) %>%
  distinct(post_id)
```
 
```{r}
cels_by_post %>% inner_join(trucel_posts, join_by(post_id)) %>%
  filter(!cel %in% c("trucel","truecel", "incel", "httpsincel")) %>%
  count(cel) %>%
  arrange(desc(n))

cels_by_post %>% inner_join(fakecel_posts, join_by(post_id)) %>%
  filter(!cel %in% c("fakecel","incel", "httpsincel")) %>%
  count(cel) %>%
  arrange(desc(n))
```

# You/we/they are
```{r}
are_post_contents <- incel_posts_c %>% 
  filter(str_detect(post_content,"(you|they|we)('re| are) ")) %>% 
  select(post_id, post_content) %>% 
  collect() %>%
  mutate(post_content = post_content %>% str_replace_all("Click to expand...",".") %>% str_replace_all("\\s+"," "))
```


```{r}
ares_by_post <- c(1:4) %>% 
  map_dfr(~are_post_contents %>%
    mutate(length= .x, are=str_extract_all(post_content, str_c("(you|they|we)('re| are)('nt| not)?( a| an| the)?", strrep(" \\w+",.x))))
  ) %>%  
  unnest(are) %>%
  select(post_id, are, length) %>%
  mutate(are=are %>% str_replace("'re"," are") %>% str_replace("'nt", " not")) %>%
  mutate(
    who=str_replace(are," .*",""), 
    are=str_replace(are, ".*? ",""),
    stem=str_replace(are, " [^ ]*$", "")
  ) %>%
  relocate(post_id, length, who, are)
ares_by_post
```


```{r}
ares_by_post_count <- ares_by_post %>%
  count(length, who, are, stem)
ares_by_post_count
```

```{r}
top_ares <- ares_by_post_count %>% 
  group_by(length, who) %>%
  slice_max(n,n=20) %>%
  ungroup()
```


```{r}
ares_by_post_count %>% 
  anti_join(top_ares, join_by(who, are==stem)) %>%
  anti_join(ares_by_post_count %>% mutate(length=length+1), join_by(who,are,length)) %>%
  select(-stem) %>%
  group_by(length, who) %>%
  slice_max(n,n=20) %>%
  mutate(order=row_number()) %>%
  ungroup() %>%
  filter(order<=20) %>%
  pivot_wider(id_cols=c("length","order"), names_from="who", values_from=c("are","n")) %>%
  relocate(are_they,n_they,are_we,n_we,are_you,n_you) %>%
  arrange(desc(length)) %>%
  gt(groupname_col = "length", rowname_col="order") %>%
  cols_label(
    are_they="They",
    n_they="N",
    are_we="We",
    n_we="N",
    are_you="You",
    n_you="N") %>%
  tab_style(
    style = list(
      cell_borders(
        sides = c("right"),
        style = "solid"
      )
    ),
    locations = cells_body(
      columns = c(n_we,n_you,n_they)
    )
  )
  
```

