library(ggbeeswarm)
Loading required package: ggplot2
library(gt)
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
library(tictoc)
source(here::here("src/common_basis.R"))
here() starts at /Users/jiemakel/tyo/disc-analysis
── Attaching core tidyverse packages ────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.2.1
✔ purrr     1.0.1     ✔ tidyr     1.3.0── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errorsRegistered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
centiles <- seq(0,1,by=0.1)
percentiles <- seq(0,1,by=0.01)

Fetal personhood

abortion_tweets_c %>% 
  count(reply_count) %>%
  ggplot(aes(x=reply_count,y=n)) +
  geom_point(size=0.5) +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Reply count distribution")

abortion_tweets_c %>% 
  count(retweet_count) %>%
  ggplot(aes(x=retweet_count,y=n)) +
  geom_point(size=0.5) +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Retweet count distribution")

abortion_tweets_c %>% 
  count(like_count) %>%
  ggplot(aes(x=like_count,y=n)) +
  geom_point(size=0.5) +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Like count distribution")

Incels

incel_threads_c %>% 
  count(thread_label, posts) %>%
  group_by(thread_label) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=posts,y=prop, color=thread_label)) +
  geom_line() +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Thread length distribution by label")

incel_quotes_c %>% 
  filter(quoted_post_id != 0, quoted_post_id!=quoting_post_id) %>%
  count(quoted_post_id) %>%
  count(n) %>%
  arrange(desc(nn)) %>%
  ggplot(aes(x=n,y=nn)) + 
  geom_point() +
  theme_hsci_discrete() +
  scale_y_log10() +
  ggtitle("Distribution of the number of times a message has been quoted")
Storing counts in nn, as n already present in input
ℹ Use `name = "new_name"` to pick a new name.

incel_quote_tree_a <- incel_quotes_a %>%
  inner_join(incel_posts_a %>% select(quoting_post_id=post_id,quoting_post_poster_id=poster_id)) %>%
    inner_join(incel_posts_a %>% select(quoted_post_id=post_id,quoted_post_poster_id=poster_id)) %>%
  distinct(quoting_post_id,quoted_post_id,quoting_post_poster_id,quoted_post_poster_id) %>%
  filter(quoted_post_id!=0,quoted_post_id!=quoting_post_id) %>%
  compute_a(name="incel_quote_tree_a", unique_indexes=list(c("quoting_post_id","quoted_post_id"),c("quoted_post_id","quoting_post_id")), temporary=FALSE, overwrite=TRUE)
ancestors_q <- tbl(con,sql('
WITH RECURSIVE ancestors AS ( 
  SELECT quoting_post_id AS descendant_post_id, quoted_post_id AS ancestor_post_id, 1 AS length
  FROM incel_quote_tree_a
  UNION 
  SELECT a.descendant_post_id, qt.quoted_post_id AS ancestor_post_id, length+1 AS length
  FROM incel_quote_tree_a qt, ancestors a
  WHERE qt.quoting_post_id = a.ancestor_post_id
) 
SELECT * FROM ancestors'))
descendants_q <- tbl(con,sql('
WITH RECURSIVE descendants AS ( 
  SELECT quoting_post_id AS descendant_post_id, quoted_post_id AS ancestor_post_id, 1 AS length
  FROM incel_quote_tree_a
  UNION 
  SELECT qt.quoting_post_id AS descendant_post_id, ancestor_post_id, length+1 AS length
  FROM incel_quote_tree_a qt, descendants d
  WHERE qt.quoted_post_id = d.descendant_post_id
) 
SELECT * FROM descendants'))
tic()
reply_depth_a <- descendants_q %>% 
  group_by(descendant_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  compute_a(name="reply_depth_a",unique_indexes=list(c("descendant_post_id","ancestor_post_id"),c("ancestor_post_id","descendant_post_id")),temporary=FALSE,overwrite=TRUE)
toc()
reply_depth_a %>% 
  group_by(ancestor_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  count(length) %>% 
  ggplot(aes(x=length,y=n)) + 
  geom_point(size=0.5) +
  scale_y_log10() +
  theme_hsci_discrete() +
  ggtitle("Quote chain (~discussion) length distribution")

grandparent_q <- tbl(con,sql("
WITH three_posts AS (
  SELECT DISTINCT q1.quoting_post_id AS child_post_id, p1.poster_id AS child_user_id, q2.quoting_post_id AS parent_post_id, p2.poster_id AS parent_user_id, q2.quoted_post_id AS grandparent_post_id, p3.poster_id AS grandparent_user_id
  FROM incel_quotes_a q1
  INNER JOIN incel_quotes_a q2 ON (q1.quoted_post_id=q2.quoting_post_id)
  INNER JOIN incel_posts_a p1 ON (q1.quoting_post_id=p1.post_id)
  INNER JOIN incel_posts_a p2 ON (q1.quoted_post_id=p2.post_id)
  INNER JOIN incel_posts_a p3 ON (q2.quoted_post_id=p3.post_id)
  WHERE
   q2.quoted_post_id!=0 AND
   q2.quoting_post_id!=q2.quoted_post_id
)
SELECT * FROM three_posts"))
incel_quote_triplets_a <- grandparent_q %>%
  compute_a("incel_quote_triplets_a", temporary=FALSE, overwrite=TRUE)
ifqtriplets_a <- incel_quote_triplets_a %>%
  filter(child_user_id==grandparent_user_id) %>%
  compute_a("ifqtriplets_a", temporary=FALSE, overwrite=TRUE, unique_indexes=list(c("child_post_id","grandparent_post_id", "parent_post_id")))
paired_tree_q <- tbl(con,sql("
WITH RECURSIVE ancestors AS ( 
  SELECT child_post_id, child_user_id, parent_post_id AS ancestor_post_id, parent_user_id AS ancestor_user_id, 1 AS length
  FROM ifqtriplets_a
  UNION 
  SELECT a.child_post_id, a.child_user_id, t.parent_post_id AS ancestor_post_id, t.parent_user_id AS ancestor_user_id, length + 1 AS length
  FROM ancestors a, ifqtriplets_a t
  WHERE a.ancestor_post_id = t.child_post_id
)
SELECT * FROM ancestors
"))
tic()
paired_depth_a <- paired_tree_q %>% 
  group_by(child_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  compute_a(name="reply_depth_a",unique_indexes=list(c("child_post_id","ancestor_post_id"),c("ancestor_post_id","child_post_id")),temporary=FALSE,overwrite=TRUE)
toc()
paired_depth_a %>% 
  group_by(ancestor_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  count(length) %>% 
  ggplot(aes(x=length,y=n)) + 
  geom_point(size=0.5) +
  scale_y_log10() +
  theme_hsci_discrete() +
  ggtitle("Distribution of the length of discussion between two participants")

Sample threads by their lengths

paired_depth_a %>% 
  group_by(ancestor_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  mutate(bucket=floor(length/10)*10) %>%
  group_by(bucket) %>%
  slice_sample(n=5) %>%
  ungroup() %>%
  select(bucket, post_id=child_post_id, length) %>%
  inner_join(incel_posts_a, join_by(post_id)) %>%
  mutate(url=str_c("https://incels.is/goto/post?id=",str_sub(post_id_str,6))) %>%
  select(bucket, length, url) %>%
  arrange(bucket, length) %>%
  gt(groupname_col = "bucket",rowname_col="length") %>%
  fmt_url(url)
url
0
1 https://incels.is/goto/post?id=3369722
1 https://incels.is/goto/post?id=10417884
1 https://incels.is/goto/post?id=7698942
2 https://incels.is/goto/post?id=3980004
3 https://incels.is/goto/post?id=3491876
10
10 https://incels.is/goto/post?id=4207410
10 https://incels.is/goto/post?id=9106503
10 https://incels.is/goto/post?id=7922193
12 https://incels.is/goto/post?id=8998392
15 https://incels.is/goto/post?id=8234518
20
20 https://incels.is/goto/post?id=9439584
21 https://incels.is/goto/post?id=4025155
22 https://incels.is/goto/post?id=2945100
26 https://incels.is/goto/post?id=5481855
29 https://incels.is/goto/post?id=5987153
30
32 https://incels.is/goto/post?id=7638989
32 https://incels.is/goto/post?id=2991510
33 https://incels.is/goto/post?id=10716178
35 https://incels.is/goto/post?id=4228065
39 https://incels.is/goto/post?id=8141248
40
41 https://incels.is/goto/post?id=4482117
42 https://incels.is/goto/post?id=2589974
42 https://incels.is/goto/post?id=4646038
46 https://incels.is/goto/post?id=4568851
46 https://incels.is/goto/post?id=8094813
50
50 https://incels.is/goto/post?id=4297704
52 https://incels.is/goto/post?id=4314325
56 https://incels.is/goto/post?id=4492521
58 https://incels.is/goto/post?id=5204098
59 https://incels.is/goto/post?id=3741803
60
60 https://incels.is/goto/post?id=3835178
62 https://incels.is/goto/post?id=3835539
64 https://incels.is/goto/post?id=3836220
70
76 https://incels.is/goto/post?id=4409632
77 https://incels.is/goto/post?id=8057762
77 https://incels.is/goto/post?id=4761723
80
82 https://incels.is/goto/post?id=8171674
90
94 https://incels.is/goto/post?id=8057817
130
132 https://incels.is/goto/post?id=6344676
260
260 https://incels.is/goto/post?id=7145998
---
title: "Discussion structure analysis"
date: "`r Sys.Date()`"
output: 
  html_notebook:
    toc: yes
    code_folding: hide
---

```{r setup}
library(ggbeeswarm)
library(gt)
library(tictoc)
source(here::here("src/common_basis.R"))
```

```{r}
centiles <- seq(0,1,by=0.1)
percentiles <- seq(0,1,by=0.01)
```

# Fetal personhood

```{r}
abortion_tweets_c %>% 
  count(reply_count) %>%
  ggplot(aes(x=reply_count,y=n)) +
  geom_point(size=0.5) +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Reply count distribution")
```

```{r}
abortion_tweets_c %>% 
  count(retweet_count) %>%
  ggplot(aes(x=retweet_count,y=n)) +
  geom_point(size=0.5) +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Retweet count distribution")
```

```{r}
abortion_tweets_c %>% 
  count(like_count) %>%
  ggplot(aes(x=like_count,y=n)) +
  geom_point(size=0.5) +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Like count distribution")
```

# Incels

```{r}
incel_threads_c %>% 
  count(thread_label, posts) %>%
  group_by(thread_label) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=posts,y=prop, color=thread_label)) +
  geom_line() +
  scale_y_log10() +
  scale_x_log10() +
  theme_hsci_discrete() +
  ggtitle("Thread length distribution by label")
```

```{r}
incel_quotes_c %>% 
  filter(quoted_post_id != 0, quoted_post_id!=quoting_post_id) %>%
  count(quoted_post_id) %>%
  count(n) %>%
  arrange(desc(nn)) %>%
  ggplot(aes(x=n,y=nn)) + 
  geom_point() +
  theme_hsci_discrete() +
  scale_y_log10() +
  ggtitle("Distribution of the number of times a message has been quoted")
```

```{r, eval=FALSE}
incel_quote_tree_a <- incel_quotes_a %>%
  inner_join(incel_posts_a %>% select(quoting_post_id=post_id,quoting_post_poster_id=poster_id)) %>%
    inner_join(incel_posts_a %>% select(quoted_post_id=post_id,quoted_post_poster_id=poster_id)) %>%
  distinct(quoting_post_id,quoted_post_id,quoting_post_poster_id,quoted_post_poster_id) %>%
  filter(quoted_post_id!=0,quoted_post_id!=quoting_post_id) %>%
  compute_a(name="incel_quote_tree_a", unique_indexes=list(c("quoting_post_id","quoted_post_id"),c("quoted_post_id","quoting_post_id")), temporary=FALSE, overwrite=TRUE)
```

```{r}
ancestors_q <- tbl(con,sql('
WITH RECURSIVE ancestors AS ( 
  SELECT quoting_post_id AS descendant_post_id, quoted_post_id AS ancestor_post_id, 1 AS length
  FROM incel_quote_tree_a
  UNION 
  SELECT a.descendant_post_id, qt.quoted_post_id AS ancestor_post_id, length+1 AS length
  FROM incel_quote_tree_a qt, ancestors a
  WHERE qt.quoting_post_id = a.ancestor_post_id
) 
SELECT * FROM ancestors'))
descendants_q <- tbl(con,sql('
WITH RECURSIVE descendants AS ( 
  SELECT quoting_post_id AS descendant_post_id, quoted_post_id AS ancestor_post_id, 1 AS length
  FROM incel_quote_tree_a
  UNION 
  SELECT qt.quoting_post_id AS descendant_post_id, ancestor_post_id, length+1 AS length
  FROM incel_quote_tree_a qt, descendants d
  WHERE qt.quoted_post_id = d.descendant_post_id
) 
SELECT * FROM descendants'))
```

```{r, eval=FALSE}
tic()
reply_depth_a <- descendants_q %>% 
  group_by(descendant_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  compute_a(name="reply_depth_a",unique_indexes=list(c("descendant_post_id","ancestor_post_id"),c("ancestor_post_id","descendant_post_id")),temporary=FALSE,overwrite=TRUE)
toc()
```

```{r}
reply_depth_a %>% 
  group_by(ancestor_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  count(length) %>% 
  ggplot(aes(x=length,y=n)) + 
  geom_point(size=0.5) +
  scale_y_log10() +
  theme_hsci_discrete() +
  ggtitle("Quote chain (~discussion) length distribution")
```

```{r, eval=FALSE}
grandparent_q <- tbl(con,sql("
WITH three_posts AS (
  SELECT DISTINCT q1.quoting_post_id AS child_post_id, p1.poster_id AS child_user_id, q2.quoting_post_id AS parent_post_id, p2.poster_id AS parent_user_id, q2.quoted_post_id AS grandparent_post_id, p3.poster_id AS grandparent_user_id
  FROM incel_quotes_a q1
  INNER JOIN incel_quotes_a q2 ON (q1.quoted_post_id=q2.quoting_post_id)
  INNER JOIN incel_posts_a p1 ON (q1.quoting_post_id=p1.post_id)
  INNER JOIN incel_posts_a p2 ON (q1.quoted_post_id=p2.post_id)
  INNER JOIN incel_posts_a p3 ON (q2.quoted_post_id=p3.post_id)
  WHERE
   q2.quoted_post_id!=0 AND
   q2.quoting_post_id!=q2.quoted_post_id
)
SELECT * FROM three_posts"))
incel_quote_triplets_a <- grandparent_q %>%
  compute_a("incel_quote_triplets_a", temporary=FALSE, overwrite=TRUE)
```

```{r, eval=FALSE}
ifqtriplets_a <- incel_quote_triplets_a %>%
  filter(child_user_id==grandparent_user_id) %>%
  compute_a("ifqtriplets_a", temporary=FALSE, overwrite=TRUE, unique_indexes=list(c("child_post_id","grandparent_post_id", "parent_post_id")))
```


```{r, eval=FALSE}
paired_tree_q <- tbl(con,sql("
WITH RECURSIVE ancestors AS ( 
  SELECT child_post_id, child_user_id, parent_post_id AS ancestor_post_id, parent_user_id AS ancestor_user_id, 1 AS length
  FROM ifqtriplets_a
  UNION 
  SELECT a.child_post_id, a.child_user_id, t.parent_post_id AS ancestor_post_id, t.parent_user_id AS ancestor_user_id, length + 1 AS length
  FROM ancestors a, ifqtriplets_a t
  WHERE a.ancestor_post_id = t.child_post_id
)
SELECT * FROM ancestors
"))
tic()
paired_depth_a <- paired_tree_q %>% 
  group_by(child_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  compute_a(name="reply_depth_a",unique_indexes=list(c("child_post_id","ancestor_post_id"),c("ancestor_post_id","child_post_id")),temporary=FALSE,overwrite=TRUE)
toc()
```

```{r}
paired_depth_a %>% 
  group_by(ancestor_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  count(length) %>% 
  ggplot(aes(x=length,y=n)) + 
  geom_point(size=0.5) +
  scale_y_log10() +
  theme_hsci_discrete() +
  ggtitle("Distribution of the length of discussion between two participants")
```

## Sample threads by their lengths

```{r}
paired_depth_a %>% 
  group_by(ancestor_post_id) %>%
  filter(length==max(length)) %>%
  ungroup() %>%
  mutate(bucket=floor(length/10)*10) %>%
  group_by(bucket) %>%
  slice_sample(n=5) %>%
  ungroup() %>%
  select(bucket, post_id=child_post_id, length) %>%
  inner_join(incel_posts_a, join_by(post_id)) %>%
  mutate(url=str_c("https://incels.is/goto/post?id=",str_sub(post_id_str,6))) %>%
  select(bucket, length, url) %>%
  arrange(bucket, length) %>%
  gt(groupname_col = "bucket",rowname_col="length") %>%
  fmt_url(url)
```

