Causal Impact
Number of total users:
neda_liwc %>%
select(id) %>%
distinct() %>%
nrow()
## [1] 1746
|
Word Category
|
Relative Eff.(%)
|
P Value
|
|
Female
|
17.418
|
0.001
|
|
Anx
|
7.566
|
0.002
|
|
Family
|
6.452
|
0.005
|
|
Money
|
5.991
|
0.002
|
|
Relig
|
5.209
|
0.045
|
|
Achiev
|
3.813
|
0.011
|
|
They
|
3.397
|
0.034
|
|
Negate
|
2.889
|
0.003
|
|
Health
|
2.526
|
0.004
|
|
Power
|
2.458
|
0.010
|
|
Negemo
|
2.066
|
0.037
|
|
Informal
|
1.116
|
0.038
|
|
Ipron
|
-1.476
|
0.015
|
|
See
|
-2.036
|
0.025
|
|
You
|
-2.238
|
0.034
|
|
Differ
|
-2.694
|
0.006
|
|
Posemo
|
-3.277
|
0.001
|
|
Tentat
|
-3.347
|
0.001
|
|
Shehe
|
-7.042
|
0.020
|
|
Affiliation
|
-7.167
|
0.003
|
|
Friend
|
-16.459
|
0.041
|
CAUSAL IMPACT BY GENDER
Number of users by gender (1746 total):
neda_liwc_gender %>%
select(id, gender) %>%
distinct() %>%
count(gender)
## # A tibble: 3 x 2
## gender n
## <chr> <int>
## 1 f 762
## 2 m 313
## 3 u 671
Number of users in the baseline (2991 total):
baseline_liwc_gender %>%
select(id, gender) %>%
distinct() %>%
count(gender)
## # A tibble: 3 x 2
## gender n
## <chr> <int>
## 1 f 855
## 2 m 748
## 3 u 1388
baseline_liwc_gender %>%
select(-text, -created_at, -id_tweet, -name) %>%
group_by(gender, id, days_tweet) %>%
summarise_all(mean) %>%
ungroup() %>%
select(-id) %>%
group_by(gender, days_tweet) %>%
summarise_all(mean) %>%
ungroup() %>%
pivot_longer(cols = fun:filler, names_to = "categ", values_to = "values_baseline") -> ci_baseline
neda_liwc_gender %>%
select(-id_tweet, -name, -name_proc) %>%
group_by(gender, id, days_tweet) %>%
summarise_all(mean) %>%
ungroup() %>%
select(-id) %>%
group_by(gender, days_tweet) %>%
summarise_all(mean) %>%
ungroup() %>%
pivot_longer(fun:filler, names_to = "categ",
values_to = "values_neda") -> ci_neda_liwc
pre_period <- c(1, 16)
post_period <- c(17, 31)
ci_baseline %>%
full_join(ci_neda_liwc) -> d_second
d_second %>%
# count(gender, days_tweet) %>%
# view()
# view()
# select(categ, values_neda, values_baseline) %>%
# nest(data = - categ) %>%
select(gender, categ, values_neda, values_baseline) %>%
# filter(gender == "m") %>% view()
group_by(categ, gender) %>%
nest() %>%
ungroup() %>%
select(gender, categ, data) %>%
mutate(mod = map(data, ~CausalImpact::CausalImpact(.,
pre_period,
post_period))) -> ci
ci %>%
mutate(summary_mod = map(mod, "summary")) %>%
filter(!map_lgl(summary_mod, is.null)) -> ci_resul
ci_resul %>%
mutate(p = map(summary_mod, "p")) %>%
mutate(p = map_dbl(p, 1)) %>%
filter(categ %in% categories_in_gender) %>%
mutate(relative_effect = map(summary_mod, "RelEffect")) %>%
mutate(relative_effect = map_dbl(relative_effect, 2))-> sig_cat
# sig_cat %>%
# filter(categ %in% sel_cat) -> sig_cat
sig_cat %>%
arrange(gender, desc(relative_effect)) %>%
select(gender, categ,
second_relative_effect = relative_effect, p_second = p) -> second_ap
second_ap %>%
pivot_wider(id_cols = categ,
values_from = c(second_relative_effect, p_second),
names_from = gender) %>%
mutate(categ = str_to_title(categ)) %>%
mutate_at(vars(starts_with("second")), ~.*100) %>%
mutate_if(is.numeric, ~round(., digits = 3)) %>%
mutate_at(vars(starts_with("second")), function(x){
cell_spec(x, "html", color = spec_color(x), bold = T)
}) %>%
kable("html", escape = F,
align = "lrr",
col.names = c("Word Category", "FEMALE Relative Eff.(%)",
"MALE Relative Eff.(%)",
"UNKNOWN Relative Eff.(%)",
"FEMALE P Value",
"MALE P Value",
"UNKNOWN P Value")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)
|
Word Category
|
FEMALE Relative Eff.(%)
|
MALE Relative Eff.(%)
|
UNKNOWN Relative Eff.(%)
|
FEMALE P Value
|
MALE P Value
|
UNKNOWN P Value
|
|
Female
|
13.112
|
24.294
|
23.987
|
0.001
|
0.001
|
0.001
|
|
Anx
|
13.005
|
-2.704
|
10.903
|
0.001
|
0.283
|
0.001
|
|
Family
|
6.994
|
4.204
|
15.167
|
0.003
|
0.092
|
0.001
|
|
They
|
5.827
|
1.555
|
0.672
|
0.010
|
0.330
|
0.415
|
|
Achiev
|
5.153
|
-2.931
|
4.033
|
0.018
|
0.078
|
0.030
|
|
Money
|
4.866
|
14.027
|
4.088
|
0.002
|
0.002
|
0.104
|
|
Negate
|
4.449
|
-6.645
|
6.511
|
0.001
|
0.025
|
0.008
|
|
Health
|
4.246
|
9.044
|
-2.452
|
0.010
|
0.001
|
0.132
|
|
Negemo
|
3.94
|
-3.356
|
0.572
|
0.001
|
0.055
|
0.388
|
|
See
|
2.013
|
0.151
|
-7.928
|
0.146
|
0.485
|
0.001
|
|
Power
|
1.014
|
1.295
|
4.504
|
0.253
|
0.213
|
0.001
|
|
Relig
|
0.745
|
8.26
|
12.547
|
0.426
|
0.079
|
0.012
|
|
Informal
|
-0.048
|
3.212
|
0.702
|
0.473
|
0.006
|
0.192
|
|
Ipron
|
-0.233
|
-3.002
|
-1.231
|
0.401
|
0.026
|
0.111
|
|
Differ
|
-0.539
|
-5.58
|
-0.83
|
0.340
|
0.006
|
0.224
|
|
Tentat
|
-0.93
|
-3.098
|
-5.979
|
0.233
|
0.050
|
0.001
|
|
You
|
-3.067
|
1.011
|
-2.898
|
0.014
|
0.249
|
0.046
|
|
Posemo
|
-3.119
|
-7.538
|
-1.434
|
0.001
|
0.001
|
0.099
|
|
Affiliation
|
-5.861
|
-8.878
|
-6.936
|
0.003
|
0.001
|
0.011
|
|
Shehe
|
-7.875
|
-1.764
|
-4.549
|
0.038
|
0.384
|
0.096
|
|
Friend
|
-11.06
|
-19.113
|
-19.873
|
0.135
|
0.002
|
0.054
|
Words inside categ:
The tables below contain the changes for the 4 most changed words (positive and negative) within each of the categories. The first column is the specific word. Changes_total is the difference between the “before” and “after” frequency of all users. Changes_f same as Changes_total but for female users, Changes_m for male users and Changes_u for unknown users.
library(tidyverse)
read_delim(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
delim = "\t", skip = 1, col_names = c("number", "name"),
n_max = 73) -> categories_name
read_tsv(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
skip = 75, col_names = paste0(c("x"), c("_"), 1:11),
guess_max = 6000, col_types = "ccccccccccc") %>%
mutate(x_1 = str_remove(x_1, "\\*")) -> words_dic
categories_name %>%
filter(name %in% c("female", "family", "anx", "shehe", "affiliation", "friend")) %>%
pull(number) -> number_top_categ
words_dic %>%
filter_at(vars(-x_1), any_vars(. %in% number_top_categ)) %>%
pull(x_1) -> words_cat
# neda_timelapse ----------------------------------------------------------
library(lubridate)
neda_hist <- read_rds(here::here("data", "NEDA_historical.rds")) %>%
distinct() %>%
filter(created_at_tweet >= ymd("2018-03-01")) %>%
mutate(text = str_to_lower(text),
neda_related = str_detect(text,
pattern = "#nedawareness|#comeasyouare|@nedastaff"))
gender_output <- read_tsv(here::here("data", "gender_extractor",
"neda_liwc_gender_output.tsv"),
col_names = c("id", "name", "name_proc", "gender"))
first_tweet <- neda_hist %>%
select(created_at_tweet, text, id, id_tweet, neda_related) %>%
filter(neda_related) %>%
arrange(created_at_tweet) %>%
group_by(id) %>%
slice(1) %>%
ungroup() %>%
select(cero_date = created_at_tweet, id) %>%
filter(cero_date >= ymd("2019-01-01"))
neda_change <- neda_hist %>%
select(id_tweet, text, created_at_tweet, id) %>%
inner_join(first_tweet) %>%
mutate(days_tweet = interval(start = cero_date, end = created_at_tweet),
days_tweet = round(time_length(days_tweet, unit = "days"))) %>%
select(-cero_date) %>%
filter(days_tweet >= -15, days_tweet <= 15)
neda_timelapse <- neda_change %>%
count(id, before_after = sign(days_tweet), sort = T) %>%
mutate(before_after = case_when(before_after == -1 ~ "before",
before_after == 1 ~ "after",
TRUE ~ "cero")) %>%
pivot_wider(values_from = n, names_from = before_after) %>%
filter(before >= 15 & after >= 15) %>%
select(-cero) %>%
semi_join(x = neda_change, y = .)
neda_timelapse %>%
inner_join(gender_output) -> neda_timelapse
library(tidytext)
replace_reg1 <- "https://t.co/[A-Za-z]\\d]+|"
replace_reg2 <- "https://t.co/[A-Za-z]\\d]+|&|<|>|RT|https"
replace_reg <- paste0(replace_reg1, replace_reg2)
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
tidy_tweets <- neda_timelapse %>%
mutate(text = str_to_lower(text)) %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
unnest_tokens(word, text, token = "regex", pattern = unnest_reg)
tidy_tweets %>%
filter(word %in% words_cat) %>%
mutate(days_tweet = case_when(days_tweet < 0 ~ "before",
days_tweet > 0 ~ "after",
TRUE ~ "cero")) %>%
filter(days_tweet != "cero") -> tidy_words
neda_liwc <- read_rds(here::here("data", "neda_liwc.rds")) %>%
select(-created_at_tweet) %>%
filter(abs(days_tweet) <= 15)
FEMALE CATEGORY
# female ------------------------------------------------------------------
# female -> 43
words_dic %>%
filter_at(vars(-x_1), any_vars(. == 43)) %>%
pull(x_1) -> words_female
tidy_words %>%
filter(word %in% words_female) %>%
count(days_tweet, word) %>%
mutate(gender = "t") %>%
bind_rows(tidy_words %>%
filter(word %in% words_female) %>%
count(gender, days_tweet, word)) %>%
pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(changes_total = after_t - before_t,
changes_f = after_f - before_f,
changes_m = after_m - before_m,
changes_u = after_u - before_u) %>%
select(word, starts_with("changes")) %>%
arrange(desc(changes_total)) -> f
f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)
f_top %>%
bind_rows(f_bottom) %>%
gt::gt()
| word |
changes_total |
changes_f |
changes_m |
changes_u |
| women |
2117 |
903 |
403 |
811 |
| her |
320 |
251 |
-18 |
87 |
| she |
228 |
296 |
-30 |
-38 |
| woman |
207 |
90 |
37 |
80 |
| gf |
-30 |
-20 |
-8 |
-2 |
| wife |
-78 |
-37 |
-43 |
2 |
| queen |
-96 |
1 |
-32 |
-65 |
| lady |
-158 |
-115 |
-22 |
-21 |
FAMILY
words_dic %>%
filter_at(vars(-x_1), any_vars(. == 41)) %>%
pull(x_1) -> words_family
tidy_words %>%
filter(word %in% words_family) %>%
count(days_tweet, word) %>%
mutate(gender = "t") %>%
bind_rows(tidy_words %>%
filter(word %in% words_family) %>%
count(gender, days_tweet, word)) %>%
pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(changes_total = after_t - before_t,
changes_f = after_f - before_f,
changes_m = after_m - before_m,
changes_u = after_u - before_u) %>%
select(word, starts_with("changes")) %>%
arrange(desc(changes_total)) -> f
f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)
f_top %>%
bind_rows(f_bottom) %>%
gt::gt()
| word |
changes_total |
changes_f |
changes_m |
changes_u |
| ma |
132 |
52 |
49 |
31 |
| family |
122 |
97 |
11 |
14 |
| daughter |
115 |
36 |
26 |
53 |
| families |
88 |
47 |
21 |
20 |
| mama |
-19 |
-24 |
-2 |
7 |
| pa |
-19 |
-10 |
7 |
-16 |
| daddy |
-23 |
-10 |
-6 |
-7 |
| bro |
-63 |
7 |
30 |
-100 |
| wife |
-78 |
-37 |
-43 |
2 |
ANXIETY
# anxiety -----------------------------------------------------------------
# anxiety -> 33
words_dic %>%
filter_at(vars(-x_1), any_vars(. == 33)) %>%
pull(x_1) -> words_anxiety
tidy_words %>%
filter(word %in% words_anxiety) %>%
count(days_tweet, word) %>%
mutate(gender = "t") %>%
bind_rows(tidy_words %>%
filter(word %in% words_anxiety) %>%
count(gender, days_tweet, word)) %>%
pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(changes_total = after_t - before_t,
changes_f = after_f - before_f,
changes_m = after_m - before_m,
changes_u = after_u - before_u) %>%
select(word, starts_with("changes")) %>%
arrange(desc(changes_total)) -> f
f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)
f_top %>%
bind_rows(f_bottom) %>%
gt::gt()
| word |
changes_total |
changes_f |
changes_m |
changes_u |
| risk |
190 |
16 |
117 |
57 |
| stress |
78 |
40 |
12 |
26 |
| upset |
73 |
32 |
13 |
28 |
| worried |
70 |
33 |
14 |
23 |
| confuse |
-10 |
-6 |
1 |
-5 |
| horrible |
-11 |
-5 |
3 |
-9 |
| doubt |
-21 |
-11 |
-1 |
-9 |
| scared |
-31 |
16 |
-13 |
-34 |
---
title: "Causal Impact" 
clean: true
output:
  bookdown::html_document2:
    number_sections: false
    code_download: true
    code_folding: hide
    self_contained: true
    toc: true
    toc_float: false
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

xfun::pkg_attach("tidyverse", "lubridate", "kableExtra")

theme_set(theme_linedraw())
```

## Causal Impact 

```{r echo=FALSE}
baseline_liwc <- read_rds(here::here("data", "baseline_liwc.rds"))

neda_liwc <- read_rds(here::here("data", "neda_liwc.rds")) %>% 
  select(-text, -created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15)
  

sel_cat <- c(
  "i","we","you","shehe","they","ipron","negate","compare","posemo","negemo","anx","anger","sad","social","family","friend","female","male","insight","cause","discrep","tentat","certain","differ","see","hear","feel","body","health","sexual","ingest","affiliation","achiev","power","reward","risk","focuspast","focuspresent","focusfuture","relativ","work","leisure","home","money","relig","death","informal","swear","assent","nonflu","filler"
)
```

Number of total users:
```{r}
neda_liwc %>% 
  select(id) %>% 
  distinct() %>% 
  nrow()
```

```{r echo=FALSE}
baseline_liwc %>% 
  select(-text, -created_at, -id_tweet) %>% 
  group_by(days_tweet, id) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(cols = fun:filler, names_to = "categ", values_to = "values_baseline") -> ci_baseline

neda_liwc %>% 
  select(-id_tweet) %>% 
  group_by(days_tweet, id) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(-days_tweet, names_to = "categ", values_to = "values_neda") -> ci_neda_liwc 
  
pre_period <- c(1, 16)
post_period <- c(17, 31)

ci_baseline %>% 
  inner_join(ci_neda_liwc) -> d_second

d_second %>% 
  select(categ, values_neda, values_baseline) %>% 
  nest(data = - categ) %>% 
  mutate(mod = map(data, ~CausalImpact::CausalImpact(., pre_period, post_period))) -> ci
  
ci %>% 
  mutate(summary_mod = map(mod, "summary")) %>% 
  filter(!map_lgl(summary_mod, is.null)) -> ci_resul 

ci_resul %>% 
  mutate(p = map(summary_mod, "p")) %>% 
  mutate(p = map_dbl(p, 1)) %>% 
  filter(p < 0.05) %>% 
  mutate(relative_effect = map(summary_mod, "RelEffect")) %>% 
  mutate(relative_effect = map_dbl(relative_effect, 2))-> sig_cat

sig_cat %>% 
  filter(categ %in% sel_cat) -> sig_cat

categories_in_gender <- sig_cat %>% pull(categ)

sig_cat %>% 
  arrange(desc(relative_effect)) %>% 
  select(categ, second_relative_effect = relative_effect, p_second = p) -> second_ap

second_ap %>% 
  mutate(categ = str_to_title(categ)) %>% 
  mutate_at(vars(second_relative_effect), ~.*100) %>% 
  mutate_if(is.numeric, ~round(., digits = 3)) %>% 
  mutate_at(vars(second_relative_effect), function(x){
    cell_spec(x, "html", color = spec_color(x), bold = T)
  }) %>% 
  kable("html", escape = F,
        align = "lrr",
        col.names = c("Word Category", "Relative Eff.(%)",
                      "P Value")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
```

## CAUSAL IMPACT BY GENDER

```{r echo=FALSE}
baseline_liwc_gender <- read_rds(here::here("data",
                                            "baseline_liwc_gender.rds"))

neda_liwc_gender <- read_rds(here::here("data", "neda_liwc_gender.rds")) %>% 
  # select(-text, -created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15)

sel_cat <- c(
  "i","we","you","shehe","they","ipron","negate","compare","posemo","negemo","anx","anger","sad","social","family","friend","female","male","insight","cause","discrep","tentat","certain","differ","see","hear","feel","body","health","sexual","ingest","affiliation","achiev","power","reward","risk","focuspast","focuspresent","focusfuture","relativ","work","leisure","home","money","relig","death","informal","swear","assent","nonflu","filler"
)
```

Number of users by gender (1746 total):
```{r}
neda_liwc_gender %>% 
  select(id, gender) %>% 
  distinct() %>% 
  count(gender)
```

Number of users in the baseline (2991 total):
```{r}
baseline_liwc_gender %>% 
  select(id, gender) %>% 
  distinct() %>% 
  count(gender)
```

```{r }
baseline_liwc_gender %>% 
  select(-text, -created_at, -id_tweet, -name) %>% 
  group_by(gender, id, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(gender, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(cols = fun:filler, names_to = "categ", values_to = "values_baseline") -> ci_baseline

neda_liwc_gender %>% 
  select(-id_tweet, -name, -name_proc) %>% 
  group_by(gender, id, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(gender, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(fun:filler, names_to = "categ", 
               values_to = "values_neda") -> ci_neda_liwc 

pre_period <- c(1, 16)
post_period <- c(17, 31)

ci_baseline %>% 
  full_join(ci_neda_liwc) -> d_second

d_second %>% 
  # count(gender, days_tweet) %>% 
  # view()
  # view()
  # select(categ, values_neda, values_baseline) %>% 
  # nest(data = - categ) %>% 
  select(gender, categ, values_neda, values_baseline) %>%
  # filter(gender == "m") %>% view()
  group_by(categ, gender) %>% 
  nest() %>% 
  ungroup() %>% 
  select(gender, categ, data) %>% 
  mutate(mod = map(data, ~CausalImpact::CausalImpact(., 
                                                     pre_period,
                                                     post_period))) -> ci
  
ci %>% 
  mutate(summary_mod = map(mod, "summary")) %>% 
  filter(!map_lgl(summary_mod, is.null)) -> ci_resul 

ci_resul %>% 
  mutate(p = map(summary_mod, "p")) %>% 
  mutate(p = map_dbl(p, 1)) %>% 
  filter(categ %in% categories_in_gender) %>% 
  mutate(relative_effect = map(summary_mod, "RelEffect")) %>% 
  mutate(relative_effect = map_dbl(relative_effect, 2))-> sig_cat

# sig_cat %>% 
#   filter(categ %in% sel_cat) -> sig_cat

sig_cat %>% 
  arrange(gender, desc(relative_effect)) %>% 
  select(gender, categ, 
         second_relative_effect = relative_effect, p_second = p) -> second_ap

second_ap %>% 
  pivot_wider(id_cols = categ, 
              values_from = c(second_relative_effect, p_second),
              names_from = gender) %>% 
  mutate(categ = str_to_title(categ)) %>% 
  mutate_at(vars(starts_with("second")), ~.*100) %>% 
  mutate_if(is.numeric, ~round(., digits = 3)) %>% 
  mutate_at(vars(starts_with("second")), function(x){
    cell_spec(x, "html", color = spec_color(x), bold = T)
  }) %>% 
  kable("html", escape = F,
        align = "lrr",
        col.names = c("Word Category", "FEMALE Relative Eff.(%)",
                      "MALE Relative Eff.(%)",
                      "UNKNOWN Relative Eff.(%)",
                      "FEMALE P Value",
                      "MALE P Value",
                      "UNKNOWN P Value")) %>%  
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
```

## Words inside categ:

The tables below contain the changes for the 4 most changed words (positive and negative) within each of the categories. The first column is the specific word. Changes_total is the difference between the "before" and "after" frequency of all users. Changes_f same as Changes_total but for female users, Changes_m for male users and Changes_u for unknown users.

```{r}
library(tidyverse)

read_delim(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
           delim = "\t", skip = 1, col_names = c("number", "name"), 
           n_max = 73) -> categories_name 

read_tsv(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
         skip = 75, col_names = paste0(c("x"), c("_"), 1:11),
         guess_max = 6000, col_types = "ccccccccccc") %>% 
  mutate(x_1 = str_remove(x_1, "\\*")) -> words_dic

categories_name %>% 
  filter(name %in% c("female", "family", "anx", "shehe", "affiliation", "friend")) %>% 
  pull(number) -> number_top_categ

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. %in% number_top_categ)) %>%  
  pull(x_1) -> words_cat


# neda_timelapse ----------------------------------------------------------

library(lubridate)

neda_hist <- read_rds(here::here("data", "NEDA_historical.rds")) %>% 
  distinct() %>% 
  filter(created_at_tweet >= ymd("2018-03-01")) %>% 
  mutate(text = str_to_lower(text),
         neda_related = str_detect(text, 
                                   pattern = "#nedawareness|#comeasyouare|@nedastaff")) 

gender_output <- read_tsv(here::here("data", "gender_extractor", 
                                     "neda_liwc_gender_output.tsv"),
                          col_names = c("id", "name", "name_proc", "gender"))

first_tweet <- neda_hist %>% 
  select(created_at_tweet, text, id, id_tweet, neda_related) %>% 
  filter(neda_related) %>% 
  arrange(created_at_tweet) %>% 
  group_by(id) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(cero_date = created_at_tweet, id) %>% 
  filter(cero_date >= ymd("2019-01-01"))

neda_change <- neda_hist %>% 
  select(id_tweet, text, created_at_tweet, id) %>% 
  inner_join(first_tweet) %>% 
  mutate(days_tweet = interval(start = cero_date, end = created_at_tweet),
         days_tweet = round(time_length(days_tweet, unit = "days"))) %>% 
  select(-cero_date) %>% 
  filter(days_tweet >= -15, days_tweet <= 15)

neda_timelapse <- neda_change %>%
  count(id, before_after = sign(days_tweet), sort = T) %>% 
  mutate(before_after = case_when(before_after == -1 ~ "before",
                                  before_after == 1 ~ "after",
                                  TRUE ~ "cero")) %>% 
  pivot_wider(values_from = n, names_from = before_after) %>% 
  filter(before >= 15 & after >= 15) %>% 
  select(-cero) %>% 
  semi_join(x = neda_change, y = .)

neda_timelapse %>% 
  inner_join(gender_output) -> neda_timelapse

library(tidytext)

replace_reg1 <- "https://t.co/[A-Za-z]\\d]+|"
replace_reg2 <- "https://t.co/[A-Za-z]\\d]+|&amp;|&lt;|&gt;|RT|https"
replace_reg <- paste0(replace_reg1, replace_reg2)
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

tidy_tweets <- neda_timelapse %>%
  mutate(text = str_to_lower(text)) %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>% 
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg)

tidy_tweets %>% 
  filter(word %in% words_cat) %>% 
  mutate(days_tweet = case_when(days_tweet < 0 ~ "before",
                                days_tweet > 0 ~ "after",
                                TRUE ~ "cero")) %>% 
  filter(days_tweet != "cero") -> tidy_words

neda_liwc <- read_rds(here::here("data", "neda_liwc.rds")) %>% 
  select(-created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15)

```

### FEMALE CATEGORY

```{r}
# female ------------------------------------------------------------------
# female -> 43

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 43)) %>% 
  pull(x_1) -> words_female

tidy_words %>% 
  filter(word %in% words_female) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_female) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total)) -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
```

### FAMILY

```{r}
words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 41)) %>% 
  pull(x_1) -> words_family

tidy_words %>% 
  filter(word %in% words_family) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_family) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total))  -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
```

### ANXIETY

```{r}
# anxiety -----------------------------------------------------------------
# anxiety -> 33

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 33)) %>% 
  pull(x_1) -> words_anxiety

tidy_words %>% 
  filter(word %in% words_anxiety) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_anxiety) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total))  -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
```

## COMMON WORDS IN TWEETS BY WORDS CATEGORY:

```{r}
neda_liwc_gender %>% 
  filter(days_tweet >= 0) %>% 
  select(id_tweet, id, gender, family, female, anx) %>% 
  pivot_longer(cols = family:anx) %>% 
  filter(value > 0) %>% 
  left_join(neda_liwc %>% 
              select(id, id_tweet, text)) %>%  
  rename("category" = name) %>% 
  mutate(text = str_remove_all(text, pattern = "[:graph:]+(…)")) %>% 
  unnest_tokens(word, text, token = "tweets") %>%  
  filter(!word %in% stop_words$word,
         !word %in% c("de", 4, "da", "la", "en", "le", "los", 3),
         word != "rt",
         !word %in% str_remove_all(stop_words$word, "'")) %>% 
  mutate(word = case_when(category == "female" & word %in% c(words_female,
                                                             "womens") ~ NA_character_,
                          category == "family" & word %in% c(words_family,
                                                             "parents",
                                                             "brothers")~ NA_character_,
                          category == "anx" & word %in% words_anxiety ~ NA_character_,
                          TRUE ~ word)) %>% 
  filter(!is.na(word),
         word != "amp") %>% 
  count(category, gender, word, sort = T) %>%
  group_by(gender, category) %>% 
  top_n(10, wt = n) %>% 
  ungroup() %>% 
  pivot_wider(id_cols = c(category, word), names_from = gender, values_from = n) %>% 
  mutate(category = str_to_title(category)) %>% 
  gt::gt(groupname_col = "category",
         rowname_col = "word")   
  
```

