Causal Impact

CAUSAL IMPACT BY GENDER

baseline_liwc_gender %>% 
  select(-text, -created_at, -id_tweet, -name) %>% 
  group_by(gender, id, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(gender, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(cols = fun:filler, names_to = "categ", values_to = "values_baseline") -> ci_baseline

neda_liwc_gender %>% 
  select(-id_tweet, -name, -name_proc) %>% 
  group_by(gender, id, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(gender, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(fun:filler, names_to = "categ", 
               values_to = "values_neda") -> ci_neda_liwc 

pre_period <- c(1, 16)
post_period <- c(17, 31)

ci_baseline %>% 
  full_join(ci_neda_liwc) -> d_second

d_second %>% 
  # count(gender, days_tweet) %>% 
  # view()
  # view()
  # select(categ, values_neda, values_baseline) %>% 
  # nest(data = - categ) %>% 
  select(gender, categ, values_neda, values_baseline) %>%
  # filter(gender == "m") %>% view()
  group_by(categ, gender) %>% 
  nest() %>% 
  ungroup() %>% 
  select(gender, categ, data) %>% 
  mutate(mod = map(data, ~CausalImpact::CausalImpact(., 
                                                     pre_period,
                                                     post_period))) -> ci
  
ci %>% 
  mutate(summary_mod = map(mod, "summary")) %>% 
  filter(!map_lgl(summary_mod, is.null)) -> ci_resul 

ci_resul %>% 
  mutate(p = map(summary_mod, "p")) %>% 
  mutate(p = map_dbl(p, 1)) %>% 
  filter(categ %in% c("female",
                      "family",
                      "anx",
                      "relig",
                      "money",
                      "they",
                      "achiev",
                      "negate",
                      "health",
                      "power",
                      "negemo",
                      "informal",
                      "ipron",
                      "you",
                      "discrep",
                      "differ",
                      "tentat",
                      "posemo",
                      "shehe",
                      "affiliation")) %>% 
  mutate(relative_effect = map(summary_mod, "RelEffect")) %>% 
  mutate(relative_effect = map_dbl(relative_effect, 2))-> sig_cat

# sig_cat %>% 
#   filter(categ %in% sel_cat) -> sig_cat

sig_cat %>% 
  arrange(gender, desc(relative_effect)) %>% 
  select(gender, categ, 
         second_relative_effect = relative_effect, p_second = p) -> second_ap

second_ap %>% 
  pivot_wider(id_cols = categ, 
              values_from = c(second_relative_effect, p_second),
              names_from = gender) %>% 
  mutate(categ = str_to_title(categ)) %>% 
  mutate_at(vars(starts_with("second")), ~.*100) %>% 
  mutate_if(is.numeric, ~round(., digits = 3)) %>% 
  mutate_at(vars(starts_with("second")), function(x){
    cell_spec(x, "html", color = spec_color(x), bold = T)
  }) %>% 
  kable("html", escape = F,
        align = "lrr",
        col.names = c("Word Category", "FEMALE Relative Eff.(%)",
                      "MALE Relative Eff.(%)",
                      "UNKNOWN Relative Eff.(%)",
                      "FEMALE P Value",
                      "MALE P Value",
                      "UNKNOWN P Value")) %>%  
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Word Category FEMALE Relative Eff.(%) MALE Relative Eff.(%) UNKNOWN Relative Eff.(%) FEMALE P Value MALE P Value UNKNOWN P Value
Anx 13.196 1.808 10.928 0.001 0.313 0.005
Female 12.601 15.836 19.852 0.001 0.001 0.001
Family 7.904 1.151 14.691 0.012 0.350 0.001
Achiev 6.533 -1.29 0.71 0.004 0.289 0.384
Relig 5.685 -1.697 17.331 0.030 0.328 0.001
They 5.086 -1.85 3.6 0.005 0.291 0.057
Negemo 4.489 -4.545 -0.366 0.001 0.008 0.438
Money 4.037 16.973 5.994 0.035 0.001 0.021
Power 3.875 2.584 4.018 0.001 0.044 0.003
Negate 3.107 -2.063 3.02 0.015 0.158 0.104
Health 2.93 5.301 -4.463 0.070 0.060 0.009
Ipron 0.376 -1.107 -1.675 0.358 0.229 0.049
Informal 0.239 2.044 1.418 0.365 0.036 0.068
Tentat -0.532 -1.518 -6.818 0.380 0.176 0.001
Differ -1.323 -2.077 -0.054 0.137 0.070 0.485
Posemo -2.319 -5.974 -1.489 0.005 0.001 0.092
You -2.321 0.197 -3.912 0.021 0.451 0.002
Discrep -3.547 -5.811 0.608 0.006 0.023 0.334
Affiliation -5.274 -7.818 -7.205 0.004 0.001 0.011
Shehe -7.708 -3.775 -5.058 0.023 0.221 0.050

Words inside categ:

The tables below contain the changes for the 4 most changed words (positive and negative) within each of the categories. The first column is the specific word. Changes_total is the difference between the “before” and “after” frequency of all users. Changes_f same as Changes_total but for female users, Changes_m for male users and Changes_u for unknown users.

library(tidyverse)

read_delim(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
           delim = "\t", skip = 1, col_names = c("number", "name"), 
           n_max = 73) -> categories_name 

read_tsv(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
         skip = 75, col_names = paste0(c("x"), c("_"), 1:11),
         guess_max = 6000, col_types = "ccccccccccc") %>% 
  mutate(x_1 = str_remove(x_1, "\\*")) -> words_dic

categories_name %>% 
  filter(name %in% c("female", "family", "anx", "shehe", "affiliation", "friend")) %>% 
  pull(number) -> number_top_categ

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. %in% number_top_categ)) %>%  
  pull(x_1) -> words_cat


# neda_timelapse ----------------------------------------------------------

library(lubridate)

neda_hist <- read_rds(here::here("data", "NEDA_historical.rds")) %>% 
  distinct() %>% 
  filter(created_at_tweet >= ymd("2018-03-01")) %>% 
  mutate(text = str_to_lower(text),
         neda_related = str_detect(text, 
                                   pattern = "#nedawareness|#comeasyouare|@nedastaff")) 

gender_output <- read_tsv(here::here("data", "gender_extractor", 
                                     "neda_liwc_gender_output.tsv"),
                          col_names = c("id", "name", "name_proc", "gender"))

first_tweet <- neda_hist %>% 
  select(created_at_tweet, text, id, id_tweet, neda_related) %>% 
  filter(neda_related) %>% 
  arrange(created_at_tweet) %>% 
  group_by(id) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(cero_date = created_at_tweet, id) %>% 
  filter(cero_date >= ymd("2019-01-01"))

neda_change <- neda_hist %>% 
  select(id_tweet, text, created_at_tweet, id) %>% 
  inner_join(first_tweet) %>% 
  mutate(days_tweet = interval(start = cero_date, end = created_at_tweet),
         days_tweet = round(time_length(days_tweet, unit = "days"))) %>% 
  select(-cero_date) %>% 
  filter(days_tweet >= -15, days_tweet <= 15)

neda_timelapse <- neda_change %>%
  count(id, before_after = sign(days_tweet), sort = T) %>% 
  mutate(before_after = case_when(before_after == -1 ~ "before",
                                  before_after == 1 ~ "after",
                                  TRUE ~ "cero")) %>% 
  pivot_wider(values_from = n, names_from = before_after) %>% 
  filter(before >= 15 & after >= 15) %>% 
  select(-cero) %>% 
  semi_join(x = neda_change, y = .)

neda_timelapse %>% 
  inner_join(gender_output) -> neda_timelapse

library(tidytext)

replace_reg1 <- "https://t.co/[A-Za-z]\\d]+|"
replace_reg2 <- "https://t.co/[A-Za-z]\\d]+|&amp;|&lt;|&gt;|RT|https"
replace_reg <- paste0(replace_reg1, replace_reg2)
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

tidy_tweets <- neda_timelapse %>%
  mutate(text = str_to_lower(text)) %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>% 
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg)

tidy_tweets %>% 
  filter(word %in% words_cat) %>% 
  mutate(days_tweet = case_when(days_tweet < 0 ~ "before",
                                days_tweet > 0 ~ "after",
                                TRUE ~ "cero")) %>% 
  filter(days_tweet != "cero") -> tidy_words

neda_liwc <- read_rds(here::here("data", "neda_liwc.rds")) %>% 
  select(-created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15)

FEMALE CATEGORY

# female ------------------------------------------------------------------
# female -> 43

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 43)) %>% 
  pull(x_1) -> words_female

tidy_words %>% 
  filter(word %in% words_female) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_female) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total)) -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
word changes_total changes_f changes_m changes_u
women 2117 896 404 817
her 320 265 -1 56
she 228 307 -27 -52
woman 207 82 34 91
gf -30 -19 -7 -4
wife -78 -37 -33 -8
queen -96 3 -32 -67
lady -158 -112 -21 -25

FAMILY

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 41)) %>% 
  pull(x_1) -> words_family

tidy_words %>% 
  filter(word %in% words_family) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_family) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total))  -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
word changes_total changes_f changes_m changes_u
ma 132 52 54 26
family 122 98 14 10
daughter 115 39 24 52
families 88 49 22 17
mama -19 -23 0 4
pa -19 -14 7 -12
daddy -23 -9 -6 -8
bro -63 7 31 -101
wife -78 -37 -33 -8

ANXIETY

# anxiety -----------------------------------------------------------------
# anxiety -> 33

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 33)) %>% 
  pull(x_1) -> words_anxiety

tidy_words %>% 
  filter(word %in% words_anxiety) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_anxiety) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total))  -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
word changes_total changes_f changes_m changes_u
risk 190 15 115 60
stress 78 38 13 27
upset 73 30 13 30
worried 70 31 15 24
confuse -10 -6 1 -5
horrible -11 -5 1 -7
doubt -21 -11 -1 -9
scared -31 16 -12 -35

COMMON WORDS IN TWEETS BY WORDS CATEGORY:

neda_liwc_gender %>% 
  filter(days_tweet >= 0) %>% 
  select(id_tweet, id, gender, family, female, anx) %>% 
  pivot_longer(cols = family:anx) %>% 
  filter(value > 0) %>% 
  left_join(neda_liwc %>% 
              select(id, id_tweet, text)) %>%  
  rename("category" = name) %>% 
  mutate(text = str_remove_all(text, pattern = "[:graph:]+(…)")) %>% 
  unnest_tokens(word, text, token = "tweets") %>% 
  filter(!word %in% stop_words$word,
         word != "rt",
         !word %in% str_remove_all(stop_words$word, "'")) %>% 
  mutate(word = case_when(category == "female" & word %in% c(words_female,
                                                             "womens") ~ NA_character_,
                          category == "family" & word %in% c(words_family,
                                                             "parents",
                                                             "brothers")~ NA_character_,
                          category == "anx" & word %in% words_anxiety ~ NA_character_,
                          TRUE ~ word)) %>% 
  filter(!is.na(word),
         word != "amp") %>% 
  count(category, gender, word, sort = T) %>%
  group_by(gender, category) %>% 
  top_n(10, wt = n) %>% 
  ungroup() %>% 
  pivot_wider(id_cols = c(category, word), names_from = gender, values_from = n) %>% 
  mutate(category = str_to_title(category)) %>% 
  gt::gt(groupname_col = "category",
         rowname_col = "word")   
f u m
Female
day 491 432 222
love 390 274 130
happy 314 326 138
time 305 214 114
international 261 229 122
people 261 209 NA
black 234 204 97
notre NA 227 NA
de NA 201 113
life 182 NA 91
#internationalwomensday 158 179 NA
white 173 NA NA
amazing NA NA 103
world NA NA 93
Anx
love 136 362 151
people 340 287 98
struggling 225 204 NA
feel 163 117 NA
@swampmusicinfo NA 157 70
time 155 NA NA
struggle 148 NA NA
@laurarjacobs NA 148 NA
mental 147 108 NA
laura NA 145 NA
jacobs NA 135 NA
depression 118 NA NA
life 118 NA NA
ft NA 117 NA
eating 112 NA NA
3 NA NA 94
sixty NA NA 87
$safe NA NA 85
cse NA NA 85
solutions NA NA 85
news NA NA 83
@financialbuzz NA NA 80
Family
love 248 161 76
day 209 143 88
people 194 169 52
kids 191 151 NA
time 190 NA 72
de NA 176 171
ur NA 173 NA
apologize NA 170 NA
4 NA 164 NA
children 149 163 NA
da NA 157 NA
friends 135 NA 60
women 135 NA NA
child 127 NA NA
pregnant 118 NA NA
la NA NA 98
life NA NA 79
en NA NA 56
fathersrightshq NA NA 52
---
title: "Causal Impact" 
clean: true
output:
  bookdown::html_document2:
    number_sections: false
    code_download: true
    code_folding: hide
    self_contained: true
    toc: true
    toc_float: false
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

xfun::pkg_attach("tidyverse", "lubridate", "kableExtra")

theme_set(theme_linedraw())
```

## Causal Impact 

```{r eval=FALSE, include=FALSE}
baseline_liwc <- read_rds(here::here("data", "baseline_liwc.rds"))
neda_liwc <- read_rds(here::here("data", "neda_liwc.rds")) %>% 
  select(-text, -created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15) %>% 
  

sel_cat <- c(
  "i","we","you","shehe","they","ipron","negate","compare","posemo","negemo","anx","anger","sad","social","family","friend","female","male","insight","cause","discrep","tentat","certain","differ","see","hear","feel","body","health","sexual","ingest","affiliation","achiev","power","reward","risk","focuspast","focuspresent","focusfuture","relativ","work","leisure","home","money","relig","death","informal","swear","assent","nonflu","filler"
)
```

```{r eval=FALSE, include=FALSE}
baseline_liwc %>% 
  select(-text, -created_at, -id_tweet) %>% 
  group_by(days_tweet, id) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(cols = fun:filler, names_to = "categ", values_to = "values_baseline") -> ci_baseline

neda_liwc %>% 
  select(-id_tweet) %>% 
  group_by(days_tweet, id) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(-days_tweet, names_to = "categ", values_to = "values_neda") -> ci_neda_liwc 
  
pre_period <- c(1, 16)
post_period <- c(17, 31)

ci_baseline %>% 
  inner_join(ci_neda_liwc) -> d_second

d_second %>% 
  select(categ, values_neda, values_baseline) %>% 
  nest(data = - categ) %>% 
  mutate(mod = map(data, ~CausalImpact::CausalImpact(., pre_period, post_period))) -> ci
  
ci %>% 
  mutate(summary_mod = map(mod, "summary")) %>% 
  filter(!map_lgl(summary_mod, is.null)) -> ci_resul 

ci_resul %>% 
  mutate(p = map(summary_mod, "p")) %>% 
  mutate(p = map_dbl(p, 1)) %>% 
  filter(p < 0.05) %>% 
  mutate(relative_effect = map(summary_mod, "RelEffect")) %>% 
  mutate(relative_effect = map_dbl(relative_effect, 2))-> sig_cat

sig_cat %>% 
  filter(categ %in% sel_cat) -> sig_cat

sig_cat %>% 
  arrange(desc(relative_effect)) %>% 
  select(categ, second_relative_effect = relative_effect, p_second = p) -> second_ap

second_ap %>% 
  mutate(categ = str_to_title(categ)) %>% 
  mutate_at(vars(second_relative_effect), ~.*100) %>% 
  mutate_if(is.numeric, ~round(., digits = 3)) %>% 
  mutate_at(vars(second_relative_effect), function(x){
    cell_spec(x, "html", color = spec_color(x), bold = T)
  }) %>% 
  kable("html", escape = F,
        align = "lrr",
        col.names = c("Word Category", "Relative Eff.(%)",
                      "P Value")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
```

## CAUSAL IMPACT BY GENDER

```{r echo=FALSE}
baseline_liwc_gender <- read_rds(here::here("data",
                                            "baseline_liwc_gender.rds"))

neda_liwc_gender <- read_rds(here::here("data", "neda_liwc_gender.rds")) %>% 
  # select(-text, -created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15)

sel_cat <- c(
  "i","we","you","shehe","they","ipron","negate","compare","posemo","negemo","anx","anger","sad","social","family","friend","female","male","insight","cause","discrep","tentat","certain","differ","see","hear","feel","body","health","sexual","ingest","affiliation","achiev","power","reward","risk","focuspast","focuspresent","focusfuture","relativ","work","leisure","home","money","relig","death","informal","swear","assent","nonflu","filler"
)
```

```{r }
baseline_liwc_gender %>% 
  select(-text, -created_at, -id_tweet, -name) %>% 
  group_by(gender, id, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(gender, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(cols = fun:filler, names_to = "categ", values_to = "values_baseline") -> ci_baseline

neda_liwc_gender %>% 
  select(-id_tweet, -name, -name_proc) %>% 
  group_by(gender, id, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  select(-id) %>% 
  group_by(gender, days_tweet) %>% 
  summarise_all(mean) %>% 
  ungroup() %>% 
  pivot_longer(fun:filler, names_to = "categ", 
               values_to = "values_neda") -> ci_neda_liwc 

pre_period <- c(1, 16)
post_period <- c(17, 31)

ci_baseline %>% 
  full_join(ci_neda_liwc) -> d_second

d_second %>% 
  # count(gender, days_tweet) %>% 
  # view()
  # view()
  # select(categ, values_neda, values_baseline) %>% 
  # nest(data = - categ) %>% 
  select(gender, categ, values_neda, values_baseline) %>%
  # filter(gender == "m") %>% view()
  group_by(categ, gender) %>% 
  nest() %>% 
  ungroup() %>% 
  select(gender, categ, data) %>% 
  mutate(mod = map(data, ~CausalImpact::CausalImpact(., 
                                                     pre_period,
                                                     post_period))) -> ci
  
ci %>% 
  mutate(summary_mod = map(mod, "summary")) %>% 
  filter(!map_lgl(summary_mod, is.null)) -> ci_resul 

ci_resul %>% 
  mutate(p = map(summary_mod, "p")) %>% 
  mutate(p = map_dbl(p, 1)) %>% 
  filter(categ %in% c("female",
                      "family",
                      "anx",
                      "relig",
                      "money",
                      "they",
                      "achiev",
                      "negate",
                      "health",
                      "power",
                      "negemo",
                      "informal",
                      "ipron",
                      "you",
                      "discrep",
                      "differ",
                      "tentat",
                      "posemo",
                      "shehe",
                      "affiliation")) %>% 
  mutate(relative_effect = map(summary_mod, "RelEffect")) %>% 
  mutate(relative_effect = map_dbl(relative_effect, 2))-> sig_cat

# sig_cat %>% 
#   filter(categ %in% sel_cat) -> sig_cat

sig_cat %>% 
  arrange(gender, desc(relative_effect)) %>% 
  select(gender, categ, 
         second_relative_effect = relative_effect, p_second = p) -> second_ap

second_ap %>% 
  pivot_wider(id_cols = categ, 
              values_from = c(second_relative_effect, p_second),
              names_from = gender) %>% 
  mutate(categ = str_to_title(categ)) %>% 
  mutate_at(vars(starts_with("second")), ~.*100) %>% 
  mutate_if(is.numeric, ~round(., digits = 3)) %>% 
  mutate_at(vars(starts_with("second")), function(x){
    cell_spec(x, "html", color = spec_color(x), bold = T)
  }) %>% 
  kable("html", escape = F,
        align = "lrr",
        col.names = c("Word Category", "FEMALE Relative Eff.(%)",
                      "MALE Relative Eff.(%)",
                      "UNKNOWN Relative Eff.(%)",
                      "FEMALE P Value",
                      "MALE P Value",
                      "UNKNOWN P Value")) %>%  
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
```

## Words inside categ:

The tables below contain the changes for the 4 most changed words (positive and negative) within each of the categories. The first column is the specific word. Changes_total is the difference between the "before" and "after" frequency of all users. Changes_f same as Changes_total but for female users, Changes_m for male users and Changes_u for unknown users.

```{r}
library(tidyverse)

read_delim(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
           delim = "\t", skip = 1, col_names = c("number", "name"), 
           n_max = 73) -> categories_name 

read_tsv(here::here("data", "liwc", "LIWC2015_English_Flat.dic"),
         skip = 75, col_names = paste0(c("x"), c("_"), 1:11),
         guess_max = 6000, col_types = "ccccccccccc") %>% 
  mutate(x_1 = str_remove(x_1, "\\*")) -> words_dic

categories_name %>% 
  filter(name %in% c("female", "family", "anx", "shehe", "affiliation", "friend")) %>% 
  pull(number) -> number_top_categ

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. %in% number_top_categ)) %>%  
  pull(x_1) -> words_cat


# neda_timelapse ----------------------------------------------------------

library(lubridate)

neda_hist <- read_rds(here::here("data", "NEDA_historical.rds")) %>% 
  distinct() %>% 
  filter(created_at_tweet >= ymd("2018-03-01")) %>% 
  mutate(text = str_to_lower(text),
         neda_related = str_detect(text, 
                                   pattern = "#nedawareness|#comeasyouare|@nedastaff")) 

gender_output <- read_tsv(here::here("data", "gender_extractor", 
                                     "neda_liwc_gender_output.tsv"),
                          col_names = c("id", "name", "name_proc", "gender"))

first_tweet <- neda_hist %>% 
  select(created_at_tweet, text, id, id_tweet, neda_related) %>% 
  filter(neda_related) %>% 
  arrange(created_at_tweet) %>% 
  group_by(id) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(cero_date = created_at_tweet, id) %>% 
  filter(cero_date >= ymd("2019-01-01"))

neda_change <- neda_hist %>% 
  select(id_tweet, text, created_at_tweet, id) %>% 
  inner_join(first_tweet) %>% 
  mutate(days_tweet = interval(start = cero_date, end = created_at_tweet),
         days_tweet = round(time_length(days_tweet, unit = "days"))) %>% 
  select(-cero_date) %>% 
  filter(days_tweet >= -15, days_tweet <= 15)

neda_timelapse <- neda_change %>%
  count(id, before_after = sign(days_tweet), sort = T) %>% 
  mutate(before_after = case_when(before_after == -1 ~ "before",
                                  before_after == 1 ~ "after",
                                  TRUE ~ "cero")) %>% 
  pivot_wider(values_from = n, names_from = before_after) %>% 
  filter(before >= 15 & after >= 15) %>% 
  select(-cero) %>% 
  semi_join(x = neda_change, y = .)

neda_timelapse %>% 
  inner_join(gender_output) -> neda_timelapse

library(tidytext)

replace_reg1 <- "https://t.co/[A-Za-z]\\d]+|"
replace_reg2 <- "https://t.co/[A-Za-z]\\d]+|&amp;|&lt;|&gt;|RT|https"
replace_reg <- paste0(replace_reg1, replace_reg2)
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

tidy_tweets <- neda_timelapse %>%
  mutate(text = str_to_lower(text)) %>% 
  mutate(text = str_replace_all(text, replace_reg, "")) %>% 
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg)

tidy_tweets %>% 
  filter(word %in% words_cat) %>% 
  mutate(days_tweet = case_when(days_tweet < 0 ~ "before",
                                days_tweet > 0 ~ "after",
                                TRUE ~ "cero")) %>% 
  filter(days_tweet != "cero") -> tidy_words

neda_liwc <- read_rds(here::here("data", "neda_liwc.rds")) %>% 
  select(-created_at_tweet) %>% 
  filter(abs(days_tweet) <= 15)

```

### FEMALE CATEGORY

```{r}
# female ------------------------------------------------------------------
# female -> 43

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 43)) %>% 
  pull(x_1) -> words_female

tidy_words %>% 
  filter(word %in% words_female) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_female) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total)) -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
```

### FAMILY

```{r}
words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 41)) %>% 
  pull(x_1) -> words_family

tidy_words %>% 
  filter(word %in% words_family) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_family) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total))  -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
```

### ANXIETY

```{r}
# anxiety -----------------------------------------------------------------
# anxiety -> 33

words_dic %>% 
  filter_at(vars(-x_1), any_vars(. == 33)) %>% 
  pull(x_1) -> words_anxiety

tidy_words %>% 
  filter(word %in% words_anxiety) %>% 
  count(days_tweet, word) %>% 
  mutate(gender = "t") %>% 
  bind_rows(tidy_words %>% 
              filter(word %in% words_anxiety) %>% 
              count(gender, days_tweet, word)) %>% 
  pivot_wider(names_from = c(days_tweet, gender), values_from = n) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) %>% 
  mutate(changes_total = after_t - before_t,
         changes_f = after_f - before_f,
         changes_m = after_m - before_m,
         changes_u = after_u - before_u) %>% 
  select(word, starts_with("changes")) %>% 
  arrange(desc(changes_total))  -> f 

f_top <- f %>% top_n(4, wt = changes_total)
f_bottom <- f %>% top_n(-4, wt = changes_total)

f_top %>% 
  bind_rows(f_bottom) %>% 
  gt::gt()
```

## COMMON WORDS IN TWEETS BY WORDS CATEGORY:

```{r}
neda_liwc_gender %>% 
  filter(days_tweet >= 0) %>% 
  select(id_tweet, id, gender, family, female, anx) %>% 
  pivot_longer(cols = family:anx) %>% 
  filter(value > 0) %>% 
  left_join(neda_liwc %>% 
              select(id, id_tweet, text)) %>%  
  rename("category" = name) %>% 
  mutate(text = str_remove_all(text, pattern = "[:graph:]+(…)")) %>% 
  unnest_tokens(word, text, token = "tweets") %>% 
  filter(!word %in% stop_words$word,
         word != "rt",
         !word %in% str_remove_all(stop_words$word, "'")) %>% 
  mutate(word = case_when(category == "female" & word %in% c(words_female,
                                                             "womens") ~ NA_character_,
                          category == "family" & word %in% c(words_family,
                                                             "parents",
                                                             "brothers")~ NA_character_,
                          category == "anx" & word %in% words_anxiety ~ NA_character_,
                          TRUE ~ word)) %>% 
  filter(!is.na(word),
         word != "amp") %>% 
  count(category, gender, word, sort = T) %>%
  group_by(gender, category) %>% 
  top_n(10, wt = n) %>% 
  ungroup() %>% 
  pivot_wider(id_cols = c(category, word), names_from = gender, values_from = n) %>% 
  mutate(category = str_to_title(category)) %>% 
  gt::gt(groupname_col = "category",
         rowname_col = "word")   
  
```

