INFILE <- "analysis/11_by_book_analyses/data/tidy_lcnl_kidbook_corpus.csv"
tidy_book <- read_csv(here(INFILE)) %>%
  group_by(doc_id) %>%
  mutate(word_id = 1:n())

Number of words by book

tidy_book %>%
  count(title) %>%
  ggplot(aes(x=n)) +
  geom_histogram() 

Sentiment Measure

Correlated with Warniner measure?

SENTIMENT_PATH  <- "analysis/12_text_analysis_exploration/corpus_word_valence_sub.csv"
sentiments <- read_csv(here(SENTIMENT_PATH)) 

human_sentiments <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/Ratings_Warriner_et_al.csv") %>%
  clean_names() %>%
  select(word, v_mean_sum)

all_sents <- inner_join(sentiments, human_sentiments)

ggplot(all_sents, aes(x = log(v_mean_sum), y = pos_score))+ 
geom_point(size = .2) +
  geom_smooth(method = "lm")

cor.test(all_sents$v_mean_sum, 
         all_sents$pos_score)
## 
##  Pearson's product-moment correlation
## 
## data:  all_sents$v_mean_sum and all_sents$pos_score
## t = 30.362, df = 3976, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4082615 0.4587266
## sample estimates:
##       cor 
## 0.4338342

Merge sentiment with book data

LENGTH_META <- "analysis/11_by_book_analyses/data/length_metadata.csv"
meta_data <- read_csv(here(LENGTH_META)) 

MIN_NUM_WORDS <- 100
tidy_book_with_sentiments <- tidy_book %>%
  left_join(sentiments)  %>%
  left_join(meta_data) %>%
  filter(n_tokens >= MIN_NUM_WORDS) %>%
  filter(!is.na(pos_score)) %>%
  group_by(author) %>%
  mutate(word_id_tile = ntile(word_id, 250)) 

Plot book means

book_means <- tidy_book_with_sentiments %>%
  group_by(doc_id, word_id_tile) %>%
  summarize(mean_pos_score = mean(pos_score))

book_means %>%
  #filter(doc_id %in% c(1,4)) %>%
  ggplot(aes(x = word_id_tile, y = mean_pos_score, group = doc_id)) +
  geom_smooth(se = F, size = .2)

Add in book gender

BOOKGENDERS<- "analysis/11_by_book_analyses/data/gender_by_book_token.csv"
book_gender <- read_csv(here(BOOKGENDERS)) %>%
  filter(gender_measure == "mean_gender_rating") %>%
  mutate(gender_type = ifelse(token_mean > 3, "girl", "boy"))%>%
  select(doc_id, gender_type, token_mean)

book_means_with_gender <- book_means %>%
  left_join(book_gender)  %>%
  ungroup()

Book means by gender tile

Tile 4 = more girl related.

quartile

TILE <- 4
book_means_by_gender <- book_means_with_gender %>%
  mutate(girl_tile = ntile(token_mean,TILE)) %>%
  group_by(girl_tile, word_id_tile) %>%
  summarize(mean_pos_score = mean(mean_pos_score))  %>%
  ungroup()%>%
  mutate(girl_tile = as.factor(girl_tile))

book_means_by_gender %>%
ggplot(aes(x = word_id_tile, 
           y = mean_pos_score, 
           color = girl_tile)) +
  facet_wrap(~girl_tile) +
  geom_smooth(size = 1, method = "lm", formula = y~poly(x,2)) +
  geom_smooth(size = 1, linetype = 2)

Median

TILE <- 2
book_means_by_gender <- book_means_with_gender %>%
  mutate(girl_tile = ntile(token_mean,TILE)) %>%
  group_by(girl_tile, word_id_tile) %>%
  summarize(mean_pos_score = mean(mean_pos_score))  %>%
  ungroup()%>%
  mutate(girl_tile = as.factor(girl_tile))

book_means_by_gender %>%
ggplot(aes(x = word_id_tile, 
           y = mean_pos_score, 
           color = girl_tile)) +
  #facet_wrap(~girl_tile) +
  geom_smooth(size = 1, method = "lm", formula = y~poly(x,2)) +
  geom_smooth(size = 1, linetype = 2)

Girl books have less conflict and less variability.