INFILE <- "analysis/11_by_book_analyses/data/tidy_lcnl_kidbook_corpus.csv"
tidy_book <- read_csv(here(INFILE)) %>%
group_by(doc_id) %>%
mutate(word_id = 1:n())
tidy_book %>%
count(title) %>%
ggplot(aes(x=n)) +
geom_histogram()
Correlated with Warniner measure?
SENTIMENT_PATH <- "analysis/12_text_analysis_exploration/corpus_word_valence_sub.csv"
sentiments <- read_csv(here(SENTIMENT_PATH))
human_sentiments <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/6_by_word_analyses/data/Ratings_Warriner_et_al.csv") %>%
clean_names() %>%
select(word, v_mean_sum)
all_sents <- inner_join(sentiments, human_sentiments)
ggplot(all_sents, aes(x = log(v_mean_sum), y = pos_score))+
geom_point(size = .2) +
geom_smooth(method = "lm")
cor.test(all_sents$v_mean_sum,
all_sents$pos_score)
##
## Pearson's product-moment correlation
##
## data: all_sents$v_mean_sum and all_sents$pos_score
## t = 30.362, df = 3976, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4082615 0.4587266
## sample estimates:
## cor
## 0.4338342
LENGTH_META <- "analysis/11_by_book_analyses/data/length_metadata.csv"
meta_data <- read_csv(here(LENGTH_META))
MIN_NUM_WORDS <- 100
tidy_book_with_sentiments <- tidy_book %>%
left_join(sentiments) %>%
left_join(meta_data) %>%
filter(n_tokens >= MIN_NUM_WORDS) %>%
filter(!is.na(pos_score)) %>%
group_by(author) %>%
mutate(word_id_tile = ntile(word_id, 250))
Plot book means
book_means <- tidy_book_with_sentiments %>%
group_by(doc_id, word_id_tile) %>%
summarize(mean_pos_score = mean(pos_score))
book_means %>%
#filter(doc_id %in% c(1,4)) %>%
ggplot(aes(x = word_id_tile, y = mean_pos_score, group = doc_id)) +
geom_smooth(se = F, size = .2)
Add in book gender
BOOKGENDERS<- "analysis/11_by_book_analyses/data/gender_by_book_token.csv"
book_gender <- read_csv(here(BOOKGENDERS)) %>%
filter(gender_measure == "mean_gender_rating") %>%
mutate(gender_type = ifelse(token_mean > 3, "girl", "boy"))%>%
select(doc_id, gender_type, token_mean)
book_means_with_gender <- book_means %>%
left_join(book_gender) %>%
ungroup()
Tile 4 = more girl related.
TILE <- 4
book_means_by_gender <- book_means_with_gender %>%
mutate(girl_tile = ntile(token_mean,TILE)) %>%
group_by(girl_tile, word_id_tile) %>%
summarize(mean_pos_score = mean(mean_pos_score)) %>%
ungroup()%>%
mutate(girl_tile = as.factor(girl_tile))
book_means_by_gender %>%
ggplot(aes(x = word_id_tile,
y = mean_pos_score,
color = girl_tile)) +
facet_wrap(~girl_tile) +
geom_smooth(size = 1, method = "lm", formula = y~poly(x,2)) +
geom_smooth(size = 1, linetype = 2)
TILE <- 2
book_means_by_gender <- book_means_with_gender %>%
mutate(girl_tile = ntile(token_mean,TILE)) %>%
group_by(girl_tile, word_id_tile) %>%
summarize(mean_pos_score = mean(mean_pos_score)) %>%
ungroup()%>%
mutate(girl_tile = as.factor(girl_tile))
book_means_by_gender %>%
ggplot(aes(x = word_id_tile,
y = mean_pos_score,
color = girl_tile)) +
#facet_wrap(~girl_tile) +
geom_smooth(size = 1, method = "lm", formula = y~poly(x,2)) +
geom_smooth(size = 1, linetype = 2)
Girl books have less conflict and less variability.