library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidytext)
library(DT)
library(RColorBrewer)
Load your data for ZIPF tests and terms counts
load( here::here("data", "tidy_data", "my_books_tfidf.rda") )
load( here::here("data", "tidy_data", "my_books_stopped.rda") )
Please run the ZIPF test and provide a brief analysis.
my_books_tfidf %>%
ggplot( aes(x = zipf_rank, y = tf, color = title) )+
geom_line(size = 1.25, alpha = 0.75) +
scale_y_log10() +
scale_x_log10() +
labs(title = "Zipf test: My Books",
x = "Rank (log10)",
y = "Term Frequency (log10)",
color = "Novel")
Doing a formal linear model is optional. If you did so, your results below.
Do you have a data subset or summary stats to present? This is optional.
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(wordcloud)
library(tidytext)
library(tidyverse)
library(here)
## here() starts at /cloud/project
library(RColorBrewer)
library(radarchart)
library(reshape2)
library(wordcloud)
library(DT)
We can use Wordclouds to map out key terms and key differences. Remember, you have four different types of Wordclouds at your disposable. You can also do a TF-IDF Wordcloud.
Term Frequency or TF-IDF.
my_books_tfidf %>%
group_by(title) %>%
slice_max(n, n =10) %>%
DT::datatable()
Do you want to make additional graphs and choose the most useful? One code section per graph. But make as many as you please.
Shows only the terms shared by all your texts in the corpus.
rm(list = ls()) # clear the environment
# Your data
load( here::here("data", "tidy_data", "hgwells.rda") )
load( here::here("data", "tidy_data", "my_books_tfidf.rda") )
load( here::here("data", "tidy_data", "my_books_stopped.rda") )
load( here::here("data", "tidy_data", "my_books_tidy_all.rda") )
# sentiment lexicons
load( here::here("data", "tidy_data", "sentiment_lexicons.RData") ) # Everyone
dark2 <- brewer.pal(8, "Dark2")
set1 <- brewer.pal(8, "Set1")
accent <- rev(brewer.pal(8, "Accent"))
paired_cl <- brewer.pal(12, "Paired")
my_books_stopped %>%
group_by(title) %>%
count(word) %>%
reshape2::acast(word ~ title, value.var = "n", fill = 0) %>%
commonality.cloud(color = dark2, scale = c(3, 0.33),
random.order = FALSE, max.words = 100)
Shows key differences between texts.
my_books_stopped %>%
group_by(title) %>%
count(word) %>%
reshape2::acast(word ~ title, value.var = "n", fill = 0) %>%
comparison.cloud(color = dark2, title.size = 1,
scale = c(3, 0.33), random.order = FALSE, max.words = 100)
## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## notch could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## spring could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## inside could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## triggers could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## game could not be fit on page. It will not be plotted.
top_25_cleaned <- my_books_stopped %>%
group_by(title) %>%
count(word) %>%
slice_max(n, n = 25)
top_25_cleaned %>% glimpse()
## Rows: 102
## Columns: 3
## Groups: title [4]
## $ title <chr> "Birds_and_Man", "Birds_and_Man", "Birds_and_Man", "Birds_and_Ma…
## $ word <chr> "bird", "birds", "species", "human", "time", "song", "beautiful"…
## $ n <int> 299, 293, 136, 125, 115, 104, 94, 90, 90, 87, 86, 85, 85, 80, 77…
top_25_cleaned <- top_25_cleaned %>%
group_by(word) %>%
mutate(tc = sum(n)) %>%
arrange(desc(tc)) %>%
ungroup()
top_25_cleaned %>% glimpse()
## Rows: 102
## Columns: 4
## $ title <chr> "Deadfalls_and_Snares", "The_Bird_Book", "Birds_and_Man", "Buffa…
## $ word <chr> "illustration", "illustration", "white", "white", "white", "eggs…
## $ n <int> 106, 1548, 64, 110, 1189, 1333, 1220, 136, 791, 737, 723, 293, 3…
## $ tc <int> 1654, 1654, 1363, 1363, 1363, 1333, 1220, 927, 927, 737, 723, 66…
top_25_cleaned %>%
filter(tc >= 60) %>%
ggplot( aes(x = reorder(word, tc), y = n, fill= title) ) +
geom_col() +
coord_flip()
You might have done this earlier.
top_25_cleaned %>%
ggplot( aes(x = reorder(word, tc), y = n, fill= title) ) +
geom_col() +
coord_flip()
top_twenty_tf <- my_books_tfidf %>%
group_by(title) %>%
slice_max(tf, n =20) %>%
ungroup()
top_twenty_tf %>%
ggplot( aes(x = word, y = tf, fill= title) )+
geom_col() +
coord_flip() +
facet_wrap(~title, scales = "free") +
guides(fill = "none")
top_twenty_tf <- my_books_tfidf %>%
group_by(title) %>%
slice_max(tf, n =20) %>%
ungroup()
top_twenty_tf %>%
ggplot( aes(x = word, y = tf, fill= title) )+
geom_col() +
coord_flip() +
facet_wrap(~title, scales = "free") +
guides(fill = "none")
top_twenty_tfidf <- my_books_tfidf %>%
group_by(title) %>%
slice_max(tf_idf, n =20) %>%
ungroup()
top_twenty_tfidf %>%
ggplot( aes(x = word, y = tf_idf, fill= title) )+
geom_col() +
coord_flip() +
facet_wrap(~title, scales = "free") +
guides(fill = "none")
You can skip if you are doing Word Pairs and/or Word Correlation Analyses.
library(tidytext)
library(here)
load(here::here("data", "tidy_data", "ngrams_data.rda"))
bigrams_tfidf <- tidy_bigrams_all %>%
group_by(title) %>%
count(title, bigram) %>%
bind_tf_idf(bigram, title, n) %>%
ungroup() %>%
mutate(across(where(is.numeric), round, 4))
stats_bigrams <- bigrams_tfidf %>%
group_by(title) %>%
summarize(average_bigram = mean(n, na.rm = TRUE),
median_bigram = median(n , na.rm = TRUE),
max_bigram = max(n , na.rm = TRUE),
sd_bigram = sd(n , na.rm = TRUE)) %>%
ungroup() %>%
mutate(across(where(is.numeric), round, 4))
remove_old_common <- bind_rows(tibble(word = c("thou", "thee", "hath" , "hast", "thy", "thyself", "canst", "dost" , "thine", "shalt", "didst", "ye", "wilt"), lexicon = c("custom")))
word_pairs_stopped2 <- word_pairs_stopped %>%
filter(!word1 %in% remove_old_common$word) %>%
filter(!word2 %in% remove_old_common$word) %>%
na.omit()
network_wordpairs <- function(countedpairs) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
countedpairs %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
library(ggraph)
## The following object is masked from 'package:tibble':
##
## as_data_frame
#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
countedpairs %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
bigrams_tfidf <- tidy_bigrams_all %>%
group_by(title) %>%
count(title, bigram) %>%
bind_tf_idf(bigram, title, n) %>%
ungroup() %>%
mutate(across(where(is.numeric), round, 4))
stats_bigrams <- bigrams_tfidf %>%
group_by(title) %>%
summarize(average_bigram = mean(n, na.rm = TRUE),
median_bigram = median(n , na.rm = TRUE),
max_bigram = max(n , na.rm = TRUE),
sd_bigram = sd(n , na.rm = TRUE)) %>%
ungroup() %>%
mutate(across(where(is.numeric), round, 4))
network_wordpairs <- function(countedpairs) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
countedpairs %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
one_not <- word_pairs_all %>%
group_by(title) %>%
filter(word1 == "bird") %>%
count(title, word1, word2, sort = TRUE) %>%
ungroup()
one_not %>% DT::datatable(., caption = "If word1 is bird, ....")
## For Word Network Graphs
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
countedpairs %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
bl_wp_network_3 <- word_pairs_stopped2 %>%
filter(title == "Buffalo_Land") %>%
count(word1, word2, sort = TRUE) %>%
filter(n >= 3) %>%
network_wordpairs() +
ggtitle("Key Word-Pairs in Buffalo_Land")
bl_wp_network_3
top_forty_bigrams <- bigrams_tfidf %>%
slice_max(n, n = 50) %>%
group_by(bigram) %>%
mutate(tc = sum(n)) %>%
ungroup() %>%
ggplot(aes(x = reorder(bigram, tc) , y = n, fill = title ) )+
geom_col() +
xlab(NULL) +
coord_flip() +
theme(legend.position = c(0.8, 0.3) ) +
labs(title = "Top 50 Bigrams by Count",
y = "Bigram Count",
subtitle ="Some may occur in more than one text",
fill = "Title")
top_forty_bigrams
Please do not edit the code below unless you are sure abot it.
## For Word Network Graphs
library(widyr)
library(igraph)
library(ggraph)
#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
countedpairs %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
##
#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
countedpairs %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
Do you have interesting results for Word Pair searches? As many good examples as you please. We shared these as both data tables and visualizations.
one_love <- word_pairs_all %>%
group_by(title) %>%
filter(word1 == "bird") %>%
count(title, word1, word2, sort = TRUE) %>%
ungroup()
one_love %>% DT::datatable(., caption = "If word1 is love, ....")
One code block per example, please.
one_love <- word_pairs_all %>%
group_by(title) %>%
filter(word1 == "bird") %>%
count(title, word1, word2, sort = TRUE) %>%
ungroup()
one_love %>% DT::datatable(., caption = "If word1 is love, ....")
One code block per example, please.
We can make the network graph, the regular bar graph, and data tables.
## Network Graph Code
network_cors <- function(word_cors_book) {
set.seed(2016)
word_cors_book %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "tomato1", size = 4) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
}
## Summary Stats Code
sum_stats_word_cors <- function(word_cors_book) {
word_cors_book %>%
summarize(median_correlation = median(correlation, na.rm = TRUE ),
average_correlation= mean(correlation, na.rm = TRUE),
max_correlation = max(correlation, na.rm = TRUE ),
sd_correlation = sd(correlation, na.rm = TRUE ) ) %>%
mutate(across(where(is.numeric),round, 3))
}
## end
network_cors <- function(word_cors_book) {
set.seed(2016)
word_cors_book %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "tomato1", size = 4) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
}
More than one possible here.
Gender Analysis meets (replaces) the Word Pairs / Word Correlations requirement.
load( here::here("data", "raw_data", "my_books_raw.rda") )
data_gender <- my_books_raw
rm(my_books_raw) # Keep it safe by removing it
data_gender$text <- tolower(data_gender$text) # all lowercase
## Get Rid of the Underline Marks if you have NOT already done so
str_detect(data_gender$text, "_") %>% sum() # How many?
## [1] 0
# See examples
us_where <- str_detect(data_gender$text, "_") # creates Index
data_gender$text[us_where] %>% head(12) # shows first 12
## character(0)
# Replace them with whitespace
data_gender$text <- gsub("_", " ", data_gender$text)
# check
str_detect(data_gender$text, "_") %>% sum() # How many? Should be 0 now.
## [1] 0
str_detect(data_gender$text, "her") %>% sum() # How many?
## [1] 4670
## See examples We can not simply swap out "her"
dex_her <- str_detect(data_gender$text, "her")
data_gender$text[dex_her] %>% head(n = 10)
## [1] "bundle of dead feathers; crystal globes may be put into the empty"
## [2] "\"natural surroundings,\" i very naturally avoid the places where they are"
## [3] "visited and stayed in that town, there was no inclination to see the"
## [4] "not only was the bird in his brightest feathers, but his surroundings"
## [5] "his appearance, as i saw him then and on many other occasions in"
## [6] "metallic feathers, and, more rarely, in other kinds. thus the"
## [7] "still on a tuft of heather among the furze-bushes, and at"
## [8] "flew to a bush within twelve yards of where i sat, and perching on"
## [9] "he flit to another spray than it vanished, and he was once"
## [10] "dark but semi-translucent delicate feathers of his mantle; but its"
str_detect(data_gender$text, "he'd") %>% sum() # How many?
## [1] 5
str_detect(data_gender$text, "he'll") %>% sum() # How many?
## [1] 1
str_detect(data_gender$text, "he's") %>% sum() # How many?
## [1] 7
str_detect(data_gender$text, "she'd") %>% sum() # How many?
## [1] 0
str_detect(data_gender$text, "she'll") %>% sum() # How many?
## [1] 1
str_detect(data_gender$text, "she's") %>% sum() # How many?
## [1] 5
# Replace HE words
data_gender$text <- gsub("\\<he's\\>", "he", data_gender$text)
data_gender$text <- gsub("\\<he'd\\>" , "he", data_gender$text)
data_gender$text <- gsub("\\<he'll\\>" , "he", data_gender$text)
# Replace SHE words
data_gender$text <- gsub("\\<she's\\>", "she", data_gender$text)
data_gender$text <- gsub("\\<she'd\\>" , "she", data_gender$text)
data_gender$text <- gsub("\\<she'll\\>" , "she", data_gender$text)
# Check results
str_detect(data_gender$text , "he'd") %>% sum() # How many?
## [1] 0
str_detect(data_gender$text , "he'll") %>% sum() # How many?
## [1] 0
str_detect(data_gender$text , "he's") %>% sum() # How many?
## [1] 0
str_detect(data_gender$text, "she'd") %>% sum() # How many?
## [1] 0
str_detect(data_gender$text, "she'll") %>% sum() # How many?
## [1] 0
str_detect(data_gender$text, "she's") %>% sum() # How many?
## [1] 0
### Top Twenty Data Subsets?
```r
top_twenty_tfidf <- my_books_tfidf %>%
group_by(title) %>%
slice_max(tf_idf, n =20) %>%
ungroup()
top_twenty_tfidf %>%
ggplot( aes(x = word, y = tf_idf, fill= title) )+
geom_col() +
coord_flip() +
facet_wrap(~title, scales = "free") +
guides(fill = "none")
We load our radar chart library and custom functions.
library(radarchart)
source(here::here("scripts", "custom_functions.R"), local = knitr::knit_global())
source(here::here("scripts", "custom_functions.R"), local = knitr::knit_global())
my_books_stopped %>% radar_Plutchik()
We also have the four-poster version of this.
my_books_stopped %>%
joy_sad() +
facet_wrap(~title, ncol = 2, scales = "free_x") +
ggtitle("Joy vs. Sadness My Books")
I recommend you create a new directory: dash_data, for example. Remember, we cannot save Wordclouds as objects. We must cook them fresh. We can save, however, all the other graphs, and our data subsets and summaries.