Key Results

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidytext)
library(DT)
library(RColorBrewer)

Load your data for ZIPF tests and terms counts

load( here::here("data", "tidy_data",  "my_books_tfidf.rda") )
load( here::here("data", "tidy_data",  "my_books_stopped.rda") )

ZIPF test and results

Please run the ZIPF test and provide a brief analysis.

Your ZIPF test plot.

my_books_tfidf %>%
  ggplot( aes(x = zipf_rank, y = tf, color = title) )+ 
  geom_line(size = 1.25, alpha = 0.75) +
  scale_y_log10() +
  scale_x_log10() +
  labs(title = "Zipf test: My Books",
       x = "Rank (log10)", 
       y = "Term Frequency (log10)",
       color = "Novel")

Optional LM

Doing a formal linear model is optional. If you did so, your results below.

Data subset or summary stats?

Do you have a data subset or summary stats to present? This is optional.

Term Count Visualizations

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(wordcloud)
library(tidytext)
library(tidyverse)
library(here)

## here() starts at /cloud/project

library(RColorBrewer)
library(radarchart)
library(reshape2) 
library(wordcloud) 
library(DT)

We can use Wordclouds to map out key terms and key differences. Remember, you have four different types of Wordclouds at your disposable. You can also do a TF-IDF Wordcloud.

Wordcloud or Bar Graph

Term Frequency or TF-IDF.

my_books_tfidf %>%
  group_by(title) %>%
  slice_max(n, n =10)  %>% 
  DT::datatable()

Do you want to make additional graphs and choose the most useful? One code section per graph. But make as many as you please.

Commonality Cloud?

Shows only the terms shared by all your texts in the corpus.

rm(list = ls()) # clear the environment

# Your data
load( here::here("data", "tidy_data",  "hgwells.rda") )
load( here::here("data", "tidy_data",  "my_books_tfidf.rda") )
load( here::here("data", "tidy_data",  "my_books_stopped.rda") )
load( here::here("data", "tidy_data",  "my_books_tidy_all.rda") )
# sentiment lexicons
load( here::here("data", "tidy_data", "sentiment_lexicons.RData") ) # Everyone

dark2 <- brewer.pal(8, "Dark2")
set1 <- brewer.pal(8, "Set1")
accent <- rev(brewer.pal(8, "Accent"))
paired_cl <-  brewer.pal(12, "Paired")

my_books_stopped %>% 
  group_by(title) %>%
  count(word) %>%
  reshape2::acast(word ~ title, value.var = "n", fill = 0) %>%
  commonality.cloud(color = dark2, scale = c(3,  0.33), 
                   random.order = FALSE,  max.words = 100)

Comparison Cloud?

Shows key differences between texts.

my_books_stopped %>% 
  group_by(title) %>%
  count(word) %>%
  reshape2::acast(word ~ title, value.var = "n", fill = 0) %>%
  comparison.cloud(color = dark2, title.size = 1, 
                   scale = c(3,  0.33), random.order = FALSE,  max.words = 100)

## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## notch could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## spring could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## inside could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## triggers could not be fit on page. It will not be plotted.

## Warning in comparison.cloud(., color = dark2, title.size = 1, scale = c(3, :
## game could not be fit on page. It will not be plotted.

Other graph?

Data subset or summary stats?

top_25_cleaned <- my_books_stopped %>% 
  group_by(title) %>%
  count(word) %>% 
  slice_max(n,  n = 25) 

top_25_cleaned %>% glimpse()

## Rows: 102
## Columns: 3
## Groups: title [4]
## $ title <chr> "Birds_and_Man", "Birds_and_Man", "Birds_and_Man", "Birds_and_Ma…
## $ word  <chr> "bird", "birds", "species", "human", "time", "song", "beautiful"…
## $ n     <int> 299, 293, 136, 125, 115, 104, 94, 90, 90, 87, 86, 85, 85, 80, 77…

top_25_cleaned <- top_25_cleaned %>%
  group_by(word) %>%
  mutate(tc = sum(n)) %>%
  arrange(desc(tc)) %>%
  ungroup()

top_25_cleaned %>% glimpse()

## Rows: 102
## Columns: 4
## $ title <chr> "Deadfalls_and_Snares", "The_Bird_Book", "Birds_and_Man", "Buffa…
## $ word  <chr> "illustration", "illustration", "white", "white", "white", "eggs…
## $ n     <int> 106, 1548, 64, 110, 1189, 1333, 1220, 136, 791, 737, 723, 293, 3…
## $ tc    <int> 1654, 1654, 1363, 1363, 1363, 1333, 1220, 927, 927, 737, 723, 66…

top_25_cleaned %>% 
  filter(tc >= 60) %>% 
  ggplot( aes(x = reorder(word, tc), y = n, fill= title) ) +
  geom_col() +
  coord_flip()

TF-IDF Analysis

You might have done this earlier.

Key Visualization

top_25_cleaned %>% 
  ggplot( aes(x = reorder(word, tc), y = n, fill= title) ) +
  geom_col() +
  coord_flip()

Second Visualization?

top_twenty_tf <- my_books_tfidf %>%
  group_by(title) %>%
  slice_max(tf, n =20) %>%
  ungroup()


top_twenty_tf %>%
  ggplot( aes(x = word, y = tf, fill= title) )+
  geom_col() +
  coord_flip() +
  facet_wrap(~title, scales = "free") +
  guides(fill = "none")

top_twenty_tf <- my_books_tfidf %>%
  group_by(title) %>%
  slice_max(tf, n =20) %>%
  ungroup()


top_twenty_tf %>%
  ggplot( aes(x = word, y = tf, fill= title) )+
  geom_col() +
  coord_flip() +
  facet_wrap(~title, scales = "free") +
  guides(fill = "none")

Data subset or summary stats?

top_twenty_tfidf <- my_books_tfidf %>%
  group_by(title) %>%
  slice_max(tf_idf, n =20) %>%
  ungroup()


top_twenty_tfidf %>%
  ggplot( aes(x = word, y = tf_idf, fill= title) )+
  geom_col() +
  coord_flip() +
  facet_wrap(~title, scales = "free") +
  guides(fill = "none")

Bigrams / Trigrams

You can skip if you are doing Word Pairs and/or Word Correlation Analyses.

Key Visualization

library(tidytext)
library(here)

load(here::here("data", "tidy_data", "ngrams_data.rda"))

bigrams_tfidf <-  tidy_bigrams_all %>%  
  group_by(title) %>%
  count(title, bigram) %>%
  bind_tf_idf(bigram, title, n) %>%
  ungroup() %>%
  mutate(across(where(is.numeric), round, 4))


  
  
stats_bigrams <-  bigrams_tfidf %>%
  group_by(title) %>%
  summarize(average_bigram = mean(n, na.rm = TRUE),  
            median_bigram = median(n , na.rm = TRUE), 
            max_bigram = max(n , na.rm = TRUE), 
            sd_bigram = sd(n , na.rm = TRUE))  %>% 
  ungroup() %>%
  mutate(across(where(is.numeric), round, 4))

remove_old_common  <- bind_rows(tibble(word = c("thou", "thee", "hath" , "hast", "thy", "thyself", "canst",  "dost" , "thine", "shalt", "didst", "ye", "wilt"), lexicon = c("custom")))

word_pairs_stopped2  <- word_pairs_stopped %>% 
  filter(!word1 %in% remove_old_common$word) %>%
  filter(!word2 %in% remove_old_common$word) %>%
  na.omit()

network_wordpairs <- function(countedpairs) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  countedpairs %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

library(ggraph)
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  countedpairs %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

bigrams_tfidf <-  tidy_bigrams_all %>%  
  group_by(title) %>%
  count(title, bigram) %>%
  bind_tf_idf(bigram, title, n) %>%
  ungroup() %>%
  mutate(across(where(is.numeric), round, 4))


  
  
stats_bigrams <-  bigrams_tfidf %>%
  group_by(title) %>%
  summarize(average_bigram = mean(n, na.rm = TRUE),  
            median_bigram = median(n , na.rm = TRUE), 
            max_bigram = max(n , na.rm = TRUE), 
            sd_bigram = sd(n , na.rm = TRUE))  %>% 
  ungroup() %>%
  mutate(across(where(is.numeric), round, 4))

network_wordpairs <- function(countedpairs) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  countedpairs %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

one_not <- word_pairs_all %>%
  group_by(title) %>%
  filter(word1 == "bird") %>%
  count(title, word1, word2, sort = TRUE) %>%
  ungroup()

one_not %>% DT::datatable(., caption = "If word1 is bird, ....")

Second Visualization?

## For Word Network Graphs
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  countedpairs %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

bl_wp_network_3 <-  word_pairs_stopped2   %>%
  filter(title == "Buffalo_Land") %>%
  count(word1, word2, sort = TRUE) %>% 
  filter(n >= 3) %>% 
  network_wordpairs() +
   ggtitle("Key Word-Pairs in Buffalo_Land")


bl_wp_network_3

Data subset or summary stats?

top_forty_bigrams <- bigrams_tfidf %>%
  slice_max(n, n = 50) %>%
  group_by(bigram)  %>%
  mutate(tc = sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(bigram, tc) ,  y = n, fill = title ) )+
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  theme(legend.position = c(0.8, 0.3) ) +
  labs(title = "Top 50 Bigrams by Count",
       y = "Bigram Count", 
       subtitle ="Some may occur in more than one text",
       fill = "Title")


top_forty_bigrams

Word Pairs

Please do not edit the code below unless you are sure abot it.

## For Word Network Graphs
library(widyr)
library(igraph)
library(ggraph)

#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  countedpairs %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}   

##

Word Pair Graph?

#leave this alone, ok -- unless you are really sure
network_wordpairs <- function(countedpairs) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  countedpairs %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

Word One, Word Two?

Do you have interesting results for Word Pair searches? As many good examples as you please. We shared these as both data tables and visualizations.

one_love <- word_pairs_all %>%
  group_by(title) %>%
  filter(word1 == "bird") %>%
  count(title, word1, word2, sort = TRUE) %>%
  ungroup()

one_love %>% DT::datatable(., caption = "If word1 is love, ....")

One code block per example, please.

one_love <- word_pairs_all %>%
  group_by(title) %>%
  filter(word1 == "bird") %>%
  count(title, word1, word2, sort = TRUE) %>%
  ungroup()

one_love %>% DT::datatable(., caption = "If word1 is love, ....")

One code block per example, please.

Word Correlations

We can make the network graph, the regular bar graph, and data tables.

## Network Graph Code
network_cors <- function(word_cors_book) {
  set.seed(2016)
  word_cors_book %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
    geom_node_point(color = "tomato1", size = 4) +
    geom_node_text(aes(label = name), repel = TRUE) +
    theme_void()
  
}


## Summary Stats Code
sum_stats_word_cors <- function(word_cors_book) {
  word_cors_book %>%
    summarize(median_correlation = median(correlation, na.rm = TRUE ), 
              average_correlation= mean(correlation, na.rm = TRUE),   
              max_correlation = max(correlation, na.rm = TRUE ),  
              sd_correlation = sd(correlation, na.rm = TRUE ) ) %>%
    mutate(across(where(is.numeric),round, 3))
}
## end

Network Graph?

network_cors <- function(word_cors_book) {
  set.seed(2016)
  word_cors_book %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
    geom_node_point(color = "tomato1", size = 4) +
    geom_node_text(aes(label = name), repel = TRUE) +
    theme_void()
  
}

Bar graph?

Data subset or summary stats?

More than one possible here.

Gender Analysis

Gender Analysis meets (replaces) the Word Pairs / Word Correlations requirement.

HE / SHE Graph

load( here::here("data", "raw_data", "my_books_raw.rda") )

data_gender <- my_books_raw

rm(my_books_raw) # Keep it safe by removing it

data_gender$text <- tolower(data_gender$text) # all lowercase

## Get Rid of the Underline Marks if you have NOT already done so
str_detect(data_gender$text, "_") %>% sum() # How many?

## [1] 0

# See examples
us_where <- str_detect(data_gender$text, "_") # creates Index
data_gender$text[us_where] %>% head(12) # shows first 12

## character(0)

# Replace them with whitespace
data_gender$text <- gsub("_", " ", data_gender$text) 
# check
str_detect(data_gender$text, "_") %>% sum() # How many?  Should be 0 now.

## [1] 0

str_detect(data_gender$text, "her") %>% sum() # How many?

## [1] 4670

## See examples We can not simply swap out "her"
dex_her <- str_detect(data_gender$text, "her")

data_gender$text[dex_her] %>% head(n = 10)

##  [1] "bundle of dead feathers; crystal globes may be put into the empty"         
##  [2] "\"natural surroundings,\" i very naturally avoid the places where they are"
##  [3] "visited and stayed in that town, there was no inclination to see the"      
##  [4] "not only was the bird in his brightest feathers, but his surroundings"     
##  [5] "his appearance, as i saw him then and on many other occasions in"          
##  [6] "metallic feathers, and, more rarely, in other kinds. thus the"             
##  [7] "still on a tuft of heather among the furze-bushes, and at"                 
##  [8] "flew to a bush within twelve yards of where i sat, and perching on"        
##  [9] "he flit to another spray than it vanished, and he was once"                
## [10] "dark but semi-translucent delicate feathers of his mantle; but its"

HIS / HERS Graph

str_detect(data_gender$text, "he'd") %>% sum() # How many?

## [1] 5

str_detect(data_gender$text, "he'll") %>% sum() # How many?

## [1] 1

str_detect(data_gender$text, "he's") %>% sum() # How many?

## [1] 7

str_detect(data_gender$text, "she'd") %>% sum() # How many?

## [1] 0

str_detect(data_gender$text, "she'll") %>% sum() # How many?

## [1] 1

str_detect(data_gender$text, "she's") %>% sum() # How many?

## [1] 5

# Replace HE words
data_gender$text <- gsub("\\<he's\\>", "he", data_gender$text) 
data_gender$text <- gsub("\\<he'd\\>" , "he", data_gender$text)
data_gender$text  <- gsub("\\<he'll\\>" , "he", data_gender$text) 

# Replace SHE words
data_gender$text <- gsub("\\<she's\\>", "she", data_gender$text) 
data_gender$text <- gsub("\\<she'd\\>" , "she", data_gender$text)
data_gender$text  <- gsub("\\<she'll\\>" , "she", data_gender$text) 

# Check results
str_detect(data_gender$text  , "he'd") %>% sum() # How many?

## [1] 0

str_detect(data_gender$text  , "he'll") %>% sum() # How many?

## [1] 0

str_detect(data_gender$text  , "he's") %>% sum() # How many?

## [1] 0

str_detect(data_gender$text, "she'd") %>% sum() # How many?

## [1] 0

str_detect(data_gender$text, "she'll") %>% sum() # How many?

## [1] 0

str_detect(data_gender$text, "she's") %>% sum() # How many?

## [1] 0



### Top Twenty Data Subsets?


```r
top_twenty_tfidf <- my_books_tfidf %>%
  group_by(title) %>%
  slice_max(tf_idf, n =20) %>%
  ungroup()


top_twenty_tfidf %>%
  ggplot( aes(x = word, y = tf_idf, fill= title) )+
  geom_col() +
  coord_flip() +
  facet_wrap(~title, scales = "free") +
  guides(fill = "none")

Sentiment Analysis

We load our radar chart library and custom functions.

library(radarchart)

source(here::here("scripts", "custom_functions.R"), local = knitr::knit_global())

Plutchik Radar Chart?

source(here::here("scripts", "custom_functions.R"), local = knitr::knit_global())

 my_books_stopped %>% radar_Plutchik()

SA Lines Across All?

Competing Emotions?

We also have the four-poster version of this.

my_books_stopped %>% 
  joy_sad() + 
  facet_wrap(~title, ncol = 2, scales = "free_x") + 
  ggtitle("Joy vs. Sadness My Books")

Save Your Results

I recommend you create a new directory: dash_data, for example. Remember, we cannot save Wordclouds as objects. We must cook them fresh. We can save, however, all the other graphs, and our data subsets and summaries.

Key Results

2022-12-22

ZIPF test and results

Your ZIPF test plot.

Optional LM

Data subset or summary stats?

Term Count Visualizations

Wordcloud or Bar Graph

Commonality Cloud?

Comparison Cloud?

Other graph?

Data subset or summary stats?

TF-IDF Analysis

Key Visualization

Second Visualization?

Data subset or summary stats?

Bigrams / Trigrams

Key Visualization

Second Visualization?

Data subset or summary stats?

Word Pairs

Word Pair Graph?

Word One, Word Two?

Word Correlations

Network Graph?

Bar graph?

Data subset or summary stats?

Gender Analysis

HE / SHE Graph

HIS / HERS Graph

Sentiment Analysis

Plutchik Radar Chart?

SA Lines Across All?

Competing Emotions?

Save Your Results