Today we will take a look at the first part of Chapter 4, about n-grams and the Case Study in Chapter 8, mining NASA metadata. NASA data is available online in JSON format.
Consecutive sequence of words.
library(dplyr)
library(tidytext)
library(janeaustenr)
austen_bigrams <- austen_books() %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
austen_bigrams
austen_bigrams %>%
count(bigram, sort = TRUE)
library(tidyr)
bigrams_separated <- austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
austen_books() %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
bigrams_filtered %>%
filter(word2 == "street") %>%
count(book, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")
AFINN
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
not_words
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
library(igraph)
# original counts
bigram_counts
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()
bigram_graph
IGRAPH e854329 DN-- 91 77 --
+ attr: name (v/c), n (e/n)
+ edges from e854329 (vertex names):
[1] sir ->thomas miss ->crawford captain ->wentworth
[4] miss ->woodhouse frank ->churchill lady ->russell
[7] lady ->bertram sir ->walter miss ->fairfax
[10] colonel ->brandon miss ->bates lady ->catherine
[13] sir ->john jane ->fairfax miss ->tilney
[16] lady ->middleton miss ->bingley thousand->pounds
[19] miss ->dashwood miss ->bennet john ->knightley
[22] miss ->morland captain ->benwick dear ->miss
+ ... omitted several edges
library(ggraph)
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
Get the metadata about the datasets and variables.
library(jsonlite)
metadata <- fromJSON("https://data.nasa.gov/data.json")
names(metadata$dataset)
[1] "_id" "@type" "accessLevel"
[4] "accrualPeriodicity" "bureauCode" "contactPoint"
[7] "description" "distribution" "identifier"
[10] "issued" "keyword" "landingPage"
[13] "language" "modified" "programCode"
[16] "publisher" "spatial" "temporal"
[19] "theme" "title" "license"
[22] "references" "rights" "describedBy"
class(metadata$dataset$title)
[1] "character"
class(metadata$dataset$description)
[1] "character"
class(metadata$dataset$keyword)
[1] "list"
Title, description, and keywords.
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`,
title = metadata$dataset$title)
nasa_title
nasa_desc <- data_frame(id = metadata$dataset$`_id`$`$oid`,
desc = metadata$dataset$description)
nasa_desc %>%
select(desc) %>%
sample_n(5)
library(tidyr)
nasa_keyword <- data_frame(id = metadata$dataset$`_id`$`$oid`,
keyword = metadata$dataset$keyword) %>%
unnest(keyword)
nasa_keyword
library(tidytext)
nasa_title <- nasa_title %>%
unnest_tokens(word, title) %>%
anti_join(stop_words)
Joining, by = "word"
nasa_desc <- nasa_desc %>%
unnest_tokens(word, desc) %>%
anti_join(stop_words)
Joining, by = "word"
nasa_title
nasa_desc
Word counts.
nasa_title %>%
count(word, sort = TRUE)
nasa_desc %>%
count(word, sort = TRUE)
my_stopwords <- data_frame(word = c(as.character(1:10),
"v1", "v03", "l2", "l3", "l4", "v5.2.0",
"v003", "v004", "v005", "v006", "v7"))
nasa_title <- nasa_title %>%
anti_join(my_stopwords)
Joining, by = "word"
nasa_desc <- nasa_desc %>%
anti_join(my_stopwords)
Joining, by = "word"
nasa_keyword %>%
group_by(keyword) %>%
count(sort = TRUE)
nasa_keyword <- nasa_keyword %>%
mutate(keyword = toupper(keyword))
library(widyr)
title_word_pairs <- nasa_title %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
title_word_pairs
Plot co-occuring words. In the description field.
library(ggplot2)
library(igraph)
Attaching package: ‘igraph’
The following object is masked from ‘package:tidyr’:
crossing
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
library(ggraph)
set.seed(1234)
title_word_pairs %>%
filter(n >= 250) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Network of Kewords.
keyword_pairs <- nasa_keyword %>%
pairwise_count(keyword, id, sort = TRUE, upper = FALSE)
keyword_pairs
set.seed(1234)
keyword_pairs %>%
filter(n >= 700) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()