4.1 Tokenizing by n-gram
library(dplyr)
library(tidytext)
library(janeaustenr)
lyric_bigrams <- lyric_data[,1:6] %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
lyric_bigrams
4.1.1 Counting and filtering n-grams
lyric_bigrams %>%
count(bigram, sort = TRUE)
library(tidyr)
package 愼㸱愼㸵tidyr愼㸱愼㸶 was built under R version 3.4.4
bigrams_separated <-lyric_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
lyric_data[,1:6] %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
4.1.2 Analyzing bigrams
bigrams_filtered %>%
filter(word2 == "street") %>%
count(album, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(album, bigram) %>%
bind_tf_idf(bigram, album, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
4.1.3 Using bigrams to provide context in sentiment analysis
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")
AFINN
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, score, sort = TRUE) %>%
ungroup()
not_words
library(ggplot2)
not_words %>%
mutate(contribution = n * score) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * score, fill = n * score > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment score * number of occurrences") +
coord_flip()

negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE) %>%
ungroup()
4.1.4 Visualizing a network of bigrams with ggraph
library(igraph)
Attaching package: 愼㸱愼㸵igraph愼㸱愼㸶
The following object is masked from 愼㸱愼㸵package:tidyr愼㸱愼㸶:
crossing
The following objects are masked from 愼㸱愼㸵package:dplyr愼㸱愼㸶:
as_data_frame, groups, union
The following objects are masked from 愼㸱愼㸵package:stats愼㸱愼㸶:
decompose, spectrum
The following object is masked from 愼㸱愼㸵package:base愼㸱愼㸶:
union
# original counts
bigram_counts
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()
bigram_graph
IGRAPH 8466b6a DN-- 131 126 --
+ attr: name (v/c), n (e/n)
+ edges from 8466b6a (vertex names):
[1] la ->la yeah ->yeah
[3] da ->da prince ->miscellaneous
[5] na ->na ha ->ha
[7] baby ->baby doo ->doo
[9] ooh ->ooh rock ->rock
[11] hey ->hey sexy ->er
[13] dance ->dance 4 ->love
[15] ooh ->baby uh ->uh
+ ... omitted several edges
library(ggraph)
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()

4.1.5 Visualizing bigrams in other texts
library(dplyr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)
count_bigrams <- function(dataset) {
dataset %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
4.2.2 Pairwise correlation
# we need to filter for at least relatively common words first
word_cors <- lyric_section_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
word_cors
word_cors %>%
filter(item1 == "alfred")
word_cors %>%
filter(item1 %in% c("alfred", "chelsea", "parker", "witness")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
Selecting by correlation

set.seed(2016)
word_cors %>%
filter(correlation > .9) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()

---
title: "Relationships between words: n-grams and correlations"
author: '劉育銘'
output: html_notebook
---


```{r}
library(data.table)
library(dplyr) #data manipulation
library(ggplot2) #visualizations
library(gridExtra) #viewing multiple plots together
library(tidytext) #text mining
#library(wordcloud2) #creative visualizations
```

#####load data

```{r}
lyric_data=fread('../data/prince_raw_data.csv')
```



###4.1 Tokenizing by n-gram

```{r}
library(dplyr)
library(tidytext)
library(janeaustenr)

lyric_bigrams <- lyric_data[,1:6] %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

lyric_bigrams

```


####4.1.1 Counting and filtering n-grams

```{r}
lyric_bigrams %>%
  count(bigram, sort = TRUE)
```


```{r}
library(tidyr)

bigrams_separated <-lyric_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
```



```{r}
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united
```



```{r}
lyric_data[,1:6] %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
```



####4.1.2 Analyzing bigrams

```{r}
bigrams_filtered %>%
  filter(word2 == "street") %>%
  count(album, word1, sort = TRUE)
```



```{r}
bigram_tf_idf <- bigrams_united %>%
  count(album, bigram) %>%
  bind_tf_idf(bigram, album, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
```

####4.1.3 Using bigrams to provide context in sentiment analysis
```{r}
bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
```


```{r}
AFINN <- get_sentiments("afinn")

AFINN
```



```{r}
not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()

not_words
```


```{r}
library(ggplot2)

not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()
```


```{r}
negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) %>%
  ungroup()
```



####4.1.4 Visualizing a network of bigrams with ggraph

```{r}
library(igraph)

# original counts
bigram_counts
```



```{r}
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()

bigram_graph
```


```{r}
library(ggraph)
set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
```



```{r}
set.seed(2016)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()
```



####4.1.5 Visualizing bigrams in other texts


```{r}
library(dplyr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)

count_bigrams <- function(dataset) {
  dataset %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE)
}

visualize_bigrams <- function(bigrams) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}
```






###4.2 Counting and correlating pairs of words with the widyr package


####4.2.1 Counting and correlating among sections

```{r}
lyric_section_words <- lyric_data[,1:6] %>%
  
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)

lyric_section_words
```

```{r}
library(widyr)

# count words co-occuring within sections
word_pairs <- lyric_section_words %>%
  pairwise_count(word, section, sort = TRUE)

word_pairs
```


```{r}
word_pairs %>%
  filter(item1 == "prince")
```



###4.2.2 Pairwise correlation

```{r}
# we need to filter for at least relatively common words first
word_cors <- lyric_section_words %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

word_cors
```

```{r}
word_cors %>%
  filter(item1 == "alfred")
```


```{r}
word_cors %>%
  filter(item1 %in% c("alfred", "chelsea", "parker", "witness")) %>%
  group_by(item1) %>%
  top_n(6) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()
```


```{r}
set.seed(2016)

word_cors %>%
  filter(correlation > .9) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()
```


<style>

em {
    color: #FFEA6C;
    background: #7D7D7D;
}

.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

.qiz {
  line-height: 1.75;
  background: #f0f0f0;
  border-left: 12px solid #ccffcc;
  padding: 4px;
  padding-left: 10px;
  color: #009900;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #0066ff;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}


h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #f8f8f8;
  line-height: 1.5;
  font-weight: bold;
}

h6 {
    color: #006000;
    background: #00ffff;
    line-height: 2;
    font-weight: bold;
}

</style>
