library(data.table)
library(dplyr) #data manipulation
library(ggplot2) #visualizations
library(gridExtra) #viewing multiple plots together
library(tidytext) #text mining
#library(wordcloud2) #creative visualizations
load data
lyric_data=fread('../data/prince_raw_data.csv')

2.1 The sentiments dataset

library(tidytext)
sentiments  

The three general-purpose lexicons are

get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")

2.2 Sentiment analysis with inner join

library(janeaustenr)
library(dplyr)
library(stringr)
original_songs <- lyric_data[,1:6] %>%
  group_by(album) %>%
  mutate(linenumber = row_number()) %>% 
  ungroup()
tidy_songs<-original_songs %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")
tidy_songs %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
Joining, by = "word"
tidy_songs %>% 
  group_by(.,album) %>%
  summarise(
    count=n()
  ) %>% .[order(.$count,decreasing = T),]
library(tidyr)
tidy_4albums<-tidy_songs %>%
  filter(.,album %in% 
           c('Other Songs',
             'Emancipation',
             'Crystal Ball',
             'Crystal Ball'))
song_lyric_sentiment <-tidy_4albums %>%
  inner_join(get_sentiments("bing")) %>%
  count(album, index = linenumber %/% 2, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
Joining, by = "word"
library(ggplot2)
ggplot(song_lyric_sentiment, 
       aes(index, sentiment, fill = album)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~album, ncol = 2, scales = "free_x")

2.3 Comparing the three sentiment dictionaries

afinn <- tidy_4albums %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 2) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")
Joining, by = "word"
bing_and_nrc <- bind_rows(tidy_4albums %>% 
                                        inner_join(get_sentiments("bing")) %>%
          mutate(method = "Bing et al."),
          tidy_4albums %>% 
          inner_join(get_sentiments("nrc") %>% 
          filter(sentiment %in% 
                   c("positive", 
                     "negative"))) %>%
                  mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 2, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
Joining, by = "word"
Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)
get_sentiments("bing") %>% 
  count(sentiment)

2.4 Most common positive and negative words

bing_word_counts <- tidy_songs %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining, by = "word"
bing_word_counts
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
Selecting by n

custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                          lexicon = c("custom")), 
                               stop_words)
custom_stop_words

2.5 Wordclouds

library(wordcloud)
Loading required package: RColorBrewer
tidy_songs %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
Joining, by = "word"

library(reshape2)
tidy_songs %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining, by = "word"

2.6 Looking at units beyond just words

斷句

對original_songs$text 斷句

PandP_sentences <- original_songs %>% 
  unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
[1] "was i the first?"
Regex斷句
austen_books() %>% head(20)
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()
austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")
wordcounts <- tidy_songs %>%
  group_by(album) %>%
  summarize(words = n())
tidy_songs %>%
  semi_join(bingnegative) %>%
  group_by(album) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c('album')) %>%
  mutate(ratio = negativewords/words) %>%
 
  ungroup()
Joining, by = "word"
---
title: "CH2 Sentiment analysis with tidy data"
output: html_notebook
---


```{r}
library(data.table)
library(dplyr) #data manipulation
library(ggplot2) #visualizations
library(gridExtra) #viewing multiple plots together
library(tidytext) #text mining
#library(wordcloud2) #creative visualizations
```


#####load data

```{r}
lyric_data=fread('../data/prince_raw_data.csv')
```



###2.1 The sentiments dataset
```{r}
library(tidytext)

sentiments  
```

The three general-purpose lexicons are

+ AFINN from Finn Årup Nielsen,
+ bing from Bing Liu and collaborators, and
+ nrc from Saif Mohammad and Peter Turney.


```{r}
get_sentiments("afinn")
```

```{r}
get_sentiments("bing")
```

```{r}
get_sentiments("nrc")
```


###2.2 Sentiment analysis with inner join

```{r}
library(janeaustenr)
library(dplyr)
library(stringr)

original_songs <- lyric_data[,1:6] %>%
  group_by(album) %>%
  mutate(linenumber = row_number()) %>% 
  ungroup()

tidy_songs<-original_songs %>%
  unnest_tokens(word, text)
```


```{r}
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_songs %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
```




```{r}
tidy_songs %>% 
  group_by(.,album) %>%
  summarise(
    count=n()
  ) %>% .[order(.$count,decreasing = T),]


```


```{r}
library(tidyr)

tidy_4albums<-tidy_songs %>%
  filter(.,album %in% 
           c('Other Songs',
             'Emancipation',
             'Crystal Ball',
             'Crystal Ball'))

song_lyric_sentiment <-tidy_4albums %>%
  inner_join(get_sentiments("bing")) %>%
  count(album, index = linenumber %/% 2, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
```



```{r}
library(ggplot2)

ggplot(song_lyric_sentiment, 
       aes(index, sentiment, fill = album)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~album, ncol = 2, scales = "free_x")
```


###2.3 Comparing the three sentiment dictionaries

```{r}
afinn <- tidy_4albums %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 2) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(tidy_4albums %>% 
                                        inner_join(get_sentiments("bing")) %>%
          mutate(method = "Bing et al."),
          tidy_4albums %>% 
          inner_join(get_sentiments("nrc") %>% 
          filter(sentiment %in% 
                   c("positive", 
                     "negative"))) %>%
                  mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 2, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
```


```{r}
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")
```



```{r}
get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)
```


```{r}
get_sentiments("bing") %>% 
  count(sentiment)
```


###2.4 Most common positive and negative words

```{r}
bing_word_counts <- tidy_songs %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts
```


```{r}
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
```


```{r}
custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words
```

###2.5 Wordclouds

```{r}
library(wordcloud)

tidy_songs %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
```


```{r}
library(reshape2)

tidy_songs %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

```


### 2.6 Looking at units beyond just words


#####斷句


對original_songs$text 斷句

```{r}
PandP_sentences <- original_songs %>% 
  unnest_tokens(sentence, text, token = "sentences")

PandP_sentences$sentence[2]
```


#####Regex斷句

```{r}
austen_books() %>% head(20)
```



```{r}
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
```





```{r}
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_songs %>%
  group_by(album) %>%
  summarize(words = n())

tidy_songs %>%
  semi_join(bingnegative) %>%
  group_by(album) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c('album')) %>%
  mutate(ratio = negativewords/words) %>%
  ungroup()
```




<style>

em {
    color: #FFEA6C;
    background: #7D7D7D;
}

.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

.qiz {
  line-height: 1.75;
  background: #f0f0f0;
  border-left: 12px solid #ccffcc;
  padding: 4px;
  padding-left: 10px;
  color: #009900;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #0066ff;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}


h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #f8f8f8;
  line-height: 1.5;
  font-weight: bold;
}

h6 {
    color: #006000;
    background: #00ffff;
    line-height: 2;
    font-weight: bold;
}

</style>
