dreamtextmining

Preprocessing

STR_Dream_Log <- read_excel("STR_Dream Log(1).xlsx", col_types = c("date", "text", "text", "text", "text"), skip = 1)
names(STR_Dream_Log) <- c("Date","Id","Sleep","Wake","Dream")
STR_Description_Log <- read_excel("STR_Description Log.xlsx")

ud_model <- udpipe_download_model(language = "english")

## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.4/master/inst/udpipe-ud-2.4-190531/english-ewt-ud-2.4-190531.udpipe to C:/Users/folde/Downloads/dreamlogs/english-ewt-ud-2.4-190531.udpipe

## Visit https://github.com/jwijffels/udpipe.models.ud.2.4 for model license details

ud_model <- udpipe_load_model(ud_model$file_model)
x <- udpipe_annotate(ud_model, x = STR_Dream_Log$Dream)
y <- udpipe_annotate(ud_model, x = STR_Description_Log$Description)

x <- as.data.frame(x)
y <- as.data.frame(y)

Keyword extraction - completely aggregated (not split by participants)

Just to show some options… ## copied wholly from https://www.r-bloggers.com/an-overview-of-keyword-extraction-techniques/

Option 1: Extracting only nouns

An easy way in order to find keywords is by looking at nouns. As each term has a Parts of Speech tag if you annotated text using the udpipe package, you can easily do this as follows.

## Warning: package 'lattice' was built under R version 3.6.2

Option 2: Collocation & co-occurrences

Although nouns are a great start, you are probably interested in multi-word expressions. You can get multi-word expression by looking either at collocations (words following one another), at word co-occurrences within each sentence or at word co-occurrences of words which are close in the neighbourhood of one another. These approaches can be executed as follows using the udpipe R package. If we combine this with selecting only the nouns and adjectives, this becomes already nice.

##   term1  term2 cooc
## 1 group people    9
## 2  last  night    8
## 3 dream   last    4
## 4 dream  night    4
## 5  real   life    3
## 6 sleep    lab    3

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

## Warning: package 'ggraph' was built under R version 3.6.2

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Option 3: Textrank (word network ordered by Google Pagerank)

Another approach for keyword detection is Textrank. Textrank is an algorithm implemented in the textrank R package. The algorithm allows to summarise text and as well allows to extract keywords. This is done by constructing a word network by looking if words are following one another. On top of that network the ‘Google Pagerank’ algorithm is applied to extract relevant words after which relevant words which are following one another are combined to get keywords. In the below example, we are interested in finding keywords using that algorithm of either nouns or adjectives following one another. You can see from the plot below that the keywords combines words together into multi-word expressions.

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.6.2

## Loading required package: RColorBrewer

stats <- textrank_keywords(x$lemma, 
                          relevant = x$upos %in% c("NOUN", "ADJ"), 
                          ngram_max = 8, sep = " ")
stats <- subset(stats$keywords, ngram > 1 & freq >= 2)

wordcloud(words = stats$keyword, freq = stats$freq)

stats <- textrank_keywords(y$lemma, 
                          relevant = y$upos %in% c("NOUN", "ADJ"), 
                          ngram_max = 8, sep = " ")
stats <- subset(stats$keywords, ngram > 1 & freq >= 2)

wordcloud(words = stats$keyword, freq = stats$freq)

Option 4: Rapid Automatic Keyword Extraction: RAKE

Next basic algorithm is called RAKE which is an acronym for Rapid Automatic Keyword Extraction. It looks for keywords by looking to a contiguous sequence of words which do not contain irrelevant words. Namely by

<!-- calculating a score for each word which is part of any candidate keyword, this is done by -->
<!--     among the words of the candidate keywords, the algorithm looks how many times each word is occurring and how many times it co-occurs with other words -->
<!--     each word gets a score which is the ratio of the word degree (how many times it co-occurs with other words) to the word frequency -->
<!-- a RAKE score for the full candidate keyword is calculated by summing up the scores of each of the words which define the candidate keyword -->

stats <- keywords_rake(x = x, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = x$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
head(subset(stats, freq > 3))

##       keyword ngram freq      rake
## 8  last night     2    4 1.7857143
## 27      party     1    4 0.8571429
## 29      night     1    5 0.7857143
## 31  chocolate     1    4 0.7500000
## 39       time     1    4 0.6666667
## 43        car     1    4 0.5714286

stats <- keywords_rake(x = y, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = y$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
head(subset(stats, freq > 3))

##          keyword ngram freq      rake
## 4    test phases     2    4 2.1111111
## 5  dream journal     2    5 2.0000000
## 18         sleep     1    4 0.7500000
## 20      patterns     1    7 0.3636364
## 21           lab     1    4 0.3333333
## 24          task     1    7 0.3000000

Option 5: Phrases

Next option is to extract phrases. These are defined as a sequence of Parts of Speech Tags. Common type of phrases are noun phrases or verb phrases. How does this work? Parts of Speech tags are recoded to one of the following one-letters: (A: adjective, C: coordinating conjuction, D: determiner, M: modifier of verb, N: noun or proper noun, P: preposition). Next you can define a regular expression to indicate a sequence of parts of speech tags which you want to extract from the text.

## Simple noun phrases (a adjective+noun, pre/postposition, optional determiner and another adjective+noun)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = x$token, 
                         pattern = "(A|N)+N(P+D*(A|N)*N)*", 
                         is_regex = TRUE, ngram_max = 4, detailed = FALSE)
head(subset(stats, ngram > 2))

##               keyword ngram freq
## 3        dream that I     3   12
## 29       Last night I     3    2
## 40       last dream I     3    2
## 46      first dream I     3    2
## 49     my best friend     3    2
## 56 my extended family     3    2

y$phrase_tag <- as_phrasemachine(y$upos, type = "upos")
stats <- keywords_phrases(x = y$phrase_tag, term = y$token, 
                         pattern = "(A|N)+N(P+D*(A|N)*N)*", 
                         is_regex = TRUE, ngram_max = 4, detailed = FALSE)
head(subset(stats, ngram > 2))

##                     keyword ngram freq
## 13    specific sleep stages     3    2
## 19 female faces with sounds     4    2
## 23     first couple of days     4    2
## 27      several test phases     3    2
## 28             it more time     3    2
## 36          first task that     3    1

Option 6: Use dependency parsing output to get the nominal subject and the adjective of it

In the last option, we will show how to use the results of the dependency parsing. When you executed the annotation using udpipe, the dep_rel field indicates how words are related to one another. A token is related to the parent using token_id and head_token_id. The dep_rel field indicates how words are linked to one another. The type of relations are defined at http://universaldependencies.org/u/dep/index.html. For this exercise we are going to take the words which have as dependency relation nsubj indicating the nominal subject and we are adding to that the adjective which is changing the nominal subject.

In this way we can combine what are people talking about with the adjective they use when they talk about the subject.

stats <- merge(x, x, 
           by.x = c("doc_id", "paragraph_id", "sentence_id", "head_token_id"),
           by.y = c("doc_id", "paragraph_id", "sentence_id", "token_id"),
           all.x = TRUE, all.y = FALSE, 
           suffixes = c("", "_parent"), sort = FALSE)
stats <- subset(stats, dep_rel %in% "nsubj" & upos %in% c("NOUN") & upos_parent %in% c("ADJ"))
stats$term <- paste(stats$lemma_parent, stats$lemma, sep = " ")
stats <- txt_freq(stats$term)
head(stats)

##                key freq freq_pct
## 1       more dream    1        5
## 2      vivid dream    1        5
## 3        odd dream    1        5
## 4      scary dream    1        5
## 5       black wall    1        5
## 6 clear appearance    1        5

library(wordcloud)
wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, max.words = 100,
          random.order = FALSE, colors = brewer.pal(6, "Dark2"))

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## clear appearance could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## obvious answer could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## uncomfortable electrode could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## aggressive people could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## important pizza could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## evil scientist could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## kind dream could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## expensive pastry could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## confusing bit could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## heavy camera could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## close family could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## strict descent could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## involved graph could not be fit on page. It will not be plotted.

stats <- merge(y, y, 
           by.x = c("doc_id", "paragraph_id", "sentence_id", "head_token_id"),
           by.y = c("doc_id", "paragraph_id", "sentence_id", "token_id"),
           all.x = TRUE, all.y = FALSE, 
           suffixes = c("", "_parent"), sort = FALSE)
stats <- subset(stats, dep_rel %in% "nsubj" & upos %in% c("NOUN") & upos_parent %in% c("ADJ"))
stats$term <- paste(stats$lemma_parent, stats$lemma, sep = " ")
stats <- txt_freq(stats$term)
head(stats)

##                      key freq freq_pct
## 1            sure tasnks    1       20
## 2       frustrating task    1       20
## 3 uncomfortable headband    1       20
## 4          fine headband    1       20
## 5        amusing journal    1       20

library(wordcloud)
wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, max.words = 100,
          random.order = FALSE, colors = brewer.pal(6, "Dark2"))

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## uncomfortable headband could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = stats$key, freq = stats$freq, min.freq = 2, :
## amusing journal could not be fit on page. It will not be plotted.

Subject level analysis

based on this https://paldhous.github.io/NICAR/2019/r-text-analysis.html

library(dplyr) # SQL-style data processing
library(tidytext) # text analysis in R

## Warning: package 'tidytext' was built under R version 3.6.2

library(stringr) # working with text strings
library(lubridate) # working with times and dates

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:igraph':
## 
##     %--%

## The following object is masked from 'package:base':
## 
##     date

#library(jsonlite) # reading and writing JSON
library(tidyr) # data reshaping

STR_Dream_Log.per_code <- STR_Dream_Log %>% group_by(Id) %>% mutate(Log = paste0(Dream, collapse = " ")) %>% select(Id, Log) %>% unique()
subjects_data <-   STR_Dream_Log.per_code %>% left_join(STR_Description_Log) %>% gather(condition, text, Log:Description)

## Joining, by = "Id"

words <- subjects_data %>% unnest_tokens(text, text, token = "words")
words <- words %>%  anti_join(stop_words,by = c("text" = "word"))

words_count <- words %>%
  group_by(Id, condition, text) %>%
  count()

#Word frequency plot for words used more than twice, as an example
words_count$condition <- factor(words_count$condition,levels(factor(words_count$condition))[c(2,1)])
words$condition <- factor(words$condition,levels(factor(words$condition))[c(2,1)])


ggplot(words_count%>% filter(n>3), aes(x = reorder(text, n), y = n)) +
    geom_col() +
    labs(title="Dream log ",
         x = NULL,
         y = "Frequency") + 
    coord_flip() + facet_grid(Id ~ condition, scales = "free_y", space = "free_y") + theme(axis.title.y = element_text(size = rel(1.5), angle = 90))

for (val in unique(words_count$Id)){
  print(val)
  plot <- ggplot(filter(words_count, Id==val), aes(x = reorder(text, n), y = n)) +
    geom_col() +
    labs(title=val,
         x = NULL,
         y = "Frequency") + 
    coord_flip() + theme(axis.title.y = element_text(size = rel(1.5), angle = 90))
  
  print(plot)
}

## [1] "negative control"

## [1] "pilot101"

## [1] "pilot102"

## [1] "pilot103"

## [1] "positive control"

## [1] "str101"

## [1] "str102"

Sentiment analysis

nrc <- get_sentiments("nrc")

# join sentiments
sentiments <- words %>%
  inner_join(nrc, by = c("text" = "word"))

head(sentiments)

## # A tibble: 6 x 4
## # Groups:   Id [1]
##   Id               condition text   sentiment   
##   <chr>            <fct>     <chr>  <chr>       
## 1 negative control Log       church anticipation
## 2 negative control Log       church joy         
## 3 negative control Log       church positive    
## 4 negative control Log       church trust       
## 5 negative control Log       church anticipation
## 6 negative control Log       church joy

sentiments %>% ggplot(aes(x = sentiment, fill = sentiment)) + 
  geom_bar(aes(y = ..count..)) + facet_grid(Id ~ condition)

bing <- get_sentiments("bing")

# join sentiments
sentiments <- words %>%
  inner_join(bing, by = c("text" = "word"))

head(sentiments)

## # A tibble: 6 x 4
## # Groups:   Id [1]
##   Id               condition text     sentiment
##   <chr>            <fct>     <chr>    <chr>    
## 1 negative control Log       loud     negative 
## 2 negative control Log       scratch  negative 
## 3 negative control Log       jealous  negative 
## 4 negative control Log       worried  negative 
## 5 negative control Log       restless negative 
## 6 negative control Log       loved    positive

sentiments %>% ggplot(aes(x = sentiment, fill = sentiment)) + 
  geom_bar(aes(y = ..count..)) + facet_grid(Id ~ condition)