Preliminary descriptive and word clouds available here.

Load in libraries:

#load in initial necessary libraries

library(quanteda)
library(readr)
library(dplyr)
library(stringr)
library(tidytext)

Here I’m going to lay out the steps as I think they need to happen to think this all through [Based on Week 9 Topic Modeling Tutorial code]:

Load in data as vector (preserving rows) Select topic (ethics or Abstract) Clean Use quanteda functions for tokenizing Model

Load in and do an initial cleaning of data

library(tm)
library(lexicon)
library(wordcloud)
library(textstem)
library(tidyverse)
library(quanteda)
library(purrr)
library(rvest)
library(curl)
library(httr)
library(text2vec)
library(LDAvis)
library(caret)
library(randomForest)
library(caTools)
library(ldatuning)

Load in data from csv

setwd("~/DACCS R/Text as Data/Final Project TaD R")
library(readr)

authors_abstracts <- read_csv("authors_abstracts.csv", show_col_types = FALSE)

Create character vector

Going to try something:

abstractsV <- authors_abstracts$Abstract
#abstractsV <- pull(abstractsV$Abstract)
class(abstractsV)

[1] "character"

#abstractsV

[NOTE - THIS FAILS - IT CONDENSES IT ALL DOWN INTO ONE VECTOR RATHER THAN KEEPING THE SEPARATE ELEMENTS - okay - did this above and I think it worked?]

#abstracts.df <- authors_abstracts %>% select(Abstract)
#abstracts.df <- as.character(abstracts.df)

Data Cleaning

remove numbers

abstractsV <- str_remove_all(abstractsV, "[:digit:]")

remove urls

#library(stringr)
url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

abstractsV <- str_remove_all(abstractsV, url_regex)

remove emails? [Don’t know how to do this! Update: found code online.]

#Function containing regex pattern to remove email id
RemoveEmail <- function(x) {
  require(stringr)
  str_replace_all(x,"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+", "")
} 

abstractsV <- RemoveEmail(abstractsV)

convert to lowercase

abstractsV <- str_to_lower(abstractsV, locale = "en")

convert to dataframe and add unique document ID

abstracts.df <- as.data.frame(abstractsV)
abstracts.df <- rename(abstracts.df, text = abstractsV)
abstracts.df$ID <- 1:nrow(abstracts.df)
dim(abstracts.df)

[1] 128   2

Tokenize

tokens <- abstracts.df %>% 
  dplyr::select(text) %>%
  unnest_tokens(word, text)

dim(tokens)

[1] 18228     1

Plot Top 30

library(ggplot2)
# plot the top 30 words
tokens %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of Unique Words Found in Abstracts")

STOP WORDS

data("stop_words")
# how many words do you have including the stop words?
nrow(tokens)

[1] 18228

tokensClean <- tokens %>%
  anti_join(stop_words)

# how many words after removing the stop words?
nrow(tokensClean)

[1] 9826

Replot top 30

# plot the top 30 words -- notice any issues?
tokensClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of Unique Words Found in Abstracts (Stop Words Removed)")

A quick look suggests we might want to find a way to stem words like experiment/experiments and combine them (this may be beyond me technically at this point.)

Iteration

create iterator

# Iterates over each token
it <- itoken(as.list(tokensClean), ids = abstracts.df$ID, progressbar = FALSE)

#Note: I am using the cleaned tokens list for this iterator - I think that's correct

# Prints iterator
it

<itoken>
  Inherits from: <CallbackIterator>
  Public:
    callback: function (x) 
    clone: function (deep = FALSE) 
    initialize: function (x, callback = identity) 
    is_complete: active binding
    length: active binding
    move_cursor: function () 
    nextElem: function () 
    x: GenericIterator, iterator, R6

Vocabulary-Based Vectorization

# Built the vocabulary
v <- create_vocabulary(it)

# Print vocabulary
#v
class(v)

[1] "text2vec_vocabulary" "data.frame"

This is not giving me the correct doc ID counts:

corpus <- corpus(abstractsV)
tokens2 <- quanteda::tokens(corpus, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
tokens2 <- tokens_select(tokens2, pattern = c(stopwords("en")), selection = "remove")
#tokens2 <- tokens_select(tokens2, pattern = "[:alnum:] ", selection = "remove", valuetype = c("regex") )

Recreate iterator:

# Iterates over each token
it2 <- itoken(as.list(tokens2), ids = abstracts.df$ID, progressbar = FALSE)

#Note: I am using the cleaned tokens list for this iterator - I think that's correct

# Prints iterator
it2

<itoken>
  Inherits from: <CallbackIterator>
  Public:
    callback: function (x) 
    clone: function (deep = FALSE) 
    initialize: function (x, callback = identity) 
    is_complete: active binding
    length: active binding
    move_cursor: function () 
    nextElem: function () 
    x: GenericIterator, iterator, R6

Vocabulary-Based Vectorization

# Built the vocabulary
v2 <- create_vocabulary(it2)

# Print vocabulary
#v
class(v2)

[1] "text2vec_vocabulary" "data.frame"

Okay, this gives us multiple document counts - note to self, USE THE QUANTEDA!

I am not going to prune any terms at this point because I’ve already got not a lot of data.

Vectorize Vocabulary Words (v2)

# Creates a closure that helps transform list of tokens into vector space
vectorizer <- vocab_vectorizer(v2)

Create DTM:

# Creates document term matrix
dtm <- create_dtm(it2, vectorizer, type = "dgTMatrix")

From here, we create our LDA model:

# Creates new LDA model
lda_model <- LDA$new(n_topics = 10, doc_topic_prior = 0.1, topic_word_prior = 0.01)

# Print other methods for LDA
lda_model

<WarpLDA>
  Inherits from: <LDA>
  Public:
    clone: function (deep = FALSE) 
    components: active binding
    fit_transform: function (x, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 10, 
    get_top_words: function (n = 10, topic_number = 1L:private$n_topics, lambda = 1) 
    initialize: function (n_topics = 10L, doc_topic_prior = 50/n_topics, topic_word_prior = 1/n_topics, 
    plot: function (lambda.step = 0.1, reorder.topics = FALSE, doc_len = private$doc_len, 
    topic_word_distribution: active binding
    transform: function (x, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 10, 
  Private:
    calc_pseudo_loglikelihood: function (ptr = private$ptr) 
    check_convert_input: function (x) 
    components_: NULL
    doc_len: NULL
    doc_topic_distribution: function () 
    doc_topic_distribution_with_prior: function () 
    doc_topic_matrix: NULL
    doc_topic_prior: 0.1
    fit_transform_internal: function (model_ptr, n_iter, convergence_tol, n_check_convergence, 
    get_c_all: function () 
    get_c_all_local: function () 
    get_doc_topic_matrix: function (prt, nr) 
    get_topic_word_count: function () 
    init_model_dtm: function (x, ptr = private$ptr) 
    internal_matrix_formats: list
    is_initialized: FALSE
    n_iter_inference: 10
    n_topics: 10
    ptr: NULL
    reset_c_local: function () 
    run_iter_doc: function (update_topics = TRUE, ptr = private$ptr) 
    run_iter_word: function (update_topics = TRUE, ptr = private$ptr) 
    seeds: 147355803.362764 883971418.676737
    set_c_all: function (x) 
    set_internal_matrix_formats: function (sparse = NULL, dense = NULL) 
    topic_word_distribution_with_prior: function () 
    topic_word_prior: 0.01
    transform_internal: function (x, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 10, 
    vocabulary: NULL

Now we fit our model:

# Fitting model
doc_topic_distr <- 
  lda_model$fit_transform(x = dtm, n_iter = 1000, 
                          convergence_tol = 0.001, n_check_convergence = 25, 
                          progressbar = FALSE)

INFO  [17:23:52.293] early stopping at 100 iteration 
INFO  [17:23:52.878] early stopping at 75 iteration

Topic Distribution:

barplot(doc_topic_distr[1, ], xlab = "topic", 
        ylab = "proportion", ylim = c(0, 1), 
        names.arg = 1:ncol(doc_topic_distr))

Describing Topics with Top Words:

#with a lambda of 1

# Get top n words for topics 1, 5, and 10
set.seed(1010)
lda_model$get_top_words(n = 10, topic_number = c(1L, 5L, 10L), lambda = 1)

      [,1]          [,2]           [,3]        
 [1,] "campaign"    "voters"       "field"     
 [2,] "information" "election"     "results"   
 [3,] "behavior"    "turnout"      "effects"   
 [4,] "effect"      "vote"         "first"     
 [5,] "electoral"   "voter"        "policy"    
 [6,] "experiment"  "voting"       "suggest"   
 [7,] "conducted"   "effects"      "compliance"
 [8,] "elections"   "candidates"   "one"       
 [9,] "campaigns"   "mobilization" "whether"   
[10,] "finance"     "increase"     "influence"

With a lambda of .2

# Get top n words for topics 1, 5, and 10
set.seed(1010)
lda_model$get_top_words(n = 10, topic_number = c(1L, 5L, 10L), lambda = .2)

      [,1]           [,2]           [,3]          
 [1,] "campaign"     "election"     "first"       
 [2,] "finance"      "voters"       "compliance"  
 [3,] "us"           "turnout"      "contribute"  
 [4,] "campaigns"    "vote"         "network"     
 [5,] "transparency" "voter"        "participants"
 [6,] "audits"       "candidates"   "positive"    
 [7,] "effect"       "mobilization" "one"         
 [8,] "revealed"     "candidate"    "second"      
 [9,] "behavior"     "partisan"     "average"     
[10,] "negative"     "increase"     "conflict"

Now I’m going to try to visualize these topics:

{r} #system(‘npm install -g localtunnel’) #this fails but I don’t think I actually need it?

library(servr) library(LDAvis)


_run model_

{r}
# Creating plot (Ignore the link)
lda_model$plot(open.browser = TRUE)

So k=10 gives us relatively distinct models, but I’d also like to run a k=5, lambda=.2, and see what we have.

# Creates new LDA model
lda_model1 <- LDA$new(n_topics = 5, doc_topic_prior = 0.1, topic_word_prior = 0.01)

# Print other methods for LDA
lda_model1

<WarpLDA>
  Inherits from: <LDA>
  Public:
    clone: function (deep = FALSE) 
    components: active binding
    fit_transform: function (x, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 10, 
    get_top_words: function (n = 10, topic_number = 1L:private$n_topics, lambda = 1) 
    initialize: function (n_topics = 10L, doc_topic_prior = 50/n_topics, topic_word_prior = 1/n_topics, 
    plot: function (lambda.step = 0.1, reorder.topics = FALSE, doc_len = private$doc_len, 
    topic_word_distribution: active binding
    transform: function (x, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 10, 
  Private:
    calc_pseudo_loglikelihood: function (ptr = private$ptr) 
    check_convert_input: function (x) 
    components_: NULL
    doc_len: NULL
    doc_topic_distribution: function () 
    doc_topic_distribution_with_prior: function () 
    doc_topic_matrix: NULL
    doc_topic_prior: 0.1
    fit_transform_internal: function (model_ptr, n_iter, convergence_tol, n_check_convergence, 
    get_c_all: function () 
    get_c_all_local: function () 
    get_doc_topic_matrix: function (prt, nr) 
    get_topic_word_count: function () 
    init_model_dtm: function (x, ptr = private$ptr) 
    internal_matrix_formats: list
    is_initialized: FALSE
    n_iter_inference: 10
    n_topics: 5
    ptr: NULL
    reset_c_local: function () 
    run_iter_doc: function (update_topics = TRUE, ptr = private$ptr) 
    run_iter_word: function (update_topics = TRUE, ptr = private$ptr) 
    seeds: 1186111629.39535 410701117.617505
    set_c_all: function (x) 
    set_internal_matrix_formats: function (sparse = NULL, dense = NULL) 
    topic_word_distribution_with_prior: function () 
    topic_word_prior: 0.01
    transform_internal: function (x, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 10, 
    vocabulary: NULL

Now we fit our model k=5 model:

# Fitting model
doc_topic_distr1 <- 
  lda_model1$fit_transform(x = dtm, n_iter = 1000, 
                          convergence_tol = 0.001, n_check_convergence = 25, 
                          progressbar = FALSE)

INFO  [17:23:53.811] early stopping at 100 iteration 
INFO  [17:23:54.182] early stopping at 75 iteration

Topic Distribution:

barplot(doc_topic_distr1[1, ], xlab = "topic", 
        ylab = "proportion", ylim = c(0, 1), 
        names.arg = 1:ncol(doc_topic_distr1))

Describing Topics with Top Words:

With a lambda of .2

# Get top n words for topics 1, 5, and 10
set.seed(1010)
lda_model1$get_top_words(n = 10, topic_number = c(1L, 2L, 5L, 4L, 5L), lambda = .2)

      [,1]             [,2]             [,3]          
 [1,] "social"         "local"          "public"      
 [2,] "also"           "elections"      "may"         
 [3,] "campaigns"      "increased"      "messages"    
 [4,] "experiments"    "women"          "candidate"   
 [5,] "discrimination" "partisan"       "suggest"     
 [6,] "treatment"      "party"          "experimental"
 [7,] "randomized"     "increase"       "first"       
 [8,] "impact"         "politicians"    "state"       
 [9,] "cooperation"    "responsiveness" "three"       
[10,] "networks"       "incumbent"      "research"    
      [,4]            [,5]          
 [1,] "effects"       "public"      
 [2,] "policy"        "may"         
 [3,] "whether"       "messages"    
 [4,] "participation" "candidate"   
 [5,] "officials"     "suggest"     
 [6,] "show"          "experimental"
 [7,] "often"         "first"       
 [8,] "however"       "state"       
 [9,] "performance"   "three"       
[10,] "states"        "research"

Now I’m going to try to visualize these topics:

{r} #system(‘npm install -g localtunnel’) #this fails but I don’t think I actually need it?

library(servr) library(LDAvis)



{r}
# Creating plot (Ignore the link)
lda_model1$plot(open.browser = TRUE)

Going to stop the server for a bit while I move on:

{r} servr::daemon_stop(3)


[Note: I do not know what this code means and Google was not helpful.]

### Repeating All Steps with the Ethics Paragraphs

_Create character vector_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='va'>authors_abstracts</span><span class='op'>$</span><span class='va'>Paragraph</span>
<span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/r/base/as.data.frame.html'>as.data.frame</a></span><span class='op'>(</span><span class='va'>ethicsV</span><span class='op'>)</span>
<span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='va'>ethicsV</span><span class='op'>[</span><span class='fu'><a href='https://rdrr.io/r/stats/complete.cases.html'>complete.cases</a></span><span class='op'>(</span><span class='va'>ethicsV</span><span class='op'>)</span>,<span class='op'>]</span>  <span class='co'>#removing NA rows where ethics not mentioned</span>

<span class='fu'><a href='https://rdrr.io/r/base/class.html'>class</a></span><span class='op'>(</span><span class='va'>ethicsV</span><span class='op'>)</span>
</code></pre></div>

[1] “character”


</div>


_Data Cleaning_

+   remove numbers

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://stringr.tidyverse.org/reference/str_remove.html'>str_remove_all</a></span><span class='op'>(</span><span class='va'>ethicsV</span>, <span class='st'>"[:digit:]"</span><span class='op'>)</span>
</code></pre></div>

</div>


+   remove urls

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'>#library(stringr)</span>
<span class='va'>url_regex</span> <span class='op'>&lt;-</span> <span class='st'>"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&amp;+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"</span>

<span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://stringr.tidyverse.org/reference/str_remove.html'>str_remove_all</a></span><span class='op'>(</span><span class='va'>ethicsV</span>, <span class='va'>url_regex</span><span class='op'>)</span>
</code></pre></div>

</div>


+   remove emails (not 100% sure this works)

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'>#Function containing regex pattern to remove email id</span>
<span class='va'>RemoveEmail</span> <span class='op'>&lt;-</span> <span class='kw'>function</span><span class='op'>(</span><span class='va'>x</span><span class='op'>)</span> <span class='op'>{</span>
  <span class='kw'><a href='https://rdrr.io/r/base/library.html'>require</a></span><span class='op'>(</span><span class='va'><a href='http://stringr.tidyverse.org'>stringr</a></span><span class='op'>)</span>
  <span class='fu'><a href='https://stringr.tidyverse.org/reference/str_replace.html'>str_replace_all</a></span><span class='op'>(</span><span class='va'>x</span>,<span class='st'>"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"</span>, <span class='st'>""</span><span class='op'>)</span>
<span class='op'>}</span> 

<span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='fu'>RemoveEmail</span><span class='op'>(</span><span class='va'>ethicsV</span><span class='op'>)</span>
</code></pre></div>

</div>


+   convert to lowercase

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>ethicsV</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://stringr.tidyverse.org/reference/case.html'>str_to_lower</a></span><span class='op'>(</span><span class='va'>ethicsV</span>, locale <span class='op'>=</span> <span class='st'>"en"</span><span class='op'>)</span>
</code></pre></div>

</div>


_convert to dataframe and add unique document ID_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>ethics.df</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/r/base/as.data.frame.html'>as.data.frame</a></span><span class='op'>(</span><span class='va'>ethicsV</span><span class='op'>)</span>
<span class='va'>ethics.df</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://dplyr.tidyverse.org/reference/rename.html'>rename</a></span><span class='op'>(</span><span class='va'>ethics.df</span>, text <span class='op'>=</span> <span class='va'>ethicsV</span><span class='op'>)</span>
<span class='va'>ethics.df</span><span class='op'>$</span><span class='va'>ID</span> <span class='op'>&lt;-</span> <span class='fl'>1</span><span class='op'>:</span><span class='fu'><a href='https://rdrr.io/r/base/nrow.html'>nrow</a></span><span class='op'>(</span><span class='va'>ethics.df</span><span class='op'>)</span>
<span class='fu'><a href='https://rdrr.io/r/base/dim.html'>dim</a></span><span class='op'>(</span><span class='va'>ethics.df</span><span class='op'>)</span>
</code></pre></div>

[1] 54 2


</div>


### Tokenize

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>tokensE</span> <span class='op'>&lt;-</span> <span class='va'>ethics.df</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span> 
  <span class='fu'>dplyr</span><span class='fu'>::</span><span class='fu'><a href='https://dplyr.tidyverse.org/reference/select.html'>select</a></span><span class='op'>(</span><span class='va'>text</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://rdrr.io/pkg/tidytext/man/unnest_tokens.html'>unnest_tokens</a></span><span class='op'>(</span><span class='va'>word</span>, <span class='va'>text</span><span class='op'>)</span>

<span class='fu'><a href='https://rdrr.io/r/base/dim.html'>dim</a></span><span class='op'>(</span><span class='va'>tokensE</span><span class='op'>)</span>
</code></pre></div>

[1] 8077 1


</div>


Plot Top 30

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='kw'><a href='https://rdrr.io/r/base/library.html'>library</a></span><span class='op'>(</span><span class='va'><a href='https://ggplot2.tidyverse.org'>ggplot2</a></span><span class='op'>)</span>
<span class='co'># plot the top 30 words</span>
<span class='va'>tokensE</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'>dplyr</span><span class='fu'>::</span><span class='fu'><a href='https://dplyr.tidyverse.org/reference/count.html'>count</a></span><span class='op'>(</span><span class='va'>word</span>, sort <span class='op'>=</span> <span class='cn'>TRUE</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/top_n.html'>top_n</a></span><span class='op'>(</span><span class='fl'>30</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/mutate.html'>mutate</a></span><span class='op'>(</span>word <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/stats/reorder.factor.html'>reorder</a></span><span class='op'>(</span><span class='va'>word</span>, <span class='va'>n</span><span class='op'>)</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/ggplot.html'>ggplot</a></span><span class='op'>(</span><span class='fu'><a href='https://ggplot2.tidyverse.org/reference/aes.html'>aes</a></span><span class='op'>(</span>x <span class='op'>=</span> <span class='va'>word</span>, y <span class='op'>=</span> <span class='va'>n</span><span class='op'>)</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/geom_bar.html'>geom_col</a></span><span class='op'>(</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/labs.html'>xlab</a></span><span class='op'>(</span><span class='cn'>NULL</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/coord_flip.html'>coord_flip</a></span><span class='op'>(</span><span class='op'>)</span> <span class='op'>+</span>
      <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/labs.html'>labs</a></span><span class='op'>(</span>x <span class='op'>=</span> <span class='st'>"Count"</span>,
      y <span class='op'>=</span> <span class='st'>"Unique words"</span>,
      title <span class='op'>=</span> <span class='st'>"Count of Unique Words Found in Ethics Paragraphs"</span><span class='op'>)</span>
</code></pre></div>
<img src="FINAL-LDA-MODELS-I-SWEAR-TO-GOD_files/figure-html5/unnamed-chunk-38-1.png" width="624" />

</div>


_STOP WORDS_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='fu'><a href='https://rdrr.io/r/utils/data.html'>data</a></span><span class='op'>(</span><span class='st'>"stop_words"</span><span class='op'>)</span>
<span class='co'># how many words do you have including the stop words?</span>
<span class='fu'><a href='https://rdrr.io/r/base/nrow.html'>nrow</a></span><span class='op'>(</span><span class='va'>tokensE</span><span class='op'>)</span>
</code></pre></div>

[1] 8077


<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>tokensEclean</span> <span class='op'>&lt;-</span> <span class='va'>tokensE</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/filter-joins.html'>anti_join</a></span><span class='op'>(</span><span class='va'>stop_words</span><span class='op'>)</span>

<span class='co'># how many words after removing the stop words?</span>
<span class='fu'><a href='https://rdrr.io/r/base/nrow.html'>nrow</a></span><span class='op'>(</span><span class='va'>tokensEclean</span><span class='op'>)</span>
</code></pre></div>

[1] 4045


</div>


Replot top 30

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># plot the top 30 words -- notice any issues?</span>
<span class='va'>tokensEclean</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'>dplyr</span><span class='fu'>::</span><span class='fu'><a href='https://dplyr.tidyverse.org/reference/count.html'>count</a></span><span class='op'>(</span><span class='va'>word</span>, sort <span class='op'>=</span> <span class='cn'>TRUE</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/top_n.html'>top_n</a></span><span class='op'>(</span><span class='fl'>30</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/mutate.html'>mutate</a></span><span class='op'>(</span>word <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/stats/reorder.factor.html'>reorder</a></span><span class='op'>(</span><span class='va'>word</span>, <span class='va'>n</span><span class='op'>)</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/ggplot.html'>ggplot</a></span><span class='op'>(</span><span class='fu'><a href='https://ggplot2.tidyverse.org/reference/aes.html'>aes</a></span><span class='op'>(</span>x <span class='op'>=</span> <span class='va'>word</span>, y <span class='op'>=</span> <span class='va'>n</span><span class='op'>)</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/geom_bar.html'>geom_col</a></span><span class='op'>(</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/labs.html'>xlab</a></span><span class='op'>(</span><span class='cn'>NULL</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/coord_flip.html'>coord_flip</a></span><span class='op'>(</span><span class='op'>)</span> <span class='op'>+</span>
      <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/labs.html'>labs</a></span><span class='op'>(</span>x <span class='op'>=</span> <span class='st'>"Count"</span>,
      y <span class='op'>=</span> <span class='st'>"Unique words"</span>,
      title <span class='op'>=</span> <span class='st'>"Count of Unique Words Found in Ethics Paragraphs (Stop Words Removed)"</span><span class='op'>)</span>
</code></pre></div>
<img src="FINAL-LDA-MODELS-I-SWEAR-TO-GOD_files/figure-html5/unnamed-chunk-40-1.png" width="624" />

</div>


Here I'm going to remove si manually, as I don't think it adds value.

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='fu'><a href='https://rdrr.io/r/base/nrow.html'>nrow</a></span><span class='op'>(</span><span class='va'>tokensEclean</span><span class='op'>)</span>
</code></pre></div>

[1] 4045


<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>tokensEclean</span> <span class='op'>&lt;-</span> <span class='va'>tokensEclean</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span> <span class='fu'><a href='https://dplyr.tidyverse.org/reference/filter.html'>filter</a></span><span class='op'>(</span><span class='op'>!</span><span class='va'>word</span> <span class='op'>==</span> <span class='st'>"si"</span><span class='op'>)</span>
<span class='fu'><a href='https://rdrr.io/r/base/nrow.html'>nrow</a></span><span class='op'>(</span><span class='va'>tokensEclean</span><span class='op'>)</span>
</code></pre></div>

[1] 4032


</div>


_And, final replot - top 30_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># plot the top 30 words -- notice any issues?</span>
<span class='va'>tokensEclean</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'>dplyr</span><span class='fu'>::</span><span class='fu'><a href='https://dplyr.tidyverse.org/reference/count.html'>count</a></span><span class='op'>(</span><span class='va'>word</span>, sort <span class='op'>=</span> <span class='cn'>TRUE</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/top_n.html'>top_n</a></span><span class='op'>(</span><span class='fl'>30</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://dplyr.tidyverse.org/reference/mutate.html'>mutate</a></span><span class='op'>(</span>word <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/stats/reorder.factor.html'>reorder</a></span><span class='op'>(</span><span class='va'>word</span>, <span class='va'>n</span><span class='op'>)</span><span class='op'>)</span> <span class='op'><a href='https://stringr.tidyverse.org/reference/pipe.html'>%&gt;%</a></span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/ggplot.html'>ggplot</a></span><span class='op'>(</span><span class='fu'><a href='https://ggplot2.tidyverse.org/reference/aes.html'>aes</a></span><span class='op'>(</span>x <span class='op'>=</span> <span class='va'>word</span>, y <span class='op'>=</span> <span class='va'>n</span><span class='op'>)</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/geom_bar.html'>geom_col</a></span><span class='op'>(</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/labs.html'>xlab</a></span><span class='op'>(</span><span class='cn'>NULL</span><span class='op'>)</span> <span class='op'>+</span>
  <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/coord_flip.html'>coord_flip</a></span><span class='op'>(</span><span class='op'>)</span> <span class='op'>+</span>
      <span class='fu'><a href='https://ggplot2.tidyverse.org/reference/labs.html'>labs</a></span><span class='op'>(</span>x <span class='op'>=</span> <span class='st'>"Count"</span>,
      y <span class='op'>=</span> <span class='st'>"Unique words"</span>,
      title <span class='op'>=</span> <span class='st'>"Count of Unique Words Found in Ethics Paragraphs (Stop Words Removed)"</span><span class='op'>)</span>
</code></pre></div>
<img src="FINAL-LDA-MODELS-I-SWEAR-TO-GOD_files/figure-html5/unnamed-chunk-42-1.png" width="624" />

</div>


### Iteration

_create iterator_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># Iterates over each token</span>
<span class='va'>itE</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/pkg/text2vec/man/itoken.html'>itoken</a></span><span class='op'>(</span><span class='fu'><a href='https://rdrr.io/r/base/list.html'>as.list</a></span><span class='op'>(</span><span class='va'>tokensEclean</span><span class='op'>)</span>, ids <span class='op'>=</span> <span class='va'>ethics.df</span><span class='op'>$</span><span class='va'>ID</span>, progressbar <span class='op'>=</span> <span class='cn'>FALSE</span><span class='op'>)</span>

<span class='co'>#Note: I am using the cleaned tokens list for this iterator - I think that's correct</span>

<span class='co'># Prints iterator</span>
<span class='va'>itE</span>
</code></pre></div>

Inherits from: Public: callback: function (x) clone: function (deep = FALSE) initialize: function (x, callback = identity) is_complete: active binding length: active binding move_cursor: function () nextElem: function () x: GenericIterator, iterator, R6


</div>


### Vocabulary-Based Vectorization

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># Built the vocabulary</span>
<span class='va'>vE</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/pkg/text2vec/man/create_vocabulary.html'>create_vocabulary</a></span><span class='op'>(</span><span class='va'>itE</span><span class='op'>)</span>

<span class='co'># Print vocabulary</span>
<span class='co'>#v</span>
<span class='fu'><a href='https://rdrr.io/r/base/class.html'>class</a></span><span class='op'>(</span><span class='va'>vE</span><span class='op'>)</span>
</code></pre></div>

[1] “text2vec_vocabulary” “data.frame”


</div>


This is not giving me the correct doc ID counts:

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='va'>corpusE</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/corpus.html'>corpus</a></span><span class='op'>(</span><span class='va'>ethicsV</span><span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'>quanteda</span><span class='fu'>::</span><span class='fu'><a href='https://quanteda.io/reference/tokens.html'>tokens</a></span><span class='op'>(</span><span class='va'>corpusE</span>, remove_punct <span class='op'>=</span> <span class='cn'>TRUE</span>, remove_numbers <span class='op'>=</span> <span class='cn'>TRUE</span>, remove_symbols <span class='op'>=</span> <span class='cn'>TRUE</span><span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_select</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='fu'><a href='https://rdrr.io/r/base/c.html'>c</a></span><span class='op'>(</span><span class='fu'><a href='https://rdrr.io/pkg/tm/man/stopwords.html'>stopwords</a></span><span class='op'>(</span><span class='st'>"en"</span><span class='op'>)</span><span class='op'>)</span>, selection <span class='op'>=</span> <span class='st'>"remove"</span><span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_remove</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='st'>"si"</span>, valuetype <span class='op'>=</span> <span class='st'>"fixed"</span> <span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_remove</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='st'>"e.g"</span>, valuetype <span class='op'>=</span> <span class='st'>"fixed"</span> <span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_remove</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='st'>"however"</span>, valuetype <span class='op'>=</span> <span class='st'>"fixed"</span> <span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_remove</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='st'>"s"</span>, valuetype <span class='op'>=</span> <span class='st'>"fixed"</span> <span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_remove</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='st'>"b"</span>, valuetype <span class='op'>=</span> <span class='st'>"fixed"</span> <span class='op'>)</span>
<span class='va'>tokensE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://quanteda.io/reference/tokens_select.html'>tokens_remove</a></span><span class='op'>(</span><span class='va'>tokensE2</span>, pattern <span class='op'>=</span> <span class='st'>"c"</span>, valuetype <span class='op'>=</span> <span class='st'>"fixed"</span> <span class='op'>)</span>
<span class='co'>#additional data cleaning from topic models</span>
</code></pre></div>

</div>


REMINDER TO SELF - USE QUANTEDA FOR THIS!!!

Recreate iterator:

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># Iterates over each token</span>
<span class='va'>itE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/pkg/text2vec/man/itoken.html'>itoken</a></span><span class='op'>(</span><span class='fu'><a href='https://rdrr.io/r/base/list.html'>as.list</a></span><span class='op'>(</span><span class='va'>tokensE2</span><span class='op'>)</span>, ids <span class='op'>=</span> <span class='va'>ethics.df</span><span class='op'>$</span><span class='va'>ID</span>, progressbar <span class='op'>=</span> <span class='cn'>FALSE</span><span class='op'>)</span>

<span class='co'>#Note: I am using the cleaned tokens list for this iterator - I think that's correct</span>

<span class='co'># Prints iterator</span>
<span class='va'>itE2</span>
</code></pre></div>


</div>


### Vocabulary-Based Vectorization

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># Built the vocabulary</span>
<span class='va'>vE2</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/pkg/text2vec/man/create_vocabulary.html'>create_vocabulary</a></span><span class='op'>(</span><span class='va'>itE2</span><span class='op'>)</span>

<span class='co'># Print vocabulary</span>
<span class='co'>#v</span>
<span class='fu'><a href='https://rdrr.io/r/base/class.html'>class</a></span><span class='op'>(</span><span class='va'>vE2</span><span class='op'>)</span>
</code></pre></div>

[1] “text2vec_vocabulary” “data.frame”


</div>


Okay, this gives us multiple document counts - note to self, USE THE QUANTEDA!

I am not going to prune any terms at this point because I've already got not a lot of data.

_Vectorize Vocabulary Words (v2)_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># Creates a closure that helps transform list of tokens into vector space</span>
<span class='va'>vectorizer</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/pkg/text2vec/man/vectorizers.html'>vocab_vectorizer</a></span><span class='op'>(</span><span class='va'>vE2</span><span class='op'>)</span>
</code></pre></div>

</div>


_Create DTM:_

<div class="layout-chunk" data-layout="l-body">
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class='co'># Creates document term matrix</span>
<span class='va'>dtm</span> <span class='op'>&lt;-</span> <span class='fu'><a href='https://rdrr.io/pkg/text2vec/man/create_dtm.html'>create_dtm</a></span><span class='op'>(</span><span class='va'>itE2</span>, <span class='va'>vectorizer</span>, type <span class='op'>=</span> <span class='st'>"dgTMatrix"</span><span class='op'>)</span>
</code></pre></div>

</div>


[NOTE: for the purposes of knitting and publishing this, I'm going to take these out as active code blocks]

_From here, we create our LDA model, k=10:_

{r}
# Creates new LDA model
lda_modelE <- LDA$new(n_topics = 10, doc_topic_prior = 0.1, topic_word_prior = 0.01)

# Print other methods for LDA
lda_modelE

Now we fit our model:

{r} # Fitting model doc_topic_distrE <- lda_modelE$fit_transform(x = dtm, n_iter = 1000, convergence_tol = 0.001, n_check_convergence = 25, progressbar = FALSE)


_Topic Distribution:_

{r}
barplot(doc_topic_distrE[1, ], xlab = "topic", 
        ylab = "proportion", ylim = c(0, 1), 
        names.arg = 1:ncol(doc_topic_distrE))

Describing Topics with Top Words:

{r} #with a lambda of 1

Get top n words for topics 1, 5, and 10

set.seed(1010) lda_modelE$get_top_words(n = 10, topic_number = c(1L, 5L, 10L), lambda = 1)


With a lambda of .2

{r}
# Get top n words for topics 1, 5, and 10
set.seed(1010)
lda_modelE$get_top_words(n = 10, topic_number = c(1L, 5L, 10L), lambda = .2)

Now I’m going to try to visualize these topics:

{r} #system(‘npm install -g localtunnel’) #this fails but I don’t think I actually need it?

library(servr) library(LDAvis)


_run model_

{r}
# Creating plot (Ignore the link)
lda_modelE$plot(open.browser = TRUE)

So k=10 gives us relatively distinct models, but I’d also like to run a k=5, lambda=.2, and see what we have.

{r} # Creates new LDA model lda_modelE1 <- LDA$new(n_topics = 5, doc_topic_prior = 0.1, topic_word_prior = 0.01)

Print other methods for LDA

lda_modelE1


_Now we fit our model k=5 model:_

{r}
# Fitting model
doc_topic_distrE1 <- 
  lda_modelE1$fit_transform(x = dtm, n_iter = 1000, 
                          convergence_tol = 0.001, n_check_convergence = 25, 
                          progressbar = FALSE)

Topic Distribution:

{r} barplot(doc_topic_distrE1[1, ], xlab = “topic”, ylab = “proportion”, ylim = c(0, 1), names.arg = 1:ncol(doc_topic_distrE1))


_Describing Topics with Top Words:_

With a lambda of .2

{r}
# Get top n words for topics 1, 5, and 10
set.seed(1010)
lda_modelE1$get_top_words(n = 10, topic_number = c(1L, 2L, 5L, 4L, 5L), lambda = .2)

As we can see, we have some words that are messy in here: “e.g.” “si” “however” “may”

Now I’m going to try to visualize these topics:

{r} #system(‘npm install -g localtunnel’) #this fails but I don’t think I actually need it?

library(servr) library(LDAvis)



{r}
# Creating plot (Ignore the link)
lda_modelE1$plot(open.browser = TRUE)

Now there’s one last thing I want to do. I know from my review of the articles where ethic* is mentioned that not all of the mentions are what I would call “substantive” - i.e, actually discussing the ethical considertation in designing human subjects research. In fact, I know that only 44.4% of the articles that contain an ethic* term are actually substantive discussions of the relevant interests. Therefore, I want to drill down to just those paragraphs, and rerun my wordclouds and top 30 word plots. I don’t think I’ll have enough data to actually run models, but that is something that could easily be done. I’m going to do this in a separate document, as this has gotten unweildy.

[Note: I’m including this in here because I got it to run, but I am not sure how to interpret.]

Now we’ll try some STM!

Required Libraries

{r} library(stm) library(quanteda)


We've already cleaned our original text file, so we'll continue to use that.

{r}
myDfm <- dfm(abstractsV, tolower=TRUE,
  remove = stopwords('en'), 
  remove_punct = TRUE
  )

dim(myDfm)

Correlated Topic Model

{r} cor_topic_model <- stm(myDfm, K = 5, verbose = FALSE, init.type = “Spectral”)


Labels

{r}
labelTopics(cor_topic_model)

Find Thoughts

{r} findThoughts(cor_topic_model, texts = authors_abstracts$Abstract, topics = c(1:5), n = 1)

```{.r .distill-force-highlighting-css}

Final LDA Model for TaD Final Project: Ethics and Abstracts

Load in libraries:

Load in and do an initial cleaning of data

Tokenize

Iteration

Vocabulary-Based Vectorization

Vocabulary-Based Vectorization

Get top n words for topics 1, 5, and 10

Print other methods for LDA