Preperation and Preprocessing

library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
#Use built in dataset from the package

dtm <- dfm(data_corpus_inaugural, stem = FALSE)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
#rule-based approach: specifying our dictionary
myDict <- dictionary(list(terror = c("terror*"),
                          economy = c("job*", "business*", "econom*")))
# this means that if there is a word containg 'terror' e.g. terrorist, it will be counted under the term 'terror'

#Applying myDict on dtm
#nomatch => if a term is not matched in the dictionary, still count it
dict_dtm <- dfm_lookup(dtm, myDict, nomatch = "_unmatched")

Supervised Models

library(quanteda.textmodels)

#basic pre-processing
dtm <- dfm(data_corpus_inaugural, stem = TRUE, remove = stopwords("english"), remove_punct = TRUE)
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
#docvars shows the variables in the dataset
docvars(dtm)

Filtering to see pre-war documents => before 1945

#Adding a new varible, "is_prewar"
docvars(dtm, "is_prewar") <- docvars(dtm, "Year") < 1945
docvars(dtm)

Creating training and testing document term matrix

set.seed(1)
#training data, size of 40
train_dtm <- dfm_sample(dtm, size = 40)

#testing data not from the training data
test_dtm <- dtm[setdiff(docnames(dtm), docnames(train_dtm)), ]

Training a text model using naive bayes classifier textmodel_nb(training_data, [what to train on])

#Using NB to classify 
nb_model <- textmodel_nb(train_dtm, y = docvars(train_dtm, "is_prewar"))

#predict
pred_nb <- predict(nb_model, newdata = test_dtm)

#Check if predicted accurately
pred_nb
## 1793-Washington  1805-Jefferson    1829-Jackson   1837-VanBuren     1849-Taylor 
##            TRUE            TRUE            TRUE            TRUE            TRUE 
##     1853-Pierce    1861-Lincoln      1873-Grant   1881-Garfield   1889-Harrison 
##            TRUE            TRUE            TRUE            TRUE            TRUE 
##   1897-McKinley       1909-Taft     1949-Truman    1961-Kennedy       1989-Bush 
##            TRUE            TRUE            TRUE           FALSE           FALSE 
##    1993-Clinton    1997-Clinton      2009-Obama      2017-Trump 
##           FALSE           FALSE           FALSE           FALSE 
## Levels: FALSE TRUE
#crosstab
table(prediction = pred_nb, is_prewar = docvars(test_dtm, "is_prewar"))
##           is_prewar
## prediction FALSE TRUE
##      FALSE     6    0
##      TRUE      1   12

Only one false prediction, out of the times that we predict the speech to be pre-war, 11 out of the 12 were predicted correctly.=> precision 11/12. and out of the times that we predict the speech to not be pre-war, was all predicted correctly.

Unsupervised Models

Train a model that does something useful with texts without requiring annotated texts as training data

Topic modelling

Have a computer come up with ‘topics’ and classify documents according to these topics

library(topicmodels)

#seperating the corpus into paragraphs so we have more data to work with
texts = corpus_reshape(data_corpus_inaugural, to = "paragraphs")

#pre-processing
par_dtm <- dfm(texts, stem = TRUE,
               remove_punct = TRUE, remove = stopwords("english"))
## Warning: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
#dropping terms with freq < 5
par_dtm <- dfm_trim(par_dtm, min_termfreq = 5)

#converting our dtm so it can be used with topicmodels function
par_dtm <- convert(par_dtm, to = "topicmodels")

Training and testing models with unsupervised machine learning

set.seed(3)
lda_model <- topicmodels::LDA(par_dtm, method = "Gibbs", k = 10)
#k indicates the number of topics, Gibbs is the method parameters
terms(lda_model,5)
##      Topic 1   Topic 2 Topic 3  Topic 4    Topic 5 Topic 6   Topic 7  Topic 8 
## [1,] "may"     "us"    "peopl"  "world"    "time"  "shall"   "law"    "nation"
## [2,] "can"     "must"  "nation" "new"      "now"   "duti"    "peopl"  "peac"  
## [3,] "countri" "can"   "upon"   "american" "year"  "citizen" "govern" "war"   
## [4,] "everi"   "let"   "free"   "america"  "hope"  "offic"   "upon"   "great" 
## [5,] "never"   "work"  "everi"  "freedom"  "one"   "confid"  "execut" "justic"
##      Topic 9     Topic 10  
## [1,] "state"     "countri" 
## [2,] "govern"    "public"  
## [3,] "power"     "govern"  
## [4,] "constitut" "interest"
## [5,] "right"     "use"

As can be seen, the correlation of words within topics are not very clear - this means that we will need a bigger dataset in order to have a more clearly defined topic models.