library(quanteda)
## Package version: 1.1.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(e1071)
rawdata <- read.csv("~/Downloads/NSDMC_training_data.csv - NSDMC_training_data.csv.csv")
# what class is our data?
class(rawdata$Subject)
## [1] "factor"
# corpus() function requires data of class "character", so convert
rawdata$Subject <- as.character(rawdata$Subject)

# turn character vector into a corpus
text_corpus <- corpus(rawdata$Subject)

Let’s see what we’ve got.

text_corpus[1]
##                                      text1 
## "The World Map This First Of The New Year"
# set docvars = to the ham/spam variable
docvars(text_corpus) <- rawdata$class_labels
text_corpus[1]
##                                      text1 
## "The World Map This First Of The New Year"
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'tm'
## The following objects are masked from 'package:quanteda':
## 
##     as.DocumentTermMatrix, stopwords
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
mytokens <- tokens(text_corpus)
newtokens <- tokens_wordstem(mytokens)
mydfm <- dfm(newtokens, remove = stopwords())
trimdfm <- dfm_trim(mydfm, min_count = 4, min_docfreq = 4)
dfm_train <- trimdfm[1:16089,]
dfm_test <- trimdfm[16090:22984,]
x <- rawdata$click_through_rate[1:16089]
nb_model <- textmodel_nb(dfm_train, x)