NSDMC model attempt

library(quanteda)

## Package version: 1.1.1

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

library(e1071)

rawdata <- read.csv("~/Downloads/NSDMC_training_data.csv - NSDMC_training_data.csv.csv")

# what class is our data?
class(rawdata$Subject)

## [1] "factor"

# corpus() function requires data of class "character", so convert
rawdata$Subject <- as.character(rawdata$Subject)

# turn character vector into a corpus
text_corpus <- corpus(rawdata$Subject)

Let’s see what we’ve got.

text_corpus[1]

##                                      text1 
## "The World Map This First Of The New Year"

# set docvars = to the ham/spam variable
docvars(text_corpus) <- rawdata$class_labels

text_corpus[1]

##                                      text1 
## "The World Map This First Of The New Year"

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'tm'

## The following objects are masked from 'package:quanteda':
## 
##     as.DocumentTermMatrix, stopwords

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

mytokens <- tokens(text_corpus)
newtokens <- tokens_wordstem(mytokens)

mydfm <- dfm(newtokens, remove = stopwords())

trimdfm <- dfm_trim(mydfm, min_count = 4, min_docfreq = 4)

dfm_train <- trimdfm[1:16089,]
dfm_test <- trimdfm[16090:22984,]

x <- rawdata$click_through_rate[1:16089]
nb_model <- textmodel_nb(dfm_train, x)

NSDMC model attempt

Chrissy Pace

Today