Final submission for NSDMC
# Load these Libraries
library(dplyr)
library(caret)
library(e1071)
library(knitr)
library(rmarkdown)
library(quanteda)
Import Data
# Import raw data given to us
rawdata <- read.csv("NSDMC_training_data.csv - NSDMC_training_data.csv.csv")
# Scramble Data
scramble <- sample(1:22984, size = 1500)
rawdata <- rawdata[scramble,]
Make a corpus
# Convert text data into Class character
rawdata$Subject <- as.character(rawdata$Subject)
# Create a corpus
datacorp <- corpus(rawdata$Subject)
# Check Data
datacorp[10]
## text10
## "Coupons from Walmart, Target & Whole Foods"
Tokenize data
# Tokenize the corpus
tokens <- tokens(datacorp, remove_punct = TRUE)
# Word Stem the tokens
newtokens <- tokens_wordstem(tokens)
# Lowercase the tokens
newtokens2 <- tokens_tolower(newtokens)
# See what data looks like
newtokens2[10]
## tokens from 1 document.
## text10 :
## [1] "coupon" "from" "walmart" "target" "whole" "food"
Create a DFM
# Create a dfm
mydfm <- dfm(newtokens2, remove = stopwords())
# Trim DFM
trimdfm <- dfm_trim(mydfm, min_termfreq = 15, min_docfreq = 8)
# Turn DFM into a dataframe
mydf <- data.frame(trimdfm)
# Drop "document" variable
mydf <- select(mydf, -document)
# Append variables on to that dataframe
mydf2 <- cbind(rawdata$class_labels, mydf)
# Rename the rawdata$class_labels variable to "class_labels"
names(mydf2)[1] <- "class_labels"
Splitting Data
# Split mydf2
traindf <- mydf2[1:500,]
testdf <- mydf2[501:1500,]
Train Model on training set
# Get our features and response variable
train_features <- select(traindf, -class_labels)
train_response <- traindf$class_labels
Model xgboost
# Create Model
mod <- train(x = train_features, y = train_response, method = "xgbTree")
# Make predictions on test data
preds <- predict(mod, testdf)
tbl <- table(preds, testdf$class_labels)
# Function to find percent accuracy
percentage <- function(tbl){
(tbl[1,1] + tbl[2,2] + tbl[3,3]) / 1000*100
}
percentage(tbl)
## [1] 54.2
# Save Model
saveRDS(mod, "xgbmodel.Rds")