This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install required packages

#set default repository to avoid mirror error
options(repos=structure(c(CRAN="http://cran.r-project.org")))

# Sentiment Analysis with Machine Learning Algos
# You can compare the accuracy of SVM, Naive Bayes, Random Forest and GlmNet 

install.packages('data.table')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'data.table' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('tm')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'tm' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('e1071')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'e1071' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('gmodels')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'gmodels' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('caret')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('stringr')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'stringr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('purrrlyr')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'purrrlyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('tidyverse')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('text2vec')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'text2vec' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('glmnet')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'glmnet' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('ggrepel')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'ggrepel' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('randomForest')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'randomForest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

install.packages('SnowballC')

## Installing package into 'C:/Users/Rowen/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)

## package 'SnowballC' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rowen\AppData\Local\Temp\Rtmpk9wDzp\downloaded_packages

library(data.table)
library(tm)

## Loading required package: NLP

library(e1071)
library(gmodels)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(stringr)
library(purrrlyr)
library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --

## v tibble  2.1.1       v purrr   0.3.2  
## v tidyr   0.8.3       v dplyr   0.8.0.1
## v readr   1.3.1       v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::between()    masks data.table::between()
## x dplyr::filter()     masks stats::filter()
## x dplyr::first()      masks data.table::first()
## x dplyr::lag()        masks stats::lag()
## x dplyr::last()       masks data.table::last()
## x purrr::lift()       masks caret::lift()
## x purrr::transpose()  masks data.table::transpose()

library(text2vec)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded glmnet 2.0-16

library(ggrepel)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(SnowballC)

Load 1.6M labelled tweets for training data

tweets_classified <- readRDS("tweets_classified.RDS")

# At first, we only use a small subset of the data to save time
set.seed(2340)
tweets_Index <- createDataPartition(tweets_classified$sentiment, p = 0.001, 
                                         list = FALSE, 
                                         times = 1)
train_tweets <- tweets_classified[tweets_Index, ]

# Divide into train and test data sets
train_index <- sample(1600, 1200)
train_data <- train_tweets[train_index, ]
test_data <- train_tweets[-train_index, ]

# Create DTM
data <- rbind(train_data,test_data)
data_corpus <- VCorpus(VectorSource(data$text))
data_dtm <- DocumentTermMatrix(data_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))
data_dtm <- as.matrix(data_dtm)

train_dtm <- data_dtm[1:1200, ]
test_dtm <- data_dtm[1201:1600, ]

Naive Bayes

bayes_classifier <- naiveBayes(train_dtm, as.factor(train_data$sentiment))
bayes_test_pred <- predict(bayes_classifier, test_dtm)
CrossTable(bayes_test_pred, test_data$sentiment,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  400 
## 
##  
##              | actual 
##    predicted |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       195 |       162 |       357 | 
##              |     0.899 |     0.885 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        22 |        21 |        43 | 
##              |     0.101 |     0.115 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       217 |       183 |       400 | 
##              |     0.542 |     0.458 |           | 
## -------------|-----------|-----------|-----------|
## 
##

auc(as.numeric(test_data$sentiment), bayes_test_pred)

## [1] 0.5066858

#0.5066858, 400 obs in table

SVM: Support Vector Machine

svm_classifier <- svm(train_dtm, as.factor(train_data$sentiment))

## Warning in svm.default(train_dtm, as.factor(train_data$sentiment)):
## Variable(s) 'aaaal' and 'aamyhaanson' and 'abba' and 'abil' and 'abu'
## and 'ace' and 'ack' and 'acknowledg' and 'acostalyn' and 'acount' and
## 'across' and 'actress' and 'adam' and 'adriannecurri' and 'adult' and
## 'adventur' and 'advic' and 'africa' and 'aghh' and 'agoodappetit' and
## 'ahahahaha' and 'ahahha' and 'alicecullen' and 'allen' and 'allll' and
## 'alo' and 'alpha' and 'amandabohan' and 'amitbhawani' and 'among' and
## 'amongst' and 'ampextrav' and 'amsterdam' and 'andrea' and 'anjali' and
## 'aparajuli' and 'area' and 'arrrggghhhhhh' and 'artsi' and 'artyart'
## and 'ashetalketh' and 'asia' and 'assign' and 'assum' and 'athlet' and
## 'atlant' and 'atlanti' and 'auburn' and 'audiodidact' and 'aussi' and
## 'austin' and 'awdruh' and 'awhil' and 'awwe' and 'ayanami' and 'azulski'
## and 'baaawnot' and 'babyrhysgear' and 'backstreetboy' and 'ball' and
## 'bank' and 'banksyart' and 'barrel' and 'basebal' and 'bcuz' and 'beard'
## and 'beckykingston' and 'benjimonicus' and 'besti' and 'bestttttttt' and
## 'beter' and 'beyond' and 'bhellabel' and 'bialovesnkotb' and 'biggest'
## and 'binder' and 'birmingham' and 'bitter' and 'bkfast' and 'blackpool'
## and 'bldi' and 'blogtv' and 'blush' and 'board' and 'boat' and 'bold'
## and 'bonfir' and 'booksa' and 'booktwitt' and 'booooo' and 'bootabl'
## and 'boredd' and 'boriss' and 'borrow' and 'boyquot' and 'brand' and
## 'breastfeed' and 'bright' and 'brightesttt' and 'brioch' and 'brked'
## and 'bros' and 'bsb' and 'bubbi' and 'buckley' and 'buildingcomplex'
## and 'burgertub' and 'burnt' and 'butter' and 'butteri' and 'bytey'
## and 'bzelasko' and 'cafe' and 'calm' and 'camaro' and 'camera' and
## 'candykiss' and 'cap' and 'capac' and 'careftw' and 'cash' and 'cbdw'
## and 'celadonnewtown' and 'cellabub' and 'celz' and 'chai' and 'challeng'
## and 'changeisgood' and 'chantaleto' and 'charleston' and 'chicbud'
## and 'choosehmmmgood' and 'chris' and 'chrishasboob' and 'chrisk' and
## 'christian' and 'christin' and 'ckkl' and 'classic' and 'coki' and 'coldi'
## and 'college' and 'colour' and 'column' and 'com' and 'communityslow' and
## 'comp' and 'compil' and 'conclud' and 'condit' and 'cone' and 'conform'
## and 'continu' and 'contract' and 'contri' and 'convinc' and 'cooki' and
## 'coop' and 'copycat' and 'cord' and 'cori' and 'correct' and 'covent' and
## 'craigkuhn' and 'crappp' and 'craterpop' and 'creat' and 'cricket' and
## 'crippl' and 'crowdquot' and 'cup' and 'cvilma' and 'cyrus' and 'danni' and
## 'dare' and 'dat' and 'data' and 'daviddraiman' and 'dawn' and 'dead' and
## 'deadtwom' and 'deasaurr' and 'dec' and 'decemb' and 'decent' and 'decibel'
## and 'deepest' and 'definit' and 'degrassi' and 'delic' and 'delici' and
## 'deluv' and 'devast' and 'devic' and 'devil' and 'devious' and 'devout'
## and 'dhabi' and 'dian' and 'dinosaur' and 'dint' and 'discussinbut' and
## 'diseas' and 'distract' and 'dito' and 'divis' and 'dmg' and 'doin' and
## 'dokey' and 'doma' and 'dominiccampbel' and 'donellama' and 'donni' and
## 'dontforgetchao' and 'doodoo' and 'downor' and 'drawback' and 'dreami'
## and 'drinkup' and 'drizzl' and 'dtphulp' and 'due' and 'dun' and 'dunde'
## and 'economi' and 'effort' and 'eight' and 'eliclaudell' and 'elising'
## and 'elivazeth' and 'elon' and 'emamdjom' and 'encourag' and 'enhanc' and
## 'enorm' and 'equal' and 'errquotlight' and 'eskimo' and 'essteeyou' and
## 'estim' and 'euh' and 'evaekeis' and 'evanheis' and 'ewout' and 'exercis'
## and 'exploit' and 'extraaaa' and 'extravaganza' and 'faculti' and 'famous'
## and 'fanniest' and 'farmland' and 'favour' and 'featherquot' and 'feedback'
## and 'fennel' and 'ferdinand' and 'ferri' and 'fic' and 'fifa' and 'file'
## and 'finetun' and 'floodedviolet' and 'florida' and 'flow' and 'flown'
## and 'flush' and 'fml' and 'followerskeep' and 'fooood' and 'forcast' and
## 'forehead' and 'foreign' and 'forev' and 'foreverrr' and 'fpanewsflash'
## and 'frackyland' and 'franc' and 'francoisplanqu' and 'franz' and 'frustat'
## and 'ftw' and 'funquot' and 'gaiman' and 'gamingstev' and 'gangster' and
## 'garethcliff' and 'gave' and 'gavin' and 'gee' and 'geoffsawesom' and
## 'german' and 'gimp' and 'glamor' and 'glasto' and 'gloomi' and 'glui'
## and 'golf' and 'golli' and 'goss' and 'grade' and 'grader' and 'gramma'
## and 'grant' and 'greatest' and 'greenfe' and 'greermcdonald' and 'gross'
## and 'gtgt' and 'gtsyrup' and 'guitar' and 'haaaaa' and 'hahahahahha'
## and 'haii' and 'halfroom' and 'handdddd' and 'happay' and 'happier' and
## 'headachehop' and 'headsup' and 'heart' and 'heavier' and 'heehe' and
## 'heey' and 'henryl' and 'hereford' and 'heyi' and 'hide' and 'hiii' and
## 'hilarius' and 'histor' and 'hitler' and 'hmm' and 'hollymccaig' and
## 'holm' and 'homeim' and 'honour' and 'hooooom' and 'hooray' and 'hors' and
## 'hotel' and 'house' and 'housemovingblu' and 'houston' and 'hrsquotbut'
## and 'httpbitlyomsdk' and 'httpbitlypdmb' and 'httpbitlywybza' and
## 'httpblipfmch' and 'httpbudurlcomfreeu' and 'httphourkitdeviantartcom'
## and 'httpmypictmelz' and 'httpplurkcompdct' and 'httpplurkcompxq' and
## 'httpplurkcompxufug' and 'httptinyurlcomdjjc' and 'httptinyurlcomleh' and
## 'httptinyurlcomnyomgu' and 'httptinyurlcomqmsmw' and 'httptumblrcomxnepahi'
## and 'httptwitpiccomalu' and 'httptwitpiccomft' and 'httptwitpiccomg' and
## 'httpwwwaxureworldcom' and 'httpwwwnasagovmultimedianasatvindexhtml' and
## 'hubbi' and 'hubbl' and 'huggtlt' and 'huhuhuhu' and 'hull' and 'hungari'
## and 'huntington' and 'husband' and 'hve' and 'hydrat' and 'iancleari' and
## 'icki' and 'icoop' and 'ide' and 'idungiveafuck' and 'iima' and 'imac'
## and 'indiebird' and 'indonesia' and 'inevit' and 'inform' and 'inlaw'
## and 'insan' and 'insanitea' and 'instant' and 'invit' and 'inworship' and
## 'ioh' and 'ionlydrumnak' and 'iow' and 'ipadr' and 'ipl' and 'irinatulip'
## and 'itsadinosaur' and 'itself' and 'itsjenniferamo' and 'itun' and
## 'janellechant' and 'jas' and 'jaybreezi' and 'jazjess' and 'jealoussss'
## and 'jennixd' and 'jericho' and 'jerseybash' and 'joelmadden' and
## 'johngray' and 'joli' and 'jonaslt' and 'jonastwilight' and 'jonathan' and
## 'jonathankochi' and 'jonathanrknight' and 'jturcuato' and 'junction' and
## 'justinlevi' and 'jwhof' and 'kareenxd' and 'karmadillo' and 'kateyi' and
## 'kati' and 'katieh' and 'katyk' and 'katyperri' and 'kela' and 'kellykfog'
## and 'kelz' and 'kenni' and 'kennywood' and 'key' and 'killer' and
## 'kinoflyhigh' and 'kkmariejb' and 'kris' and 'kristineeenf' and 'kruger'
## and 'kyleowyang' and 'label' and 'lake' and 'lalala' and 'lambert' and
## 'land' and 'lap' and 'largest' and 'lastsoul' and 'launch' and 'lauren'
## and 'leader' and 'leagu' and 'leakag' and 'ledg' and 'leed' and 'lib' and
## 'lilli' and 'lindsaylou' and 'littlefurybug' and 'livetorqu' and 'lobbi'
## and 'lock' and 'loke' and 'lolspiltercel' and 'longboard' and 'longest'
## and 'lookin' and 'loopygine' and 'lose' and 'lprofil' and 'ltitsabbygt'
## and 'ltlt' and 'ltmyblackbirdlt' and 'ltnick' and 'lust' and 'lutherstadt'
## and 'luvjanelli' and 'macbook' and 'makenag' and 'makin' and 'mami' and
## 'mana' and 'manic' and 'manitsingh' and 'margotrobbi' and 'mariandougan'
## and 'marriag' and 'marshmallow' and 'maryrsnyd' and 'mcmuffin' and
## 'mcrfanclub' and 'membership' and 'merit' and 'mesomaniac' and 'met' and
## 'metal' and 'meteora' and 'metro' and 'mexico' and 'mford' and 'minor'
## and 'mirror' and 'misabel' and 'missin' and 'moleskin' and 'momma' and
## 'monica' and 'moonchic' and 'mountain' and 'moveor' and 'movesold' and
## 'mps' and 'mrs' and 'mtvsunday' and 'mummi' and 'muskat' and 'mustach'
## and 'mythbust' and 'nai' and 'naidaaa' and 'nanavett' and 'naontiotami'
## and 'nappp' and 'natashacairn' and 'nate' and 'nayyna' and 'neeeeed'
## and 'neil' and 'nerd' and 'neshiak' and 'nevermind' and 'newcastl' and
## 'newlyalmostsingl' and 'nicoladetisi' and 'niec' and 'niecemirabell' and
## 'nijmegen' and 'nikkilopez' and 'nocturnali' and 'noddi' and 'nonsens'
## and 'nooooooooanoth' and 'note' and 'nothin' and 'notorioustori' and 'nut'
## and 'oatmeal' and 'obnoxi' and 'obv' and 'oftenbut' and 'okey' and 'omesh'
## and 'onsit' and 'opinion' and 'opportun' and 'orkney' and 'outback' and
## 'outsideth' and 'overkil' and 'pair' and 'panic' and 'papi' and 'pasta' and
## 'peanut' and 'pell' and 'penguinj' and 'pentaton' and 'pepper' and 'pet'
## and 'phopho' and 'photograph' and 'pictureee' and 'pillow' and 'placedo'
## and 'p

svm_test_pred <- predict(svm_classifier, test_dtm)
CrossTable(svm_test_pred, as.factor(test_data$sentiment),
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  400 
## 
##  
##               | as.factor(test_data$sentiment) 
## svm_test_pred |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##             1 |       217 |       183 |       400 | 
## --------------|-----------|-----------|-----------|
##  Column Total |       217 |       183 |       400 | 
## --------------|-----------|-----------|-----------|
## 
##

auc(as.numeric(test_data$sentiment), svm_test_pred)

## [1] 0.5

#0.5, 400 observations in table

RandomForest

randomforest_classifier <- randomForest(train_dtm, as.factor(train_data$sentiment), ntree=100)
rf_test_pred <- predict(randomforest_classifier, test_dtm)
CrossTable(rf_test_pred, as.factor(test_data$sentiment),
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  400 
## 
##  
##              | actual 
##    predicted |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       107 |        27 |       134 | 
##              |     0.493 |     0.148 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       110 |       156 |       266 | 
##              |     0.507 |     0.852 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       217 |       183 |       400 | 
##              |     0.542 |     0.458 |           | 
## -------------|-----------|-----------|-----------|
## 
##

auc(as.numeric(test_data$sentiment), rf_test_pred)

## [1] 0.6727733

plot(randomforest_classifier)

#0.6727733, 400 observations in table

GlmNet: Generalized Linear Model

prep_fun <- tolower
tok_fun <- word_tokenizer

it_train <- itoken(train_data$text, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = train_data$id,
                   progressbar = TRUE)
it_test <- itoken(test_data$text, 
                  preprocessor = prep_fun, 
                  tokenizer = tok_fun,
                  ids = test_data$id,
                  progressbar = TRUE)
vocab <- create_vocabulary(it_train)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

vectorizer <- vocab_vectorizer(vocab)
dtm_train <- create_dtm(it_train, vectorizer)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

dtm_test <- create_dtm(it_test, vectorizer)

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

tfidf <- TfIdf$new()
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
                               y = train_data[['sentiment']], 
                               family = 'binomial', 
                               # L1 penalty
                               alpha = 1,
                               # interested in the area under ROC curve
                               type.measure = "auc",
                               # 5-fold cross-validation
                               nfolds = 5,
                               # high value is less accurate, but has faster training
                               thresh = 1e-3,
                               # again lower number of iterations for faster training
                               maxit = 1e3)

preds <- predict(glmnet_classifier, dtm_test_tfidf, type = 'response')[ ,1]
auc(as.numeric(test_data$sentiment), preds)

## [1] 0.6741205

#0.6741205

plot(glmnet_classifier)

print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

## [1] "max AUC = 0.6933"

#max AUC = 0.6933

#Here GLMNet seems to provide the highest accuracy overall in AUC (Area Under Curve) with a score of 0.6933, 
#meaning it best predicts classes determined to be present in the data compared to naive bayes (0.5066858), 
#SVM (0.5), 
#or randomforest (0.6727733)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).