Read in data

tweets <- read.csv('tweets.csv', stringsAsFactors=FALSE)
str(tweets)
## 'data.frame':    1181 obs. of  2 variables:
##  $ Tweet: chr  "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore" "iOS 7 is so fricking smooth & beautiful!! #ThanxApple @Apple" "LOVE U @APPLE" "Thank you @apple, loving my new iPhone 5S!!!!!  #apple #iphone5S pic.twitter.com/XmHJCU4pcb" ...
##  $ Avg  : num  2 2 1.8 1.8 1.8 1.8 1.8 1.6 1.6 1.6 ...

If average sentiment is <= 1, treat tweet as negative

tweets$Negative <- as.factor(tweets$Avg <= -1)
table(tweets$Negative)
## 
## FALSE  TRUE 
##   999   182

Packages to help with text analytics

library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.1.3

Create corpus - a collection of documents

corpus <- Corpus(VectorSource(tweets$Tweet))
((corpus))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1181
corpus[[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 101

Change all text in tweets to lower case and remove punctuation

corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
(corpus[[1]]$content)
## [1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"
corpus <- tm_map(corpus, removePunctuation)
(corpus[[1]]$content)
## [1] "i have to say apple has by far the best customer care service i have ever received apple appstore"

TM provides stop words. Shows first 10 stop words.

stopwords("english")[1:10]
##  [1] "i"         "me"        "my"        "myself"    "we"       
##  [6] "our"       "ours"      "ourselves" "you"       "your"

Remove all stop words and word apple

corpus <- tm_map(corpus, removeWords, c("apple", stopwords("english")))
(corpus[[1]]$content)
## [1] "   say    far  best customer care service   ever received  appstore"

Remove stemming (customer -> custom)

corpus <- tm_map(corpus, stemDocument)
(corpus[[1]]$content)
## [1] "   say    far  best custom care servic   ever receiv  appstor"

Generate document term matrix - each row is a document, each column is a term and its frequency.

frequencies <- DocumentTermMatrix(corpus)
frequencies
## <<DocumentTermMatrix (documents: 1181, terms: 3289)>>
## Non-/sparse entries: 8980/3875329
## Sparsity           : 100%
## Maximal term length: 115
## Weighting          : term frequency (tf)

Inspect a small section of matrix. Documents 1000 to 1005, terms 505 to 515

inspect(frequencies[1000:1005, 505:515])
## <<DocumentTermMatrix (documents: 6, terms: 11)>>
## Non-/sparse entries: 1/65
## Sparsity           : 98%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           cheapen cheaper check cheep cheer cheerio cherylcol chief
##   character(0)       0       0     0     0     0       0         0     0
##   character(0)       0       0     0     0     0       0         0     0
##   character(0)       0       0     0     0     0       0         0     0
##   character(0)       0       0     0     0     0       0         0     0
##   character(0)       0       0     0     0     0       0         0     0
##   character(0)       0       0     0     0     1       0         0     0
##               Terms
## Docs           chiiiiqu child children
##   character(0)        0     0        0
##   character(0)        0     0        0
##   character(0)        0     0        0
##   character(0)        0     0        0
##   character(0)        0     0        0
##   character(0)        0     0        0

Look for words which appear more than 20 times

findFreqTerms(frequencies, lowfreq=20)
##  [1] "android"              "anyon"                "app"                 
##  [4] "appl"                 "back"                 "batteri"             
##  [7] "better"               "buy"                  "can"                 
## [10] "cant"                 "come"                 "dont"                
## [13] "fingerprint"          "freak"                "get"                 
## [16] "googl"                "ios7"                 "ipad"                
## [19] "iphon"                "iphone5"              "iphone5c"            
## [22] "ipod"                 "ipodplayerpromo"      "itun"                
## [25] "just"                 "like"                 "lol"                 
## [28] "look"                 "love"                 "make"                
## [31] "market"               "microsoft"            "need"                
## [34] "new"                  "now"                  "one"                 
## [37] "phone"                "pleas"                "promo"               
## [40] "promoipodplayerpromo" "realli"               "releas"              
## [43] "samsung"              "say"                  "store"               
## [46] "thank"                "think"                "time"                
## [49] "twitter"              "updat"                "use"                 
## [52] "via"                  "want"                 "well"                
## [55] "will"                 "work"

Only keep terms which appear in 0.5% or more of the tweets:

(sparse <- removeSparseTerms(frequencies, 0.995))
## <<DocumentTermMatrix (documents: 1181, terms: 309)>>
## Non-/sparse entries: 4669/360260
## Sparsity           : 99%
## Maximal term length: 20
## Weighting          : term frequency (tf)

Convert sparse matrix into data frame

tweetSparse <- as.data.frame(as.matrix(sparse))

Since R struggles with column names that start with numbers, convert them:

(colnames(tweetSparse)[1:10])
##  [1] "244tsuyoponzu" "7evenstarz"    "actual"        "add"          
##  [5] "alreadi"       "alway"         "amaz"          "amazon"       
##  [9] "android"       "announc"
colnames(tweetSparse) <- make.names(colnames(tweetSparse))

Now add dependent variable

tweetSparse$Negative <- tweets$Negative

Split into test and training sets

library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(123)
split <- sample.split(tweetSparse$Negative, SplitRatio=0.7)
trainSparse <- subset(tweetSparse, split==TRUE)
testSparse <- subset(tweetSparse, split==FALSE)

Using CART to build model

library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
tweetCART <- rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)

Use model against test set

predictCART <- predict(tweetCART, newdata=testSparse, type="class")
(confmat <- table(testSparse$Negative, predictCART))
##        predictCART
##         FALSE TRUE
##   FALSE   294    6
##   TRUE     37   18
sum(diag(confmat)) / nrow(testSparse)
## [1] 0.8788732

Accuracy of baseline model

(table(testSparse$Negative))
## 
## FALSE  TRUE 
##   300    55
300 / 355
## [1] 0.8450704

Use random forest

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
tweetRF <- randomForest(Negative ~ ., data=trainSparse)

Predict against test set and get accuracy

predictRF <- predict(tweetRF, newdata=testSparse)
(confmat <- table(testSparse$Negative, predictRF))
##        predictRF
##         FALSE TRUE
##   FALSE   293    7
##   TRUE     34   21
sum(diag(confmat)) / nrow(testSparse)
## [1] 0.884507

Use logistic regression

tweetLog <- glm(Negative ~ ., data=trainSparse, family=binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Predict against test set and get accuracy

predictLog <- predict(tweetLog, newdata=testSparse, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
(confmat <- table(testSparse$Negative, predictLog > 0.5))
##        
##         FALSE TRUE
##   FALSE   253   47
##   TRUE     22   33
sum(diag(confmat)) / nrow(testSparse)
## [1] 0.8056338