Read in data
tweets <- read.csv('tweets.csv', stringsAsFactors=FALSE)
str(tweets)
## 'data.frame': 1181 obs. of 2 variables:
## $ Tweet: chr "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore" "iOS 7 is so fricking smooth & beautiful!! #ThanxApple @Apple" "LOVE U @APPLE" "Thank you @apple, loving my new iPhone 5S!!!!! #apple #iphone5S pic.twitter.com/XmHJCU4pcb" ...
## $ Avg : num 2 2 1.8 1.8 1.8 1.8 1.8 1.6 1.6 1.6 ...
If average sentiment is <= 1, treat tweet as negative
tweets$Negative <- as.factor(tweets$Avg <= -1)
table(tweets$Negative)
##
## FALSE TRUE
## 999 182
Packages to help with text analytics
library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.1.3
Create corpus - a collection of documents
corpus <- Corpus(VectorSource(tweets$Tweet))
((corpus))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1181
corpus[[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 101
Change all text in tweets to lower case and remove punctuation
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
(corpus[[1]]$content)
## [1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"
corpus <- tm_map(corpus, removePunctuation)
(corpus[[1]]$content)
## [1] "i have to say apple has by far the best customer care service i have ever received apple appstore"
TM provides stop words. Shows first 10 stop words.
stopwords("english")[1:10]
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
Remove all stop words and word apple
corpus <- tm_map(corpus, removeWords, c("apple", stopwords("english")))
(corpus[[1]]$content)
## [1] " say far best customer care service ever received appstore"
Remove stemming (customer -> custom)
corpus <- tm_map(corpus, stemDocument)
(corpus[[1]]$content)
## [1] " say far best custom care servic ever receiv appstor"
Generate document term matrix - each row is a document, each column is a term and its frequency.
frequencies <- DocumentTermMatrix(corpus)
frequencies
## <<DocumentTermMatrix (documents: 1181, terms: 3289)>>
## Non-/sparse entries: 8980/3875329
## Sparsity : 100%
## Maximal term length: 115
## Weighting : term frequency (tf)
Inspect a small section of matrix. Documents 1000 to 1005, terms 505 to 515
inspect(frequencies[1000:1005, 505:515])
## <<DocumentTermMatrix (documents: 6, terms: 11)>>
## Non-/sparse entries: 1/65
## Sparsity : 98%
## Maximal term length: 9
## Weighting : term frequency (tf)
##
## Terms
## Docs cheapen cheaper check cheep cheer cheerio cherylcol chief
## character(0) 0 0 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0 0 0
## character(0) 0 0 0 0 1 0 0 0
## Terms
## Docs chiiiiqu child children
## character(0) 0 0 0
## character(0) 0 0 0
## character(0) 0 0 0
## character(0) 0 0 0
## character(0) 0 0 0
## character(0) 0 0 0
Look for words which appear more than 20 times
findFreqTerms(frequencies, lowfreq=20)
## [1] "android" "anyon" "app"
## [4] "appl" "back" "batteri"
## [7] "better" "buy" "can"
## [10] "cant" "come" "dont"
## [13] "fingerprint" "freak" "get"
## [16] "googl" "ios7" "ipad"
## [19] "iphon" "iphone5" "iphone5c"
## [22] "ipod" "ipodplayerpromo" "itun"
## [25] "just" "like" "lol"
## [28] "look" "love" "make"
## [31] "market" "microsoft" "need"
## [34] "new" "now" "one"
## [37] "phone" "pleas" "promo"
## [40] "promoipodplayerpromo" "realli" "releas"
## [43] "samsung" "say" "store"
## [46] "thank" "think" "time"
## [49] "twitter" "updat" "use"
## [52] "via" "want" "well"
## [55] "will" "work"
Only keep terms which appear in 0.5% or more of the tweets:
(sparse <- removeSparseTerms(frequencies, 0.995))
## <<DocumentTermMatrix (documents: 1181, terms: 309)>>
## Non-/sparse entries: 4669/360260
## Sparsity : 99%
## Maximal term length: 20
## Weighting : term frequency (tf)
Convert sparse matrix into data frame
tweetSparse <- as.data.frame(as.matrix(sparse))
Since R struggles with column names that start with numbers, convert them:
(colnames(tweetSparse)[1:10])
## [1] "244tsuyoponzu" "7evenstarz" "actual" "add"
## [5] "alreadi" "alway" "amaz" "amazon"
## [9] "android" "announc"
colnames(tweetSparse) <- make.names(colnames(tweetSparse))
Now add dependent variable
tweetSparse$Negative <- tweets$Negative
Split into test and training sets
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(123)
split <- sample.split(tweetSparse$Negative, SplitRatio=0.7)
trainSparse <- subset(tweetSparse, split==TRUE)
testSparse <- subset(tweetSparse, split==FALSE)
Using CART to build model
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
tweetCART <- rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)
Use model against test set
predictCART <- predict(tweetCART, newdata=testSparse, type="class")
(confmat <- table(testSparse$Negative, predictCART))
## predictCART
## FALSE TRUE
## FALSE 294 6
## TRUE 37 18
sum(diag(confmat)) / nrow(testSparse)
## [1] 0.8788732
Accuracy of baseline model
(table(testSparse$Negative))
##
## FALSE TRUE
## 300 55
300 / 355
## [1] 0.8450704
Use random forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
tweetRF <- randomForest(Negative ~ ., data=trainSparse)
Predict against test set and get accuracy
predictRF <- predict(tweetRF, newdata=testSparse)
(confmat <- table(testSparse$Negative, predictRF))
## predictRF
## FALSE TRUE
## FALSE 293 7
## TRUE 34 21
sum(diag(confmat)) / nrow(testSparse)
## [1] 0.884507
Use logistic regression
tweetLog <- glm(Negative ~ ., data=trainSparse, family=binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Predict against test set and get accuracy
predictLog <- predict(tweetLog, newdata=testSparse, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
(confmat <- table(testSparse$Negative, predictLog > 0.5))
##
## FALSE TRUE
## FALSE 253 47
## TRUE 22 33
sum(diag(confmat)) / nrow(testSparse)
## [1] 0.8056338