packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
# Sys.setlocale("LC_ALL","C")
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


# Read in the data
tweets = read.csv("data/tweets.csv", stringsAsFactors=FALSE)
調整目標變數

因為情緒標的記值偏高,先調整目標變數的編碼,讓10之間比較平衡:

tweets$Negative = as.factor(tweets$Avg <= -1)
prop.table(table(tweets$Negative))
## 
##   FALSE    TRUE 
## 0.84589 0.15411

文集整理

建立文集 (Coupus)
# create corpus from vector
corpus = Corpus(VectorSource(tweets$Tweet))
corpus[[1]]$content    # content of the first document
## [1] "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore"
轉為小寫
corpus = tm_map(corpus, content_transformer(tolower))
corpus[[1]]$content    # content of the first document
## [1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"
移除標點
# some version of tm may have to do the commmad below
# corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus[[1]]$content
## [1] "i have to say apple has by far the best customer care service i have ever received apple appstore"
去除贅字
stopwords("english")[1:10]
##  [1] "i"         "me"        "my"        "myself"    "we"       
##  [6] "our"       "ours"      "ourselves" "you"       "your"
corpus = tm_map(corpus, removeWords, c("apple", stopwords("english")))
corpus[[1]]$content
## [1] "   say    far  best customer care service   ever received  appstore"
字根還原
corpus = tm_map(corpus, stemDocument)
corpus[[1]]$content
## [1] "say far best custom care servic ever receiv appstor"


文件字詞矩陣 (字頻表,DTM)

建立文件字詞矩陣 (Document Term Matrix)
frequencies = DocumentTermMatrix(corpus)
frequencies
## <<DocumentTermMatrix (documents: 1181, terms: 3289)>>
## Non-/sparse entries: 8980/3875329
## Sparsity           : 100%
## Maximal term length: 115
## Weighting          : term frequency (tf)
# Look at matrix 
inspect(frequencies[1000:1005,505:515])
## <<DocumentTermMatrix (documents: 6, terms: 11)>>
## Non-/sparse entries: 1/65
## Sparsity           : 98%
## Maximal term length: 23
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   asap courtsideassistappforio current follow idea kickbutt preinstal
##   1000    0                       0       0      0    1        0         0
##   1001    0                       0       0      0    0        0         0
##   1002    0                       0       0      0    0        0         0
##   1003    0                       0       0      0    0        0         0
##   1004    0                       0       0      0    0        0         0
##   1005    0                       0       0      0    0        0         0
##       Terms
## Docs   save ssd support
##   1000    0   0       0
##   1001    0   0       0
##   1002    0   0       0
##   1003    0   0       0
##   1004    0   0       0
##   1005    0   0       0
findFreqTerms(frequencies, lowfreq=20)
##  [1] "say"                  "love"                 "iphon"               
##  [4] "iphone5"              "new"                  "thank"               
##  [7] "phone"                "can"                  "make"                
## [10] "market"               "one"                  "will"                
## [13] "cant"                 "get"                  "just"                
## [16] "updat"                "fingerprint"          "iphone5c"            
## [19] "store"                "time"                 "come"                
## [22] "now"                  "use"                  "back"                
## [25] "anyon"                "work"                 "app"                 
## [28] "android"              "think"                "ipad"                
## [31] "well"                 "freak"                "dont"                
## [34] "via"                  "better"               "like"                
## [37] "pleas"                "samsung"              "want"                
## [40] "batteri"              "ios7"                 "microsoft"           
## [43] "itun"                 "buy"                  "releas"              
## [46] "look"                 "appl"                 "need"                
## [49] "googl"                "twitter"              "ipod"                
## [52] "ipodplayerpromo"      "promoipodplayerpromo" "lol"                 
## [55] "realli"               "promo"
移除頻率太低的字詞
sparse = removeSparseTerms(frequencies, 0.995)
sparse
## <<DocumentTermMatrix (documents: 1181, terms: 309)>>
## Non-/sparse entries: 4669/360260
## Sparsity           : 99%
## Maximal term length: 20
## Weighting          : term frequency (tf)


模型與預測

轉成資料框
# Convert to a data frame
tweetsSparse = as.data.frame(as.matrix(sparse))
# Make all variable names R-friendly
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))
# Add target variable
tweetsSparse$Negative = tweets$Negative
資料分割
library(caTools)
set.seed(123)
split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse = subset(tweetsSparse, split==TRUE)
testSparse = subset(tweetsSparse, split==FALSE)
# quiz: find the words whose frequency is greater or equal than 100
findFreqTerms(frequencies, lowfreq=100)
## [1] "iphon" "new"   "itun"


分類決策樹模型

library(rpart)
library(rpart.plot)
tweetCART = rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)

驗證模型的準確性
# Evaluate the performance of the model
predictCART = predict(tweetCART, newdata=testSparse, type="class")
table(testSparse$Negative, predictCART)
##        predictCART
##         FALSE TRUE
##   FALSE   294    6
##   TRUE     37   18
(294+18)/(294+6+37+18)  # ACC = 0.87887
## [1] 0.87887
與底線準確性相比較
# Baseline accuracy 
table(testSparse$Negative)
## 
## FALSE  TRUE 
##   300    55
300/(300+55)            # ACC = 0.84507
## [1] 0.84507


隨機森林模型

library(randomForest)
set.seed(123)
tweetRF = randomForest(Negative ~ ., data=trainSparse)
# Make predictions:
predictRF = predict(tweetRF, newdata=testSparse)
table(testSparse$Negative, predictRF)  %>% {sum(diag(.)) / sum(.)}
## [1] 0.88732
# (293+21)/(293+7+34+23) # ACC = 0.87955


一般線性模型

## quiz: glm model
glm1 = glm(Negative ~ ., data=trainSparse, family = 'binomial')
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred = predict(glm1, testSparse, type='response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
table(testSparse$Negative, pred > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.80845
# Test ACC = 0.80845
pred2 = predict(glm1, type='response')
table(trainSparse$Negative, pred2 > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.94915
# Train ACC = 0.94915


討論議題:
  ■ 文字分析的流程
    ● 從文件向量建立文集
    ● 文集整理 (字碼、小寫、標點、贅字、字根)
    ● 文件字詞矩陣
    ● 資料框
    ● 建立模型、進行預測

  ■ 比較一下三種模型的準確性,你可以觀察到哪一些現象?
    ●
    ●