AS10-0A：Twitter 情緒分析

packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

rm(list=ls(all=TRUE))
# Sys.setlocale("LC_ALL","C")
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)

# Read in the data
tweets = read.csv("data/tweets.csv", stringsAsFactors=FALSE)

調整目標變數

因為情緒標的記值偏高，先調整目標變數的編碼，讓1和0之間比較平衡：

tweets$Negative = as.factor(tweets$Avg <= -1)
prop.table(table(tweets$Negative))

## 
##   FALSE    TRUE 
## 0.84589 0.15411

文集整理

建立文集 (Coupus)

# create corpus from vector
corpus = Corpus(VectorSource(tweets$Tweet))
corpus[[1]]$content    # content of the first document

## [1] "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore"

轉為小寫

corpus = tm_map(corpus, content_transformer(tolower))
corpus[[1]]$content    # content of the first document

## [1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"

移除標點

# some version of tm may have to do the commmad below
# corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus[[1]]$content

## [1] "i have to say apple has by far the best customer care service i have ever received apple appstore"

去除贅字

stopwords("english")[1:10]

##  [1] "i"         "me"        "my"        "myself"    "we"       
##  [6] "our"       "ours"      "ourselves" "you"       "your"

corpus = tm_map(corpus, removeWords, c("apple", stopwords("english")))
corpus[[1]]$content

## [1] "   say    far  best customer care service   ever received  appstore"

字根還原

corpus = tm_map(corpus, stemDocument)
corpus[[1]]$content

## [1] "say far best custom care servic ever receiv appstor"

文件字詞矩陣 (字頻表，DTM)

建立文件字詞矩陣 (Document Term Matrix)

frequencies = DocumentTermMatrix(corpus)
frequencies

## <<DocumentTermMatrix (documents: 1181, terms: 3289)>>
## Non-/sparse entries: 8980/3875329
## Sparsity           : 100%
## Maximal term length: 115
## Weighting          : term frequency (tf)

# Look at matrix 
inspect(frequencies[1000:1005,505:515])

## <<DocumentTermMatrix (documents: 6, terms: 11)>>
## Non-/sparse entries: 1/65
## Sparsity           : 98%
## Maximal term length: 23
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   asap courtsideassistappforio current follow idea kickbutt preinstal
##   1000    0                       0       0      0    1        0         0
##   1001    0                       0       0      0    0        0         0
##   1002    0                       0       0      0    0        0         0
##   1003    0                       0       0      0    0        0         0
##   1004    0                       0       0      0    0        0         0
##   1005    0                       0       0      0    0        0         0
##       Terms
## Docs   save ssd support
##   1000    0   0       0
##   1001    0   0       0
##   1002    0   0       0
##   1003    0   0       0
##   1004    0   0       0
##   1005    0   0       0

findFreqTerms(frequencies, lowfreq=20)

##  [1] "say"                  "love"                 "iphon"               
##  [4] "iphone5"              "new"                  "thank"               
##  [7] "phone"                "can"                  "make"                
## [10] "market"               "one"                  "will"                
## [13] "cant"                 "get"                  "just"                
## [16] "updat"                "fingerprint"          "iphone5c"            
## [19] "store"                "time"                 "come"                
## [22] "now"                  "use"                  "back"                
## [25] "anyon"                "work"                 "app"                 
## [28] "android"              "think"                "ipad"                
## [31] "well"                 "freak"                "dont"                
## [34] "via"                  "better"               "like"                
## [37] "pleas"                "samsung"              "want"                
## [40] "batteri"              "ios7"                 "microsoft"           
## [43] "itun"                 "buy"                  "releas"              
## [46] "look"                 "appl"                 "need"                
## [49] "googl"                "twitter"              "ipod"                
## [52] "ipodplayerpromo"      "promoipodplayerpromo" "lol"                 
## [55] "realli"               "promo"

移除頻率太低的字詞

sparse = removeSparseTerms(frequencies, 0.995)
sparse

## <<DocumentTermMatrix (documents: 1181, terms: 309)>>
## Non-/sparse entries: 4669/360260
## Sparsity           : 99%
## Maximal term length: 20
## Weighting          : term frequency (tf)

模型與預測

轉成資料框

# Convert to a data frame
tweetsSparse = as.data.frame(as.matrix(sparse))
# Make all variable names R-friendly
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))
# Add target variable
tweetsSparse$Negative = tweets$Negative

資料分割

library(caTools)
set.seed(123)
split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse = subset(tweetsSparse, split==TRUE)
testSparse = subset(tweetsSparse, split==FALSE)

# quiz: find the words whose frequency is greater or equal than 100
findFreqTerms(frequencies, lowfreq=100)

## [1] "iphon" "new"   "itun"

分類決策樹模型

library(rpart)
library(rpart.plot)
tweetCART = rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)

驗證模型的準確性

# Evaluate the performance of the model
predictCART = predict(tweetCART, newdata=testSparse, type="class")
table(testSparse$Negative, predictCART)

##        predictCART
##         FALSE TRUE
##   FALSE   294    6
##   TRUE     37   18

(294+18)/(294+6+37+18)  # ACC = 0.87887

## [1] 0.87887

與底線準確性相比較

# Baseline accuracy 
table(testSparse$Negative)

## 
## FALSE  TRUE 
##   300    55

300/(300+55)            # ACC = 0.84507

## [1] 0.84507

隨機森林模型

library(randomForest)
set.seed(123)
tweetRF = randomForest(Negative ~ ., data=trainSparse)

# Make predictions:
predictRF = predict(tweetRF, newdata=testSparse)
table(testSparse$Negative, predictRF)  %>% {sum(diag(.)) / sum(.)}

## [1] 0.88732

# (293+21)/(293+7+34+23) # ACC = 0.87955

一般線性模型

## quiz: glm model
glm1 = glm(Negative ~ ., data=trainSparse, family = 'binomial')

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

pred = predict(glm1, testSparse, type='response')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

table(testSparse$Negative, pred > 0.5) %>% {sum(diag(.)) / sum(.)}

## [1] 0.80845

# Test ACC = 0.80845

pred2 = predict(glm1, type='response')
table(trainSparse$Negative, pred2 > 0.5) %>% {sum(diag(.)) / sum(.)}

## [1] 0.94915

# Train ACC = 0.94915

討論議題：
■ 文字分析的流程
● 從文件向量建立文集
● 文集整理 (字碼、小寫、標點、贅字、字根)
● 文件字詞矩陣
● 資料框
● 建立模型、進行預測

■ 比較一下三種模型的準確性，你可以觀察到哪一些現象？
●
●

AS10-0A：Twitter 情緒分析

卓雍然 D994010001

2018-08-12 14:36:26

調整目標變數

文集整理

建立文集 (Coupus)

轉為小寫

移除標點

去除贅字

字根還原

文件字詞矩陣 (字頻表，DTM)

建立文件字詞矩陣 (Document Term Matrix)

移除頻率太低的字詞

模型與預測

轉成資料框

資料分割

分類決策樹模型

驗證模型的準確性

與底線準確性相比較

隨機森林模型

一般線性模型