packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
# Sys.setlocale("LC_ALL","C")
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)# Read in the data
tweets = read.csv("data/tweets.csv", stringsAsFactors=FALSE)因為情緒標的記值偏高,先調整目標變數的編碼,讓1和0之間比較平衡:
tweets$Negative = as.factor(tweets$Avg <= -1)
prop.table(table(tweets$Negative))##
## FALSE TRUE
## 0.84589 0.15411
# create corpus from vector
corpus = Corpus(VectorSource(tweets$Tweet))
corpus[[1]]$content # content of the first document## [1] "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore"
corpus = tm_map(corpus, content_transformer(tolower))
corpus[[1]]$content # content of the first document## [1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"
# some version of tm may have to do the commmad below
# corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus[[1]]$content## [1] "i have to say apple has by far the best customer care service i have ever received apple appstore"
stopwords("english")[1:10]## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
corpus = tm_map(corpus, removeWords, c("apple", stopwords("english")))
corpus[[1]]$content## [1] " say far best customer care service ever received appstore"
corpus = tm_map(corpus, stemDocument)
corpus[[1]]$content## [1] "say far best custom care servic ever receiv appstor"
frequencies = DocumentTermMatrix(corpus)
frequencies## <<DocumentTermMatrix (documents: 1181, terms: 3289)>>
## Non-/sparse entries: 8980/3875329
## Sparsity : 100%
## Maximal term length: 115
## Weighting : term frequency (tf)
# Look at matrix
inspect(frequencies[1000:1005,505:515])## <<DocumentTermMatrix (documents: 6, terms: 11)>>
## Non-/sparse entries: 1/65
## Sparsity : 98%
## Maximal term length: 23
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs asap courtsideassistappforio current follow idea kickbutt preinstal
## 1000 0 0 0 0 1 0 0
## 1001 0 0 0 0 0 0 0
## 1002 0 0 0 0 0 0 0
## 1003 0 0 0 0 0 0 0
## 1004 0 0 0 0 0 0 0
## 1005 0 0 0 0 0 0 0
## Terms
## Docs save ssd support
## 1000 0 0 0
## 1001 0 0 0
## 1002 0 0 0
## 1003 0 0 0
## 1004 0 0 0
## 1005 0 0 0
findFreqTerms(frequencies, lowfreq=20)## [1] "say" "love" "iphon"
## [4] "iphone5" "new" "thank"
## [7] "phone" "can" "make"
## [10] "market" "one" "will"
## [13] "cant" "get" "just"
## [16] "updat" "fingerprint" "iphone5c"
## [19] "store" "time" "come"
## [22] "now" "use" "back"
## [25] "anyon" "work" "app"
## [28] "android" "think" "ipad"
## [31] "well" "freak" "dont"
## [34] "via" "better" "like"
## [37] "pleas" "samsung" "want"
## [40] "batteri" "ios7" "microsoft"
## [43] "itun" "buy" "releas"
## [46] "look" "appl" "need"
## [49] "googl" "twitter" "ipod"
## [52] "ipodplayerpromo" "promoipodplayerpromo" "lol"
## [55] "realli" "promo"
sparse = removeSparseTerms(frequencies, 0.995)
sparse## <<DocumentTermMatrix (documents: 1181, terms: 309)>>
## Non-/sparse entries: 4669/360260
## Sparsity : 99%
## Maximal term length: 20
## Weighting : term frequency (tf)
# Convert to a data frame
tweetsSparse = as.data.frame(as.matrix(sparse))
# Make all variable names R-friendly
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))
# Add target variable
tweetsSparse$Negative = tweets$Negativelibrary(caTools)
set.seed(123)
split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse = subset(tweetsSparse, split==TRUE)
testSparse = subset(tweetsSparse, split==FALSE)# quiz: find the words whose frequency is greater or equal than 100
findFreqTerms(frequencies, lowfreq=100)## [1] "iphon" "new" "itun"
library(rpart)
library(rpart.plot)
tweetCART = rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)# Evaluate the performance of the model
predictCART = predict(tweetCART, newdata=testSparse, type="class")
table(testSparse$Negative, predictCART)## predictCART
## FALSE TRUE
## FALSE 294 6
## TRUE 37 18
(294+18)/(294+6+37+18) # ACC = 0.87887## [1] 0.87887
# Baseline accuracy
table(testSparse$Negative)##
## FALSE TRUE
## 300 55
300/(300+55) # ACC = 0.84507## [1] 0.84507
library(randomForest)
set.seed(123)
tweetRF = randomForest(Negative ~ ., data=trainSparse)# Make predictions:
predictRF = predict(tweetRF, newdata=testSparse)
table(testSparse$Negative, predictRF) %>% {sum(diag(.)) / sum(.)}## [1] 0.88732
# (293+21)/(293+7+34+23) # ACC = 0.87955## quiz: glm model
glm1 = glm(Negative ~ ., data=trainSparse, family = 'binomial')## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred = predict(glm1, testSparse, type='response')## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
table(testSparse$Negative, pred > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.80845
# Test ACC = 0.80845pred2 = predict(glm1, type='response')
table(trainSparse$Negative, pred2 > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.94915
# Train ACC = 0.94915
討論議題:
■ 文字分析的流程
● 從文件向量建立文集
● 文集整理 (字碼、小寫、標點、贅字、字根)
● 文件字詞矩陣
● 資料框
● 建立模型、進行預測
■ 比較一下三種模型的準確性,你可以觀察到哪一些現象?
●
●