Refer https://github.com/victorneo/Twitter-Sentimental-Analysis/blob/master/happy.txt
https://chengjun.github.io/en/2014/04/sentiment-analysis-with-machine-learning-in-R/
http://www.svm-tutorial.com/2014/11/svm-classify-text-r/
This is a tutorial for illustrating how to use R for machine learning.
########################################################################################
# "Sentiment analysis with machine learning"
# Under the hood, RTextTools uses the e1071 package which is a R wrapper around libsvm;
#######################################################################################
library(RTextTools)
library(e1071)
####
# TMS.R is a rJava wrapping of the Korean Morphlogical Analyzer from Saltlux Inc.
####
source(file="/Users/ivan/R/TMS.R")
content <- "중국 간쑤성에서 한 남성이 총기를 난사해 11명이 죽거나 다치는 사건이 발생했다."
posTag <- TMS$getPOS(content)
posTag
## [1] "KOR `중국`[`중국`/NN:0:2] `간쑤성에서`[`간쑤성`/NN:3:6+`에서`/J:6:8] `한`[`한`/M:9:10] `남성이`[`남성`/NN:11:13+`이`/J:13:14] `총기를`[`총기`/NN:15:17+`를`/J:17:18] `난사해`[`난사`/NN:19:21+`해`/X:21:22] `11명이`[`11`/SN:23:25+`명`/NN:25:26+`이`/J:26:27] `죽거나`[`죽`/VV:28:29+`거나`/E:29:31] `다치는`[`다치`/VV:32:34+`는`/E:34:35] `사건이`[`사건`/NN:36:38+`이`/J:38:39] `발생했다.`[`발생`/NN:40:42+`했`/X:42:43+`다`/E:43:44+`.`/S:44:45]\n"
chunked <- TMS$getChunked(content)
chunked
## [1] "중국_간쑤성 중국 간쑤성\n남성\n총기\n난사\n11명\n사건\n발생\n\n"
om_features <- TMS$getFeaturesOM(content)
om_features
## [1] "중국/NN 간쑤성/NN 한/M 남성/NN 총기/NN 난사/NN 해/X 명/NN 죽/VV 다치/VV 사건/NN 발생/NN 했/X"
#training set for positive labels
pos_tweets = rbind(
c(TMS$getFeaturesOM('차사고 싶다. 쉐보레껄루.. 이너넷보면 흉기차흉기차하고 삼성차도 별로란 거 보면 그나마 쉐보레가 낫지 않을까. 연비 좀 떨어져도..ㅠㅠ 비싸도.ㅠㅠ 지인은 평소엔 당연 연비생각했는데, 사고 한번 나보니까 다음엔 무조건 SUV타고 싶단다.ㅋㅋ'), 'positive'),
c(TMS$getFeaturesOM('소니가 렌즈는 괜찮은데 아직까진 캐논이 갑...'), 'positive'),
c(TMS$getFeaturesOM('겨울왕국다시봐도 감동 꿀잼이다ㅜㅜ'), 'positive'),
c(TMS$getFeaturesOM('정말 재미 하나도 없는 런닝맨..게스트가 지난주에 이어 더욱 형편 없는 돌대가리를 선정했군.. 이제 런닝맨도 폐지될 날만 남은 듯..'), 'positive'),
c(TMS$getFeaturesOM('런닝맨이 정말 재미있어'), 'positive')
)
#training set for negative labels
neg_tweets = rbind(
c(TMS$getFeaturesOM('런닝맨에서 실망한거 신화방송에서 힐링ㅋㅋ'), 'negative'),
c(TMS$getFeaturesOM('겨울왕국 유치하고 재미없더라..'), 'negative'),
c(TMS$getFeaturesOM('코카콜라 전제품 가격 또 올렸다면서?ㅋ 안마셔 이 개새끼들아!'), 'negative'),
c(TMS$getFeaturesOM('아런닝맨 좀 폐지하면 안되냐 재미 드럽게 없는데'), 'negative'),
c(TMS$getFeaturesOM('코카콜라만 먹지마라는게 아님 코카콜라라벨붙은건 다 먹지마세여'), 'negative')
)
#testing set (evaluation)
# 2 = positive ; 1 = negative
test_tweets = rbind(
c(TMS$getFeaturesOM('런닝맨이 보고있다. 많이 웃고싶어. 정말 재미있네~'), 'positive'),
c(TMS$getFeaturesOM('극장을 못가서 올레tv로 겨울왕국 사서 봤는데 시발 존잼 꿀잼'), 'positive'),
c(TMS$getFeaturesOM('내가 볼때 참 신기한 예능프로는 런닝맨. 이건 진짜 뭔 재미로 보는건지'), 'negative'),
c(TMS$getFeaturesOM('까놓고 말함 코카콜라 개새끼 망해라'), 'negative'),
c(TMS$getFeaturesOM('미친 코카콜라가 3000원을넘기다니 '), 'negative')
)
tweets = rbind(pos_tweets, neg_tweets, test_tweets)
# build dtm
matrix= create_matrix(tweets[,1], language="english",
removeStopwords=FALSE, removeNumbers=TRUE, # we can also removeSparseTerms
stemWords=FALSE)
# build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(tweets[,2])),
trainSize=1:10, testSize=11:15,virgin=FALSE)
# train a SVM Model
model_svm <- train_model(container, "SVM", kernel="linear", cost=1)
# test the validity
predicted = predict(model_svm, matrix[11:15,]); predicted
## 1 2 3 4 5
## 2 1 1 1 1
## Levels: 1 2
#confusion matrix
table(tweets[11:15, 2], predicted)
## predicted
## 1 2
## negative 3 0
## positive 1 1
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), predicted)
## [1] 0.8
# training several algorithms and make ensemble engine ?
# 2 = positive ; 1 = negative
# Second, to train the model with multiple machine learning algorithms:
models = train_models(container, algorithms=c("MAXENT" , "SVM", "RF", "BAGGING", "TREE"))
results = classify_models(container, models)
# accuracy table
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"FORESTS_LABEL"])
##
## 1 2
## 1 3 0
## 2 1 1
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"MAXENTROPY_LABEL"])
##
## 1 2
## 1 2 1
## 2 1 1
# recall accuracy
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"FORESTS_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"MAXENTROPY_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])
## [1] 0.4
# model summary
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.80
## n >= 2 1.0 0.80
## n >= 3 1.0 0.80
## n >= 4 0.8 0.75
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.415 0.415 0.400
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 0.875 0.750 0.765
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 0.875 0.750 0.765
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 0.300 0.500 0.375
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 0.585 0.585 0.585
head(analytics@document_summary)
## MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB FORESTS_LABEL
## 1 2 0.9999615 1 0.9866280 2
## 2 1 0.8676100 2 0.5794699 1
## 3 2 0.7206822 1 0.6577971 1
## 4 1 0.9669875 2 0.7324802 1
## 5 1 0.8912894 2 0.6607808 1
## FORESTS_PROB BAGGING_LABEL BAGGING_PROB TREE_LABEL TREE_PROB MANUAL_CODE
## 1 0.620 2 0.60 1 0.5 2
## 2 0.650 1 0.96 1 0.5 2
## 3 0.595 1 0.72 1 0.5 1
## 4 0.740 1 0.80 1 0.5 1
## 5 0.720 1 0.80 1 0.5 1
## CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE
## 1 2 3 0 2
## 2 1 4 1 1
## 3 1 4 0 2
## 4 1 4 0 1
## 5 1 4 0 1
## PROBABILITY_INCORRECT
## 1 0
## 2 1
## 3 1
## 4 0
## 5 0
analytics@ensemble_summary
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.80
## n >= 2 1.0 0.80
## n >= 3 1.0 0.80
## n >= 4 0.8 0.75