## 文本分类示例####
##设置工作文件夹
setwd("/Users/Daitu/数据分析/CCF/Sogou画像")
getwd()
## [1] "/Users/daitu/数据分析/CCF/Sogou画像"
## 加载所需要的包
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(tm)
## Loading required package: NLP
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## 1. Creating a matrix
load("训练集分词结果.RData")
load("训练集分词标记.RData")
set.seed(1234)
index <- base::sample(1:nrow(resultsID),1000)
train_samp <- results[index]
trainID_sap <- resultsID[index,]
load("预测集分词标记.RData")
load("预测集分词结果.RData")
index <- base::sample(1:nrow(resultsID),100)
test_samp <- resultstest[index]
test_sap <- data.frame(ID = resultstestID[index,])
## 组合分词结果
traindata <- c(train_samp,test_samp)
length(traindata)
## [1] 1100
# CREATE THE DOCUMENT-TERM MATRIX
corpus = Corpus(VectorSource(traindata))
train_dtm = DocumentTermMatrix(corpus,
control = list(wordLengths = c(1, Inf),
weighting = weightTfIdf)) # 文档-词矩阵
## 查看词项数目
train_dtm
## <<DocumentTermMatrix (documents: 1100, terms: 64934)>>
## Non-/sparse entries: 216005/71211395
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## 该步骤维简化稀疏矩阵#####
# Start by removing sparse terms:
# This makes a matrix that is 10% empty space, maximum.
train_dtmr <- removeSparseTerms(train_dtm, 0.99)
train_dtmr
## <<DocumentTermMatrix (documents: 1100, terms: 2940)>>
## Non-/sparse entries: 99609/3134391
## Sparsity : 97%
## Maximal term length: 11
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Gender, trainSize=1:800,
testSize=801:1000, virgin=FALSE)
## Training models
system.time({
SVM <- train_model(container,"SVM")
})
## user system elapsed
## 3.737 0.016 3.756
## Classifying data using trained models
system.time({
SVM_CLASSIFY <- classify_model(container, SVM)
})
## user system elapsed
## 0.381 0.001 0.381
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 0.8
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.800 0.805 0.800
# CREATE THE data.frame SUMMARIES
topic_summary <- analytics@label_summary
alg_summary <- analytics@algorithm_summary
ens_summary <-analytics@ensemble_summary
doc_summary <- analytics@document_summary
## 用来预测----------------------------------------------------------------
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Gender, trainSize=1:1000,
testSize=1001:1100, virgin=FALSE)
## Training models
system.time({
SVM <- train_model(container,"SVM")
})
## user system elapsed
## 5.731 0.039 5.796
## Classifying data using trained models
system.time({
SVM_CLASSIFY <- classify_model(container, SVM)
})
## user system elapsed
## 0.370 0.003 0.375
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 1
##
##
## ALGORITHM PERFORMANCE
##
## results SVM_PRECISION SVM_RECALL SVM_FSCORE
## NaN NaN NaN NaN
# CREATE THE data.frame SUMMARIES
doc_summary <- analytics@document_summary
head(doc_summary)
## SVM_LABEL SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1 1 0.6557598 NA 1 1
## 2 1 0.9735428 NA 1 1
## 3 1 0.6103694 NA 1 1
## 4 1 0.8534269 NA 1 1
## 5 1 0.5416010 NA 1 1
## 6 1 0.7637904 NA 1 1
## CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 NA 1 NA
## 2 NA 1 NA
## 3 NA 1 NA
## 4 NA 1 NA
## 5 NA 1 NA
## 6 NA 1 NA
test_sap$Gender <- doc_summary$SVM_LABEL
## 年龄
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Age, trainSize=1:1000,
testSize=1001:1100, virgin=FALSE)
## Training models
system.time({
SVM <- train_model(container,"SVM")
})
## user system elapsed
## 7.963 0.043 8.030
## Classifying data using trained models
system.time({
SVM_CLASSIFY <- classify_model(container, SVM)
})
## user system elapsed
## 0.372 0.001 0.376
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 1
##
##
## ALGORITHM PERFORMANCE
##
## results SVM_PRECISION SVM_RECALL SVM_FSCORE
## NaN NaN NaN NaN
# CREATE THE data.frame SUMMARIES
doc_summary <- analytics@document_summary
head(doc_summary)
## SVM_LABEL SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1 2 0.4470032 NA 2 1
## 2 1 0.8410878 NA 1 1
## 3 3 0.3234500 NA 3 1
## 4 2 0.4241233 NA 2 1
## 5 2 0.3312853 NA 2 1
## 6 2 0.4570718 NA 2 1
## CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 NA 2 NA
## 2 NA 1 NA
## 3 NA 3 NA
## 4 NA 2 NA
## 5 NA 2 NA
## 6 NA 2 NA
test_sap$Age <- doc_summary$SVM_LABEL
## 学历
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Education, trainSize=1:1000,
testSize=1001:1100, virgin=FALSE)
## Training models
system.time({
SVM <- train_model(container,"SVM")
})
## user system elapsed
## 7.563 0.048 7.632
## Classifying data using trained models
system.time({
SVM_CLASSIFY <- classify_model(container, SVM)
})
## user system elapsed
## 0.351 0.000 0.352
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 1
##
##
## ALGORITHM PERFORMANCE
##
## results SVM_PRECISION SVM_RECALL SVM_FSCORE
## NaN NaN NaN NaN
# CREATE THE data.frame SUMMARIES
doc_summary <- analytics@document_summary
head(doc_summary)
## SVM_LABEL SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1 4 0.4601493 NA 4 1
## 2 5 0.7954812 NA 5 1
## 3 4 0.4070790 NA 4 1
## 4 4 0.4671398 NA 4 1
## 5 4 0.3647757 NA 4 1
## 6 4 0.4195077 NA 4 1
## CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 NA 4 NA
## 2 NA 5 NA
## 3 NA 4 NA
## 4 NA 4 NA
## 5 NA 4 NA
## 6 NA 4 NA
test_sap$Education <- doc_summary$SVM_LABEL
##
head(test_sap)
## ID Gender Age Education
## 1 7F9C488CE0E596D65003F5E98A8C1C75 1 2 4
## 2 9BA77811D651837F38A1D09099272B9F 1 1 5
## 3 1C5A8199F6E8EFEE446F56A662CFF046 1 3 4
## 4 5497635A57A4BF6D77E00F6E137419FA 1 2 4
## 5 7EB0AA8CE69AD46271FD0C1951958546 1 2 4
## 6 CCEB0DF1CBE19CD10C5D0C888320B1DC 1 2 4
# 1)提交的数据为csv格式,第一列为用户ID,第二列为年龄标签、第三列为性别标签、第四列为学历标签,全部以单空格分隔
#
# 2)文件采用“GBK”编码
## 保存数据
write.table(test_sap,file = "samp_sub.csv",quote = FALSE,sep = " ",
row.names = FALSE,col.names = FALSE,fileEncoding = "gbk")