## 文本分类示例####

##设置工作文件夹
setwd("/Users/Daitu/数据分析/CCF/Sogou画像")
getwd()
## [1] "/Users/daitu/数据分析/CCF/Sogou画像"
## 加载所需要的包
library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(tm)
## Loading required package: NLP
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 1. Creating a matrix
load("训练集分词结果.RData")
load("训练集分词标记.RData")


set.seed(1234)
index <- base::sample(1:nrow(resultsID),1000)

train_samp <- results[index]
trainID_sap <- resultsID[index,]

load("预测集分词标记.RData")
load("预测集分词结果.RData")

index <- base::sample(1:nrow(resultsID),100)

test_samp <- resultstest[index]
test_sap <- data.frame(ID = resultstestID[index,])
## 组合分词结果
traindata <- c(train_samp,test_samp)
length(traindata)
## [1] 1100
# CREATE THE DOCUMENT-TERM MATRIX
corpus = Corpus(VectorSource(traindata))
train_dtm = DocumentTermMatrix(corpus,
                               control = list(wordLengths = c(1, Inf),
                               weighting = weightTfIdf)) # 文档-词矩阵
## 查看词项数目
train_dtm
## <<DocumentTermMatrix (documents: 1100, terms: 64934)>>
## Non-/sparse entries: 216005/71211395
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## 该步骤维简化稀疏矩阵#####
#  Start by removing sparse terms:
# This makes a matrix that is 10% empty space, maximum.
train_dtmr <- removeSparseTerms(train_dtm, 0.99) 
train_dtmr
## <<DocumentTermMatrix (documents: 1100, terms: 2940)>>
## Non-/sparse entries: 99609/3134391
## Sparsity           : 97%
## Maximal term length: 11
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Gender, trainSize=1:800,
                              testSize=801:1000, virgin=FALSE)

## Training models
system.time({
  SVM <- train_model(container,"SVM")

})
##    user  system elapsed 
##   3.737   0.016   3.756
## Classifying data using trained models

system.time({
  SVM_CLASSIFY <- classify_model(container, SVM)
})
##    user  system elapsed 
##   0.381   0.001   0.381
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)

summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1               0.8
## 
## 
## ALGORITHM PERFORMANCE
## 
## SVM_PRECISION    SVM_RECALL    SVM_FSCORE 
##         0.800         0.805         0.800
# CREATE THE data.frame SUMMARIES
topic_summary <- analytics@label_summary
alg_summary <- analytics@algorithm_summary
ens_summary <-analytics@ensemble_summary
doc_summary <- analytics@document_summary
## 用来预测----------------------------------------------------------------

##Creating a container
container <- create_container(train_dtmr, trainID_sap$Gender, trainSize=1:1000,
                              testSize=1001:1100, virgin=FALSE)

## Training models
system.time({
  SVM <- train_model(container,"SVM")
})
##    user  system elapsed 
##   5.731   0.039   5.796
## Classifying data using trained models

system.time({
  SVM_CLASSIFY <- classify_model(container, SVM)
})
##    user  system elapsed 
##   0.370   0.003   0.375
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)

summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1                 1
## 
## 
## ALGORITHM PERFORMANCE
## 
##       results SVM_PRECISION    SVM_RECALL    SVM_FSCORE 
##           NaN           NaN           NaN           NaN
# CREATE THE data.frame SUMMARIES
doc_summary <- analytics@document_summary
head(doc_summary)
##   SVM_LABEL  SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1         1 0.6557598          NA              1               1
## 2         1 0.9735428          NA              1               1
## 3         1 0.6103694          NA              1               1
## 4         1 0.8534269          NA              1               1
## 5         1 0.5416010          NA              1               1
## 6         1 0.7637904          NA              1               1
##   CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                  NA                1                    NA
## 2                  NA                1                    NA
## 3                  NA                1                    NA
## 4                  NA                1                    NA
## 5                  NA                1                    NA
## 6                  NA                1                    NA
test_sap$Gender <- doc_summary$SVM_LABEL

## 年龄
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Age, trainSize=1:1000,
                              testSize=1001:1100, virgin=FALSE)

## Training models
system.time({
  SVM <- train_model(container,"SVM")
})
##    user  system elapsed 
##   7.963   0.043   8.030
## Classifying data using trained models

system.time({
  SVM_CLASSIFY <- classify_model(container, SVM)
})
##    user  system elapsed 
##   0.372   0.001   0.376
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)

summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1                 1
## 
## 
## ALGORITHM PERFORMANCE
## 
##       results SVM_PRECISION    SVM_RECALL    SVM_FSCORE 
##           NaN           NaN           NaN           NaN
# CREATE THE data.frame SUMMARIES
doc_summary <- analytics@document_summary
head(doc_summary)
##   SVM_LABEL  SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1         2 0.4470032          NA              2               1
## 2         1 0.8410878          NA              1               1
## 3         3 0.3234500          NA              3               1
## 4         2 0.4241233          NA              2               1
## 5         2 0.3312853          NA              2               1
## 6         2 0.4570718          NA              2               1
##   CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                  NA                2                    NA
## 2                  NA                1                    NA
## 3                  NA                3                    NA
## 4                  NA                2                    NA
## 5                  NA                2                    NA
## 6                  NA                2                    NA
test_sap$Age <- doc_summary$SVM_LABEL

## 学历
##Creating a container
container <- create_container(train_dtmr, trainID_sap$Education, trainSize=1:1000,
                              testSize=1001:1100, virgin=FALSE)

## Training models
system.time({
  SVM <- train_model(container,"SVM")
})
##    user  system elapsed 
##   7.563   0.048   7.632
## Classifying data using trained models

system.time({
  SVM_CLASSIFY <- classify_model(container, SVM)
})
##    user  system elapsed 
##   0.351   0.000   0.352
## 5. Analytics
analytics <- create_analytics(container,SVM_CLASSIFY)

summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1                 1
## 
## 
## ALGORITHM PERFORMANCE
## 
##       results SVM_PRECISION    SVM_RECALL    SVM_FSCORE 
##           NaN           NaN           NaN           NaN
# CREATE THE data.frame SUMMARIES
doc_summary <- analytics@document_summary
head(doc_summary)
##   SVM_LABEL  SVM_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1         4 0.4601493          NA              4               1
## 2         5 0.7954812          NA              5               1
## 3         4 0.4070790          NA              4               1
## 4         4 0.4671398          NA              4               1
## 5         4 0.3647757          NA              4               1
## 6         4 0.4195077          NA              4               1
##   CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                  NA                4                    NA
## 2                  NA                5                    NA
## 3                  NA                4                    NA
## 4                  NA                4                    NA
## 5                  NA                4                    NA
## 6                  NA                4                    NA
test_sap$Education <- doc_summary$SVM_LABEL


## 
head(test_sap)
##                                 ID Gender Age Education
## 1 7F9C488CE0E596D65003F5E98A8C1C75      1   2         4
## 2 9BA77811D651837F38A1D09099272B9F      1   1         5
## 3 1C5A8199F6E8EFEE446F56A662CFF046      1   3         4
## 4 5497635A57A4BF6D77E00F6E137419FA      1   2         4
## 5 7EB0AA8CE69AD46271FD0C1951958546      1   2         4
## 6 CCEB0DF1CBE19CD10C5D0C888320B1DC      1   2         4
# 1)提交的数据为csv格式,第一列为用户ID,第二列为年龄标签、第三列为性别标签、第四列为学历标签,全部以单空格分隔
# 
# 2)文件采用“GBK”编码
## 保存数据
write.table(test_sap,file = "samp_sub.csv",quote = FALSE,sep = " ",
          row.names = FALSE,col.names = FALSE,fileEncoding = "gbk")