Performing Text Mining With Machine Learning Algorithm For Document Classification Of Reuters Dataset And Observing Performance Of Different Models For Classsification
knitr::opts_chunk$set(message = FALSE, echo = TRUE)
library("tm")
library("plyr")
library("class")
library("SnowballC")
library("stringr")
library("wordcloud")
library("RTextTools")
library("e1071")
library("tidyr")
library("DT")
library("ggplot2")Downloaded the Reuters Dataset from
Reuters-21578 DataSet can be downloaded as zip from : http://disi.unitn.it/moschitti/corpora.htm
Uploaded at https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week10A/Data
For this assignment, the zip was downloaded and extracted locally for different categories . The Reuters dataset consists of 90 categories and classified into Test and Training as separate dataset for each category.
Here the categories of ‘Trade’ and ‘MoneyFx’ is used to perform document classification.
options(stringsAsFactors = FALSE)
datapath <- "F:/Data/Reuters21578_90Cat/training"
category <- c("trade", "moneyfx")
# Trainig Data for Trade and MoneyFx categories
trade.directory <- "F:/Data/Reuters21578_90Cat/training/trade"
moneyfx.directory <- "F:/Data/Reuters21578_90Cat/training/moneyfx"
# Test Data for Trade and MoneyFx categories
tradetest.dir <- "F:/Data/Reuters21578_90Cat/test/trade"
moneyfxtest.dir <- "F:/Data/Reuters21578_90Cat/test/moneyfx"The Corpus for ‘Trade’ and ‘MoneyFx’ categories need to be cleaned of punctuations, numbers and commong english language words. Doing so, facilitates the relevant terms to surface for text mining that would help build classification model.
cleanupDataCorpus <- function(dataCorpus) {
cleanDataCorpus <- tm_map(dataCorpus, removePunctuation)
cleanDataCorpus <- tm_map(cleanDataCorpus, removeNumbers)
cleanDataCorpus <- tm_map(cleanDataCorpus, str_replace_all, pattern = "[[:punct:]]",
replacement = " ")
cleanDataCorpus <- tm_map(cleanDataCorpus, tolower)
cleanDataCorpus <- tm_map(cleanDataCorpus, removeWords, c("said", "u.s", stopwords("english")))
cleanDataCorpus <- tm_map(cleanDataCorpus, stemDocument)
cleanDataCorpus <- tm_map(cleanDataCorpus, PlainTextDocument)
return(cleanDataCorpus)
}trade.corpus <- Corpus(DirSource(directory = trade.directory, encoding = "ASCII"))
moneyfx.corpus <- Corpus(DirSource(directory = moneyfx.directory, encoding = "ASCII"))
trade.cleancorpus <- cleanupDataCorpus(trade.corpus)
moneyfx.cleancorpus <- cleanupDataCorpus(moneyfx.corpus)
trade.tdm <- TermDocumentMatrix(trade.cleancorpus)
moneyfx.tdm <- TermDocumentMatrix(moneyfx.cleancorpus)
trade.tdm <- removeSparseTerms(trade.tdm, 0.7)
moneyfx.tdm <- removeSparseTerms(moneyfx.tdm, 0.7)
trade.datamatx <- t(data.matrix(trade.tdm))
moneyfx.datamatx <- t(data.matrix(moneyfx.tdm))
head(trade.datamatx)## Terms
## Docs billion deficit dlrs exports foreign imports japan last
## character(0) 3 0 3 1 3 0 0 1
## character(0) 4 0 1 3 0 3 1 0
## character(0) 2 1 2 0 0 1 0 1
## character(0) 0 0 0 0 0 0 0 0
## character(0) 1 3 1 0 1 1 8 2
## character(0) 2 0 2 1 1 0 0 3
## Terms
## Docs march mln surplus told trad trade washington will year
## character(0) 0 2 1 0 0 3 0 0 1
## character(0) 1 1 4 0 0 2 0 0 3
## character(0) 1 3 4 1 0 0 0 0 1
## character(0) 1 0 0 1 1 2 0 3 0
## character(0) 1 0 1 0 4 11 1 3 1
## character(0) 1 0 4 1 0 4 2 0 6
head(moneyfx.datamatx)## Terms
## Docs bank billion central dollar exchange march market money pct
## character(0) 4 0 0 0 0 1 1 1 10
## character(0) 3 0 0 0 2 1 6 4 1
## character(0) 2 0 1 2 3 1 0 0 0
## character(0) 1 4 1 0 0 2 2 2 2
## character(0) 1 1 0 0 0 1 2 2 0
## character(0) 1 0 0 0 0 1 2 2 0
## Terms
## Docs rate today
## character(0) 0 0
## character(0) 0 0
## character(0) 1 0
## character(0) 0 0
## character(0) 0 2
## character(0) 0 0
trade.df <- as.data.frame(trade.datamatx)
moneyfx.df <- as.data.frame(moneyfx.datamatx)tradetest.corpus <- Corpus(DirSource(directory = tradetest.dir, encoding = "ASCII"))
moneyfxtest.corpus <- Corpus(DirSource(directory = moneyfxtest.dir, encoding = "ASCII"))
tradetest.cleancorpus <- cleanupDataCorpus(tradetest.corpus)
moneyfxtest.cleancorpus <- cleanupDataCorpus(moneyfxtest.corpus)
tradetest.tdm <- TermDocumentMatrix(tradetest.cleancorpus)
moneyfxtest.tdm <- TermDocumentMatrix(moneyfxtest.cleancorpus)
tradetest.tdm <- removeSparseTerms(tradetest.tdm, 0.7)
moneyfxtest.tdm <- removeSparseTerms(moneyfxtest.tdm, 0.7)
tradetest.datamatx <- t(data.matrix(tradetest.tdm))
moneyfxtest.datamatx <- t(data.matrix(moneyfxtest.tdm))
head(tradetest.datamatx)## Terms
## Docs also april billion deficit dlrs exports foreign imports
## character(0) 4 2 5 0 6 4 1 4
## character(0) 0 1 6 2 0 1 0 2
## character(0) 4 2 1 0 1 4 0 4
## character(0) 1 1 9 1 6 2 5 2
## character(0) 0 1 0 0 0 1 0 1
## character(0) 1 1 0 0 0 0 0 0
## Terms
## Docs japan june last market mln new pct surplus tariffs told
## character(0) 9 0 4 1 1 1 2 3 4 1
## character(0) 0 0 0 0 0 0 10 0 0 0
## character(0) 14 0 1 7 0 1 0 0 0 0
## character(0) 0 0 2 1 1 0 6 13 0 1
## character(0) 3 0 0 0 0 0 0 0 1 0
## character(0) 2 0 0 1 0 1 0 0 1 1
## Terms
## Docs trad trade washington will year
## character(0) 1 14 1 2 4
## character(0) 0 2 0 0 2
## character(0) 0 4 0 4 1
## character(0) 0 6 1 0 6
## character(0) 1 3 1 1 0
## character(0) 2 3 2 3 0
head(moneyfxtest.datamatx)## Terms
## Docs april bank dollar market new pct rate rates west
## character(0) 2 2 0 4 0 2 1 1 0
## character(0) 1 1 0 3 0 0 0 0 0
## character(0) 1 4 7 3 0 2 1 4 0
## character(0) 1 3 3 0 1 0 0 0 0
## character(0) 1 3 0 2 0 2 0 0 0
## character(0) 1 0 4 0 0 1 2 3 7
tradetest.df <- as.data.frame(tradetest.datamatx)
moneyfxtest.df <- as.data.frame(moneyfxtest.datamatx)Filtering for columns present in test and training and then adding the columns/terms present only in training. Attempt to keep the same terms in test and training
trade.df.mat <- as.matrix(trade.df)
tradetest.df.mat <- as.matrix(tradetest.df)
tradetestfiltered1 <- data.frame(tradetest.df.mat[, intersect(colnames(tradetest.df.mat),
colnames(trade.df.mat))])
tradetestfiltered2 <- read.table(textConnection(""), col.names = colnames(trade.df.mat))
tradetestfiltered <- rbind.fill(tradetestfiltered1, tradetestfiltered2)
ncol(tradetestfiltered)## [1] 17
moneyfx.df.mat <- as.matrix(moneyfx.df)
moneyfxtest.df.mat <- as.matrix(moneyfxtest.df)
moneyfxtestfiltered1 <- data.frame(moneyfxtest.df.mat[, intersect(colnames(moneyfxtest.df.mat),
colnames(moneyfx.df.mat))])
moneyfxtestfiltered2 <- read.table(textConnection(""), col.names = colnames(moneyfx.df.mat))
moneyfxtestfiltered <- rbind.fill(moneyfxtestfiltered1, moneyfxtestfiltered2)
ncol(moneyfxtestfiltered)## [1] 11
trade.df <- cbind(trade.df, category = rep("trade"))
moneyfx.df <- cbind(moneyfx.df, category = rep("moneyfx"))
tradetestfiltered <- cbind(tradetestfiltered, category = rep("trade"))
moneyfxtestfiltered <- cbind(moneyfxtestfiltered, category = rep("moneyfx"))We now have two dataframes , one each category , with the content as rows and the terms as variables / columns, and a category column appended to denote the category.
We now stack both the tdms
tdm.stackedcategory <- rbind.fill(trade.df, moneyfx.df)
nrow(trade.df)## [1] 369
nrow(moneyfx.df)## [1] 538
nrow(tdm.stackedcategory)## [1] 907
tdm.stackedcategory[is.na(tdm.stackedcategory)] <- 0
ncol(trade.df)## [1] 18
ncol(moneyfx.df)## [1] 12
ncol(tdm.stackedcategory)## [1] 27
tdm.teststackedcategory <- rbind.fill(tradetestfiltered, moneyfxtestfiltered)
nrow(tradetestfiltered)## [1] 117
nrow(moneyfxtestfiltered)## [1] 179
nrow(tdm.teststackedcategory)## [1] 296
tdm.teststackedcategory[is.na(tdm.teststackedcategory)] <- 0
ncol(tradetestfiltered)## [1] 18
ncol(moneyfxtestfiltered)## [1] 12
ncol(tdm.teststackedcategory)## [1] 27
We now see the number of total rows in stacked training TDM and stacked test TDM is equivalent to the sum of total rows for each category TDM.
findAssocs(tdm.stackedcategory, terms = “billion”, corlimit = 0.3)
Trade Word Cloud
We find that words like ‘trade’,‘billion’, ‘year’ have been more prominently used
# Word Cloud for Trade
wordcloud(trade.cleancorpus, max.words = 20, random.order = FALSE, colors = brewer.pal(8,
"Dark2"), scale = c(5, 0.5), vfont = c("gothic english", "plain"))Moneyyfx Word Cloud
We find that words like ‘bank’,‘market’, ‘pct’ ‘Ddollar’ have been more prominently used.
# Word Cloud for Moneyfx
wordcloud(moneyfx.cleancorpus, max.words = 20, random.order = FALSE, colors = brewer.pal(8,
"Dark2"), scale = c(5, 0.5), vfont = c("serif", "plain"))Here we separate the training data and test data both from the category column so as to provide to train the model and compare it later for accuracy.
# training data with only category column specified
tdm.traincategorycol <- tdm.stackedcategory[, "category"]
# training data without category column specified
tdm.train <- tdm.stackedcategory[, !colnames(tdm.stackedcategory) %in% "category"]
# test data with only category column specified
tdm.testcategorycol <- tdm.teststackedcategory[, "category"]
# test data without category column specified
tdm.test <- tdm.teststackedcategory[, !colnames(tdm.teststackedcategory) %in% "category"]knn.pred <- knn(tdm.train, tdm.test, tdm.traincategorycol)
# Accuracy Test
confusion_matx <- table(Predictions = knn.pred, Actual = tdm.testcategorycol)
confusion_matx## Actual
## Predictions moneyfx trade
## moneyfx 179 5
## trade 0 112
# Calculating Accuracy
accuracy_1 <- sum(diag(confusion_matx))/nrow(tdm.test)
accuracy_1## [1] 0.9831081
# Another way to calculate accuracy
true_moneyfx <- 179
true_trade <- 112
false_moneyfx <- 5
false_trade <- 0
total <- true_moneyfx + true_trade + false_moneyfx + false_trade
accuracy_2 <- (true_moneyfx + true_trade)/total
accuracy_2## [1] 0.9831081
error <- 1 - accuracy_2
error## [1] 0.01689189
container <- create_container(tdm.train, tdm.stackedcategory$category, trainSize = 1:907,
virgin = FALSE)
predtestcontainer <- create_container(tdm.test, labels = 0:296, testSize = 1:296,
virgin = FALSE)SVM <- train_model(container, "SVM")
results.SVM <- classify_model(predtestcontainer, SVM)
datatable(results.SVM)MAXENT <- train_model(container, "MAXENT")
results.MAXENT <- classify_model(predtestcontainer, MAXENT)
datatable(results.MAXENT)BAGGING <- train_model(container, "BAGGING")
results.BAGGING <- classify_model(predtestcontainer, BAGGING)
datatable(results.BAGGING)RF <- train_model(container, "RF")
results.RF <- classify_model(predtestcontainer, RF)
datatable(results.RF)TREE <- train_model(container, "TREE")
results.TREE <- classify_model(predtestcontainer, TREE)
datatable(results.TREE)cat_check <- data.frame(correct_cat = tdm.testcategorycol[1:296], svm = as.character(results.SVM[,
1]), maxent = as.character(results.MAXENT[, 1]), bagging = as.character(results.BAGGING[,
1]), rf = as.character(results.RF[, 1]), tree = as.character(results.TREE[, 1]),
stringAsFactors = F)
# SVM Performance
svm_table <- table(cat_check[, 1] == cat_check[, 2])
addmargins(svm_table)##
## FALSE TRUE Sum
## 6 290 296
psvm <- prop.table(svm_table)
psvm <- as.data.frame(psvm)
psvm$Model <- "SVM"
# MAXENT Performance
maxent_table <- table(cat_check[, 1] == cat_check[, 3])
addmargins(maxent_table)##
## FALSE TRUE Sum
## 47 249 296
pmaxent <- prop.table(maxent_table)
pmaxent <- as.data.frame(pmaxent)
pmaxent$Model <- "MAXENT"
# Bagging Performance
bagging_table <- table(cat_check[, 1] == cat_check[, 4])
addmargins(bagging_table)##
## FALSE TRUE Sum
## 10 286 296
pbag <- prop.table(bagging_table)
pbag <- as.data.frame(pbag)
pbag$Model <- "BAGGING"
# RF Performanace
rf_table <- table(cat_check[, 1] == cat_check[, 5])
addmargins(rf_table)##
## FALSE TRUE Sum
## 2 294 296
prf <- prop.table(rf_table)
prf <- as.data.frame(prf)
prf$Model <- "RF"
# Tree Performanace
tree_table <- table(cat_check[, 1] == cat_check[, 6])
addmargins(tree_table)##
## FALSE TRUE Sum
## 56 240 296
ptree <- prop.table(tree_table)
ptree <- as.data.frame(ptree)
ptree$Model <- "TREE"pmodels <- rbind.fill(as.data.frame(psvm), as.data.frame(pmaxent), as.data.frame(pbag),
as.data.frame(prf), as.data.frame(ptree))
colnames(pmodels) <- c("Status", "Frequency", "Model")
pmodels <- rbind(pmodels, c("FALSE", error, "KNN"), c("TRUE", accuracy_2, "KNN"))
datatable(pmodels)ggplot(pmodels, aes(x = Model, y = Frequency)) + ggtitle("Model Performance Plot") +
geom_point(aes(colour = Status), size = 4) + theme(axis.text.x = element_text(angle = 45,
hjust = 1))We find that Random Forest Model and K-Nearest Neighborr gives optimum performance in classifying the documents correctly with 99.32% and 98.31% accuracy respectively. Followed by Support Vector Machine 97.97% and Bagging 97.29% accuracy Hence Randome Forest would be the best mdel for classifying further test documents dataset for ‘Trade’ and ‘Moneyfx’ categories , based on saved models.