Reuters DataSet : Document Classification

Summary

Performing Text Mining With Machine Learning Algorithm For Document Classification Of Reuters Dataset And Observing Performance Of Different Models For Classsification

R Code : Preparing Term Document Matrix From the Reuters DataSet

Loading Packages Used

knitr::opts_chunk$set(message = FALSE, echo = TRUE)



library("tm")
library("plyr")
library("class")
library("SnowballC")
library("stringr")
library("wordcloud")

library("RTextTools")
library("e1071")
library("tidyr")
library("DT")
library("ggplot2")

Loading Dataset

Downloaded the Reuters Dataset from

Reuters-21578 DataSet can be downloaded as zip from : http://disi.unitn.it/moschitti/corpora.htm

Uploaded at https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week10A/Data

For this assignment, the zip was downloaded and extracted locally for different categories . The Reuters dataset consists of 90 categories and classified into Test and Training as separate dataset for each category.

Here the categories of ‘Trade’ and ‘MoneyFx’ is used to perform document classification.

options(stringsAsFactors = FALSE)

datapath <- "F:/Data/Reuters21578_90Cat/training"
category <- c("trade", "moneyfx")

# Trainig Data for Trade and MoneyFx categories

trade.directory <- "F:/Data/Reuters21578_90Cat/training/trade"
moneyfx.directory <- "F:/Data/Reuters21578_90Cat/training/moneyfx"

# Test Data for Trade and MoneyFx categories

tradetest.dir <- "F:/Data/Reuters21578_90Cat/test/trade"
moneyfxtest.dir <- "F:/Data/Reuters21578_90Cat/test/moneyfx"

Data Cleansing Function

The Corpus for ‘Trade’ and ‘MoneyFx’ categories need to be cleaned of punctuations, numbers and commong english language words. Doing so, facilitates the relevant terms to surface for text mining that would help build classification model.

cleanupDataCorpus <- function(dataCorpus) {
    
    cleanDataCorpus <- tm_map(dataCorpus, removePunctuation)
    cleanDataCorpus <- tm_map(cleanDataCorpus, removeNumbers)
    
    cleanDataCorpus <- tm_map(cleanDataCorpus, str_replace_all, pattern = "[[:punct:]]", 
        replacement = " ")
    cleanDataCorpus <- tm_map(cleanDataCorpus, tolower)
    cleanDataCorpus <- tm_map(cleanDataCorpus, removeWords, c("said", "u.s", stopwords("english")))
    
    
    cleanDataCorpus <- tm_map(cleanDataCorpus, stemDocument)
    cleanDataCorpus <- tm_map(cleanDataCorpus, PlainTextDocument)
    return(cleanDataCorpus)
    
}

Build Term Document Matrix For Training Data

trade.corpus <- Corpus(DirSource(directory = trade.directory, encoding = "ASCII"))
moneyfx.corpus <- Corpus(DirSource(directory = moneyfx.directory, encoding = "ASCII"))

trade.cleancorpus <- cleanupDataCorpus(trade.corpus)
moneyfx.cleancorpus <- cleanupDataCorpus(moneyfx.corpus)

trade.tdm <- TermDocumentMatrix(trade.cleancorpus)
moneyfx.tdm <- TermDocumentMatrix(moneyfx.cleancorpus)

trade.tdm <- removeSparseTerms(trade.tdm, 0.7)
moneyfx.tdm <- removeSparseTerms(moneyfx.tdm, 0.7)

trade.datamatx <- t(data.matrix(trade.tdm))
moneyfx.datamatx <- t(data.matrix(moneyfx.tdm))

head(trade.datamatx)

##               Terms
## Docs           billion deficit dlrs exports foreign imports japan last
##   character(0)       3       0    3       1       3       0     0    1
##   character(0)       4       0    1       3       0       3     1    0
##   character(0)       2       1    2       0       0       1     0    1
##   character(0)       0       0    0       0       0       0     0    0
##   character(0)       1       3    1       0       1       1     8    2
##   character(0)       2       0    2       1       1       0     0    3
##               Terms
## Docs           march mln surplus told trad trade washington will year
##   character(0)     0   2       1    0    0     3          0    0    1
##   character(0)     1   1       4    0    0     2          0    0    3
##   character(0)     1   3       4    1    0     0          0    0    1
##   character(0)     1   0       0    1    1     2          0    3    0
##   character(0)     1   0       1    0    4    11          1    3    1
##   character(0)     1   0       4    1    0     4          2    0    6

head(moneyfx.datamatx)

##               Terms
## Docs           bank billion central dollar exchange march market money pct
##   character(0)    4       0       0      0        0     1      1     1  10
##   character(0)    3       0       0      0        2     1      6     4   1
##   character(0)    2       0       1      2        3     1      0     0   0
##   character(0)    1       4       1      0        0     2      2     2   2
##   character(0)    1       1       0      0        0     1      2     2   0
##   character(0)    1       0       0      0        0     1      2     2   0
##               Terms
## Docs           rate today
##   character(0)    0     0
##   character(0)    0     0
##   character(0)    1     0
##   character(0)    0     0
##   character(0)    0     2
##   character(0)    0     0

trade.df <- as.data.frame(trade.datamatx)
moneyfx.df <- as.data.frame(moneyfx.datamatx)

Build Term Document Matrix For Test Data

tradetest.corpus <- Corpus(DirSource(directory = tradetest.dir, encoding = "ASCII"))
moneyfxtest.corpus <- Corpus(DirSource(directory = moneyfxtest.dir, encoding = "ASCII"))

tradetest.cleancorpus <- cleanupDataCorpus(tradetest.corpus)
moneyfxtest.cleancorpus <- cleanupDataCorpus(moneyfxtest.corpus)

tradetest.tdm <- TermDocumentMatrix(tradetest.cleancorpus)
moneyfxtest.tdm <- TermDocumentMatrix(moneyfxtest.cleancorpus)

tradetest.tdm <- removeSparseTerms(tradetest.tdm, 0.7)
moneyfxtest.tdm <- removeSparseTerms(moneyfxtest.tdm, 0.7)

tradetest.datamatx <- t(data.matrix(tradetest.tdm))
moneyfxtest.datamatx <- t(data.matrix(moneyfxtest.tdm))

head(tradetest.datamatx)

##               Terms
## Docs           also april billion deficit dlrs exports foreign imports
##   character(0)    4     2       5       0    6       4       1       4
##   character(0)    0     1       6       2    0       1       0       2
##   character(0)    4     2       1       0    1       4       0       4
##   character(0)    1     1       9       1    6       2       5       2
##   character(0)    0     1       0       0    0       1       0       1
##   character(0)    1     1       0       0    0       0       0       0
##               Terms
## Docs           japan june last market mln new pct surplus tariffs told
##   character(0)     9    0    4      1   1   1   2       3       4    1
##   character(0)     0    0    0      0   0   0  10       0       0    0
##   character(0)    14    0    1      7   0   1   0       0       0    0
##   character(0)     0    0    2      1   1   0   6      13       0    1
##   character(0)     3    0    0      0   0   0   0       0       1    0
##   character(0)     2    0    0      1   0   1   0       0       1    1
##               Terms
## Docs           trad trade washington will year
##   character(0)    1    14          1    2    4
##   character(0)    0     2          0    0    2
##   character(0)    0     4          0    4    1
##   character(0)    0     6          1    0    6
##   character(0)    1     3          1    1    0
##   character(0)    2     3          2    3    0

head(moneyfxtest.datamatx)

##               Terms
## Docs           april bank dollar market new pct rate rates west
##   character(0)     2    2      0      4   0   2    1     1    0
##   character(0)     1    1      0      3   0   0    0     0    0
##   character(0)     1    4      7      3   0   2    1     4    0
##   character(0)     1    3      3      0   1   0    0     0    0
##   character(0)     1    3      0      2   0   2    0     0    0
##   character(0)     1    0      4      0   0   1    2     3    7

tradetest.df <- as.data.frame(tradetest.datamatx)
moneyfxtest.df <- as.data.frame(moneyfxtest.datamatx)

Aligning Test Data Terms with Training Data Terms

Filtering for columns present in test and training and then adding the columns/terms present only in training. Attempt to keep the same terms in test and training

subset trade testing data by colnames (ie. terms) from training trade data

trade.df.mat <- as.matrix(trade.df)
tradetest.df.mat <- as.matrix(tradetest.df)


tradetestfiltered1 <- data.frame(tradetest.df.mat[, intersect(colnames(tradetest.df.mat), 
    colnames(trade.df.mat))])

tradetestfiltered2 <- read.table(textConnection(""), col.names = colnames(trade.df.mat))


tradetestfiltered <- rbind.fill(tradetestfiltered1, tradetestfiltered2)

ncol(tradetestfiltered)

## [1] 17

subset moneyfx testing data by colnames (ie. terms) from moneyfx training data

moneyfx.df.mat <- as.matrix(moneyfx.df)
moneyfxtest.df.mat <- as.matrix(moneyfxtest.df)


moneyfxtestfiltered1 <- data.frame(moneyfxtest.df.mat[, intersect(colnames(moneyfxtest.df.mat), 
    colnames(moneyfx.df.mat))])

moneyfxtestfiltered2 <- read.table(textConnection(""), col.names = colnames(moneyfx.df.mat))


moneyfxtestfiltered <- rbind.fill(moneyfxtestfiltered1, moneyfxtestfiltered2)

ncol(moneyfxtestfiltered)

## [1] 11

Binding ‘Category’ Column To The Data Frame

trade.df <- cbind(trade.df, category = rep("trade"))
moneyfx.df <- cbind(moneyfx.df, category = rep("moneyfx"))

tradetestfiltered <- cbind(tradetestfiltered, category = rep("trade"))
moneyfxtestfiltered <- cbind(moneyfxtestfiltered, category = rep("moneyfx"))

Stacking different categories TDM together

We now have two dataframes , one each category , with the content as rows and the terms as variables / columns, and a category column appended to denote the category.

We now stack both the tdms

Stacking the TDMS of different categories for training data

tdm.stackedcategory <- rbind.fill(trade.df, moneyfx.df)

nrow(trade.df)

## [1] 369

nrow(moneyfx.df)

## [1] 538

nrow(tdm.stackedcategory)

## [1] 907

tdm.stackedcategory[is.na(tdm.stackedcategory)] <- 0

ncol(trade.df)

## [1] 18

ncol(moneyfx.df)

## [1] 12

ncol(tdm.stackedcategory)

## [1] 27

Stacking the TDMS of different categories for test data

tdm.teststackedcategory <- rbind.fill(tradetestfiltered, moneyfxtestfiltered)

nrow(tradetestfiltered)

## [1] 117

nrow(moneyfxtestfiltered)

## [1] 179

nrow(tdm.teststackedcategory)

## [1] 296

tdm.teststackedcategory[is.na(tdm.teststackedcategory)] <- 0

ncol(tradetestfiltered)

## [1] 18

ncol(moneyfxtestfiltered)

## [1] 12

ncol(tdm.teststackedcategory)

## [1] 27

We now see the number of total rows in stacked training TDM and stacked test TDM is equivalent to the sum of total rows for each category TDM.

Word Cloud For Training Sets For Trade And Moneyfx Categories

findAssocs(tdm.stackedcategory, terms = “billion”, corlimit = 0.3)

Trade Word Cloud

We find that words like ‘trade’,‘billion’, ‘year’ have been more prominently used

# Word Cloud for Trade

wordcloud(trade.cleancorpus, max.words = 20, random.order = FALSE, colors = brewer.pal(8, 
    "Dark2"), scale = c(5, 0.5), vfont = c("gothic english", "plain"))

Moneyyfx Word Cloud

We find that words like ‘bank’,‘market’, ‘pct’ ‘Ddollar’ have been more prominently used.

# Word Cloud for Moneyfx

wordcloud(moneyfx.cleancorpus, max.words = 20, random.order = FALSE, colors = brewer.pal(8, 
    "Dark2"), scale = c(5, 0.5), vfont = c("serif", "plain"))

R Code : Preparing Data For Model Based Classification

Here we separate the training data and test data both from the category column so as to provide to train the model and compare it later for accuracy.

# training data with only category column specified
tdm.traincategorycol <- tdm.stackedcategory[, "category"]

# training data without category column specified
tdm.train <- tdm.stackedcategory[, !colnames(tdm.stackedcategory) %in% "category"]

# test data with only category column specified
tdm.testcategorycol <- tdm.teststackedcategory[, "category"]

# test data without category column specified
tdm.test <- tdm.teststackedcategory[, !colnames(tdm.teststackedcategory) %in% "category"]

MODEL KNN (K Nearest Neighbour)

knn.pred <- knn(tdm.train, tdm.test, tdm.traincategorycol)



# Accuracy Test

confusion_matx <- table(Predictions = knn.pred, Actual = tdm.testcategorycol)
confusion_matx

##            Actual
## Predictions moneyfx trade
##     moneyfx     179     5
##     trade         0   112

# Calculating Accuracy
accuracy_1 <- sum(diag(confusion_matx))/nrow(tdm.test)
accuracy_1

## [1] 0.9831081

# Another way to calculate accuracy

true_moneyfx <- 179
true_trade <- 112

false_moneyfx <- 5
false_trade <- 0


total <- true_moneyfx + true_trade + false_moneyfx + false_trade
accuracy_2 <- (true_moneyfx + true_trade)/total
accuracy_2

## [1] 0.9831081

error <- 1 - accuracy_2
error

## [1] 0.01689189

container <- create_container(tdm.train, tdm.stackedcategory$category, trainSize = 1:907, 
    virgin = FALSE)

predtestcontainer <- create_container(tdm.test, labels = 0:296, testSize = 1:296, 
    virgin = FALSE)

MODEL SVM (Support Vector Machine)

SVM <- train_model(container, "SVM")
results.SVM <- classify_model(predtestcontainer, SVM)
datatable(results.SVM)

MODEL MAXENT (Max Entropy)

MAXENT <- train_model(container, "MAXENT")
results.MAXENT <- classify_model(predtestcontainer, MAXENT)
datatable(results.MAXENT)

MODEL BAGGING (Bag)

BAGGING <- train_model(container, "BAGGING")
results.BAGGING <- classify_model(predtestcontainer, BAGGING)
datatable(results.BAGGING)

MODEL Random Forest

RF <- train_model(container, "RF")
results.RF <- classify_model(predtestcontainer, RF)
datatable(results.RF)

MODEL TREE

TREE <- train_model(container, "TREE")
results.TREE <- classify_model(predtestcontainer, TREE)
datatable(results.TREE)

Performance Comparison Of Different Models

cat_check <- data.frame(correct_cat = tdm.testcategorycol[1:296], svm = as.character(results.SVM[, 
    1]), maxent = as.character(results.MAXENT[, 1]), bagging = as.character(results.BAGGING[, 
    1]), rf = as.character(results.RF[, 1]), tree = as.character(results.TREE[, 1]), 
    stringAsFactors = F)


# SVM Performance

svm_table <- table(cat_check[, 1] == cat_check[, 2])
addmargins(svm_table)

## 
## FALSE  TRUE   Sum 
##     6   290   296

psvm <- prop.table(svm_table)
psvm <- as.data.frame(psvm)
psvm$Model <- "SVM"


# MAXENT Performance

maxent_table <- table(cat_check[, 1] == cat_check[, 3])
addmargins(maxent_table)

## 
## FALSE  TRUE   Sum 
##    47   249   296

pmaxent <- prop.table(maxent_table)
pmaxent <- as.data.frame(pmaxent)
pmaxent$Model <- "MAXENT"


# Bagging Performance

bagging_table <- table(cat_check[, 1] == cat_check[, 4])
addmargins(bagging_table)

## 
## FALSE  TRUE   Sum 
##    10   286   296

pbag <- prop.table(bagging_table)
pbag <- as.data.frame(pbag)
pbag$Model <- "BAGGING"


# RF Performanace

rf_table <- table(cat_check[, 1] == cat_check[, 5])
addmargins(rf_table)

## 
## FALSE  TRUE   Sum 
##     2   294   296

prf <- prop.table(rf_table)
prf <- as.data.frame(prf)
prf$Model <- "RF"


# Tree Performanace

tree_table <- table(cat_check[, 1] == cat_check[, 6])
addmargins(tree_table)

## 
## FALSE  TRUE   Sum 
##    56   240   296

ptree <- prop.table(tree_table)
ptree <- as.data.frame(ptree)
ptree$Model <- "TREE"

pmodels <- rbind.fill(as.data.frame(psvm), as.data.frame(pmaxent), as.data.frame(pbag), 
    as.data.frame(prf), as.data.frame(ptree))




colnames(pmodels) <- c("Status", "Frequency", "Model")

pmodels <- rbind(pmodels, c("FALSE", error, "KNN"), c("TRUE", accuracy_2, "KNN"))


datatable(pmodels)

Plotting Performance Of Models

ggplot(pmodels, aes(x = Model, y = Frequency)) + ggtitle("Model Performance Plot") + 
    geom_point(aes(colour = Status), size = 4) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1))

Conclusion

We find that Random Forest Model and K-Nearest Neighborr gives optimum performance in classifying the documents correctly with 99.32% and 98.31% accuracy respectively. Followed by Support Vector Machine 97.97% and Bagging 97.29% accuracy Hence Randome Forest would be the best mdel for classifying further test documents dataset for ‘Trade’ and ‘Moneyfx’ categories , based on saved models.

ReutersDocumentClassification

Kumudini Bhave

November 10, 2016