Text Classification

http://www.cs.umb.edu/~smimarog/textmining/datasets/

set the working directory to yours and read the file

setwd("C:/Users/ngsook/Documents")
raw_text <- read.table('r8-train-all-terms.txt', header=FALSE, sep='\t')

Random Shuffle

help (nrow)

help (sample) ## Random shuffle

set.seed(10)
raw_random <- raw_text[sample(1:nrow(raw_text)),]
nrow(raw_text)

## [1] 5485

head(raw_random, 1)

##       V1
## 491 earn
##                                                                                                                 V2
## 491 trimac ltd year net shr nine cts vs six cts net vs revs mln vs mln note year includes tax gain of dlrs reuter

Select a subset which the class is ‘trade’, ‘crude’ or ‘money-fx’

raw_subset <- raw_random[which(raw_random$V1 %in% c('trade','crude', 'money-fx')),]
head(raw_subset, 1)

##            V1
## 4440 money-fx
##                                                                                                                                                                                                                                                                                                                                                                                                                       V2
## 4440 fed will buy bills for customer after auction the federal reserve said it will enter the u s government securities market after the est weekly bill auction to purchase around mln dlrs of treasury bills for customers a spokesman said he said the fed will purchase bills with maturities from may through september dealers said federal funds were trading at pct when the fed announced the operation reuter

dim(raw_subset)

## [1] 710   2

#710 docs in total are selected

Create Corpus

library(tm)
corpus <- Corpus(VectorSource(raw_subset$V2)) #only V2 are selected

Normalize

help (tm_map)

help (content_transformer)

corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, stopwords('english')) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a  b"-> "a b"

Create DTM Unigram+TFIDF as the feature set

dtm <- DocumentTermMatrix(corpus)
inspect(dtm)

## <<DocumentTermMatrix (documents: 710, terms: 5287)>>
## Non-/sparse entries: 54285/3699485
## Sparsity           : 99%
## Maximal term length: 15
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  bank billion dlrs market mln oil reuter said trade will
##   17     0       1    1      0   0   2      1    7    28    6
##   214    2       6    4      1   2  10      2    7     0    4
##   248    0       6    7      1   0  18      1   10     3    7
##   287    0       0    0      2   2   0      1   18     0    5
##   31     5       6    6      2   4   0      1   12     8    6
##   358    1       0    0     10   0   0      1   16     5    2
##   527    0       0    0      2   0   2      1    0     2    0
##   637    1      11    5      2   1   1      1   14    18    9
##   697    7       3    1      6   0   0      2   18    11    0
##   91    15       0    4      9   0   0      1   13     5    0

dtm_ti <- weightTfIdf(dtm)
inspect(dtm_ti)

## <<DocumentTermMatrix (documents: 710, terms: 5287)>>
## Non-/sparse entries: 54285/3699485
## Sparsity           : 99%
## Maximal term length: 15
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##      Terms
## Docs        bank    billion crude dlrs fed        mln        oil
##   205 0.03728690 0.00000000     0    0   0 0.00000000 0.00000000
##   288 0.00000000 0.00000000     0    0   0 0.00000000 0.10065818
##   361 0.11250357 0.01659294     0    0   0 0.03969924 0.00000000
##   428 0.00000000 0.00000000     0    0   0 0.00000000 0.07139708
##   540 0.00000000 0.00000000     0    0   0 0.00000000 0.00000000
##   542 0.00000000 0.00000000     0    0   0 0.00000000 0.02693048
##   587 0.00000000 0.00000000     0    0   0 0.00000000 0.00000000
##   607 0.07661691 0.00000000     0    0   0 0.00000000 0.00000000
##   623 0.00000000 0.00000000     0    0   0 0.00000000 0.02146905
##   625 0.00000000 0.00000000     0    0   0 0.00000000 0.00000000
##      Terms
## Docs         say        stg trade
##   205 0.00000000 0.07034958     0
##   288 0.03092972 0.00000000     0
##   361 0.00000000 0.00000000     0
##   428 0.00000000 0.00000000     0
##   540 0.00000000 0.00000000     0
##   542 0.01655011 0.00000000     0
##   587 0.00000000 0.00000000     0
##   607 0.00000000 0.00000000     0
##   623 0.00000000 0.00000000     0
##   625 0.00000000 0.00000000     0

#inspect (dtm_ti)

Convert to matrix

mat <- as.matrix(dtm_ti)
str(mat)

##  num [1:710, 1:5287] 0.0747 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Docs : chr [1:710] "1" "2" "3" "4" ...
##   ..$ Terms: chr [1:5287] "announc" "around" "auction" "bill" ...

nrow(mat)

## [1] 710

Prepare the training and testing data (4:1) from this “training dataset”

x_train <- mat[1:568,]
x_train <- data.frame(x_train)
y_train <- raw_subset$V1[1:568]  #V1 as the label column

x_test <- mat[569:710,]
x_test <- data.frame(x_test)
y_test <- raw_subset$V1[569:710]

https://cran.r-project.org/web/packages/e1071/index.html

NaiveBayes

install.packages(‘e1071’)

library(e1071)
bayes_model <- naiveBayes(x_train, y_train)
predicted <- predict(bayes_model, x_test) #apply the model on the test data
table(as.character(y_test), as.character(predicted)) #get confusion matrix

##           
##            crude money-fx trade
##   crude       21       27     1
##   money-fx     0       35     3
##   trade        3       27    25

sum(predicted==y_test)/length(y_test) #compute Accuracy

## [1] 0.5704225

head(y_test)

## [1] crude    trade    trade    trade    money-fx trade   
## Levels: acq crude earn grain interest money-fx ship trade

head(predicted)

## [1] money-fx trade    money-fx trade    money-fx trade   
## Levels: acq crude earn grain interest money-fx ship trade

Use decision tree to build model

help(cbind)

trainall <- cbind(y_train,x_train)

install.packages(“rpart”)

install.packages(“rpart.plot”)

library("rpart")
library("rpart.plot")

outcome ~ predictor1+predictor2+predictor3+ect.

tree_model <- rpart(y_train~., method="class", data=trainall)
tree_model

## n= 568 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 568 364 crude (0 0.36 0 0 0 0.3 0 0.35)  
##    2) oil>=0.01338797 182   2 crude (0 0.99 0 0 0 0 0 0.011) *
##    3) oil< 0.01338797 386 192 trade (0 0.062 0 0 0 0.44 0 0.5)  
##      6) trade< 0.01429346 187  45 money-fx (0 0.13 0 0 0 0.76 0 0.11)  
##       12) crude>=0.009719474 15   0 crude (0 1 0 0 0 0 0 0) *
##       13) crude< 0.009719474 172  30 money-fx (0 0.052 0 0 0 0.83 0 0.12)  
##         26) product>=0.01564713 12   4 trade (0 0.33 0 0 0 0 0 0.67) *
##         27) product< 0.01564713 160  18 money-fx (0 0.031 0 0 0 0.89 0 0.081) *
##      7) trade>=0.01429346 199  26 trade (0 0 0 0 0 0.13 0 0.87)  
##       14) currenc>=0.03458796 19   3 money-fx (0 0 0 0 0 0.84 0 0.16) *
##       15) currenc< 0.03458796 180  10 trade (0 0 0 0 0 0.056 0 0.94)  
##         30) reserv>=0.04443007 7   0 money-fx (0 0 0 0 0 1 0 0) *
##         31) reserv< 0.04443007 173   3 trade (0 0 0 0 0 0.017 0 0.98) *

rpart.plot(tree_model)

Calculate the accuracy by using testset with train’s model

y<- predict(tree_model, x_test, type = "class")
table(as.character(y_test), as.character(y))

##           
##            crude money-fx trade
##   crude       45        2     2
##   money-fx     0       38     0
##   trade        5        6    44

sum(y==y_test)/length(y)

## [1] 0.8943662

Using SVM from e1071 to build the model

svm_model <- svm(y_train~ ., data = trainall, kernel='linear')
y2<- predict(svm_model, x_test, type = "class")
table(as.character(y_test), as.character(y2))

##           
##            crude money-fx trade
##   crude       49        0     0
##   money-fx     0       36     2
##   trade        1        0    54

sum(y2==y_test)/length(y2)

## [1] 0.9788732

Save the Model

setwd("C:/Users/ngsook/Documents")
saveRDS(svm_model, "./final_model.rds")

Read the Model

setwd("c:/Users/ngsook/Documents")
final_model <- readRDS("./final_model.rds")
final_model

## 
## Call:
## svm(formula = y_train ~ ., data = trainall, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  395

Get the vocabulary used in model [Important when reuse models]

vocab <- dimnames(final_model$SV)[[2]]
length(vocab)

## [1] 5287

head(vocab)

## [1] "announc" "around"  "auction" "bill"    "buy"     "custom"

tail(vocab)

## [1] "arifin"  "flew"    "java"    "siregar" "wise"    "stoppag"

Let’s load a new set of documents for testing

setwd("c:/Users/ngsook/Documents")
raw_newset <- read.table('r8-test-all-terms.txt', header=FALSE, sep='\t')
set.seed(10)
raw_random <- raw_text[sample(1:nrow(raw_newset)),]
raw_subset <- raw_random[which(raw_random$V1 %in% c('trade','crude', 'money-fx')),]

nrow(raw_subset)

## [1] 279

Preprocess the set in the same process

corpus_new <- Corpus(VectorSource(raw_subset$V2))

corpus_new <- tm_map(corpus_new, content_transformer(tolower))
corpus_new <- tm_map(corpus_new, removeNumbers)
corpus_new <- tm_map(corpus_new, removeWords, stopwords('english'))
corpus_new <- tm_map(corpus_new, removePunctuation)
corpus_new <- tm_map(corpus_new, stemDocument)
corpus_new <- tm_map(corpus_new, removeWords, stopwords('english'))
corpus_new <- tm_map(corpus_new, stripWhitespace)

Create DTM using the SAME vocabulary as trainnig data (same set of features: unigram+tfidf)

dtm_new <- DocumentTermMatrix(corpus_new,
                           control = list(dictionary=vocab, 
                                          weighting = function(x) weightTfIdf(x,normalize =TRUE)))
mat_new<- as.matrix(dtm_new)

Apply train model to the new matrix

y_new <- predict(final_model, mat_new, type = "class")
table(y_new)

## y_new
##      acq    crude     earn    grain interest money-fx     ship    trade 
##        0      142        0        0        0       48        0       89