set the working directory to yours and read the file
setwd("C:/Users/ngsook/Documents")
raw_text <- read.table('r8-train-all-terms.txt', header=FALSE, sep='\t')
Random Shuffle
help (nrow)
help (sample) ## Random shuffle
set.seed(10)
raw_random <- raw_text[sample(1:nrow(raw_text)),]
nrow(raw_text)
## [1] 5485
head(raw_random, 1)
## V1
## 491 earn
## V2
## 491 trimac ltd year net shr nine cts vs six cts net vs revs mln vs mln note year includes tax gain of dlrs reuter
Select a subset which the class is ‘trade’, ‘crude’ or ‘money-fx’
raw_subset <- raw_random[which(raw_random$V1 %in% c('trade','crude', 'money-fx')),]
head(raw_subset, 1)
## V1
## 4440 money-fx
## V2
## 4440 fed will buy bills for customer after auction the federal reserve said it will enter the u s government securities market after the est weekly bill auction to purchase around mln dlrs of treasury bills for customers a spokesman said he said the fed will purchase bills with maturities from may through september dealers said federal funds were trading at pct when the fed announced the operation reuter
dim(raw_subset)
## [1] 710 2
#710 docs in total are selected
Create Corpus
library(tm)
corpus <- Corpus(VectorSource(raw_subset$V2)) #only V2 are selected
Normalize
help (tm_map)
help (content_transformer)
corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, stopwords('english')) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a b"-> "a b"
Create DTM Unigram+TFIDF as the feature set
dtm <- DocumentTermMatrix(corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 710, terms: 5287)>>
## Non-/sparse entries: 54285/3699485
## Sparsity : 99%
## Maximal term length: 15
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs bank billion dlrs market mln oil reuter said trade will
## 17 0 1 1 0 0 2 1 7 28 6
## 214 2 6 4 1 2 10 2 7 0 4
## 248 0 6 7 1 0 18 1 10 3 7
## 287 0 0 0 2 2 0 1 18 0 5
## 31 5 6 6 2 4 0 1 12 8 6
## 358 1 0 0 10 0 0 1 16 5 2
## 527 0 0 0 2 0 2 1 0 2 0
## 637 1 11 5 2 1 1 1 14 18 9
## 697 7 3 1 6 0 0 2 18 11 0
## 91 15 0 4 9 0 0 1 13 5 0
dtm_ti <- weightTfIdf(dtm)
inspect(dtm_ti)
## <<DocumentTermMatrix (documents: 710, terms: 5287)>>
## Non-/sparse entries: 54285/3699485
## Sparsity : 99%
## Maximal term length: 15
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs bank billion crude dlrs fed mln oil
## 205 0.03728690 0.00000000 0 0 0 0.00000000 0.00000000
## 288 0.00000000 0.00000000 0 0 0 0.00000000 0.10065818
## 361 0.11250357 0.01659294 0 0 0 0.03969924 0.00000000
## 428 0.00000000 0.00000000 0 0 0 0.00000000 0.07139708
## 540 0.00000000 0.00000000 0 0 0 0.00000000 0.00000000
## 542 0.00000000 0.00000000 0 0 0 0.00000000 0.02693048
## 587 0.00000000 0.00000000 0 0 0 0.00000000 0.00000000
## 607 0.07661691 0.00000000 0 0 0 0.00000000 0.00000000
## 623 0.00000000 0.00000000 0 0 0 0.00000000 0.02146905
## 625 0.00000000 0.00000000 0 0 0 0.00000000 0.00000000
## Terms
## Docs say stg trade
## 205 0.00000000 0.07034958 0
## 288 0.03092972 0.00000000 0
## 361 0.00000000 0.00000000 0
## 428 0.00000000 0.00000000 0
## 540 0.00000000 0.00000000 0
## 542 0.01655011 0.00000000 0
## 587 0.00000000 0.00000000 0
## 607 0.00000000 0.00000000 0
## 623 0.00000000 0.00000000 0
## 625 0.00000000 0.00000000 0
#inspect (dtm_ti)
Convert to matrix
mat <- as.matrix(dtm_ti)
str(mat)
## num [1:710, 1:5287] 0.0747 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ Docs : chr [1:710] "1" "2" "3" "4" ...
## ..$ Terms: chr [1:5287] "announc" "around" "auction" "bill" ...
nrow(mat)
## [1] 710
Prepare the training and testing data (4:1) from this “training dataset”
x_train <- mat[1:568,]
x_train <- data.frame(x_train)
y_train <- raw_subset$V1[1:568] #V1 as the label column
x_test <- mat[569:710,]
x_test <- data.frame(x_test)
y_test <- raw_subset$V1[569:710]
NaiveBayes
install.packages(‘e1071’)
library(e1071)
bayes_model <- naiveBayes(x_train, y_train)
predicted <- predict(bayes_model, x_test) #apply the model on the test data
table(as.character(y_test), as.character(predicted)) #get confusion matrix
##
## crude money-fx trade
## crude 21 27 1
## money-fx 0 35 3
## trade 3 27 25
sum(predicted==y_test)/length(y_test) #compute Accuracy
## [1] 0.5704225
head(y_test)
## [1] crude trade trade trade money-fx trade
## Levels: acq crude earn grain interest money-fx ship trade
head(predicted)
## [1] money-fx trade money-fx trade money-fx trade
## Levels: acq crude earn grain interest money-fx ship trade
Use decision tree to build model
help(cbind)
trainall <- cbind(y_train,x_train)
install.packages(“rpart”)
install.packages(“rpart.plot”)
library("rpart")
library("rpart.plot")
outcome ~ predictor1+predictor2+predictor3+ect.
tree_model <- rpart(y_train~., method="class", data=trainall)
tree_model
## n= 568
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 568 364 crude (0 0.36 0 0 0 0.3 0 0.35)
## 2) oil>=0.01338797 182 2 crude (0 0.99 0 0 0 0 0 0.011) *
## 3) oil< 0.01338797 386 192 trade (0 0.062 0 0 0 0.44 0 0.5)
## 6) trade< 0.01429346 187 45 money-fx (0 0.13 0 0 0 0.76 0 0.11)
## 12) crude>=0.009719474 15 0 crude (0 1 0 0 0 0 0 0) *
## 13) crude< 0.009719474 172 30 money-fx (0 0.052 0 0 0 0.83 0 0.12)
## 26) product>=0.01564713 12 4 trade (0 0.33 0 0 0 0 0 0.67) *
## 27) product< 0.01564713 160 18 money-fx (0 0.031 0 0 0 0.89 0 0.081) *
## 7) trade>=0.01429346 199 26 trade (0 0 0 0 0 0.13 0 0.87)
## 14) currenc>=0.03458796 19 3 money-fx (0 0 0 0 0 0.84 0 0.16) *
## 15) currenc< 0.03458796 180 10 trade (0 0 0 0 0 0.056 0 0.94)
## 30) reserv>=0.04443007 7 0 money-fx (0 0 0 0 0 1 0 0) *
## 31) reserv< 0.04443007 173 3 trade (0 0 0 0 0 0.017 0 0.98) *
rpart.plot(tree_model)

Calculate the accuracy by using testset with train’s model
y<- predict(tree_model, x_test, type = "class")
table(as.character(y_test), as.character(y))
##
## crude money-fx trade
## crude 45 2 2
## money-fx 0 38 0
## trade 5 6 44
sum(y==y_test)/length(y)
## [1] 0.8943662
Using SVM from e1071 to build the model
svm_model <- svm(y_train~ ., data = trainall, kernel='linear')
y2<- predict(svm_model, x_test, type = "class")
table(as.character(y_test), as.character(y2))
##
## crude money-fx trade
## crude 49 0 0
## money-fx 0 36 2
## trade 1 0 54
sum(y2==y_test)/length(y2)
## [1] 0.9788732
Save the Model
setwd("C:/Users/ngsook/Documents")
saveRDS(svm_model, "./final_model.rds")
Read the Model
setwd("c:/Users/ngsook/Documents")
final_model <- readRDS("./final_model.rds")
final_model
##
## Call:
## svm(formula = y_train ~ ., data = trainall, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 395
Get the vocabulary used in model [Important when reuse models]
vocab <- dimnames(final_model$SV)[[2]]
length(vocab)
## [1] 5287
head(vocab)
## [1] "announc" "around" "auction" "bill" "buy" "custom"
tail(vocab)
## [1] "arifin" "flew" "java" "siregar" "wise" "stoppag"
Let’s load a new set of documents for testing
setwd("c:/Users/ngsook/Documents")
raw_newset <- read.table('r8-test-all-terms.txt', header=FALSE, sep='\t')
set.seed(10)
raw_random <- raw_text[sample(1:nrow(raw_newset)),]
raw_subset <- raw_random[which(raw_random$V1 %in% c('trade','crude', 'money-fx')),]
nrow(raw_subset)
## [1] 279
Preprocess the set in the same process
corpus_new <- Corpus(VectorSource(raw_subset$V2))
corpus_new <- tm_map(corpus_new, content_transformer(tolower))
corpus_new <- tm_map(corpus_new, removeNumbers)
corpus_new <- tm_map(corpus_new, removeWords, stopwords('english'))
corpus_new <- tm_map(corpus_new, removePunctuation)
corpus_new <- tm_map(corpus_new, stemDocument)
corpus_new <- tm_map(corpus_new, removeWords, stopwords('english'))
corpus_new <- tm_map(corpus_new, stripWhitespace)
Create DTM using the SAME vocabulary as trainnig data (same set of features: unigram+tfidf)
dtm_new <- DocumentTermMatrix(corpus_new,
control = list(dictionary=vocab,
weighting = function(x) weightTfIdf(x,normalize =TRUE)))
mat_new<- as.matrix(dtm_new)
Apply train model to the new matrix
y_new <- predict(final_model, mat_new, type = "class")
table(y_new)
## y_new
## acq crude earn grain interest money-fx ship trade
## 0 142 0 0 0 48 0 89