It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
#Import libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corpus)
library(tm)
## Loading required package: NLP
library(NLP)
library(SnowballC)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:NLP':
##
## annotate
#Import the data
#Data source: https://www.kaggle.com/team-ai/spam-text-message-classification/version/1#SPAM%20text%20message%2020170820%20-%20Data.csv
data <- read.csv ("https://github.com/GehadGad/Spam-and-ham-data/raw/master/Data.csv", header=TRUE, sep=",", quote='\"\"', stringsAsFactors=FALSE)
#data <- read.csv("Data.csv", header=TRUE, sep=",", quote='\"\"', stringsAsFactors=FALSE)
#Since the data is big, I selected the first 300 only.
data[2,]
## Category Message
## 2 ham Ok lar... Joking wif u oni...
data = data[1:300,]
# Get the probability of spam and ham:
data$Category <- factor(data$Category)
prop.table(table(data$Category))
##
## ham spam
## 0.8533333 0.1466667
#Separat each word.
corpus = VCorpus(VectorSource(data$Message))
as.character(corpus[[1]])
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
#Change all words to lower case.
corpus = tm_map(corpus, content_transformer(tolower))
#Remove numebrs
corpus = tm_map(corpus, removeNumbers)
#Remove punctuation.
corpus = tm_map(corpus, removePunctuation)
#Remove stop words
corpus = tm_map(corpus, removeWords, stopwords("english"))
#Stemming the words
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
as.character(corpus[[1]])
## [1] "go jurong point crazi avail bugi n great world la e buffet cine got amor wat"
#Remove words which are unrepetitive
dtm = DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 300, terms: 1122)>>
## Non-/sparse entries: 2381/334219
## Sparsity : 99%
## Maximal term length: 35
## Weighting : term frequency (tf)
dtm = removeSparseTerms(dtm, 0.9999)
dim(dtm)
## [1] 300 1122
inspect(dtm[40:50, 10:15])
## <<DocumentTermMatrix (documents: 11, terms: 6)>>
## Non-/sparse entries: 0/66
## Sparsity : 100%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs account acoentri actin activ address admir
## 40 0 0 0 0 0 0
## 41 0 0 0 0 0 0
## 42 0 0 0 0 0 0
## 43 0 0 0 0 0 0
## 44 0 0 0 0 0 0
## 45 0 0 0 0 0 0
## 46 0 0 0 0 0 0
## 47 0 0 0 0 0 0
## 48 0 0 0 0 0 0
## 49 0 0 0 0 0 0
## 50 0 0 0 0 0 0
#Change 0 and 1 to yes and no.
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
# Apply the convert_count function to get final training and testing DTMs
datasetNB <- apply(dtm, 2, convert_count)
dataset = as.data.frame(as.matrix(datasetNB))
#Frequency of columns.
freq<- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 10)
## call now like get just will free can dont time
## 38 29 22 21 20 20 19 17 15 15
findFreqTerms(dtm, lowfreq=60)
## character(0)
dataset$Class = data$Category
#Data splitting.
set.seed(222)
split = sample(2,nrow(dataset),prob = c(0.75,0.25),replace = TRUE)
train_set = dataset[split == 1,]
test_set = dataset[split == 2,]
prop.table(table(train_set$Class))
##
## ham spam
## 0.8558952 0.1441048
#The probability of spam and ham
prop.table(table(test_set$Class))
##
## ham spam
## 0.8450704 0.1549296
#Run randomforest.
rf_classifier = randomForest(x = train_set,
y = train_set$Class,
ntree = 300)
#Prediction
rf_pred = predict(rf_classifier, newdata = test_set)
#Run confusion matrix
confusionMatrix(table(rf_pred,test_set$Class))
## Confusion Matrix and Statistics
##
##
## rf_pred ham spam
## ham 60 4
## spam 0 7
##
## Accuracy : 0.9437
## 95% CI : (0.862, 0.9844)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : 0.009977
##
## Kappa : 0.7473
##
## Mcnemar's Test P-Value : 0.133614
##
## Sensitivity : 1.0000
## Specificity : 0.6364
## Pos Pred Value : 0.9375
## Neg Pred Value : 1.0000
## Prevalence : 0.8451
## Detection Rate : 0.8451
## Detection Prevalence : 0.9014
## Balanced Accuracy : 0.8182
##
## 'Positive' Class : ham
##
Confusion matrix gives accuracy of 94%
#Run Support vector machine.
svm_clf = svm(Class~.,data = train_set)
svm_pred = predict(svm_clf,test_set)
confusionMatrix(svm_pred,test_set$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 60 11
## spam 0 0
##
## Accuracy : 0.8451
## 95% CI : (0.7397, 0.92)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : 0.579549
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.002569
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8451
## Neg Pred Value : NaN
## Prevalence : 0.8451
## Detection Rate : 0.8451
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : ham
##
Support vector machine give 84%