Project 4: Document Classification

Instruction

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

#Import libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corpus)
library(tm)

## Loading required package: NLP

library(NLP)
library(SnowballC)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(e1071)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## The following object is masked from 'package:NLP':
## 
##     annotate

#Import the data

#Data source: https://www.kaggle.com/team-ai/spam-text-message-classification/version/1#SPAM%20text%20message%2020170820%20-%20Data.csv

data <- read.csv ("https://github.com/GehadGad/Spam-and-ham-data/raw/master/Data.csv", header=TRUE, sep=",", quote='\"\"', stringsAsFactors=FALSE)

#data <- read.csv("Data.csv", header=TRUE, sep=",", quote='\"\"', stringsAsFactors=FALSE)

#Since the data is big, I selected the first 300 only.

data[2,]

##   Category                       Message
## 2      ham Ok lar... Joking wif u oni...

data = data[1:300,]

# Get the probability of spam and ham:

data$Category <- factor(data$Category)

prop.table(table(data$Category))

## 
##       ham      spam 
## 0.8533333 0.1466667

#Separat each word.

corpus = VCorpus(VectorSource(data$Message))
as.character(corpus[[1]])

## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."

#Change all words to lower case.
corpus = tm_map(corpus, content_transformer(tolower))

#Remove numebrs
corpus = tm_map(corpus, removeNumbers)

#Remove punctuation.

corpus = tm_map(corpus, removePunctuation)

#Remove stop words
corpus = tm_map(corpus, removeWords, stopwords("english"))

#Stemming the words
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
as.character(corpus[[1]])

## [1] "go jurong point crazi avail bugi n great world la e buffet cine got amor wat"

#Remove words which are unrepetitive
dtm = DocumentTermMatrix(corpus)
dtm

## <<DocumentTermMatrix (documents: 300, terms: 1122)>>
## Non-/sparse entries: 2381/334219
## Sparsity           : 99%
## Maximal term length: 35
## Weighting          : term frequency (tf)

dtm = removeSparseTerms(dtm, 0.9999)

dim(dtm)

## [1]  300 1122

inspect(dtm[40:50, 10:15])

## <<DocumentTermMatrix (documents: 11, terms: 6)>>
## Non-/sparse entries: 0/66
## Sparsity           : 100%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs account acoentri actin activ address admir
##   40       0        0     0     0       0     0
##   41       0        0     0     0       0     0
##   42       0        0     0     0       0     0
##   43       0        0     0     0       0     0
##   44       0        0     0     0       0     0
##   45       0        0     0     0       0     0
##   46       0        0     0     0       0     0
##   47       0        0     0     0       0     0
##   48       0        0     0     0       0     0
##   49       0        0     0     0       0     0
##   50       0        0     0     0       0     0

#Change 0 and 1 to yes and no.
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

# Apply the convert_count function to get final training and testing DTMs
datasetNB <- apply(dtm, 2, convert_count)

dataset = as.data.frame(as.matrix(datasetNB))

#Frequency of columns.

freq<- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 10)

## call  now like  get just will free  can dont time 
##   38   29   22   21   20   20   19   17   15   15

findFreqTerms(dtm, lowfreq=60)

## character(0)

dataset$Class = data$Category

#Data splitting.

set.seed(222)
split = sample(2,nrow(dataset),prob = c(0.75,0.25),replace = TRUE)
train_set = dataset[split == 1,]
test_set = dataset[split == 2,] 

prop.table(table(train_set$Class))

## 
##       ham      spam 
## 0.8558952 0.1441048

#The probability of spam and ham 
prop.table(table(test_set$Class))

## 
##       ham      spam 
## 0.8450704 0.1549296

#Run randomforest.

rf_classifier = randomForest(x = train_set,
                          y = train_set$Class,
                          ntree = 300)

#Prediction
rf_pred = predict(rf_classifier, newdata = test_set)

#Run confusion matrix

confusionMatrix(table(rf_pred,test_set$Class))

## Confusion Matrix and Statistics
## 
##        
## rf_pred ham spam
##    ham   60    4
##    spam   0    7
##                                          
##                Accuracy : 0.9437         
##                  95% CI : (0.862, 0.9844)
##     No Information Rate : 0.8451         
##     P-Value [Acc > NIR] : 0.009977       
##                                          
##                   Kappa : 0.7473         
##                                          
##  Mcnemar's Test P-Value : 0.133614       
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.6364         
##          Pos Pred Value : 0.9375         
##          Neg Pred Value : 1.0000         
##              Prevalence : 0.8451         
##          Detection Rate : 0.8451         
##    Detection Prevalence : 0.9014         
##       Balanced Accuracy : 0.8182         
##                                          
##        'Positive' Class : ham            
##

Confusion matrix gives accuracy of 94%

#Run Support vector machine.

svm_clf = svm(Class~.,data = train_set)
svm_pred = predict(svm_clf,test_set)
confusionMatrix(svm_pred,test_set$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham   60   11
##       spam   0    0
##                                         
##                Accuracy : 0.8451        
##                  95% CI : (0.7397, 0.92)
##     No Information Rate : 0.8451        
##     P-Value [Acc > NIR] : 0.579549      
##                                         
##                   Kappa : 0             
##                                         
##  Mcnemar's Test P-Value : 0.002569      
##                                         
##             Sensitivity : 1.0000        
##             Specificity : 0.0000        
##          Pos Pred Value : 0.8451        
##          Neg Pred Value :    NaN        
##              Prevalence : 0.8451        
##          Detection Rate : 0.8451        
##    Detection Prevalence : 1.0000        
##       Balanced Accuracy : 0.5000        
##                                         
##        'Positive' Class : ham           
##

Support vector machine give 84%

Project 4: Document Classification

Gehad Gad

4/26/2020

Instruction