IS 606: Assignment 11

Load the packages libraries and load data from Url into R.

options(warn = -1)
suppressMessages(library(tm))
suppressMessages(library(tidyr))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))
suppressMessages(library(SnowballC))
suppressMessages(library(knitr))
suppressMessages(library(RTextTools));


spam <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/772b359fe7cb65505be4ef38b1541435949f5e12/spam.csv", header = FALSE, sep = ";")

ham <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/772b359fe7cb65505be4ef38b1541435949f5e12/ham.csv", header = FALSE, sep = ";")

spam_ham <- c(spam, ham)
str(spam_ham)

## List of 2
##  $ V1: Factor w/ 113 levels " "," --><FONT ",..: 92 103 87 99 54 56 49 100 53 58 ...
##  $ V1: Factor w/ 81 levels "    (8.11.6/8.11.6) with ESMTP id g7MBQPW13260",..: 52 74 48 68 36 37 31 71 35 38 ...

We combine the files into single file and load it into Corpus.

review_text <- paste(spam_ham, collapse = " ")
review_text

## [1] "c(92, 103, 87, 99, 54, 56, 49, 100, 53, 58, 51, 98, 55, 57, 50, 90, 101, 52, 108, 106, 86, 95, 94, 84, 47, 83, 60, 68, 70, 113, 69, 62, 76, 110, 105, 44, 46, 43, 24, 40, 46, 43, 24, 39, 2, 82, 59, 89, 66, 79, 89, 67, 64, 104, 65, 63, 71, 72, 73, 63, 74, 88, 45, 75, 88, 44, 46, 42, 91, 33, 93, 31, 107, 38, 46, 41, 30, 18, 19, 15, 5, 112, 9, 109, 11, 111, 8, 96, 10, 85, 4, 61, 21, 14, 17, 3, 7, 1, 6, 13, 12, 97, 16, 22, 20, 28, 77, 29, 23, 29, 27, 25, 26, 48, 35, 34, 102, 37, 80, 36, 32, 78, 81) c(52, 74, 48, 68, 36, 37, 31, 71, 35, 38, 33, 66, 16, 12, 32, 67, 21, 28, 14, 47, 65, 4, 17, 30, 3, 63, 19, 28, 13, 70, 20, 9, 29, 72, 2, 24, 64, 23, 25, 69, 1, 34, 5, 51, 78, 44, 76, 53, 73, 6, 7, 8, 61, 45, 27, 60, 80, 75, 49, 79, 81, 62, 55, 57, 58, 10, 56, 59, 11, 54, 46, 15, 18, 22, 26, 50, 77, 40, 39, 41, 42, 43)"

review_source <- VectorSource(spam_ham)
corpus <- Corpus(review_source)
corpus <- tm_map(corpus, removePunctuation)

head(inspect(corpus)) # Inspect the corpus format.

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2
## 
## $V1
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3964
## 
## $V1
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 4180

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2

corpus <- tm_map(corpus, removeNumbers) # We remove numbers
corpus <- tm_map(corpus, tolower)       # Change case to lower
corpus <- tm_map(corpus, stripWhitespace) #Removing whitespaces
corpus <- tm_map(corpus, removeWords, stopwords("english")) # Removing words

head(corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2

corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)

We move the corpus into document matrix and remove any sparse term

dtm <- DocumentTermMatrix(corpus)

dtm <- removeSparseTerms(dtm, 0.2)
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 24)>>
## Non-/sparse entries: 48/0
## Sparsity           : 0%
## Maximal term length: 13
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           aug contenttype date deliveredto edt error esmtp fetchmail
##   character(0)   6           1    1           1   1     1     2         1
##   character(0)  13           1    2           2   3     2     6         1
##               Terms
## Docs           get ïfrom ist list localhost messageid mimeversion new
##   character(0)   1     1   1    1         3         1           1   1
##   character(0)   1     1   1    6         4         2           1   1
##               Terms
## Docs           postfix received returnpath singledrop subject thu wed
##   character(0)       1        4          1          1       1   4   1
##   character(0)       3       10          1          1       4  12   1
##               Terms
## Docs           zzzzlocalhost
##   character(0)             2
##   character(0)             2

We are trying to get some summary statistics

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)   
head(table(freq), 20)

## freq
##  2  3  4  5  7  8 14 16 19 
## 10  4  3  1  2  1  1  1  1

ord <- order(freq) 
ord

##  [1] 15 16 17 18 19 20 21 22 23 24 11 12 13 14  8  9 10  7  5  6  4  3  2
## [24]  1

wf <- data.frame(word=names(freq), freq=freq)   
head(wf)

##                word freq
## aug             aug   19
## thu             thu   16
## received   received   14
## esmtp         esmtp    8
## list           list    7
## localhost localhost    7

dim(wf)

## [1] 24  2

barplot(wf[1:10, ]$freq, las = 2, names.arg = wf[1:10,]$word, col = "purple",
        main = "Most Frequent Words", ylab ="Word frequecies")

Checking for any association between word(s), displaying it wordcloud and saving/writting it into .csv

findAssocs(dtm, "remove" , corlimit=0.5)

## $remove
## numeric(0)

library(wordcloud)

## Loading required package: RColorBrewer

word <- names(freq)

set.seed(142)   
dark2 <- brewer.pal(6, "Dark2")   
wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)

ham <- as.vector(apply(as.matrix(spam_ham[1], mode="character"),1,paste,"V1",sep="",collapse=""))
spam <- as.vector(apply(as.matrix(spam_ham[2], mode="character"),1,paste,"V1", sep="",collapse=""))

training_data <- cbind(spam_ham[3], ham, spam) 

training_data = sample(1:nrow(training_data), size=0.2*nrow(training_data))
training_codes <- training_data[1] 
training_data <- training_data[-1] 


setwd("c:/data")
write.csv(freq, file="dtm.csv")

References:

https://stat.ethz.ch/pipermail/r-help/2010-October/255593.html

http://www.inside-r.org/packages/cran/stylo/docs/load.corpus

http://stackoverflow.com/questions/8681688/disable-messages-upon-loading-package-in-r

https://deltadna.com/blog/plotting-in-r-tutorial/

IS 606: Assignment 11

MUSA T. GANIYU

April 9, 2016