Introduction

The objective of this assignment is to work with sentiments using spam/ham dataset to train and transform data into corpus where can be analyzed using r programming.

Libraries

To perform this assignment the following library are needed for cleaning, analyzing and data visualision of sentiment.

library(tm)
library(knitr)
library(plyr)
library(wordcloud)

Import Data

Select the desire parameter and boundaries to imported data

# load file intto r 
# load 50 files 
n_files <- 50

# define the table row to didplay 
n_rows <- 10

# Define how many variable to analyze  
n_terms <- 999

Create a function to import texts data into r from desire directory

# get the Text given the directory
get_texts <- function(dir){
  contents <- c()
  files <- list.files(path=dir, full.names = TRUE)
  head(files)
  i <- 0
  for (cur_file in files){
    if(i < n_files){
      content <- readLines(cur_file)
      contents <- c(contents, content)
      i <- (i+1)
    }
  }
  texts <- Corpus(VectorSource(contents))
  return (texts)
}

Import the text into r environment

# Get the Ham Texts:
ham_texts <- get_texts("C:/Users/Ali/Desktop/lab/easy_ham/")
length(ham_texts)
## [1] 4621
# Get the Spam Texts:
spam_texts <- get_texts("C:/Users/Ali/Desktop/lab/spam_2/")
length(spam_texts)
## [1] 8924

Transforming Data

Apply filtering and formatting data using a list parameters

# general filtering:
trm <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)

# Basic data filtering and formatting 
ham_trm <- TermDocumentMatrix(ham_texts,control=trm)
ham_trm
## <<TermDocumentMatrix (terms: 3017, documents: 4621)>>
## Non-/sparse entries: 12425/13929132
## Sparsity           : 100%
## Maximal term length: 67
## Weighting          : term frequency (tf)
spam_trm <- TermDocumentMatrix(spam_texts,control=trm)
spam_trm
## <<TermDocumentMatrix (terms: 7068, documents: 8924)>>
## Non-/sparse entries: 22634/63052198
## Sparsity           : 100%
## Maximal term length: 251
## Weighting          : term frequency (tf)

Turn the result into r data frame, perform cleaning, rename columns, format data types, and display the results in tables.

# get the spam 

spamdata <- as.data.frame(as.table(spam_trm))
spamdata$spam_ham <- "SPAM"
colnames(spamdata) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spamdata <- subset(spamdata, select = -c(2) )
spamdata$SPAM_FREQ[is.na(spamdata$SPAM_FREQ)] <- '0'
spamdata <- ddply(spamdata, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))

# display spam data in table
kable(head(spamdata, n = n_rows))
TERM TYPE_SPAM SPAM_FREQ
“ecomog” SPAM 1
“on SPAM 2
aaa SPAM 1
aaaaabafeabaafaeefeebbbaafaabbc SPAM 2
aaaaadeaeeeafbdaecaabaeabaadecbfa SPAM 2
aaaabaaaeabaaeaaaadabeaabbafdfde SPAM 2
aaaafedcbafedc SPAM 2
aaabefaaaaabaffaadeaafaeaabb SPAM 2
aaacetcolnetco SPAM 1
aab SPAM 1
spamdatacount <- nrow(spamdata)
spamdatacount
## [1] 7068
# get the ham


hamdata <- as.data.frame(as.table(ham_trm))
hamdata$spam_ham <- "HAM"
colnames(hamdata) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
hamdata <- subset(hamdata, select = -c(2) )
hamdata$HAM_FREQ[is.na(hamdata$HAM_FREQ)] <- '0'
hamdata <- ddply(hamdata, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))

# display ham data in table

kable(head(hamdata, n = n_rows))
TERM TYPE_HAM HAM_FREQ
aaf HAM 1
abef HAM 1
abefa HAM 1
ability HAM 2
able HAM 6
acc HAM 1
access HAM 4
accessible HAM 1
accomplish HAM 1
account HAM 2
hamdatacount <- nrow(hamdata)
hamdatacount
## [1] 3017
# merge ham and spam into one data frame

ham_spam_df <- merge(x = hamdata, y = spamdata, by="TERM", all = TRUE)

# change na into zero or define text  

ham_spam_df$SPAM_FREQ[is.na(ham_spam_df$SPAM_FREQ)] <- '0'
ham_spam_df$TYPE_SPAM[is.na(ham_spam_df$TYPE_SPAM)] <- 'SPAM'
ham_spam_df$HAM_FREQ[is.na(ham_spam_df$HAM_FREQ)] <- '0'
ham_spam_df$TYPE_HAM[is.na(ham_spam_df$TYPE_HAM)] <- 'HAM'
ham_spam_df[is.na(ham_spam_df)] <- '0'

# sort the data frame by HAM_FREQ desc

ham_spam_df$SPAM_WEIGHT <- as.numeric(ham_spam_df$SPAM_FREQ) - as.numeric(ham_spam_df$HAM_FREQ)
kable(head(ham_spam_df[order(-as.numeric(ham_spam_df$HAM_FREQ)), ], n=n_rows))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
164 aug HAM 543 SPAM 28 -515
2246 received HAM 414 SPAM 172 -242
2716 thu HAM 376 SPAM 55 -321
820 esmtp HAM 197 SPAM 73 -124
1627 localhost HAM 178 SPAM 23 -155
2136 postfix HAM 113 SPAM 50 -63
3013 zzzzlocalhost HAM 100 SPAM 0 -100
1600 list HAM 91 SPAM 40 -51
755 edt HAM 86 SPAM 13 -73
623 deliveredto HAM 85 SPAM 44 -41
kable(head(ham_spam_df[order(-as.numeric(ham_spam_df$SPAM_FREQ)), ], n=n_rows))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
5131 font HAM 0 SPAM 284 284
5007 faceverdana HAM 0 SPAM 274 274
3334 arial HAM 0 SPAM 272 272
5440 helvetica HAM 0 SPAM 271 271
4146 colorfont HAM 0 SPAM 270 270
5270 geneva HAM 0 SPAM 270 270
7560 sansseriffont HAM 0 SPAM 270 270
783 email HAM 34 SPAM 178 144
2246 received HAM 414 SPAM 172 -242
2917 will HAM 27 SPAM 147 120

Visualization

Create a cloud chart of the ham and spam.

# ham
wordcloud(ham_texts, max.words = 200, random.order = FALSE, colors=c('blue'))

#spam
wordcloud(spam_texts, max.words = 200, random.order = FALSE, colors=c('red'))

Calculation

Create a function to calculate the means of the spam.

# Function to calculate the spam score (positive means more likely to be spam.):
getspam_score <- function(datapath){
  content <- readLines(datapath)
  a_string <- paste(content, collapse = ' ')
  wordlist <- strsplit(a_string, "\\W+")
  word_df <- as.data.frame(wordlist)
  colnames(word_df) <- c("WORD")
  word_df$WORD <- tolower(word_df$WORD)
  
  tscore <- sum(ham_spam_df$SPAM_WEIGHT[word_df$WORD == ham_spam_df$TERM])

  print(tscore)
}

Get the score of some spam

# test with ham 

getspam_score("C:/Users/Ali/Desktop/lab/easy_ham/0044.f1db2c76854ee58bc73d0c85ca6a86d2")
## [1] 18
getspam_score("C:/Users/Ali/Desktop/lab/easy_ham/0076.f565b68778786f9b9736f779489331f0")
## [1] 0
getspam_score("C:/Users/Ali/Desktop/lab/easy_ham/0062.b675bdb7b9e2321dfe97e48037fe7782")
## [1] -187
# test with spam

getspam_score("C:/Users/Ali/Desktop/lab/spam_2/00018.336cb9e7b0358594cf002e7bf669eaf5")
## [1] 3
getspam_score("C:/Users/Ali/Desktop/lab/spam_2/00036.5b5e714c8d5b1050a392e55c42070f2c")
## [1] 0
getspam_score("C:/Users/Ali/Desktop/lab/spam_2/00118.141d803810acd9d4fc23db103dddfcd9")
## [1] 16

Conclusion

In conclusion, sentiment can be imported and analyzed by r. Sentiment are powerful tools used to assess and categorize texts and turn them into meanfull data for opinion mining. We can determine emotional tones, attitudes, and opinion based on set of texts data.

To analyze sentiment, we need to define a bag of words, get and reduce terms, filter data, convert inflections terms to their roots, extract words, and train the classifier.