The objective of this assignment is to work with sentiments using spam/ham dataset to train and transform data into corpus where can be analyzed using r programming.
To perform this assignment the following library are needed for cleaning, analyzing and data visualision of sentiment.
library(tm)
library(knitr)
library(plyr)
library(wordcloud)
Select the desire parameter and boundaries to imported data
# load file intto r
# load 50 files
n_files <- 50
# define the table row to didplay
n_rows <- 10
# Define how many variable to analyze
n_terms <- 999
Create a function to import texts data into r from desire directory
# get the Text given the directory
get_texts <- function(dir){
contents <- c()
files <- list.files(path=dir, full.names = TRUE)
head(files)
i <- 0
for (cur_file in files){
if(i < n_files){
content <- readLines(cur_file)
contents <- c(contents, content)
i <- (i+1)
}
}
texts <- Corpus(VectorSource(contents))
return (texts)
}
Import the text into r environment
# Get the Ham Texts:
ham_texts <- get_texts("C:/Users/Ali/Desktop/lab/easy_ham/")
length(ham_texts)
## [1] 4621
# Get the Spam Texts:
spam_texts <- get_texts("C:/Users/Ali/Desktop/lab/spam_2/")
length(spam_texts)
## [1] 8924
Apply filtering and formatting data using a list parameters
# general filtering:
trm <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)
# Basic data filtering and formatting
ham_trm <- TermDocumentMatrix(ham_texts,control=trm)
ham_trm
## <<TermDocumentMatrix (terms: 3017, documents: 4621)>>
## Non-/sparse entries: 12425/13929132
## Sparsity : 100%
## Maximal term length: 67
## Weighting : term frequency (tf)
spam_trm <- TermDocumentMatrix(spam_texts,control=trm)
spam_trm
## <<TermDocumentMatrix (terms: 7068, documents: 8924)>>
## Non-/sparse entries: 22634/63052198
## Sparsity : 100%
## Maximal term length: 251
## Weighting : term frequency (tf)
Turn the result into r data frame, perform cleaning, rename columns, format data types, and display the results in tables.
# get the spam
spamdata <- as.data.frame(as.table(spam_trm))
spamdata$spam_ham <- "SPAM"
colnames(spamdata) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spamdata <- subset(spamdata, select = -c(2) )
spamdata$SPAM_FREQ[is.na(spamdata$SPAM_FREQ)] <- '0'
spamdata <- ddply(spamdata, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
# display spam data in table
kable(head(spamdata, n = n_rows))
| TERM | TYPE_SPAM | SPAM_FREQ |
|---|---|---|
| “ecomog” | SPAM | 1 |
| “on | SPAM | 2 |
| aaa | SPAM | 1 |
| aaaaabafeabaafaeefeebbbaafaabbc | SPAM | 2 |
| aaaaadeaeeeafbdaecaabaeabaadecbfa | SPAM | 2 |
| aaaabaaaeabaaeaaaadabeaabbafdfde | SPAM | 2 |
| aaaafedcbafedc | SPAM | 2 |
| aaabefaaaaabaffaadeaafaeaabb | SPAM | 2 |
| aaacetcolnetco | SPAM | 1 |
| aab | SPAM | 1 |
spamdatacount <- nrow(spamdata)
spamdatacount
## [1] 7068
# get the ham
hamdata <- as.data.frame(as.table(ham_trm))
hamdata$spam_ham <- "HAM"
colnames(hamdata) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
hamdata <- subset(hamdata, select = -c(2) )
hamdata$HAM_FREQ[is.na(hamdata$HAM_FREQ)] <- '0'
hamdata <- ddply(hamdata, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
# display ham data in table
kable(head(hamdata, n = n_rows))
| TERM | TYPE_HAM | HAM_FREQ |
|---|---|---|
| aaf | HAM | 1 |
| abef | HAM | 1 |
| abefa | HAM | 1 |
| ability | HAM | 2 |
| able | HAM | 6 |
| acc | HAM | 1 |
| access | HAM | 4 |
| accessible | HAM | 1 |
| accomplish | HAM | 1 |
| account | HAM | 2 |
hamdatacount <- nrow(hamdata)
hamdatacount
## [1] 3017
# merge ham and spam into one data frame
ham_spam_df <- merge(x = hamdata, y = spamdata, by="TERM", all = TRUE)
# change na into zero or define text
ham_spam_df$SPAM_FREQ[is.na(ham_spam_df$SPAM_FREQ)] <- '0'
ham_spam_df$TYPE_SPAM[is.na(ham_spam_df$TYPE_SPAM)] <- 'SPAM'
ham_spam_df$HAM_FREQ[is.na(ham_spam_df$HAM_FREQ)] <- '0'
ham_spam_df$TYPE_HAM[is.na(ham_spam_df$TYPE_HAM)] <- 'HAM'
ham_spam_df[is.na(ham_spam_df)] <- '0'
# sort the data frame by HAM_FREQ desc
ham_spam_df$SPAM_WEIGHT <- as.numeric(ham_spam_df$SPAM_FREQ) - as.numeric(ham_spam_df$HAM_FREQ)
kable(head(ham_spam_df[order(-as.numeric(ham_spam_df$HAM_FREQ)), ], n=n_rows))
| TERM | TYPE_HAM | HAM_FREQ | TYPE_SPAM | SPAM_FREQ | SPAM_WEIGHT | |
|---|---|---|---|---|---|---|
| 164 | aug | HAM | 543 | SPAM | 28 | -515 |
| 2246 | received | HAM | 414 | SPAM | 172 | -242 |
| 2716 | thu | HAM | 376 | SPAM | 55 | -321 |
| 820 | esmtp | HAM | 197 | SPAM | 73 | -124 |
| 1627 | localhost | HAM | 178 | SPAM | 23 | -155 |
| 2136 | postfix | HAM | 113 | SPAM | 50 | -63 |
| 3013 | zzzzlocalhost | HAM | 100 | SPAM | 0 | -100 |
| 1600 | list | HAM | 91 | SPAM | 40 | -51 |
| 755 | edt | HAM | 86 | SPAM | 13 | -73 |
| 623 | deliveredto | HAM | 85 | SPAM | 44 | -41 |
kable(head(ham_spam_df[order(-as.numeric(ham_spam_df$SPAM_FREQ)), ], n=n_rows))
| TERM | TYPE_HAM | HAM_FREQ | TYPE_SPAM | SPAM_FREQ | SPAM_WEIGHT | |
|---|---|---|---|---|---|---|
| 5131 | font | HAM | 0 | SPAM | 284 | 284 |
| 5007 | faceverdana | HAM | 0 | SPAM | 274 | 274 |
| 3334 | arial | HAM | 0 | SPAM | 272 | 272 |
| 5440 | helvetica | HAM | 0 | SPAM | 271 | 271 |
| 4146 | colorfont | HAM | 0 | SPAM | 270 | 270 |
| 5270 | geneva | HAM | 0 | SPAM | 270 | 270 |
| 7560 | sansseriffont | HAM | 0 | SPAM | 270 | 270 |
| 783 | HAM | 34 | SPAM | 178 | 144 | |
| 2246 | received | HAM | 414 | SPAM | 172 | -242 |
| 2917 | will | HAM | 27 | SPAM | 147 | 120 |
Create a cloud chart of the ham and spam.
# ham
wordcloud(ham_texts, max.words = 200, random.order = FALSE, colors=c('blue'))
#spam
wordcloud(spam_texts, max.words = 200, random.order = FALSE, colors=c('red'))
Create a function to calculate the means of the spam.
# Function to calculate the spam score (positive means more likely to be spam.):
getspam_score <- function(datapath){
content <- readLines(datapath)
a_string <- paste(content, collapse = ' ')
wordlist <- strsplit(a_string, "\\W+")
word_df <- as.data.frame(wordlist)
colnames(word_df) <- c("WORD")
word_df$WORD <- tolower(word_df$WORD)
tscore <- sum(ham_spam_df$SPAM_WEIGHT[word_df$WORD == ham_spam_df$TERM])
print(tscore)
}
Get the score of some spam
# test with ham
getspam_score("C:/Users/Ali/Desktop/lab/easy_ham/0044.f1db2c76854ee58bc73d0c85ca6a86d2")
## [1] 18
getspam_score("C:/Users/Ali/Desktop/lab/easy_ham/0076.f565b68778786f9b9736f779489331f0")
## [1] 0
getspam_score("C:/Users/Ali/Desktop/lab/easy_ham/0062.b675bdb7b9e2321dfe97e48037fe7782")
## [1] -187
# test with spam
getspam_score("C:/Users/Ali/Desktop/lab/spam_2/00018.336cb9e7b0358594cf002e7bf669eaf5")
## [1] 3
getspam_score("C:/Users/Ali/Desktop/lab/spam_2/00036.5b5e714c8d5b1050a392e55c42070f2c")
## [1] 0
getspam_score("C:/Users/Ali/Desktop/lab/spam_2/00118.141d803810acd9d4fc23db103dddfcd9")
## [1] 16
In conclusion, sentiment can be imported and analyzed by r. Sentiment are powerful tools used to assess and categorize texts and turn them into meanfull data for opinion mining. We can determine emotional tones, attitudes, and opinion based on set of texts data.
To analyze sentiment, we need to define a bag of words, get and reduce terms, filter data, convert inflections terms to their roots, extract words, and train the classifier.