Acquiring the Data

The data for the text mining is a corpus called HC Corpora.

The corpus consists of text files in 4 different languages: German , English , Finnish and Russian. Text were obtained from blogs, news and twitter sources.

For the further processing only english language was considered

# Basic Indformation of the Dataset

filepath1 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
filepath2 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.news.txt"
filepath3 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

Preprocessing of the Dataset

A convinient fraction of the dataset was chosen as a sample for buiding the prediction app.

The cleanning of the data such as: whitespace elimination and lower case conversion, Stopword removal, and erasing word suffxes to retrieve their radicals in stemming process was performed, even the Profanity filtering by removing words in the profanity list was performed.

# Read blogs data in binary mode
conn <- file(filepath1, open="rb"); blogs <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Read news data in binary mode
conn <- file(filepath2, open="rb"); news <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Read twitter data in binary mode
conn <- file(filepath3, open="rb"); twitter <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Remove temporary variable
rm(conn)
# Basic Summary Table
sumTabl <- data.frame(
            file=c("en_US.blogs","en_US.news","en_US.twitter"),
            fileinMB=c(file.info(filepath1)$size/1024^2,
                           file.info(filepath2)$size/1024^2,
                           file.info(filepath3)$size/1024^2),
            t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
            Words=sapply(list(blogs,news,twitter),stri_stats_latex)[4,]))
             )
kable(sumTabl)
file fileinMB Lines LinesNEmpty Chars CharsNWhite Words
en_US.blogs 200.4242 899288 899288 206824382 170389539 37570839
en_US.news 196.2775 1010242 1010242 203223154 169860866 34494539
en_US.twitter 159.3641 2360148 2360148 162096241 134082806 30451170

Creating sample text corpus

blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")

set.seed(565)
sample_data <- c(sample(blogs, length(blogs) * 0.0004),
                 sample(news, length(news) * 0.0055),
                 sample(twitter, length(twitter) * 0.0001))
corpus <- VCorpus(VectorSource(sample_data))
rm(blogs,news,twitter)
# Cleaning the Sample corpus
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##        word freq
## said   said 1422
## year   year  663
## will   will  656
## one     one  556
## new     new  444
## time   time  429
## like   like  398
## get     get  391
## state state  380
## say     say  361
library(memisc)
library("wordcloud")
library("RColorBrewer")
par(bg="black")

wordcloud(d$word, d$freq[1:600], col=terrain.colors(length(d$word), alpha=0.9), random.order=FALSE, rot.per=0.3 )
title(main = "Most Used Words in the Corpus", font.main = 1, col.main = "cornsilk3", cex.main = 1.0)

# Sentiment Analysis of the Corpus
library(syuzhet)
d<-get_nrc_sentiment(sample_data)
td<-data.frame(t(d))
td_new <- data.frame(rowSums(td[2:6151]))
#The function rowSums computes column sums across rows for each level of a grouping variable.

#Transformation and  cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
# Visualisation
library("ggplot2")
qplot(sentiment, data=td_new2, weight=count, geom="bar",fill=sentiment)+ggtitle("Corpus sentiments")   

###At the end NGramTokenizer and TermDocumentMatrix function was used to collect the terms pattern and the frequency to occur independently and linked to other specific words was performed.The frequency estimation performed included unigram, Bigram and Trigram for use in building the prediction model.

library(RWeka)
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uni_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = uni_tokenizer))
bi_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = bi_tokenizer))
tri_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = tri_tokenizer))
uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 50)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq=10)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq=05)

uni_corpus_freq <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_freq <- data.frame(word=names(uni_corpus_freq), frequency=uni_corpus_freq)
bi_corpus_freq <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_freq <- data.frame(word=names(bi_corpus_freq), frequency=bi_corpus_freq)
tri_corpus_freq <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_freq <- data.frame(word=names(tri_corpus_freq), frequency=tri_corpus_freq)
head(uni_corpus_freq)
##            word frequency
## abl         abl        60
## accord   accord       129
## account account        51
## activ     activ        55
## age         age        62
## agenc     agenc        63
plotNgrams <- function(data, title, num) {
 df2 <- data[order(-data$frequency),][1:num,] 
  l<-df2$word[1:num]
  ggplot(df2, aes(x = seq(1:num),  y = frequency)) +
    geom_bar(stat = "identity", fill = "red", colour = "black", width = 0.80) +
    coord_cartesian(xlim = c(0, num+1)) +
    labs(title = title) +
    xlab("Words")+
    ylab("Count") +
    scale_x_discrete(breaks = seq(1, num, by = 1), labels= l )+ 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
  }
plotNgrams(uni_corpus_freq,"Top Unigrams",20)

plotNgrams(bi_corpus_freq,"Top Bigrams",20)

plotNgrams(tri_corpus_freq,"Top Trigrams",08)

Building the Prediction Algorithm

The predictive model will analyse the associated words for each terms keyed in by user and provide the next possible term(s) based on the term’s relevancy.