Acquiring the Data

The data for the text mining is a corpus called HC Corpora.

The corpus consists of text files in 4 different languages: German , English , Finnish and Russian. Text were obtained from blogs, news and twitter sources.

For the further processing only english language was considered

# Basic Indformation of the Dataset

filepath1 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
filepath2 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.news.txt"
filepath3 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

Preprocessing of the Dataset

A convinient fraction of the dataset was chosen as a sample for buiding the prediction app.

The cleanning of the data such as: whitespace elimination and lower case conversion, Stopword removal, and erasing word suffxes to retrieve their radicals in stemming process was performed, even the Profanity filtering by removing words in the profanity list was performed.

# Read blogs data in binary mode
conn <- file(filepath1, open="rb"); blogs <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Read news data in binary mode
conn <- file(filepath2, open="rb"); news <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Read twitter data in binary mode
conn <- file(filepath3, open="rb"); twitter <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Remove temporary variable
rm(conn)
# Basic Summary Table
sumTabl <- data.frame(
            file=c("en_US.blogs","en_US.news","en_US.twitter"),
            fileinMB=c(file.info(filepath1)$size/1024^2,
                           file.info(filepath2)$size/1024^2,
                           file.info(filepath3)$size/1024^2),
            t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
            Words=sapply(list(blogs,news,twitter),stri_stats_latex)[4,]))
             )
kable(sumTabl)

file	fileinMB	Lines	LinesNEmpty	Chars	CharsNWhite	Words
en_US.blogs	200.4242	899288	899288	206824382	170389539	37570839
en_US.news	196.2775	1010242	1010242	203223154	169860866	34494539
en_US.twitter	159.3641	2360148	2360148	162096241	134082806	30451170

Creating sample text corpus

blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")

set.seed(565)
sample_data <- c(sample(blogs, length(blogs) * 0.0004),
                 sample(news, length(news) * 0.0055),
                 sample(twitter, length(twitter) * 0.0001))
corpus <- VCorpus(VectorSource(sample_data))
rm(blogs,news,twitter)

# Cleaning the Sample corpus
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

##        word freq
## said   said 1422
## year   year  663
## will   will  656
## one     one  556
## new     new  444
## time   time  429
## like   like  398
## get     get  391
## state state  380
## say     say  361

library(memisc)
library("wordcloud")
library("RColorBrewer")
par(bg="black")

wordcloud(d$word, d$freq[1:600], col=terrain.colors(length(d$word), alpha=0.9), random.order=FALSE, rot.per=0.3 )
title(main = "Most Used Words in the Corpus", font.main = 1, col.main = "cornsilk3", cex.main = 1.0)

# Sentiment Analysis of the Corpus
library(syuzhet)
d<-get_nrc_sentiment(sample_data)
td<-data.frame(t(d))
td_new <- data.frame(rowSums(td[2:6151]))
#The function rowSums computes column sums across rows for each level of a grouping variable.

#Transformation and  cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]

# Visualisation
library("ggplot2")
qplot(sentiment, data=td_new2, weight=count, geom="bar",fill=sentiment)+ggtitle("Corpus sentiments")

###At the end NGramTokenizer and TermDocumentMatrix function was used to collect the terms pattern and the frequency to occur independently and linked to other specific words was performed.The frequency estimation performed included unigram, Bigram and Trigram for use in building the prediction model.

library(RWeka)
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uni_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = uni_tokenizer))
bi_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = bi_tokenizer))
tri_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = tri_tokenizer))

uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 50)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq=10)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq=05)

uni_corpus_freq <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_freq <- data.frame(word=names(uni_corpus_freq), frequency=uni_corpus_freq)
bi_corpus_freq <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_freq <- data.frame(word=names(bi_corpus_freq), frequency=bi_corpus_freq)
tri_corpus_freq <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_freq <- data.frame(word=names(tri_corpus_freq), frequency=tri_corpus_freq)
head(uni_corpus_freq)

##            word frequency
## abl         abl        60
## accord   accord       129
## account account        51
## activ     activ        55
## age         age        62
## agenc     agenc        63

plotNgrams <- function(data, title, num) {
 df2 <- data[order(-data$frequency),][1:num,] 
  l<-df2$word[1:num]
  ggplot(df2, aes(x = seq(1:num),  y = frequency)) +
    geom_bar(stat = "identity", fill = "red", colour = "black", width = 0.80) +
    coord_cartesian(xlim = c(0, num+1)) +
    labs(title = title) +
    xlab("Words")+
    ylab("Count") +
    scale_x_discrete(breaks = seq(1, num, by = 1), labels= l )+ 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
  }
plotNgrams(uni_corpus_freq,"Top Unigrams",20)

plotNgrams(bi_corpus_freq,"Top Bigrams",20)

plotNgrams(tri_corpus_freq,"Top Trigrams",08)

Building the Prediction Algorithm

The predictive model will analyse the associated words for each terms keyed in by user and provide the next possible term(s) based on the term’s relevancy.

Data Science Capstone Milestone Report

Harris Panakkal

March 26, 2018