setwd("D:/personal/R/swiftkey_capstone")
library(tm)
## Loading required package: NLP
library(slam)
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(stringi)
library(reshape)
library(eRm)
library(xtable)
library(knitr)

Summary

The goal of the capstone project is to predict succesive words using text input from the user. This project uses the data given by swiftkey. This report mainly contains the analyis of the text corpus data.

The data consists of text documents from sources like blogs, news and twitter and 4 different languages. For this analysis, I have considered English language.

Reading and Cleaning Data

Since the data is large, we can read the files once and save the variables to reduce the run time for consecutive executions.

#blogs <-  readLines("eng_data/en_US.blogs.txt",encoding="UTF-8", skipNul = TRUE)
#tweets <- readLines("eng_data/en_US.twitter.txt", encoding="UTF-8",skipNul = TRUE)
#con       <- file("eng_data/en_US.news.txt", open="rb")
#news    <- readLines(con, encoding="UTF-8")
#close(con)
#rm(con)
#save(blogs,tweets,news,file = "rawdata.RData")

load("rawdata.RData")

Exploratory Analysis of the datasets

First, we will explore each dataset individually and find some important features of each dataset ###Blogs

summary(blogs)
##    Length     Class      Mode 
##    899288 character character
##count number of words
stri_stats_latex(blogs)
##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     162464653             9      42636700      37570839             3 
##        Envirs 
##             0

Thus, the blogs data contains 37570839 words and 899288 lines. Before finding the most frequent words in this dataset, we can write a generic function to clean the data.

profanity <- as.character(read.csv("profanity.txt",header = FALSE)$V1)



toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

cleanCorpus <- function(texts,profanity){
  texts <- gsub("[^A-Za-z ]","",texts)
  corpus_data <- VCorpus(VectorSource(texts))
  #remove urls
  corpus_data <- tm_map(corpus_data,toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
  corpus_data <- tm_map(corpus_data, toSpace, "RT |via ") # RT's and VIA
  corpus_data <- tm_map(corpus_data, content_transformer(stri_trans_tolower))
  corpus_data <- tm_map(corpus_data, removeWords, stopwords("english"))
  corpus_data <- tm_map(corpus_data, removeNumbers)
  corpus_data <- tm_map(corpus_data, removePunctuation)
  corpus_data <- tm_map(corpus_data, stemDocument)
  corpus_data <- tm_map(corpus_data, stripWhitespace)
  corpus_data <- tm_map(corpus_data, removeWords, profanity)
}

We can use this function to clean all the datasets.

# blogs_clean <- cleanCorpus(blogs,profanity = profanity)
# news_clean <- cleanCorpus(news,profanity = profanity)
# tweets_clean <- cleanCorpus(tweets,profanity = profanity)
# save(blogs_clean,news_clean,tweets_clean,file = "cleandata.RData")
# load("cleandata.RData")
# dtm_blogs <- DocumentTermMatrix(blogs_clean,control=list(bounds=list(global=c(10, Inf))))
# 
# freq_blogs <- sort(col_sums(dtm_blogs, na.rm=T), decreasing=TRUE)
# df_blogs <- data.frame(word=names(freq_blogs), freq=freq_blogs)

# dtm_news <- DocumentTermMatrix(news_clean,control=list(bounds=list(global=c(10, Inf))))
# freq_news <- sort(col_sums(dtm_news, na.rm=T), decreasing=TRUE)
# df_news <- data.frame(word=names(freq_news), freq=freq_news)
# save(dtm_news,df_news,file = "news_unigram.RData")
# 
# dtm_tweets <- DocumentTermMatrix(tweets_clean,control=list(bounds=list(global=c(10, Inf))))
# freq_tweets <- sort(col_sums(dtm_tweets, na.rm=T), decreasing=TRUE)
# df_tweets <- data.frame(word=names(freq_tweets), freq=freq_tweets)
# save(dtm_tweets,df_tweets,file = "tweets_unigram.RData")
# save(dtm_blogs,df_blogs,file = "blogs_unigram.RData")

#save(df_blogs,df_news,df_tweets,file = "unigram.RData")
load("unigram.RData")

The top 20 words from the blogs can be plotted on a histogram and a wordcloud.

blogs_top20 <- df_blogs[1:20,]
ggplot(blogs_top20,aes(x= reorder(word,freq), y = freq)) + geom_histogram(aes(fill = freq), stat = "identity") + coord_flip() + ggtitle("Top 20 words in blogs data") + xlab("Words") + ylab("Frequency")

wordcloud(df_blogs$word,freq = df_blogs$freq, scale = c(4,0.5),random.order=FALSE, rot.per=.15,colors= brewer.pal(8,"Dark2"),max.words = 40)

Similar analysis cn be done for news and twitter data.

News

summary(news)
##    Length     Class      Mode 
##   1010242 character character
##count number of words
stri_stats_latex(news)
##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     162227130             2      40263955      34494539             1 
##        Envirs 
##             0
news_top20 <- df_news[1:20,]
ggplot(news_top20,aes(x= reorder(word,freq), y = freq)) + geom_histogram(aes(fill = freq), stat = "identity") + coord_flip() + ggtitle("Top 20 words in news data") + xlab("Words") + ylab("Frequency")

wordcloud(df_news$word,freq = df_news$freq, scale = c(4,0.5),random.order=FALSE, rot.per=.15,colors= brewer.pal(8,"Dark2"),max.words = 40)

Twitter

#### twitter
summary(tweets)
##    Length     Class      Mode 
##   2360148 character character
##count number of words
stri_stats_latex(tweets)
##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     125570778          3032      35958529      30451170           963 
##        Envirs 
##             0
tweets_top20 <- df_tweets[1:20,]
ggplot(tweets_top20,aes(x= reorder(word,freq), y = freq)) + geom_histogram(aes(fill = freq), stat = "identity") + coord_flip() + ggtitle("Top 20 words in twitter data") + xlab("Words") + ylab("Frequency")

wordcloud(df_tweets$word,freq = df_tweets$freq, scale = c(4,0.5),random.order=FALSE, rot.per=.15,colors= brewer.pal(8,"Dark2"),max.words = 40)

Summary

num_lines <- c(summary(blogs)[1],summary(news)[1],summary(tweets)[1])
num_words <- c(stri_stats_latex(blogs)[4],stri_stats_latex(news)[4],stri_stats_latex(tweets)[4])
data_summary <- data.frame(num_lines,num_words)
colnames(data_summary) <- c("Number of Lines","Number of words")
rownames(data_summary) <- c("Blogs","News","Tweets")
kable(data_summary, format = "markdown")
Number of Lines Number of words
Blogs 899288 37570839
News 1010242 34494539
Tweets 2360148 30451170

We can observe that the most frequent words are one, said and just in the blogs, news and twitter data respectively.

Note: Although I have commented lot of R code, I have used the code and saved it as .RData and loading the same to fasten the process.