setwd("D:/personal/R/swiftkey_capstone")
library(tm)
## Loading required package: NLP
library(slam)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(stringi)
library(reshape)
library(eRm)
library(xtable)
library(knitr)
The goal of the capstone project is to predict succesive words using text input from the user. This project uses the data given by swiftkey. This report mainly contains the analyis of the text corpus data.
The data consists of text documents from sources like blogs, news and twitter and 4 different languages. For this analysis, I have considered English language.
Since the data is large, we can read the files once and save the variables to reduce the run time for consecutive executions.
#blogs <- readLines("eng_data/en_US.blogs.txt",encoding="UTF-8", skipNul = TRUE)
#tweets <- readLines("eng_data/en_US.twitter.txt", encoding="UTF-8",skipNul = TRUE)
#con <- file("eng_data/en_US.news.txt", open="rb")
#news <- readLines(con, encoding="UTF-8")
#close(con)
#rm(con)
#save(blogs,tweets,news,file = "rawdata.RData")
load("rawdata.RData")
First, we will explore each dataset individually and find some important features of each dataset ###Blogs
summary(blogs)
## Length Class Mode
## 899288 character character
##count number of words
stri_stats_latex(blogs)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 162464653 9 42636700 37570839 3
## Envirs
## 0
Thus, the blogs data contains 37570839 words and 899288 lines. Before finding the most frequent words in this dataset, we can write a generic function to clean the data.
profanity <- as.character(read.csv("profanity.txt",header = FALSE)$V1)
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
cleanCorpus <- function(texts,profanity){
texts <- gsub("[^A-Za-z ]","",texts)
corpus_data <- VCorpus(VectorSource(texts))
#remove urls
corpus_data <- tm_map(corpus_data,toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus_data <- tm_map(corpus_data, toSpace, "RT |via ") # RT's and VIA
corpus_data <- tm_map(corpus_data, content_transformer(stri_trans_tolower))
corpus_data <- tm_map(corpus_data, removeWords, stopwords("english"))
corpus_data <- tm_map(corpus_data, removeNumbers)
corpus_data <- tm_map(corpus_data, removePunctuation)
corpus_data <- tm_map(corpus_data, stemDocument)
corpus_data <- tm_map(corpus_data, stripWhitespace)
corpus_data <- tm_map(corpus_data, removeWords, profanity)
}
We can use this function to clean all the datasets.
# blogs_clean <- cleanCorpus(blogs,profanity = profanity)
# news_clean <- cleanCorpus(news,profanity = profanity)
# tweets_clean <- cleanCorpus(tweets,profanity = profanity)
# save(blogs_clean,news_clean,tweets_clean,file = "cleandata.RData")
# load("cleandata.RData")
# dtm_blogs <- DocumentTermMatrix(blogs_clean,control=list(bounds=list(global=c(10, Inf))))
#
# freq_blogs <- sort(col_sums(dtm_blogs, na.rm=T), decreasing=TRUE)
# df_blogs <- data.frame(word=names(freq_blogs), freq=freq_blogs)
# dtm_news <- DocumentTermMatrix(news_clean,control=list(bounds=list(global=c(10, Inf))))
# freq_news <- sort(col_sums(dtm_news, na.rm=T), decreasing=TRUE)
# df_news <- data.frame(word=names(freq_news), freq=freq_news)
# save(dtm_news,df_news,file = "news_unigram.RData")
#
# dtm_tweets <- DocumentTermMatrix(tweets_clean,control=list(bounds=list(global=c(10, Inf))))
# freq_tweets <- sort(col_sums(dtm_tweets, na.rm=T), decreasing=TRUE)
# df_tweets <- data.frame(word=names(freq_tweets), freq=freq_tweets)
# save(dtm_tweets,df_tweets,file = "tweets_unigram.RData")
# save(dtm_blogs,df_blogs,file = "blogs_unigram.RData")
#save(df_blogs,df_news,df_tweets,file = "unigram.RData")
load("unigram.RData")
The top 20 words from the blogs can be plotted on a histogram and a wordcloud.
blogs_top20 <- df_blogs[1:20,]
ggplot(blogs_top20,aes(x= reorder(word,freq), y = freq)) + geom_histogram(aes(fill = freq), stat = "identity") + coord_flip() + ggtitle("Top 20 words in blogs data") + xlab("Words") + ylab("Frequency")
wordcloud(df_blogs$word,freq = df_blogs$freq, scale = c(4,0.5),random.order=FALSE, rot.per=.15,colors= brewer.pal(8,"Dark2"),max.words = 40)
Similar analysis cn be done for news and twitter data.
summary(news)
## Length Class Mode
## 1010242 character character
##count number of words
stri_stats_latex(news)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 162227130 2 40263955 34494539 1
## Envirs
## 0
news_top20 <- df_news[1:20,]
ggplot(news_top20,aes(x= reorder(word,freq), y = freq)) + geom_histogram(aes(fill = freq), stat = "identity") + coord_flip() + ggtitle("Top 20 words in news data") + xlab("Words") + ylab("Frequency")
wordcloud(df_news$word,freq = df_news$freq, scale = c(4,0.5),random.order=FALSE, rot.per=.15,colors= brewer.pal(8,"Dark2"),max.words = 40)
#### twitter
summary(tweets)
## Length Class Mode
## 2360148 character character
##count number of words
stri_stats_latex(tweets)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 125570778 3032 35958529 30451170 963
## Envirs
## 0
tweets_top20 <- df_tweets[1:20,]
ggplot(tweets_top20,aes(x= reorder(word,freq), y = freq)) + geom_histogram(aes(fill = freq), stat = "identity") + coord_flip() + ggtitle("Top 20 words in twitter data") + xlab("Words") + ylab("Frequency")
wordcloud(df_tweets$word,freq = df_tweets$freq, scale = c(4,0.5),random.order=FALSE, rot.per=.15,colors= brewer.pal(8,"Dark2"),max.words = 40)
num_lines <- c(summary(blogs)[1],summary(news)[1],summary(tweets)[1])
num_words <- c(stri_stats_latex(blogs)[4],stri_stats_latex(news)[4],stri_stats_latex(tweets)[4])
data_summary <- data.frame(num_lines,num_words)
colnames(data_summary) <- c("Number of Lines","Number of words")
rownames(data_summary) <- c("Blogs","News","Tweets")
kable(data_summary, format = "markdown")
| Number of Lines | Number of words | |
|---|---|---|
| Blogs | 899288 | 37570839 |
| News | 1010242 | 34494539 |
| Tweets | 2360148 | 30451170 |
We can observe that the most frequent words are one, said and just in the blogs, news and twitter data respectively.
Note: Although I have commented lot of R code, I have used the code and saved it as .RData and loading the same to fasten the process.