Roadmap to Next Word Predicting App

Introduction

This report shows the steps that must be taken to create an application capable of predicting the next word in a text.
The raw data, provided by Swiftkey, … …, and consisting of three huge text files coming from blogs, news and twitters, were downloaded and stored in local working directory. The code performing calculation is in Appendix so that it’s easier to see the results.

Getting Data and basic analysis

##            Filename       Size  Length
## 1    en_US.news.txt 196.28  MB 1010242
## 2 en_US.twitter.txt 159.36  MB 2360148
## 3   en_US.blogs.txt 200.42  MB  899288

It’s useful to execute a data sampling because files size is too big for all text to be covered.

Cleaning Data

Cleaning data process was based on:
- Removing non-literal characters because not necessary for our goals
- Removing punctuation and extra spaces
- Removing profanity words
- Removing stopwords.

Exploratory Data Analysis

This part deals with some properties of texts in exam like number of words, most frequent words and how the words are connected each others.

Number of lines and detected words

##    Corpus Words Lines
## 1    News 30599 10102
## 2   Blogs 29685  8992
## 3 Twitter 25850 23601

Most frequent words in news

Most frequent words in blogs

Most frequent words in twitter

2-grams and 3-grams from news

2-grams and 3-grams from blogs

2-grams and 3-grams from twitter

Next Steps

Next steps will have to deal with building the prediction algorithm and, probably, most efforts will concern the minimization of the resources used and the waiting times of the users.

Appendix

news <- readLines(con <- file("en_US/en_US.news.txt"), skipNul = T)
close(con)
twitter <- readLines(con <- file("en_US/en_US.twitter.txt"), skipNul = T)
close(con)
blogs <- readLines(con <- file("en_US/en_US.blogs.txt"), skipNul = T)
close(con)
filenames <- c("en_US/en_US.news.txt", "en_US/en_US.twitter.txt", "en_US/en_US.blogs.txt")
sz <- paste(round(file.info(filenames)$size/(1024^2),2), " MB")
le <- c(length(news), length(twitter), length(blogs))
df <- data.frame(Filename =c("en_US.news.txt","en_US.twitter.txt","en_US.blogs.txt"),
                Size =sz, Length = le)
print(df)

set.seed(7373)
SampleNews <- sample(news, 0.01*length(news), rep = FALSE)
SampleBlogs <- sample(blogs, 0.01*length(blogs), rep = FALSE)
SampleTwitter <- sample(twitter, 0.01*length(twitter), rep = FALSE)

# Preliminary cleaning
preClean <- function(txt){
  cl <- utf8_encode(txt)
  preCleaned <- str_replace_all(cl, "[^\x20\x41-\x5a\x61-\x7a]", "")
  return(preCleaned)
}
# Deleting profanity words
delProf <- function(txt) {
  prof <- readLines(con <- file("badwords.txt"))
  close(con)
  profcleaned <- removeWords(txt, prof)
  return(profcleaned)
}
# Removing puntuaction, numbers, stopwords and collapsing white spaces

cleanedText <- function(txt) {
  sampled <- VCorpus(VectorSource(txt))
  sampled <- tm_map(sampled, content_transformer(tolower))
  sampled <- tm_map(sampled, removePunctuation)
  sampled <- tm_map(sampled, content_transformer(removeNumbers))
  sampled <- tm_map(sampled, removeWords, stopwords("english"))
  final <- tm_map(sampled, content_transformer(stripWhitespace))
  return(final)
}

preCleanedNews <- preClean(SampleNews)
preCleanedBlogs <- preClean(SampleBlogs)
preCleanedTwitter <- preClean(SampleTwitter)
noProfSampleNews <- delProf(preCleanedNews)
noProfSampleBlogs <- delProf(preCleanedBlogs)
noProfSampleTwitter <- delProf(preCleanedTwitter)

cleanedNews <- cleanedText(noProfSampleNews)
cleanedBlogs <- cleanedText(noProfSampleBlogs)
cleanedTwitter <- cleanedText(noProfSampleTwitter)
#EDA
dtmNews <- DocumentTermMatrix(cleanedNews)
dtmBlogs <- DocumentTermMatrix(cleanedBlogs)
dtmTwitter <- DocumentTermMatrix(cleanedTwitter)

df2 <- data.frame(
  Corpus = c("News", "Blogs", "Twitter"),
  Words =c(ncol(dtmNews), ncol(dtmBlogs),ncol(dtmTwitter)),
  Lines =c(nrow(dtmNews), nrow(dtmBlogs),nrow(dtmTwitter)))
print(df2)

corpnews <- corpus(cleanedNews)
tkNews <- tokens(corpnews, remove_punct = TRUE, remove_symbols = TRUE) 
dfmNews <- dfm(tkNews)
nstatFreq <- textstat_frequency(dfmNews, n = 50)
nstatFreq <- nstatFreq[1:10,]

ggplot(nstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

corpblogs <- corpus(cleanedBlogs)
tkBlogs <- tokens(corpblogs, remove_punct = TRUE, remove_symbols = TRUE)  
dfmBlogs <- dfm(tkBlogs)
#topfeatures(dfmBlogs, 30)
bstatFreq <- textstat_frequency(dfmBlogs, n = 50)
bstatFreq <- bstatFreq[1:10,]
ggplot(bstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

corptwitter <- corpus(cleanedTwitter)
tkTwitter <- tokens(corptwitter, remove_punct = TRUE, remove_symbols = TRUE)
dfmTwitter <- dfm(tkTwitter)
#topfeatures(dfmTwitter, 30 )
tstatFreq <- textstat_frequency(dfmTwitter, n = 50)
tstatFreq <- tstatFreq[1:10,]
ggplot(tstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

digramNews <- tokens_ngrams(tkNews, n=2)
dfmDigramNews <- dfm(digramNews)

ndigramstatFreq <- textstat_frequency(dfmDigramNews, n = 50)
ndigramstatFreq <- ndigramstatFreq[1:10,]
ggplot(ndigramstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

trigramNews <- tokens_ngrams(tkNews, n=3)
dfmTrigramNews <- dfm(trigramNews)
ntrigramstatFreq <- textstat_frequency(dfmTrigramNews, n = 50)
ntrigramstatFreq <- ntrigramstatFreq[1:10,]
ggplot(ntrigramstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

digramBlogs <- tokens_ngrams(tkBlogs, n=2)
dfmDigramBlogs <- dfm(digramBlogs)
bdigramstatFreq <- textstat_frequency(dfmDigramBlogs, n = 50)
bdigramstatFreq <- bdigramstatFreq[1:10,]
ggplot(bdigramstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

trigramBlogs <- tokens_ngrams(tkBlogs, n=3)
dfmTrigramBlogs <- dfm(trigramBlogs)
btrigramstatFreq <- textstat_frequency(dfmTrigramBlogs, n = 50)
btrigramstatFreq <- btrigramstatFreq[1:10,]
ggplot(btrigramstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

digramTwitter <- tokens_ngrams(tkTwitter, n=2)
dfmDigramTwitter <- dfm(digramTwitter)
tdigramstatFreq <- textstat_frequency(dfmDigramTwitter, n = 50)
tdigramstatFreq <- tdigramstatFreq[1:10,]
ggplot(tdigramstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")

trigramTwitter <- tokens_ngrams(tkTwitter, n=3)
dfmTrigramTwitter <- dfm(trigramTwitter)
ttrigramstatFreq <- textstat_frequency(dfmTrigramTwitter, n = 50)
ttrigramstatFreq <- ttrigramstatFreq[1:10,]
ggplot(ttrigramstatFreq, aes(x = reorder(feature, frequency), y = frequency, fill = feature)) +
  
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = NULL, y = "Frequency")