library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)

Overview

The Capstone Milestone Week 2 Report serves as an initial analysis of the dataset, aimed at exploring how it can be used to develop a predictive app. The primary objective of this report is to identify key data features that will support the creation of an effective predictive algorithm. To achieve this, the following steps will be completed: confirming successful data download and loading, generating a basic summary report of statistics across the datasets, highlighting any notable findings so far, and gathering feedback on the proposed approach for building the prediction algorithm and Shiny app.

Read the data

twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)

## News file has special characters so it is better to read it as a binary file
bin.news <- file("en_US.news.txt", open="rb")
news <- readLines(bin.news, encoding="UTF-8")
close(bin.news)
rm(bin.news)

Basic report of summary statistics about the data

Sample Data

The datasets are quite large and will take significant time to process and analyze. For this preliminary analysis, a random sample of 1,000 lines will be taken from the Blogs, News, and Twitter data.

set.seed(5454568)
sampleTwitter <- twitter[sample(1:length(twitter),1000)]
sampleBlogs <- blogs[sample(1:length(blogs),1000)]
sampleNews <- news[sample(1:length(news),1000)]

## Combine data samples
sampleData <- c(sampleTwitter,sampleBlogs,sampleNews)

## Save sample data and remove data not needed to free memory
writeLines(sampleData, "sampleData.txt")
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs)

Read Sample Data & Clean Data

## Read Sample

sampleData <- readLines("sampleData.txt", encoding="UTF-8")
corpus <- VCorpus(VectorSource(sampleData))

## Remove space, punctuation, numbers, whitespace, stopwords and change to lowercase
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "\"|/|@|\\|")
corpus <- tm_map(corpus, toSpace, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

2.3 summary and Overview Data

#summary(corpus)
wordcloud(corpus, min.freq=5, max.words=101, random.order=TRUE,
          rot.per=0.5, colors=brewer.pal(8, "Set2"), use.r.layout=FALSE)
## Warning in wordcloud(corpus, min.freq = 5, max.words = 101, random.order =
## TRUE, : day could not be fit on page. It will not be plotted.
## Warning in wordcloud(corpus, min.freq = 5, max.words = 101, random.order =
## TRUE, : one could not be fit on page. It will not be plotted.

Interesting findings

corpus.dataframe <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = F)

uniGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 1, max = 1))))
biGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 2, max = 2))))
triGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 3, max = 3))))

#order by decreasing frequency
unigram <- uniGramToken[order(uniGramToken$Freq, decreasing = TRUE),]
bigram  <- biGramToken[order(biGramToken$Freq, decreasing = TRUE),]
trigram <- triGramToken[order(triGramToken$Freq, decreasing = TRUE),]

Words most commonly used

par(mfrow = c(1, 1))
par(mar=c(5,4,2,0))
barplot(unigram[1:30,2], 
        names.arg=unigram[1:30,1], 
        col = "gold", 
        main="Most commonly used Words (Top 30)", 
        las=2, 
        ylab = "Frequency")

Two-word combinations most commonly used

par(mar=c(8.5,4,2,1))
barplot(bigram[1:30,2], 
        names.arg=bigram[1:30,1], 
        col = "maroon", 
        main="Most commonly used two word combinations (Top 30)", 
        las=2, 
        ylab = "Frequency")

Commonly used three-word combination

par(mar=c(8.5,4,2,1))
barplot(trigram[1:30,2], 
        names.arg=trigram[1:30,1], 
        col = "orange", 
        main="Most commonly used three word combinations (Top 30)", 
        las=2, 
        ylab = "Frequency")

Perspectives for creating a prediction algorithm and Shiny app

Based on the analysis, I plan to use an n-gram dataframe to calculate the probabilities of the next word occurring in relation to the preceding words. For the Shiny app, the goal is to develop a straightforward interface where users can enter a text string, and our prediction model will provide a list of suggested words to predict the next word.