Introduction

This is the exploratory analysis of the databases that will be used to create our word predictor with Shiny app for the specialisation capstone. The analysed database are from blog, news and twitter feeds.

Download data

#connection to  data
setwd("D:/GoogleDrive/onlinecourse/R/20180101swiftkey/Coursera-SwiftKey/final/en_US")
list.files("D:/GoogleDrive/onlinecourse/R/20180101swiftkey/Coursera-SwiftKey/final/en_US")
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"
# adress each file
connection_twitter <- file("en_US.twitter.txt", "r")
connection_blogs <- file("en_US.blogs.txt", "r")
connection_news <- file("en_US.news.txt", "r")

# load files
blogs <- readLines(connection_blogs)
news <- readLines(connection_news)
twitter <- readLines(connection_twitter)
close(connection_twitter,connection_news,connection_blogs)

Basic Data Summary

library(stringi)
stri_stats_general(blogs)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   208361438   171926076
stri_stats_general(news)
##       Lines LinesNEmpty       Chars CharsNWhite 
##       77259       77259    15683765    13117038
stri_stats_general(twitter)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162384825   134370864

Create Data Samples of 10,000

sample_blogs <- sample(1:length(blogs), 10000, replace = FALSE)
sample_blogs <- blogs[sample_blogs]

sample_news <- sample(1:length(news), 10000, replace = FALSE)
sample_news <- news[sample_news]

sample_twitter <- sample(1:length(twitter), 10000, replace = FALSE)
sample_twitter <- twitter[sample_twitter]

Format Data

Check how data looks

head(sample_blogs, 3)
## [1] "Here's the sofa that Audrey will sweetly grace above it. The oil painting sitting on the sofa now will be hung in the room at one point, still deciding the \"where\"."                                                                                                                                                                                                                                                                                                                                  
## [2] "Wow Day 11 already, the days are whipping by. I no sooner post a pic and time to do another one."                                                                                                                                                                                                                                                                                                                                                                                                        
## [3] "After doing your research, you may learn that the competition truly does offer a compatible product or service at a lower investment. Or, it may be a slightly inferior product. Or, they may not offer the excellent service that you do. Thereâ<U+0080><U+0099>s bound to be something you find thatâ<U+0080><U+0099>s not as good as what you offer. If you donâ<U+0080><U+0099>t find one of those three things as being different from your offering, you may need to rethink what you offer to improve in at least one of those three areas."
head(sample_news, 3)
## [1] "Wolfe said he had had very little contact with the Broncos pre-draft and was quite surprised they selected him."                                                                                  
## [2] "CC Sabathia and Josh Beckett are among the starting pitchers who declined Team USA invitations. Houston's Roy Oswalt and John Lackey of the Angels have reportedly accepted invitations to pitch."
## [3] "The boy's classmates joined in, with one advising how to cut a main artery, according to the witnesses."
head(sample_twitter, 3)
## [1] "Taking the lunchtime egg roll to yoga."                        
## [2] "#ObamaBookTitles C.H.A.N.G.E ( Come help a nigga get elected )"
## [3] "I'm sending you a ð<U+009F><U+0092>© load of pictures right now"

Structure the data

To structure the data, we use the qdap library to split every sentence by line. However, because the “sent_detect”" detects the sentences with dots and twitter users often fail to follow right punctuations, the function cannot be used for twitter. Therefore, for twitter we process lines by line.

#use qdap for blogs ans news
library(qdap)
data_sample <- c(sample_blogs, sample_news) 
#create data sample
data_sample <- sent_detect(data_sample)
# use loop for twitter
tw <- c(NULL)
for(i in 1:length(sample_twitter)){
          tw_eve <- sent_detect(sample_twitter[i])
          tw <- c(tw, tw_eve)
}
# add twitter to data sample
data_sample <- c(data_sample, tw)

Clean data and create Corpus

# Remove NON-ASCII Characters
for(i in 1:length(data_sample)){
          row <- data_sample[i]
          row_1 <- iconv(row, "latin1", "ASCII", sub = "")
          data_sample[i] <- row_1
}
# clean text and create corpus
library(tm)
corpus_sample <- VCorpus(VectorSource(data_sample))
corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))       
corpus_sample <- tm_map(corpus_sample, stripWhitespace)       
corpus_sample <- tm_map(corpus_sample, removePunctuation)  

Exploratory Analysis

We now define which words are used the most in 1-gram, 2-gram and 3-gram model

library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.3
## Convert corpus to dataframe
dataframe_sample <- data.frame(text = unlist(sapply(corpus_sample, '[', 'content')), stringsAsFactors = F)

## Process NGram model
token_one <- NGramTokenizer(dataframe_sample, Weka_control(min = 1, max = 1))
token_two <- NGramTokenizer(dataframe_sample, Weka_control(min = 2, max = 2))
token_three <- NGramTokenizer(dataframe_sample, Weka_control(min = 3, max = 3))

word_one <- data.frame(table(token_one))
word_two <- data.frame(table(token_two))
word_three <- data.frame(table(token_three))

word_one_order <- word_one[order(word_one$Freq, decreasing = TRUE), ]
word_two_order <- word_two[order(word_two$Freq, decreasing = TRUE), ]
word_three_order <- word_three[order(word_three$Freq, decreasing = TRUE), ]

names(word_one_order) <- c("Token", "Freq")
names(word_two_order) <- c("Token", "Freq")
names(word_three_order) <- c("Token", "Freq")

# load libraries for visual
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.3
library(RColorBrewer)
library(grid)

1-gram

par(mfrow=c(1,1))#
word_one_order$Token <- factor(word_one_order$Token, levels = unique(as.character(word_one_order$Token)))
ggplot(word_one_order[1:11, ], aes(Token, Freq, fill = Token)) + geom_bar(stat="Identity", width = .8) + geom_text(aes(label = word_one_order[1:11, ]$Freq), vjust = -0.2, size = 3) + scale_fill_manual(values = brewer.pal(11, "BrBG")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9)) + theme(axis.text.y = element_text(size = 7)) + theme(plot.margin = unit(c(0.3, 0, 0, 0), "cm")) + labs(title = "Word Frequency for 1-gram Model") + labs(x = "1-gram tokens") + labs(y = "Frequency")

wordcloud(word_one_order[1:11, ]$Token, word_one_order[1:11, ]$Freq, scale = c(5,1), rot.per = .3, colors = brewer.pal(11, "BrBG"), ordered.colors = TRUE, random.order = TRUE)

2-gram

word_two_order$Token <- factor(word_two_order$Token, levels = unique(as.character(word_two_order$Token)))
ggplot(word_two_order[1:11, ], aes(Token, Freq, fill = Token)) + geom_bar(stat="Identity", width = .8) + geom_text(aes(label = word_two_order[1:11, ]$Freq), vjust = -0.2, size = 3) + scale_fill_manual(values = brewer.pal(11, "PiYG")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9)) + theme(axis.text.y = element_text(size = 7)) + theme(plot.margin = unit(c(0.3, 0, 0, 0), "cm")) + labs(title = "Word Frequency for 2-gram Model") + labs(x = "2-gram tokens") + labs(y = "Frequency")

wordcloud(word_two_order[1:11, ]$Token, word_two_order[1:11, ]$Freq, scale = c(4,1), rot.per = .3, colors = brewer.pal(11, "PiYG"), ordered.colors = TRUE, random.order = TRUE)

3-gram

word_three_order$Token <- factor(word_three_order$Token, levels = unique(as.character(word_three_order$Token)))
ggplot(word_three_order[1:11, ], aes(Token, Freq, fill = Token)) + geom_bar(stat="Identity", width = .8) + geom_text(aes(label = word_three_order[1:11, ]$Freq), vjust = -0.2, size = 3) + scale_fill_manual(values = brewer.pal(11, "RdYlGn")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9)) + theme(axis.text.y = element_text(size = 7)) + theme(plot.margin = unit(c(0.3, 0, 0, 0), "cm")) + labs(title = "Word Frequency for 3-gram Model") + labs(x = "3-gram tokens") + labs(y = "Frequency")

wordcloud(word_three_order[1:11, ]$Token, word_three_order[1:11, ]$Freq, scale = c(3,1), rot.per = .3, colors = brewer.pal(11, "RdYlGn"), ordered.colors = TRUE, random.order = TRUE)