This is the exploratory analysis of the databases that will be used to create our word predictor with Shiny app for the specialisation capstone. The analysed database are from blog, news and twitter feeds.
#connection to data
setwd("D:/GoogleDrive/onlinecourse/R/20180101swiftkey/Coursera-SwiftKey/final/en_US")
list.files("D:/GoogleDrive/onlinecourse/R/20180101swiftkey/Coursera-SwiftKey/final/en_US")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
# adress each file
connection_twitter <- file("en_US.twitter.txt", "r")
connection_blogs <- file("en_US.blogs.txt", "r")
connection_news <- file("en_US.news.txt", "r")
# load files
blogs <- readLines(connection_blogs)
news <- readLines(connection_news)
twitter <- readLines(connection_twitter)
close(connection_twitter,connection_news,connection_blogs)
library(stringi)
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 208361438 171926076
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15683765 13117038
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162384825 134370864
sample_blogs <- sample(1:length(blogs), 10000, replace = FALSE)
sample_blogs <- blogs[sample_blogs]
sample_news <- sample(1:length(news), 10000, replace = FALSE)
sample_news <- news[sample_news]
sample_twitter <- sample(1:length(twitter), 10000, replace = FALSE)
sample_twitter <- twitter[sample_twitter]
head(sample_blogs, 3)
## [1] "Here's the sofa that Audrey will sweetly grace above it. The oil painting sitting on the sofa now will be hung in the room at one point, still deciding the \"where\"."
## [2] "Wow Day 11 already, the days are whipping by. I no sooner post a pic and time to do another one."
## [3] "After doing your research, you may learn that the competition truly does offer a compatible product or service at a lower investment. Or, it may be a slightly inferior product. Or, they may not offer the excellent service that you do. Thereâ<U+0080><U+0099>s bound to be something you find thatâ<U+0080><U+0099>s not as good as what you offer. If you donâ<U+0080><U+0099>t find one of those three things as being different from your offering, you may need to rethink what you offer to improve in at least one of those three areas."
head(sample_news, 3)
## [1] "Wolfe said he had had very little contact with the Broncos pre-draft and was quite surprised they selected him."
## [2] "CC Sabathia and Josh Beckett are among the starting pitchers who declined Team USA invitations. Houston's Roy Oswalt and John Lackey of the Angels have reportedly accepted invitations to pitch."
## [3] "The boy's classmates joined in, with one advising how to cut a main artery, according to the witnesses."
head(sample_twitter, 3)
## [1] "Taking the lunchtime egg roll to yoga."
## [2] "#ObamaBookTitles C.H.A.N.G.E ( Come help a nigga get elected )"
## [3] "I'm sending you a ð<U+009F><U+0092>© load of pictures right now"
To structure the data, we use the qdap library to split every sentence by line. However, because the “sent_detect”" detects the sentences with dots and twitter users often fail to follow right punctuations, the function cannot be used for twitter. Therefore, for twitter we process lines by line.
#use qdap for blogs ans news
library(qdap)
data_sample <- c(sample_blogs, sample_news)
#create data sample
data_sample <- sent_detect(data_sample)
# use loop for twitter
tw <- c(NULL)
for(i in 1:length(sample_twitter)){
tw_eve <- sent_detect(sample_twitter[i])
tw <- c(tw, tw_eve)
}
# add twitter to data sample
data_sample <- c(data_sample, tw)
# Remove NON-ASCII Characters
for(i in 1:length(data_sample)){
row <- data_sample[i]
row_1 <- iconv(row, "latin1", "ASCII", sub = "")
data_sample[i] <- row_1
}
# clean text and create corpus
library(tm)
corpus_sample <- VCorpus(VectorSource(data_sample))
corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))
corpus_sample <- tm_map(corpus_sample, stripWhitespace)
corpus_sample <- tm_map(corpus_sample, removePunctuation)
We now define which words are used the most in 1-gram, 2-gram and 3-gram model
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.3
## Convert corpus to dataframe
dataframe_sample <- data.frame(text = unlist(sapply(corpus_sample, '[', 'content')), stringsAsFactors = F)
## Process NGram model
token_one <- NGramTokenizer(dataframe_sample, Weka_control(min = 1, max = 1))
token_two <- NGramTokenizer(dataframe_sample, Weka_control(min = 2, max = 2))
token_three <- NGramTokenizer(dataframe_sample, Weka_control(min = 3, max = 3))
word_one <- data.frame(table(token_one))
word_two <- data.frame(table(token_two))
word_three <- data.frame(table(token_three))
word_one_order <- word_one[order(word_one$Freq, decreasing = TRUE), ]
word_two_order <- word_two[order(word_two$Freq, decreasing = TRUE), ]
word_three_order <- word_three[order(word_three$Freq, decreasing = TRUE), ]
names(word_one_order) <- c("Token", "Freq")
names(word_two_order) <- c("Token", "Freq")
names(word_three_order) <- c("Token", "Freq")
# load libraries for visual
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## The following object is masked from 'package:qdapRegex':
##
## %+%
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.3
library(RColorBrewer)
library(grid)
par(mfrow=c(1,1))#
word_one_order$Token <- factor(word_one_order$Token, levels = unique(as.character(word_one_order$Token)))
ggplot(word_one_order[1:11, ], aes(Token, Freq, fill = Token)) + geom_bar(stat="Identity", width = .8) + geom_text(aes(label = word_one_order[1:11, ]$Freq), vjust = -0.2, size = 3) + scale_fill_manual(values = brewer.pal(11, "BrBG")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9)) + theme(axis.text.y = element_text(size = 7)) + theme(plot.margin = unit(c(0.3, 0, 0, 0), "cm")) + labs(title = "Word Frequency for 1-gram Model") + labs(x = "1-gram tokens") + labs(y = "Frequency")
wordcloud(word_one_order[1:11, ]$Token, word_one_order[1:11, ]$Freq, scale = c(5,1), rot.per = .3, colors = brewer.pal(11, "BrBG"), ordered.colors = TRUE, random.order = TRUE)
word_two_order$Token <- factor(word_two_order$Token, levels = unique(as.character(word_two_order$Token)))
ggplot(word_two_order[1:11, ], aes(Token, Freq, fill = Token)) + geom_bar(stat="Identity", width = .8) + geom_text(aes(label = word_two_order[1:11, ]$Freq), vjust = -0.2, size = 3) + scale_fill_manual(values = brewer.pal(11, "PiYG")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9)) + theme(axis.text.y = element_text(size = 7)) + theme(plot.margin = unit(c(0.3, 0, 0, 0), "cm")) + labs(title = "Word Frequency for 2-gram Model") + labs(x = "2-gram tokens") + labs(y = "Frequency")
wordcloud(word_two_order[1:11, ]$Token, word_two_order[1:11, ]$Freq, scale = c(4,1), rot.per = .3, colors = brewer.pal(11, "PiYG"), ordered.colors = TRUE, random.order = TRUE)
word_three_order$Token <- factor(word_three_order$Token, levels = unique(as.character(word_three_order$Token)))
ggplot(word_three_order[1:11, ], aes(Token, Freq, fill = Token)) + geom_bar(stat="Identity", width = .8) + geom_text(aes(label = word_three_order[1:11, ]$Freq), vjust = -0.2, size = 3) + scale_fill_manual(values = brewer.pal(11, "RdYlGn")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9)) + theme(axis.text.y = element_text(size = 7)) + theme(plot.margin = unit(c(0.3, 0, 0, 0), "cm")) + labs(title = "Word Frequency for 3-gram Model") + labs(x = "3-gram tokens") + labs(y = "Frequency")
wordcloud(word_three_order[1:11, ]$Token, word_three_order[1:11, ]$Freq, scale = c(3,1), rot.per = .3, colors = brewer.pal(11, "RdYlGn"), ordered.colors = TRUE, random.order = TRUE)