Load the sample twitter file
twitter <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", open = "r")
tw <- readLines(twitter)
## Warning in readLines(twitter): line 167155 appears to contain an embedded
## nul
## Warning in readLines(twitter): line 268547 appears to contain an embedded
## nul
## Warning in readLines(twitter): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(twitter): line 1759032 appears to contain an embedded
## nul
tws <- tw[1:200]
twstok <- strsplit(gsub("\\.|,|\\?|:|-|\\(|\\)|;|\\&|\\!", "", tolower(tws)), " ")
twstoku <- unlist(twstok)
Plot for showing word frquencies
twsfreq <- table(twstoku)
twsfreq1 <- sort(twsfreq, decreasing=TRUE)
plot(head(twsfreq1, 20))
2-grams plot
two_gram <- ngrams(twstoku, 2)
tg <- list()
for (i in 1:length(two_gram)) {
ul <- unlist(two_gram[i])
s <- paste(ul[1],ul[2])
tg[i] <- s
}
tgu <- unlist(tg)
two_gram_freq <- table(tgu)
two_gram_freq1 <- sort(two_gram_freq, decreasing = TRUE)
plot(head(two_gram_freq1, 20))
3-grams plot
three_gram <- ngrams(twstoku, 3L)
tg <- list()
for (i in 1:length(three_gram)) {
ul <- unlist(three_gram[i])
s <- paste(ul[1],ul[2], ul[3])
tg[i] <- s
}
tgu <- unlist(tg)
three_gram_freq <- table(tgu)
three_gram_freq1 <- sort(three_gram_freq, decreasing = TRUE)
plot(head(three_gram_freq1, 20))
To cover 50%
i <- 1
s <- 0
while (s <= length(twstoku) * 0.5) {
s <- s + twsfreq1[i]
i <- i + 1
}
print(paste(s, i))
## [1] "1183 108"
To cover 90%
i <- 1
s <- 0
while (s <= length(twstoku) * 0.9) {
s <- s + twsfreq1[i]
i <- i + 1
}
print(paste(s, i))
## [1] "2127 846"
I use least frequent words
twsfreq2 <- sort(twsfreq, decreasing=FALSE)
head(twsfreq2)
## twstoku
## 'cause "as "collaboration "d" "high
## 1 1 1 1 1
## "his
## 1
Using the alphbets of the languge and finding the words not including any of those alphabets
ml <- gsub("a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z", "", twstoku)
ml[lapply(ml, nchar)>0]
## [1] "'" "'" "'" "'" "5" "'" "'"
## [8] "ًں‘¦" "$163" "#" "“" "\"" "\"" "â€‌"
## [15] "#" "#" "'" "4" "'" "♥" "'"
## [22] "<3" "99%" "#" "\"" "\"" "~" "99"
## [29] "'" "#" "\"" "\"" "40" "'" "23"
## [36] "'" "#" "4" "/" "$99" "$" "/"
## [43] "\"" "\"" "'" "'" "\"" "\"" "'"
## [50] "'" "'" "'" "'" ">>" "#" ">>>"
## [57] "#" "'" "#8" "88" "45347" "#" "#"
## [64] "43" "<3" "'" "#" ">>>>>>" "'" "'"
## [71] "آ«" "آ»" "'" "#" "#" "''" "'"
## [78] "\"" "\"" "\"" "\"" "****" "#" "#"
## [85] "'" "'" "'" "#" "//" "'" "#"
## [92] "#" "'" "/" "/" "'" "/" "4"
## [99] "\"\"" "\"\"" "'" "'" "“" "â€‌" "'"
## [106] "'" "#" "**" "$60" "\"" "\"" "20"
## [113] "'" "'" "#" "5000" "#9" "'" "'"
## [120] "30" "\"" "\"" "10" "#" "2" "2"
## [127] "’" "“" "â€‌" "/" "5" "10" "'"
## [134] "2" "'" "'" "118" "#13" "'" "#"
## [141] "2" "/" "'" "'" "'" "<3" "+"
## [148] "$200+" "'" "\"" "\"" "'" "\"" "\""
library(tm)
library(NLP)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
library(stringi)
library(manipulate)
library(openNLP)
## Warning: package 'openNLP' was built under R version 3.4.2
library(RColorBrewer)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.2
blogs <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r")
news <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.news.txt", "r")
tweets <- file("../Week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r")
blogdata <- readLines(blogs)
newsdata <- readLines(news)
## Warning in readLines(news): incomplete final line found on '../Week1/
## Coursera-SwiftKey/final/en_US/en_US.news.txt'
tweetdata <- readLines(tweets)
## Warning in readLines(tweets): line 167155 appears to contain an embedded
## nul
## Warning in readLines(tweets): line 268547 appears to contain an embedded
## nul
## Warning in readLines(tweets): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(tweets): line 1759032 appears to contain an embedded
## nul
data.Summary <- data.frame(Dataset = c("Blogs", "News", "Tweets"),
Filesize = c(file.size("../Week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"),
file.size("../Week1/Coursera-SwiftKey/final/en_US/en_US.news.txt"),
file.size("../Week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")),
Lines = c(length(blogdata),
length(newsdata),
length(tweetdata)),
Words = c(sum(sapply(strsplit(blogdata, " "), FUN = length, simplify = TRUE)),
sum(sapply(strsplit(newsdata, " "), FUN = length, simplify = TRUE)),
sum(sapply(strsplit(tweetdata, " "), FUN = length, simplify = TRUE))
)
)
close(blogs)
close(news)
close(tweets)
print(data.Summary)
## Dataset Filesize Lines Words
## 1 Blogs 210160014 899288 37334131
## 2 News 205811889 77259 2643969
## 3 Tweets 167105338 2360148 30373543
set.seed(1234)
blogs.Sample <- sample(blogdata, length(blogdata)*0.1, replace=FALSE)
news.Sample <- sample(newsdata, length(newsdata)*0.1, replace=FALSE)
tweets.Sample <- sample(tweetdata, length(tweetdata)*0.1, replace=FALSE)
sample.Corpus <- c(blogs.Sample, news.Sample, tweets.Sample)
sample.Corpus <- VCorpus(VectorSource(sample.Corpus))
sample.Corpus <- tm_map(sample.Corpus, removeNumbers)
sample.Corpus <- tm_map(sample.Corpus, removePunctuation)
sample.Corpus <- tm_map(sample.Corpus, stripWhitespace)
sample.Corpus <- tm_map(sample.Corpus, content_transformer(tolower))
sample.Corpus <- tm_map(sample.Corpus, removeWords, stopwords("english"))
sample.Corpus <- tm_map(sample.Corpus, PlainTextDocument)
unigram <- function(x) NGramTokenizer(x, Weka_control(min =1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min =2, max = 2))
unigram.tdm <- TermDocumentMatrix(sample.Corpus, control = list(tokenize = unigram))
bigram.tdm <- TermDocumentMatrix(sample.Corpus, control = list(tokenize = bigram))
unigram.tdm.temp <- removeSparseTerms(unigram.tdm, sparse = 0.99)
bigram.tdm.temp <- removeSparseTerms(bigram.tdm, sparse = 0.999)
unitdmf <- sort(rowSums(as.matrix(unigram.tdm.temp)), decreasing=TRUE)
bitdmf <- sort(rowSums(as.matrix(bigram.tdm.temp)), decreasing=TRUE)
barplot(head(unitdmf,5), main = "Most Frequent Unigrams - Top 5", col="deepskyblue1")
wordcloud(names(unitdmf), unitdmf, colors = brewer.pal(6, "Paired"))
barplot(head(bitdmf,5), main = "Most Frequent Bigrams - Top 5", col="deepskyblue1")
wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6, "Paired"))
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): cant wait could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): dont know could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): right now could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): last night could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): can get could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): happy birthday could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): thanks following could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): even though could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): thanks much could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(bitdmf), bitdmf, colors = brewer.pal(6,
## "Paired")): look like could not be fit on page. It will not be plotted.
Plan for creating a prediction algorithm and Shiny app The above analysis summariseS the most frequently used words in the corpus provided to us. We will use this learning from these findings and build a predictive model based on commonly used n-grams. We will prioritize suggestions according to popularity of the words used, so that the users of our model may select from a list of most commonly used words which could be 1, 2 or 3 words in that particular order.
Also, since the size of the training data is large, we will create a sampling strategy to down size the training data.
Finally, we will deploy our model in a Shiny App from which users will be able to enter a short phrase, and then the Shinay App could suggest the most suitable next word by using our predictive model.