Project consists of creating an applicationto aid in writting a text by displaying a next suggested word in a phrase or a sentence.
In this project US english data set of news, blogs and twitter will be used.
To understand the structure of datasets of three files data will be organized into lines of text, number of words and charecters.
In this explaratory data analysis variables of interest will be compared against each other from three files.
The variables of interest:
size of the three files
number of lines
number of total words
number of words per line
maximum words per line
number of characters
number of characters per word
Datasets files used:
“en_US.blogs.txt” ,
“en_US.news.txt” ,
“en_US.twitter.txt” .
suppressPackageStartupMessages({
library(tidytext)
library(ggplot2)
library(tidyverse)
library(stringr)
library(R.utils)
library(ngram)
library(dplyr)
library(stringi)
library(RWeka)
library(tm)
library(plotly)
library(tmap)
library(knitr)
library(wordcloud)
library(ggraph)
library(igraph)
library(readr)
})
Compressed archive available at “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”
# Load the Data
blogs_file <- "en_US.blogs.txt"
news_file <- "en_US.news.txt"
twitter_file <- "en_US.twitter.txt"
size<-file.info("en_US.blogs.txt")
size<-file.info("en_US.news.txt")
size<-file.info("en_US.twitter.txt")
kb<-size$size/1024
kb<-size$size/1024
kb<-size$size/1024
sizeBlogs<-kb/1024
sizeNews<-kb/1024
sizeTwitter<-kb/1024
#blogs
blogs <- readLines(blogs_file, encoding = "UTF-8", skipNul = TRUE)
#twitter
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE)
#news
con <- file(news_file, open="rb")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
wordsBlogs<-wordcount(blogs, sep = " ", count.function = sum)
wordsNews<-wordcount(news, sep = " ", count.function = sum)
wordsTwitter<-wordcount(twitter, sep = " ", count.function = sum)
nlinesBlogs <- countLines("en_US.blogs.txt")
nlinesTwitter <- countLines("en_US.twitter.txt")
nlinesNews <- countLines("en_US.news.txt")
statsBlogs <- stri_stats_general(blogs)
statsTwitter <- stri_stats_general(twitter)
statsNews <- stri_stats_general(news)
charectersBlogs <- statsBlogs[3]
charectersTwitter <- statsTwitter[3]
charectersNews <- statsNews[3]
WordsBlogs <- stri_count_words(blogs)
WordsTwitter <- stri_count_words(twitter)
WordsNews <- stri_count_words(news)
hist(WordsBlogs, main="Histogram of words in blogs", xlab="No. of words per blog post", col="lightblue", breaks=100, xlim=c(0, 500))
hist(WordsNews, main="Histogram of words in news", xlab="No. of words per news post", col="lightblue", breaks=100, xlim=c(0, 500))
hist(WordsTwitter, main="Histogram of words in twitter", xlab="No. of words per twitter post", col="lightblue", breaks=20, xlim=c(0, 50))
plot <- tibble(counts = c(wordsBlogs, wordsTwitter, wordsNews, nlinesBlogs, nlinesTwitter, nlinesNews),
class = as.factor(c(rep("words",3), rep("lines", 3))),
medium = as.factor(c(rep(c("blogs", "twitter", "news"),2 ))))
plot$names <- paste(plot$class, plot$medium)
WordsLinesRatio = data_frame(WordsLinesRatio = c(wordsBlogs/ nlinesBlogs, wordsNews/nlinesNews, wordsTwitter/nlinesTwitter), MediaType = as.factor(c("Blogs","News","Twitter")))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
ggplot(data = WordsLinesRatio, aes(x=MediaType, y= WordsLinesRatio, fill = MediaType)) + geom_bar(stat="identity") + ggtitle("Words/Lines Ratio") + ylab("Words/Lines")
samplesize = 0.2
blogs_sample <- sample(blogs, samplesize*length(blogs))
news_sample <- sample(news, samplesize*length(news))
twitter_sample <- sample(twitter, samplesize*length(twitter))
blog_file <- data_frame(line = 1:length(blogs_sample), text=blogs_sample)
news_file <- data_frame(line = 1:length(news_sample), text=news_sample)
twitter_file <- data_frame(line = 1:length(twitter_sample), text=twitter_sample)
# Blogs
blog_token <- blog_file %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
blog_token
## # A tibble: 7,345,640 x 2
## line bigram
## <int> <chr>
## 1 1 thank you
## 2 1 you two
## 3 1 two ton
## 4 1 ton strap
## 5 2 the other
## 6 2 other day
## 7 2 day because
## 8 2 because of
## 9 2 of the
## 10 2 the stupid
## # … with 7,345,630 more rows
# News
news_token <- news_file %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
news_token
## # A tibble: 6,743,327 x 2
## line bigram
## <int> <chr>
## 1 1 rock ohio
## 2 1 ohio spokeswoman
## 3 1 spokeswoman jennifer
## 4 1 jennifer kulczycki
## 5 1 kulczycki declined
## 6 1 declined to
## 7 1 to comment
## 8 1 comment the
## 9 1 the company
## 10 1 company has
## # … with 6,743,317 more rows
# Twitter
twitter_token <- twitter_file %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
twitter_token
## # A tibble: 5,545,369 x 2
## line bigram
## <int> <chr>
## 1 1 dm me
## 2 1 me your
## 3 1 your email
## 4 1 email address
## 5 1 address and
## 6 1 and i'll
## 7 1 i'll send
## 8 1 send you
## 9 1 you details
## 10 2 so can
## # … with 5,545,359 more rows
sample_file <- rbind(blog_file, news_file, twitter_file)
sampletoken <- sample_file %>% unnest_tokens(word, text)
commontokens <- sampletoken %>% count(word, sort = TRUE)
commontokens_nostopwords <- commontokens %>% anti_join(stop_words)
## Joining, by = "word"
totalwords <- commontokens_nostopwords %>%
summarize(total=sum(n))
totalwords <- as.integer(totalwords)
totalwords <- rep(totalwords, dim(commontokens_nostopwords)[1])
commontokens_nostopwords <- cbind(commontokens_nostopwords, totalwords)
commontokens_nostopwords <- commontokens_nostopwords %>% mutate(tf=n/totalwords)
head(commontokens_nostopwords, 10)
## word n totalwords tf
## 1 time 44638 8681955 0.005141469
## 2 day 35393 8681955 0.004076616
## 3 love 32426 8681955 0.003734873
## 4 people 31583 8681955 0.003637775
## 5 2 21270 8681955 0.002449909
## 6 3 20916 8681955 0.002409135
## 7 1 18827 8681955 0.002168521
## 8 life 18359 8681955 0.002114616
## 9 rt 17798 8681955 0.002049999
## 10 home 16589 8681955 0.001910745
bigrams <- sample_file %>%
unnest_tokens(bigram, text, token="ngrams", n = 2)
bigrams <- bigrams %>% count(bigram, sort=TRUE)
bigrams_separated <- bigrams %>%
separate(bigram, c("word1", "word2"), sep=" ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
totalbigrams <- bigrams_filtered %>%
summarize(total=sum(n))
totalbigrams <- as.integer(totalbigrams)
totalbigrams <- rep(totalbigrams, dim(bigrams_filtered)[1])
bigrams <- cbind(bigrams_filtered, totalbigrams)
bigrams <- bigrams %>% mutate(tf=n/totalbigrams)
bigram_graph <- bigrams %>%
filter (n > 300) %>%
graph_from_data_frame()
set.seed(3226)
#png(filename="ggraph_bigram.png")
arrow <- grid::arrow(type="closed", length=unit(.10, "inches"))
ggraph(bigram_graph, layout="fr") +
geom_edge_link(aes(edge_alpha=n, color = "blue"), show.legend = FALSE, arrow=arrow) +
geom_node_point(color="green", size=1.5) +
geom_node_text(aes(label=name), vjust=1, hjust=1) +
theme_void()
#dev.off()
set.seed(3693)
SampleBlog <- sample(blogs, length(blogs) * 0.001)
SampleNews <- sample(news, length(news) * 0.001)
SampleTwitter <- sample(twitter, length(twitter) * 0.001)
data_sample <- c(sample(blogs, length(blogs) * 0.001),
sample(news, length(news) * 0.001),
sample(twitter, length(twitter) * 0.001))
corpus <- VCorpus(VectorSource(data_sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusBlogs <- VCorpus(VectorSource(SampleBlog))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusBlogs <- tm_map(corpusBlogs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpusBlogs <- tm_map(corpusBlogs, toSpace, "@[^\\s]+")
corpusBlogs <- tm_map(corpusBlogs, tolower)
corpusBlogs <- tm_map(corpusBlogs, removeWords, stopwords("en"))
corpusBlogs <- tm_map(corpusBlogs, removePunctuation)
corpusBlogs <- tm_map(corpusBlogs, removeNumbers)
corpusBlogs <- tm_map(corpusBlogs, stripWhitespace)
corpusBlogs <- tm_map(corpusBlogs, PlainTextDocument)
corpusNews <- VCorpus(VectorSource(SampleNews))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusNews <- tm_map(corpusNews, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpusNews <- tm_map(corpusNews, toSpace, "@[^\\s]+")
corpusNews <- tm_map(corpusNews, tolower)
corpusNews <- tm_map(corpusNews, removeWords, stopwords("en"))
corpusNews <- tm_map(corpusNews, removePunctuation)
corpusNews <- tm_map(corpusNews, removeNumbers)
corpusNews <- tm_map(corpusNews, stripWhitespace)
corpusNews <- tm_map(corpusNews, PlainTextDocument)
corpusTwitter <- VCorpus(VectorSource(SampleTwitter))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusTwitter <- tm_map(corpusTwitter, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpusTwitter <- tm_map(corpusTwitter, toSpace, "@[^\\s]+")
corpusTwitter <- tm_map(corpusTwitter, tolower)
corpusTwitter <- tm_map(corpusTwitter, removeWords, stopwords("en"))
corpusTwitter <- tm_map(corpusTwitter, removePunctuation)
corpusTwitter <- tm_map(corpusTwitter, removeNumbers)
corpusTwitter <- tm_map(corpusTwitter, stripWhitespace)
corpusTwitter <- tm_map(corpusTwitter, PlainTextDocument)
options(mc.cores=1)
# Unigram
tdmUniBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs), 0.9999)
FreqUniBlog <- rowSums(as.matrix(tdmUniBlog))
FreqUniBlog <- sort(FreqUniBlog, decreasing=TRUE)
FreqUniBlog <- data.frame(word=names(FreqUniBlog), freq=FreqUniBlog)
tdmUniNews <- removeSparseTerms(TermDocumentMatrix(corpusNews), 0.9999)
FreqUniNews <- rowSums(as.matrix(tdmUniNews))
FreqUniNews <- sort(FreqUniNews, decreasing=TRUE)
FreqUniNews <- data.frame(word=names(FreqUniNews), freq=FreqUniNews)
tdmUniTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter), 0.9999)
FreqUniTwitter <- rowSums(as.matrix(tdmUniTwitter))
FreqUniTwitter <- sort(FreqUniTwitter, decreasing=TRUE)
FreqUniTwitter <- data.frame(word=names(FreqUniTwitter), freq=FreqUniTwitter)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBiBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs, control = list(tokenize = bigram)), 0.9999)
FreqBiBlog <- rowSums(as.matrix(tdmBiBlog))
FreqBiBlog <- sort(FreqBiBlog, decreasing=TRUE)
FreqBiBlog <- data.frame(word=names(FreqBiBlog), freq=FreqBiBlog)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBiNews <- removeSparseTerms(TermDocumentMatrix(corpusNews, control = list(tokenize = bigram)), 0.9999)
FreqBiNews <- rowSums(as.matrix(tdmBiNews))
FreqBiNews <- sort(FreqBiNews, decreasing=TRUE)
FreqBiNews <- data.frame(word=names(FreqBiNews), freq=FreqBiNews)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBiTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter, control = list(tokenize = bigram)), 0.9999)
FreqBiTwitter <- rowSums(as.matrix(tdmBiTwitter))
FreqBiTwitter <- sort(FreqBiTwitter, decreasing=TRUE)
FreqBiTwitter <- data.frame(word=names(FreqBiTwitter), freq=FreqBiTwitter)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTriBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs, control = list(tokenize = trigram)), 0.9999)
FreqTriBlog <- rowSums(as.matrix(tdmTriBlog))
FreqTriBlog <- sort(FreqTriBlog, decreasing=TRUE)
FreqTriBlog <- data.frame(word=names(FreqTriBlog), freq=FreqTriBlog)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTriNews <- removeSparseTerms(TermDocumentMatrix(corpusNews, control = list(tokenize = trigram)), 0.9999)
FreqTriNews <- rowSums(as.matrix(tdmTriNews))
FreqTriNews <- sort(FreqTriNews, decreasing=TRUE)
FreqTriNews <- data.frame(word=names(FreqTriNews), freq=FreqTriNews)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTriTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter, control = list(tokenize = trigram)), 0.9999)
FreqTriTwitter <- rowSums(as.matrix(tdmTriTwitter))
FreqTriTwitter <- sort(FreqTriTwitter, decreasing=TRUE)
FreqTriTwitter <- data.frame(word=names(FreqTriTwitter), freq=FreqTriTwitter)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuadBlog <- removeSparseTerms(TermDocumentMatrix(corpusBlogs, control = list(tokenize = quadgram)), 0.9999)
FreqQuadBlog <- rowSums(as.matrix(tdmQuadBlog))
FreqQuadBlog <- sort(FreqQuadBlog, decreasing=TRUE)
FreqQuadBlog <- data.frame(word=names(FreqQuadBlog), freq=FreqQuadBlog)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuadNews <- removeSparseTerms(TermDocumentMatrix(corpusNews, control = list(tokenize = quadgram)), 0.9999)
FreqQuadNews <- rowSums(as.matrix(tdmQuadNews))
FreqQuadNews <- sort(FreqQuadNews, decreasing=TRUE)
FreqQuadNews <- data.frame(word=names(FreqQuadNews), freq=FreqQuadNews)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuadTwitter <- removeSparseTerms(TermDocumentMatrix(corpusTwitter, control = list(tokenize = quadgram)), 0.9999)
FreqQuadTwitter <- rowSums(as.matrix(tdmQuadTwitter))
FreqQuadTwitter <- sort(FreqQuadTwitter, decreasing=TRUE)
FreqQuadTwitter <- data.frame(word=names(FreqQuadTwitter), freq=FreqQuadTwitter)
ggplot(FreqUniBlog[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("green"))
ggplot(FreqUniNews[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("News") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue"))
ggplot(FreqUniTwitter[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("orange"))
ggplot(FreqBiBlog[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("green"))
ggsave("Frequency_bigram_blog.png")
## Saving 7 x 5 in image
ggplot(FreqBiNews[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("News") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue"))
ggsave("Frequency_bigram_news.png")
## Saving 7 x 5 in image
ggplot(FreqBiTwitter[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("orange"))
ggsave("Frequency_bigram_twitter.png")
## Saving 7 x 5 in image
ggplot(FreqTriBlog[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("green"))
ggplot(FreqTriNews[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("News") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue"))
ggplot(FreqTriTwitter[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("orange"))
ggplot(FreqQuadBlog[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Blogs") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("green"))
ggplot(FreqQuadNews[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("News") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue"))
ggplot(FreqQuadTwitter[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Twitter") +
theme(axis.text.x = element_text(angle = 60, size = 9, hjust = 1)) +
geom_bar(stat = "identity", fill = I("orange"))
tdmUni <- removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)
FreqUni <- rowSums(as.matrix(tdmUni))
FreqUni <- sort(FreqUni, decreasing=TRUE)
FreqUni <- data.frame(word=names(FreqUni), freq=FreqUni)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdmBi <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999)
FreqBi <- rowSums(as.matrix(tdmBi))
FreqBi <- sort(FreqBi, decreasing=TRUE)
FreqBi <- data.frame(word=names(FreqBi), freq=FreqBi)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdmTri <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999)
FreqTri <- rowSums(as.matrix(tdmTri))
FreqTri <- sort(FreqTri, decreasing=TRUE)
FreqTri <- data.frame(word=names(FreqTri), freq=FreqTri)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
tdmQuad <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = quadgram)), 0.9999)
FreqQuad <- rowSums(as.matrix(tdmQuad))
FreqQuad <- sort(FreqQuad, decreasing=TRUE)
FreqQuad <- data.frame(word=names(FreqQuad), freq=FreqQuad)
ggplot(FreqUni[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Unigram") +
theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
geom_bar(stat = "identity", fill = I("green3"))
ggsave("unigram.png")
## Saving 7 x 5 in image
ggplot(FreqBi[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Bigram") +
theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
geom_bar(stat = "identity", fill = I("blue4"))
ggsave("bigram.png")
## Saving 7 x 5 in image
ggplot(FreqTri[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Trigram") +
theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
geom_bar(stat = "identity", fill = I("red"))
ggsave("trigram.png")
## Saving 7 x 5 in image
ggplot(FreqQuad[1:40,], aes(reorder(word, freq), freq)) +
labs(x="Words", y = "Frequency") + ggtitle("Quadgram") +
theme(axis.text.x = element_text(angle = 60, size = 8, hjust = 1)) +
geom_bar(stat = "identity", fill = I("magenta2"))
ggsave("quadgram.png")
## Saving 7 x 5 in image
The initial exploratory analysis of datasets showed that blogs have the smallest number of lines but largest numbers of words, total characters and words per line.
The news articles have the largest number of characters per word indicating that the language used in the text is more complex than text in blogs and twitter.