In this part of the project to build a predictable shinyApp interface we will extract and readin the data we will be working in, clean it, summarize its statistics, sample it,and tokenize and create Ngrams,then we will explore the data with exploatary analysis using visual plots and answer some related questions.
Read every file
# load required libraries for this part
suppressMessages(library(stringi))
unzip ("Coursera-SwiftKey.zip")
en_blog_path <- "Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
en_twitter_path <- "Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
en_news_path <- "Coursera-SwiftKey/final/en_US/en_US.news.txt"
# read in and Load the data files
en_blog <- readLines(en_blog_path, encoding = "UTF-8", warn = FALSE)
en_twitter <- readLines(en_twitter_path, encoding = "UTF-8", warn=FALSE)
en_news <- readLines(en_news_path, encoding = "UTF-8", warn=FALSE)
Run some statistical summaries on the data for each of the three files
# calculate the sizes in MB
blog_size <- file.info(en_blog_path)$size/1048576
twitter_size <- file.info(en_twitter_path)$size/1048576
news_size <- file.info(en_news_path)$size/1048576
# count number of lines
blog_lines<-length(en_blog)
twitter_lines<-length(en_twitter)
news_lines<-length(en_news)
# count total words
blog_words <- sum(stri_count_words(en_blog))
twitter_words <- sum(stri_count_words(en_twitter))
news_words <- sum(stri_count_words(en_news))
# the average count of words
blog_avg_words <- mean(stri_count_words(en_blog))
twitter_avg_words <- mean(stri_count_words(en_twitter))
news_avg_words <- mean(stri_count_words(en_news))
# create a summmary table to show the statistics
summary <-data.frame(file_name=c("en_Blogs","en_Twitter", "en_News"),
Size_in_MB=c(blog_size,twitter_size,news_size),
Lines_count=c(blog_lines,twitter_lines,news_lines),
Words_count=c(blog_words,twitter_words,news_words),
Avg_Words_count=c(blog_avg_words, twitter_avg_words, news_avg_words )
)
summary
## file_name Size_in_MB Lines_count Words_count Avg_Words_count
## 1 en_Blogs 200.4242 899288 37546246 41.75108
## 2 en_Twitter 159.3641 2360148 30093369 12.75063
## 3 en_News 196.2775 77259 2674536 34.61779
First we will sample the data by taking 0.15% of each of the fies, then we will clean the data from numbers, punctuations, whitespace and stop words.
# load required libraries for this part
suppressMessages(library(qdap))
suppressMessages(library(tm))
# sample the data
set.seed(1900)
sample_blog <- sample(en_blog, round(length(en_blog) * 0.0015))
sample_news <- sample(en_news, round(length(en_news) * 0.0015))
sample_twit <- sample(en_twitter, round(length(en_twitter) * 0.0015))
sample <- c(sample_blog,sample_news,sample_twit)
samples_corpus <- sent_detect(sample)
# remove numnbers, punctuations, whitespaces, stopwords ,and lower the letters and then store the clean data in a dataframe named sample_df
#samples_corpus <- sample[which(samples_corpus!="")]
samples_corpus <- removeNumbers(samples_corpus)
samples_corpus <- removePunctuation(samples_corpus)
samples_corpus <- stripWhitespace(samples_corpus)
samples_corpus <- rm_stopwords(samples_corpus, unlist=TRUE, stopwords=Top200Words)
sample_df <- data.frame(samples_corpus,stringsAsFactors = FALSE)
all_words <- samples_corpus
Run tokenization and create 3 ngrams, unigram, bigram and trigram
suppressMessages(library(RWeka))
suppressMessages(library(dplyr))
Ngrams<-NGramTokenizer(sample_df)
for(i in 1:length(Ngrams))
{if(length(WordTokenizer(Ngrams[i]))==2) break}
for(j in 1:length(Ngrams))
{if(length(WordTokenizer(Ngrams[j]))==1) break}
unigram <- as.data.frame(table(all_words))
unigram_df <-
unigram %>%
arrange(desc(Freq)) %>%
head(7000)
unigram_df_top <-
unigram %>%
arrange(desc(Freq)) %>%
head(15)
bigram <- as.data.frame(table(Ngrams[i:(j-1)]))
bigram_df <-
bigram %>%
arrange(desc(Freq)) %>%
head(20000)
bigram_df_top <-
bigram %>%
arrange(desc(Freq)) %>%
head(15)
trigram <- as.data.frame(table(Ngrams[1:(i-1)]))
trigram_df<-
trigram %>%
arrange(desc(Freq)) %>%
head(20000)
trigram_df_top<-
trigram %>%
arrange(desc(Freq)) %>%
head(15)
We will show a wordcound of the words frequencies
suppressMessages(library(ggplot2))
library(wordcloud)
# visualize the top 100 words in a wordcloud of all sampled datsets
wordcloud(all_words, scale= c(3,.4), max.words=100, random.order=FALSE,
rot.per=0.35, use.r.layout=TRUE, colors=brewer.pal(8,"Dark2"))
We will show a histogram of the frequencies of the 2-grams and 3-grams respectivily.
ggplot(data= bigram_df_top, aes(x = Var1, y = Freq)) +
geom_bar(stat = "Identity",fill="orange") +
ggtitle("Top 15 bigram terms")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(data= trigram_df_top, aes(x = Var1, y = Freq)) +
geom_bar(stat = "Identity", fill="red") +
ggtitle("Top 15 trigram terms")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
to cover 50% we will need unique words of
all_words_table <- as.numeric(table(all_words))
all_words_table <- cumsum(all_words_table)/sum(all_words_table)
sum(all_words_table >= 0.5 )
## [1] 5536
to cover 90% we will need unique words of
all_words_table <- as.numeric(table(all_words))
all_words_table <- cumsum(all_words_table)/sum(all_words_table)
sum(all_words_table >= 0.9 )
## [1] 1058
We need to use a specialised lexion for this matter and compare our corpus to it.
I can think of clustering users together so if people are similar in writing specific words then the words learned from one of them can be used to predict the other person words, and vice versa, because nowadays groups of people tend to be similar in their everyday life including texting.
Is to build a prediction that can predict the next word using the Ngrams and make a shinyApp interface.