Excutive summary

In this part of the project to build a predictable shinyApp interface we will extract and readin the data we will be working in, clean it, summarize its statistics, sample it,and tokenize and create Ngrams,then we will explore the data with exploatary analysis using visual plots and answer some related questions.

Reading in the data

Read every file

# load required libraries for this part
suppressMessages(library(stringi))

unzip ("Coursera-SwiftKey.zip")

en_blog_path <- "Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
en_twitter_path <-  "Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
en_news_path <- "Coursera-SwiftKey/final/en_US/en_US.news.txt"

# read in and Load the data files
en_blog <- readLines(en_blog_path, encoding = "UTF-8", warn = FALSE)
en_twitter <- readLines(en_twitter_path, encoding = "UTF-8", warn=FALSE)
en_news <- readLines(en_news_path, encoding = "UTF-8", warn=FALSE)

Statistical Analysis of the data

Run some statistical summaries on the data for each of the three files

# calculate the sizes in MB
blog_size <- file.info(en_blog_path)$size/1048576
twitter_size <- file.info(en_twitter_path)$size/1048576
news_size <- file.info(en_news_path)$size/1048576

# count number of lines
blog_lines<-length(en_blog)
twitter_lines<-length(en_twitter)
news_lines<-length(en_news)

# count total words
blog_words <- sum(stri_count_words(en_blog))
twitter_words <- sum(stri_count_words(en_twitter))
news_words <- sum(stri_count_words(en_news))

# the average count of words
blog_avg_words <- mean(stri_count_words(en_blog))
twitter_avg_words <- mean(stri_count_words(en_twitter))
news_avg_words <- mean(stri_count_words(en_news))

# create a summmary table to show the statistics
summary <-data.frame(file_name=c("en_Blogs","en_Twitter", "en_News"),
                         Size_in_MB=c(blog_size,twitter_size,news_size),
                         Lines_count=c(blog_lines,twitter_lines,news_lines),
                         Words_count=c(blog_words,twitter_words,news_words),
                         Avg_Words_count=c(blog_avg_words, twitter_avg_words, news_avg_words )
)

summary
##    file_name Size_in_MB Lines_count Words_count Avg_Words_count
## 1   en_Blogs   200.4242      899288    37546246        41.75108
## 2 en_Twitter   159.3641     2360148    30093369        12.75063
## 3    en_News   196.2775       77259     2674536        34.61779

Cleaning the data

First we will sample the data by taking 0.15% of each of the fies, then we will clean the data from numbers, punctuations, whitespace and stop words.

# load required libraries for this part
suppressMessages(library(qdap))
suppressMessages(library(tm))

# sample the data
set.seed(1900)
sample_blog <- sample(en_blog, round(length(en_blog) * 0.0015))
sample_news <- sample(en_news, round(length(en_news) * 0.0015))
sample_twit <- sample(en_twitter, round(length(en_twitter) * 0.0015))
sample <- c(sample_blog,sample_news,sample_twit)


samples_corpus <- sent_detect(sample)
# remove numnbers, punctuations, whitespaces, stopwords ,and lower the letters and then store the clean data in a dataframe named sample_df
#samples_corpus <- sample[which(samples_corpus!="")]
samples_corpus <- removeNumbers(samples_corpus)
samples_corpus <- removePunctuation(samples_corpus)
samples_corpus <- stripWhitespace(samples_corpus)
samples_corpus <- rm_stopwords(samples_corpus, unlist=TRUE, stopwords=Top200Words)
sample_df <- data.frame(samples_corpus,stringsAsFactors = FALSE)

all_words <- samples_corpus

Tokenization

Run tokenization and create 3 ngrams, unigram, bigram and trigram

suppressMessages(library(RWeka))
suppressMessages(library(dplyr))

Ngrams<-NGramTokenizer(sample_df)

for(i in 1:length(Ngrams)) 
{if(length(WordTokenizer(Ngrams[i]))==2) break}
for(j in 1:length(Ngrams)) 
{if(length(WordTokenizer(Ngrams[j]))==1) break}

unigram <- as.data.frame(table(all_words))

unigram_df <- 
  unigram %>% 
  arrange(desc(Freq)) %>% 
  head(7000) 

unigram_df_top <- 
  unigram %>% 
  arrange(desc(Freq)) %>% 
  head(15) 

bigram <- as.data.frame(table(Ngrams[i:(j-1)]))

bigram_df <- 
  bigram %>% 
  arrange(desc(Freq)) %>% 
  head(20000) 

bigram_df_top <- 
  bigram %>% 
  arrange(desc(Freq)) %>% 
  head(15) 

trigram <- as.data.frame(table(Ngrams[1:(i-1)]))

trigram_df<- 
  trigram %>% 
  arrange(desc(Freq)) %>% 
  head(20000)

trigram_df_top<- 
  trigram %>% 
  arrange(desc(Freq)) %>% 
  head(15) 

Exploratory Analysis

Some words are more frequent than others - what are the distributions of word frequencies?

We will show a wordcound of the words frequencies

suppressMessages(library(ggplot2))
library(wordcloud)

# visualize the top 100 words in a wordcloud of all sampled datsets
wordcloud(all_words, scale= c(3,.4), max.words=100, random.order=FALSE, 
          rot.per=0.35, use.r.layout=TRUE, colors=brewer.pal(8,"Dark2"))

What are the frequencies of 2-grams and 3-grams in the dataset?

We will show a histogram of the frequencies of the 2-grams and 3-grams respectivily.

ggplot(data= bigram_df_top, aes(x = Var1, y = Freq)) + 
  geom_bar(stat = "Identity",fill="orange") + 
  ggtitle("Top 15 bigram terms")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data= trigram_df_top, aes(x = Var1, y = Freq)) + 
  geom_bar(stat = "Identity", fill="red") + 
  ggtitle("Top 15 trigram terms")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

to cover 50% we will need unique words of

all_words_table <- as.numeric(table(all_words))
all_words_table <- cumsum(all_words_table)/sum(all_words_table)
sum(all_words_table >= 0.5 )
## [1] 5536

to cover 90% we will need unique words of

all_words_table <- as.numeric(table(all_words))
all_words_table <- cumsum(all_words_table)/sum(all_words_table)
sum(all_words_table >= 0.9 )
## [1] 1058

How do you evaluate how many of the words come from foreign languages?

We need to use a specialised lexion for this matter and compare our corpus to it.

Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

I can think of clustering users together so if people are similar in writing specific words then the words learned from one of them can be used to predict the other person words, and vice versa, because nowadays groups of people tend to be similar in their everyday life including texting.

Next steps:-

Is to build a prediction that can predict the next word using the Ngrams and make a shinyApp interface.