Data Science Capstone

The goal of this task is to get familiar with the databases and do the necessary cleaning. Tasks to accomplish

Tokenization - identifying appropriate tokens such as words, punctuation, and numbers. Writing a function that takes a file as input and returns a tokenized version of it.
Profanity filtering - removing profanity and other words you do not want to predict.
Frequency analysis- We will do frequency analysis to understand which words frequently appear in document
Coverage Analysis-way to increase the coverage -- identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

Data Collection

 setwd("C:/Users/sanjayx/Desktop/coursera/swiftkey/Coursera-SwiftKey/final/en_US")



#There are more than 2 million rows as shown by length(readLines(ftwitter)). we will do random sampling to read 5% of rows
set.seed(2019)

ftwitter<-file("en_US.twitter.txt", "r")
twitter<-readLines(ftwitter)
## Warning in readLines(ftwitter): line 167155 appears to contain an embedded
## nul
## Warning in readLines(ftwitter): line 268547 appears to contain an embedded
## nul
## Warning in readLines(ftwitter): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(ftwitter): line 1759032 appears to contain an embedded
## nul
stwitter <- twitter[rbinom(length(twitter) * 0.05, length(twitter), 0.5)]
writeLines(stwitter, con="twitter1.txt")
close(ftwitter)

fnews<-file("en_US.news.txt", "r")
news<-readLines(fnews)
## Warning in readLines(fnews): incomplete final line found on
## 'en_US.news.txt'
snews <- twitter[rbinom(length(news) * 0.05, length(news), 0.5)]
writeLines(snews, con="news1.txt")
close(fnews)


fblogs<-file("en_US.blogs.txt", "r")
blogs<-readLines(fblogs)
sblogs <- twitter[rbinom(length(blogs) * 0.05, length(news), 0.5)]
writeLines(sblogs, con="blogs1.txt")
close(fblogs)

data<-paste(sblogs, snews, stwitter)

file_name<- c("en_us.twitter.txt","en_us.news.txt","en_us.blogs.txt")
file_size_in_MB<- c(file.size("en_us.twitter.txt")/1048576,file.size("en_us.news.txt")/1048576,file.size("en_us.blogs.txt")/1048576)
line_count<-c(length(twitter),length(news),length(blogs))
word_count<-c(sum(sapply(strsplit(twitter, " "), length)),sum(sapply(strsplit(news, " "), length)),sum(sapply(strsplit(blogs, " "), length)))

summary_data<-data.frame(file_name,file_size_in_MB,line_count, word_count)
print(summary_data)
##           file_name file_size_in_MB line_count word_count
## 1 en_us.twitter.txt        159.3641    2360148   30373543
## 2    en_us.news.txt        196.2775      77259    2643969
## 3   en_us.blogs.txt        200.4242     899288   37334131
#Tokenization - identifying appropriate tokens such as words, punctuation, and  
#library(tokenizers)
#token_words<-tokenize_words(data)
# Profanity filtering - removing profanity and other words you do not want to predict.
#library(stopwords)
#clean_words<-tokenize_words(data, stopwords = stopwords::stopwords("en"))

Data Cleaning

We can plot the most frequent word using wordcloud package

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.2
library(dplyr)
library(tidytext)   
## Warning: package 'tidytext' was built under R version 3.5.2
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(ggplot2)
#words.freq<-table(unlist(clean_words));
#words<-cbind(names(words.freq),as.integer(words.freq))
#words.sorted.freq.list<-sort(words.freq, decreasing=TRUE)
#d <- data.frame(word=names(words.sorted.freq.list),freq=words.sorted.freq.list)
#wordcloud(d$word,d$freq.Freq,max.words=50)

url_words <- tibble(
  word = c("https","http"))


d <- data_frame(txt=data)

d$txt <- gsub("[^\x20-\x7E]", "", d$txt) #remove non-ascii

tidy_dataset <- d %>%
   unnest_tokens(output = word, input = txt) %>% 
  filter(!str_detect(word, "^[0-9]*$"))  %>%  # remove numbers
  anti_join(get_stopwords())  %>%  # remove snowball stop words
  anti_join(url_words)   %>%   # remove some urls
  mutate(word = SnowballC::wordStem(word))    # apply a stemming procedure

d<-data_frame(txt=unlist(tidy_dataset))

N-Gram Analysis

# 1-gram

d1<-d %>% 
  unnest_tokens(word,txt) %>%
        count(word, sort = TRUE)

# plot top 10


d1 %>% top_n(10) %>% ggplot(aes(word,n))+
          geom_bar(stat="identity" , fill = "red") +
          labs(x = "Word", y = "Frequency") +
          coord_flip() +
           ggtitle("Top 10 frequent word for 1-Gram" )

# 2-gram
d2<-d %>% 
  unnest_tokens(bigram,txt, token = "ngrams", n = 2) %>%
        count(bigram, sort = TRUE)

d2 %>% top_n(10) %>% ggplot(aes(bigram,n))+
          geom_bar(stat="identity" , fill = "blue") +
          labs(x = "Word", y = "Frequency") +
          coord_flip() +
           ggtitle("Top 10 frequent word for 2-Gram" )

#3-gram
d3<-d %>%
  unnest_tokens(trigram,txt, token = "ngrams", n = 3)  %>%
        count(trigram, sort = TRUE)

d3 %>% top_n(10) %>% ggplot(aes(trigram,n))+
          geom_bar(stat="identity" , fill = "green") +
          labs(x = "Word", y = "Frequency") +
          coord_flip() +
           ggtitle("Top 10 frequent word for 3-Gram" )

Coverage analysis

getThreshold<-function(data,percentage){
total<-sum(data[,2])
coverage<-percentage*total/100
current_value<-0
for (i in 1:nrow(data))
{
  if (current_value > coverage) {
    return(i)
  }
  current_value<-current_value+data[i,2]
}
return(nrow(data))
}
getThreshold(d1,50)
## [1] 330
getThreshold(d1,90)
## [1] 2168
x <- seq(10, 100, by = 10)
y<-0
for (i in 1:10) {
y[i]<-getThreshold(d1,x[i])
}
qplot(x,y,geom=c("line"))

conclusion

to predict the next word, we will use n-gram analysis. It will show the likelyhood of next work after previous word. For example: next week, holiday season etc. We are using rbinom for random sampling and reducing the memory footprint. Based on frequency coverage analysis of samples, we can estimate threshold size of sample.