The goal of this task is to get familiar with the databases and do the necessary cleaning. Tasks to accomplish
Tokenization - identifying appropriate tokens such as words, punctuation, and numbers. Writing a function that takes a file as input and returns a tokenized version of it.
Profanity filtering - removing profanity and other words you do not want to predict.
Frequency analysis- We will do frequency analysis to understand which words frequently appear in document
Coverage Analysis-way to increase the coverage -- identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?
setwd("C:/Users/sanjayx/Desktop/coursera/swiftkey/Coursera-SwiftKey/final/en_US")
#There are more than 2 million rows as shown by length(readLines(ftwitter)). we will do random sampling to read 5% of rows
set.seed(2019)
ftwitter<-file("en_US.twitter.txt", "r")
twitter<-readLines(ftwitter)
## Warning in readLines(ftwitter): line 167155 appears to contain an embedded
## nul
## Warning in readLines(ftwitter): line 268547 appears to contain an embedded
## nul
## Warning in readLines(ftwitter): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(ftwitter): line 1759032 appears to contain an embedded
## nul
stwitter <- twitter[rbinom(length(twitter) * 0.05, length(twitter), 0.5)]
writeLines(stwitter, con="twitter1.txt")
close(ftwitter)
fnews<-file("en_US.news.txt", "r")
news<-readLines(fnews)
## Warning in readLines(fnews): incomplete final line found on
## 'en_US.news.txt'
snews <- twitter[rbinom(length(news) * 0.05, length(news), 0.5)]
writeLines(snews, con="news1.txt")
close(fnews)
fblogs<-file("en_US.blogs.txt", "r")
blogs<-readLines(fblogs)
sblogs <- twitter[rbinom(length(blogs) * 0.05, length(news), 0.5)]
writeLines(sblogs, con="blogs1.txt")
close(fblogs)
data<-paste(sblogs, snews, stwitter)
file_name<- c("en_us.twitter.txt","en_us.news.txt","en_us.blogs.txt")
file_size_in_MB<- c(file.size("en_us.twitter.txt")/1048576,file.size("en_us.news.txt")/1048576,file.size("en_us.blogs.txt")/1048576)
line_count<-c(length(twitter),length(news),length(blogs))
word_count<-c(sum(sapply(strsplit(twitter, " "), length)),sum(sapply(strsplit(news, " "), length)),sum(sapply(strsplit(blogs, " "), length)))
summary_data<-data.frame(file_name,file_size_in_MB,line_count, word_count)
print(summary_data)
## file_name file_size_in_MB line_count word_count
## 1 en_us.twitter.txt 159.3641 2360148 30373543
## 2 en_us.news.txt 196.2775 77259 2643969
## 3 en_us.blogs.txt 200.4242 899288 37334131
#Tokenization - identifying appropriate tokens such as words, punctuation, and
#library(tokenizers)
#token_words<-tokenize_words(data)
# Profanity filtering - removing profanity and other words you do not want to predict.
#library(stopwords)
#clean_words<-tokenize_words(data, stopwords = stopwords::stopwords("en"))
We can plot the most frequent word using wordcloud package
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.2
library(dplyr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.2
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.2
library(ggplot2)
#words.freq<-table(unlist(clean_words));
#words<-cbind(names(words.freq),as.integer(words.freq))
#words.sorted.freq.list<-sort(words.freq, decreasing=TRUE)
#d <- data.frame(word=names(words.sorted.freq.list),freq=words.sorted.freq.list)
#wordcloud(d$word,d$freq.Freq,max.words=50)
url_words <- tibble(
word = c("https","http"))
d <- data_frame(txt=data)
d$txt <- gsub("[^\x20-\x7E]", "", d$txt) #remove non-ascii
tidy_dataset <- d %>%
unnest_tokens(output = word, input = txt) %>%
filter(!str_detect(word, "^[0-9]*$")) %>% # remove numbers
anti_join(get_stopwords()) %>% # remove snowball stop words
anti_join(url_words) %>% # remove some urls
mutate(word = SnowballC::wordStem(word)) # apply a stemming procedure
d<-data_frame(txt=unlist(tidy_dataset))
# 1-gram
d1<-d %>%
unnest_tokens(word,txt) %>%
count(word, sort = TRUE)
# plot top 10
d1 %>% top_n(10) %>% ggplot(aes(word,n))+
geom_bar(stat="identity" , fill = "red") +
labs(x = "Word", y = "Frequency") +
coord_flip() +
ggtitle("Top 10 frequent word for 1-Gram" )
# 2-gram
d2<-d %>%
unnest_tokens(bigram,txt, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
d2 %>% top_n(10) %>% ggplot(aes(bigram,n))+
geom_bar(stat="identity" , fill = "blue") +
labs(x = "Word", y = "Frequency") +
coord_flip() +
ggtitle("Top 10 frequent word for 2-Gram" )
#3-gram
d3<-d %>%
unnest_tokens(trigram,txt, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
d3 %>% top_n(10) %>% ggplot(aes(trigram,n))+
geom_bar(stat="identity" , fill = "green") +
labs(x = "Word", y = "Frequency") +
coord_flip() +
ggtitle("Top 10 frequent word for 3-Gram" )
getThreshold<-function(data,percentage){
total<-sum(data[,2])
coverage<-percentage*total/100
current_value<-0
for (i in 1:nrow(data))
{
if (current_value > coverage) {
return(i)
}
current_value<-current_value+data[i,2]
}
return(nrow(data))
}
getThreshold(d1,50)
## [1] 330
getThreshold(d1,90)
## [1] 2168
x <- seq(10, 100, by = 10)
y<-0
for (i in 1:10) {
y[i]<-getThreshold(d1,x[i])
}
qplot(x,y,geom=c("line"))
to predict the next word, we will use n-gram analysis. It will show the likelyhood of next work after previous word. For example: next week, holiday season etc. We are using rbinom for random sampling and reducing the memory footprint. Based on frequency coverage analysis of samples, we can estimate threshold size of sample.