In this project after setting work directory data as zip file is downloading from url and unzip ,and due to big data size a sample of 0.02% of data will be selected for data analysis , after cleaning of data ,after statistical analysis of data for creating N-gram the data will be tokenized and sorted according to frequency and sequence and finally the plots will be designed according to sorted data .
Goal of Milestone report is enhancing the skills to deal with big data and capability to downloading and cleaning data to sort it based on requested analytic field and designing an prediction algorithm and shiny app.
setwd("C:/Users/msc/Desktop/F/Data Science/Course 10/Milestone Report")
getwd()
[1] "C:/Users/msc/Desktop/F/Data Science/Course 10/Milestone Report"
library(RWeka)
library(dplyr)
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
library(NLP)
library(tm)
if(!file.exists("Coursera-SwiftKey.zip")){
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
"Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
Stat_summary <- data.frame('File' = c("Blogs","News","Twitter"),
"Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
'No_Entries' = sapply(list(blogs, news, twitter), function(x){length(x)}),
'Total_Chars' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
'Max_Chars' = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
Stat_summary
File Size No_Entries Total_Chars Max_Chars
1 Blogs 248.5 Mb 899288 206824505 40833
2 News 19.2 Mb 77259 15639408 5760
3 Twitter 301.4 Mb 2360148 162096031 140
stri_stats_general(blogs)
Lines LinesNEmpty Chars CharsNWhite
899288 899165 206043906 169609063
stri_stats_general(news)
Lines LinesNEmpty Chars CharsNWhite
77259 77259 15615538 13048828
stri_stats_general(twitter)
Lines LinesNEmpty Chars CharsNWhite
2360148 2360148 161961345 133947948
lenBlog <- length(blogs)
lenBlog
[1] 899288
lenNews <- length(news)
lenNews
[1] 77259
lenTwit <- length(twitter)
lenTwit
[1] 2360148
Removal of non-English characters
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
Due to Big Data ,only 0.2% of each file consider as sample.
set.seed(245)
dataSample<-c(sample(blogs, length(blogs) * 0.002),
sample(news, length(news) * 0.002),
sample(twitter, length(twitter) * 0.002))
summary(dataSample)
Length Class Mode
6672 character character
Creating a sample of corpus based on 0.02% of data
corpus_sample<-VCorpus(VectorSource(dataSample))
corpus_sample<- tm_map(corpus_sample, content_transformer(removePunctuation))
corpus_sample <- tm_map(corpus_sample, content_transformer(removeNumbers))
corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))
Tokenize sample data ,designing of matrix according to frequency and sequences
uni_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uni_matrix <- TermDocumentMatrix(corpus_data, control = list(tokenize = uni_token))
bi_matrix <- TermDocumentMatrix(corpus_data, control = list(tokenize = bi_token))
tri_matrix <- TermDocumentMatrix(corpus_data, control = list(tokenize = tri_token))
uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 20)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq=20)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq=20)
uni_corpus_Fqc <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_Fqc <- data.frame(word=names(uni_corpus_Fqc), frequency=uni_corpus_Fqc)
bi_corpus_Fqc <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_Fqc<- data.frame(word=names(bi_corpus_Fqc), frequency=bi_corpus_Fqc)
tri_corpus_Fqc <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_Fqc <- data.frame(word=names(tri_corpus_Fqc), frequency=tri_corpus_Fqc)
uni_corpus_Fqc_Seq<-arrange(uni_corpus_Fqc,desc(frequency))
uni_corpus_Fqc_Seq<-head(uni_corpus_Fqc_Seq,n=10)
bi_corpus_Fqc_Seq<-arrange(bi_corpus_Fqc,desc(frequency))
bi_corpus_Fqc_Seq<-head(bi_corpus_Fqc_Seq,n=10)
tri_corpus_Fqc_Seq<-arrange(tri_corpus_Fqc,desc(frequency))
tri_corpus_Fqc_Seq<-head(tri_corpus_Fqc_Seq,n=10)
g1 <- ggplot(data = head(uni_corpus_Fqc_Seq,10), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "grey") +
ggtitle(paste("Unigrams")) +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g2 <- ggplot(data = head(bi_corpus_Fqc_Seq,10), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "pink") +
ggtitle(paste("Bigrams")) +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
g3 <- ggplot(data = head(tri_corpus_Fqc_Seq,10), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "purple") +
ggtitle(paste("Trigrams")) +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
words_blogs <- stri_count_words(blogs)
summary(words_blogs)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 9.00 28.00 41.71 60.00 6725.
words_news <- stri_count_words(news)
summary(words_news)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 19.0 32.0 34.6 46.0 1123.0
words_twitter <- stri_count_words(twitter)
summary(words_twitter)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 7.00 12.00 12.75 18.00 47.00
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
Approximate size of Blogs files are about 248 Mb,News are 19.2 Mb and twitter are 301.4 Mb but the number of entries in blogs are 900,000 , news are about 77,000 and twitter more than 2,000,000 due to limitation of twitter in character (140). Frequency of blogs and news are same but frequency in twitter is higher due to limitation of character. Some words such as “The”,“And”are most frequent unigrams word and “of the”,“and the”are most frequent bigrams and “Thanks for the” and “one of the” are most frequent trigrams.
Building of corpus and N-gram has significant role in evaluation of dats frequency and prediction of following words as uni,bi,tri gram . Shiny app has capability to run the analytic system without reducing the sample size .