This document is being published for submission of the assignment Milestone Report for Week2 of Coursera Data Science Capstone.
I have loaded following libraries for required exploratory data analysis:
library(ggplot2) # plotting
library(R.utils) # utility
library(quanteda) # for tokenization
library(RColorBrewer) # wordcloud
library(plyr) # data ordering
library(tm)
library(dplyr)
library(magrittr)
library(stringr)
library(stringi)
library(tm)
library(rJava)
library(RWekajars)
library(RWeka)
library(SnowballC)
library(wordcloud)
The dataset used for analysis is available at available at this url: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
For the demonstation perpose we are using the English Language text data.
setwd("./final/en_US")
con <- file("en_US.news.txt", open="r")
En_US_NEWS_text <- readLines(con); close(con)
con <- file("en_US.blogs.txt", open="r")
En_US_blogs_text <- readLines(con); close(con)
con <- file("en_US.twitter.txt", open="r")
En_Twit_text <- readLines(con); close(con)
list.files("./final/en_US")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
Extracting the following text files summary:
setwd("./final/en_US")
file_stat<- function(text_file, lines) {
f_size <- file.info(text_file)[1]/1024^2
nchars <- lapply(lines, nchar)
maxchars <- which.max(nchars)
word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}
En_US_news_stat<- file_stat("en_US.news.txt", En_US_NEWS_text)
En_US_blogs_stat <- file_stat("en_US.blogs.txt", En_US_blogs_text)
En_Twit_text_stat<- file_stat("en_US.twitter.txt", En_Twit_text)
test_summary <- c(En_US_news_stat, En_US_blogs_stat,En_Twit_text_stat)
df <- data.frame(matrix(unlist(test_summary), nrow=3, byrow=T))
colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
print(df)
## Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1 en_US.news.txt 196.28 77259 14556 2643972
## 2 en_US.blogs.txt 200.42 899288 483415 37334441
## 3 en_US.twitter.txt 159.36 2360148 1484357 30373792
Here going to create test data corpus, clean corpus and count high frequency words:
make_Corpus<- function(test_file) {
gen_corp<- paste(test_file, collapse=" ")
gen_corp <- VectorSource(gen_corp)
gen_corp <- Corpus(gen_corp)
}
clean_corp <- function(corp_data) {
corp_data <- tm_map(corp_data, removeNumbers)
corp_data <- tm_map(corp_data, content_transformer(tolower))
corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
corp_data <- tm_map(corp_data, removePunctuation)
corp_data <- tm_map(corp_data, stripWhitespace)
return (corp_data)
}
high_freq_words <- function (corp_data) {
term_sparse <- DocumentTermMatrix(corp_data)
term_matrix <- as.matrix(term_sparse) ## convert our term-document-matrix into a normal matrix
freq_words <- colSums(term_matrix)
freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
freq_words$word <- rownames(freq_words)
colnames(freq_words) <- c("Frequency","word")
return (freq_words)
}
Here going to create bar chart for high frequency words:
En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.1*length(En_US_NEWS_text)), replace = F)
US_news_corpus <- make_Corpus(En_US_NEWS_text1)
US_news_corpus <- clean_corp(US_news_corpus)
US_news_most_used_word <- high_freq_words(US_news_corpus)
US_news_most_used_word1<- US_news_most_used_word[1:15,]
p<-ggplot(data=US_news_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent words : US News") +theme(legend.title=element_blank()) + coord_flip()
## en_US.blogs.txt High frequency words
En_US_blogs_text1<-sample(En_US_blogs_text, round(0.1*length(En_US_blogs_text)), replace = F)
US_blogs_corpus <- make_Corpus(En_US_blogs_text1)
US_blogs_corpus <- clean_corp(US_blogs_corpus)
US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]
p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent words : US blogs") +theme(legend.title=element_blank()) + coord_flip()
## en_US.twitter.txt High frequency words
En_Twit_text1<-sample(En_Twit_text, round(0.1*length(En_Twit_text)), replace = F)
twitter_corpus <- make_Corpus(En_Twit_text1)
twitter_corpus <- clean_corp(twitter_corpus)
twitter_most_used_word <- high_freq_words(twitter_corpus)
twitter_most_used_word1<- twitter_most_used_word[1:15,]
p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity")
p + xlab("Word") +labs(title = "Most Frequent words : Twitter") +theme(legend.title=element_blank()) + coord_flip()
## US News Word Cloud
wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
## US Twitter Word Cloud
wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(twitter_most_used_word$word[1:100],
## twitter_most_used_word$Frequency[1:100], : thanks could not be fit on page. It
## will not be plotted.
## US Blogs Word Cloud
wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
colors=brewer.pal(8, "Dark2"))
For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.
## en_US.news.txt High frequency words
En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.01*length(En_US_NEWS_text)), replace = F)
US_News_tokens<- tokens(En_US_NEWS_text1,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
US_News_tokens <- tokens_tolower(US_News_tokens)
US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")
US_News_unigram <- tokens_ngrams(US_News_tokens, n=1) ## unigram
US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_News_bigram <- tokens_ngrams(US_News_tokens, n=2) ## bigram
US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_News_trigram <- tokens_ngrams(US_News_tokens, n=3) ## trigram
US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(US_News_unigram.dfm, 20) # 20 top US News Unigram words
## said â will one new s can also people year time
## 215 116 88 67 55 52 50 48 46 45 44
## two just like years last state get back much
## 43 42 38 38 38 37 34 32 30
topfeatures(US_News_bigram.dfm, 20) # 20 top US News Bigram words
## itâ_s new_york last_year last_week years_ago
## 11 10 9 9 8
## didnâ_t united_states â_said also_said three_years
## 8 6 6 6 6
## police_said can_make san_francisco san_diego next_year
## 6 5 5 5 5
## high_school supreme_court two_years years_later white_house
## 5 4 4 4 4
topfeatures(US_News_trigram.dfm, 20) # 20 top US News Trigram words
## third_fourth_lines capital_improvement_program
## 3 2
## want_take_votes take_votes_controversial
## 2 2
## votes_controversial_issues three_years_ago
## 2 2
## port_authority_police points_per_game
## 2 2
## â_œiâ_ve four_children_ages
## 2 2
## circuit_court_appeals make_feel_like
## 2 2
## feel_like_youâ like_youâ_re
## 2 2
## youâ_re_floating re_floating_top
## 2 2
## metro_east_lutheran pulse_locker_room
## 2 2
## said_â_œwe school_activities_program
## 2 2
## en_US.blog.txt High frequency words
En_US_blogs_text1<-sample(En_US_blogs_text, round(0.02*length(En_US_blogs_text)), replace = F)
US_blogs_tokens<- tokens(En_US_blogs_text1,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")
US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1) ## unigram
US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2) ## bigram
US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3) ## tiigram
US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(US_blogs_unigram.dfm, 20) # 20 top US blogs Unigram words
## â s one will like just can t time get people
## 4632 3440 2406 2205 1994 1989 1930 1897 1750 1395 1242
## now know also iâ back even first day new
## 1192 1183 1144 1091 1052 1025 1013 1005 1004
topfeatures(US_blogs_bigram.dfm, 20) # 20 top US blogs Bigram words
## itâ_s iâ_m donâ_t didnâ_t iâ_ve thatâ_s canâ_t doesnâ_t
## 789 568 547 278 276 252 181 155
## youâ_re thereâ_s â_œthe iâ_ll wasnâ_t iâ_d â_œi t_know
## 143 134 134 129 121 116 112 107
## â_â right_now couldnâ_t years_ago
## 103 101 98 98
topfeatures(US_blogs_trigram.dfm, 20) # 20 top US blogs Trigram words
## donâ_t_know iâ_m_sure donâ_t_want donâ_t_think itâ_s_just
## 77 55 39 38 29
## â_â_â didnâ_t_know itâ_s_like iâ_m_going didnâ_t_want
## 27 26 24 23 22
## â_œiâ_m â_œitâ_s new_york_city donâ_t_like know_itâ_s
## 22 21 19 19 18
## itâ_s_time iâ_ve_never now_iâ_m think_itâ_s youâ_re_going
## 17 17 17 17 17
## en_US.twitter.txt Ngram words
En_Twit_text1<-sample(En_Twit_text, round(0.02*length(En_Twit_text)), replace = F)
twitter_tokens<- tokens(En_Twit_text1,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
twitter_tokens <- tokens_tolower(twitter_tokens)
twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")
twitter_unigram <- tokens_ngrams(twitter_tokens, n=1) ## unigram
twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
twitter_bigram <- tokens_ngrams(twitter_tokens, n=2) ## bigram
twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
twitter_trigram <- tokens_ngrams(twitter_tokens, n=3) ## trigram
twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"),
remove_punct = TRUE)
topfeatures(twitter_unigram.dfm, 20) # 20 top Unigram words
## just like get love good will can day rt thanks now
## 3119 2446 2252 2143 2055 1947 1820 1779 1751 1733 1648
## one know time u great today go new lol
## 1627 1582 1579 1567 1529 1478 1454 1433 1353
topfeatures(twitter_bigram.dfm, 20) # 20 top Bigram words
## â_œ right_now last_night looking_forward good_morning
## 388 330 237 184 165
## feel_like happy_birthday just_got good_luck follow_back
## 161 161 158 128 126
## looks_like thanks_follow let_know can_get next_week
## 125 119 107 106 94
## make_sure great_day please_follow social_media just_saw
## 92 87 82 79 78
topfeatures(twitter_trigram.dfm, 20) # 20 top Trigram words
## happy_mother's_day let_us_know happy_mothers_day
## 41 36 29
## cinco_de_mayo happy_new_year rt_â_œ
## 26 25 24
## â_â_â ðÿ_ðÿ_ðÿ keep_good_work
## 22 19 17
## new_york_city just_got_back looking_forward_seeing
## 15 14 13
## never_gets_old come_see_us will_follow_back
## 13 12 12
## just_got_home thanks_following_us yes_yes_yes
## 12 11 11
## just_got_done thanks_everyone_came
## 10 10
I have learned use of many libraries those were new to me. Furthermore I have been revised with skills learned in different courses of specialization.
I further have plans to implement a shiny app for word prediction
Please provide your valuable suggesstions to improve exploratory analysis.