The goal of this project is just to display that you've gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs http://rpubs.com/ that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
The motivation for this project is to: 1. Demonstrate that you've downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
library(wordcloud)
library(dplyr)
library(ggplot2)
library(tm)
library(stopwords)
library(quanteda)
setwd("D:/Coursera/DataScience Spe/Capston Project/en_US")
##The data we will be using in this case, is the En_US text data
f_blogs = file("D:/Coursera/DataScience Spe/Capston Project/en_US/en_US.blogs.txt")
d_blogs <- readLines(f_blogs, encoding="UTF-8", skipNul=TRUE)
close(f_blogs)
f_news = file("D:/Coursera/DataScience Spe/Capston Project/en_US/en_US.news.txt")
d_news <- readLines(f_news, encoding = "UTF-8", skipNul = TRUE)
close(f_news)
f_tw = file("D:/Coursera/DataScience Spe/Capston Project/en_US/en_US.twitter.txt")
d_tw <- readLines(f_tw, encoding = "UTF-8", skipNul = TRUE)
close(f_tw)
Up next we will be briefly showing some basic summary of the data we are analyzing
d_stat<- function(lines,t_fl) {
s_file <- file.info(t_fl)[1]/(1024*1024)
no_ch <- lapply(lines, nchar)
maxch <- which.max(no_ch)
word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
return(c(t_fl, format(round(as.double(s_file), 2), nsmall=2), length(lines),maxch, word_count))
}
d_blogs_stat <- d_stat(d_blogs,"en_US.blogs.txt")
d_news_stat <- d_stat(d_news,"en_US.news.txt")
d_tw_stat <- d_stat(d_tw,"en_US.twitter.txt")
s_dataf <- c(d_blogs_stat,d_news_stat,d_tw_stat)
s_table <- data.frame(matrix(unlist(s_dataf), nrow=3, byrow=TRUE))
colnames(s_table) <- c("Name", "Size(MB)", "Line Count", "Line", "Words Count"); s_table
## Name Size(MB) Line Count Line Words Count
## 1 en_US.blogs.txt 200.42 899288 483415 37334131
## 2 en_US.news.txt 196.28 77259 14556 2643969
## 3 en_US.twitter.txt 159.36 2360148 26 30373583
For testing proposes we are subseting the database in a small portion exactly the 8% of the total amount of data.
d_blogs_test <- sample(d_blogs, length(d_blogs) * 0.080, replace = FALSE)
d_news_test <- sample(d_news, length(d_news) * 0.080, replace = FALSE)
d_tw_test <- sample(d_tw, length(d_tw) * 0.080, replace = FALSE)
For exploratory analysis is important for us, to create, clean and stablish our goal, for this case, we are writing and run 3 functions in order to identify the top 10 words more frequently use in this dataset.
corpus_data<- function(t_fl) {
corp_g1<- paste(t_fl, collapse=" ")
corp_g1 <- VectorSource(corp_g1)
corp_g1 <- Corpus(corp_g1)
}
cleaning <- function(cp_d) {
cp_d <- tm_map(cp_d, removeNumbers)
cp_d <- tm_map(cp_d, tolower)
cp_d <- tm_map(cp_d, removePunctuation)
cp_d <- tm_map(cp_d, stripWhitespace)
cp_d <- tm_map(cp_d, PlainTextDocument)
return (cp_d)
}
hg_fq_w <- function (cp_d) {
t_sp <- DocumentTermMatrix(cp_d)
t_mx <- as.matrix(t_sp)
fq_w <- colSums(t_mx)
fq_w <- as.data.frame(sort(fq_w, decreasing=TRUE))
fq_w$word <- rownames(fq_w)
colnames(fq_w) <- c("FQ","Word")
return (fq_w)
}
###en_US.blogs.txt
corp_blogs <- corpus_data(d_blogs_test)
corp_blogs <- cleaning(corp_blogs)
blogs_hg_fq_w <- hg_fq_w(corp_blogs)
###en_US.news.txt
corp_news <- corpus_data(d_news_test)
corp_news <- cleaning(corp_news)
news_hg_fq_w <- hg_fq_w(corp_news)
###en_US.twitter.txt
corp_tw <- corpus_data(d_tw_test)
corp_tw <- cleaning(corp_tw)
tw_hg_fq_w <- hg_fq_w(corp_tw)
###en_US.blogs.txt
blogs_hg_fq_w[1:10,]
## FQ Word
## the 147330 the
## and 86527 and
## that 36589 that
## for 28845 for
## you 23527 you
## with 22790 with
## was 21950 was
## this 20554 this
## have 17478 have
## but 16073 but
###en_US.news.txt
news_hg_fq_w[1:10,]
## FQ Word
## the 12085 the
## and 5389 and
## for 2123 for
## that 2103 that
## with 1581 with
## said 1546 said
## was 1339 was
## his 1004 his
## from 946 from
## but 943 but
###en_US.twitter.txt
tw_hg_fq_w[1:10,]
## FQ Word
## the 75151 the
## you 43620 you
## and 34806 and
## for 30484 for
## that 18793 that
## with 14082 with
## your 13669 your
## have 13451 have
## this 12888 this
## are 12514 are
As part of our analysis, is important to show graphically our findings thus the reader can understand easier the content
###en_US.blogs.txt
chart_blogs <- ggplot(data = blogs_hg_fq_w[1:10,] , aes(x=FQ,y=Word,colour=Word)) +
geom_bar(stat="identity", fill="white")+ labs(title ="Top 10 US Blogs most Frequent Words")
chart_blogs
wordcloud(blogs_hg_fq_w$Word[1:100], blogs_hg_fq_w$FQ[1:100],
colors=brewer.pal(8, "Set1"))
###en_US.news.txt
chart_news <- ggplot(data = news_hg_fq_w[1:10,] , aes(x=FQ,y=Word,colour=Word)) +
geom_bar(stat="identity", fill="white")+ labs(title ="Top 10 US News most Frequent Words")
chart_news
wordcloud(news_hg_fq_w$Word[1:100], news_hg_fq_w$FQ[1:100],
colors=brewer.pal(8, "Set2"))
###en_US.twitter.txt
chart_tw <- ggplot(data = tw_hg_fq_w[1:10,] , aes(x=FQ,y=Word,colour=Word)) +
geom_bar(stat="identity", fill="white")+ labs(title ="Top 10 US Twitter most Frequent Words")
chart_tw
wordcloud(tw_hg_fq_w$Word[1:100], tw_hg_fq_w$FQ[1:100],
colors=brewer.pal(8, "Set3"))
we will show how to Build a basic n-gram model - using the exploratory analysis we performed, buildingthis basic n-gram model wil help us for predicting the next word based on the previous 1, 2, or 3 words.
###Generating tokens
d_blogs_token <- tokens(d_blogs_test,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
d_blogs_token <- tokens_tolower(d_blogs_token); d_blogs_token <- tokens_select(d_blogs_token, stopwords(),selection ="remove")
d_news_token <- tokens(d_news_test,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
d_news_token <- tokens_tolower(d_news_token);d_news_token <- tokens_select(d_news_token, stopwords(),selection ="remove")
d_tw_token <- tokens(d_tw_test,what ="word", remove_numbers = TRUE,
remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
d_tw_token <- tokens_tolower(d_tw_token); d_tw_token <- tokens_select(d_tw_token, stopwords(),selection ="remove")
###Building Uni-gram model
###en_US.blogs.txt
d_blog_uni <- tokens_ngrams(d_blogs_token, n=1)
df_blog_uni <- dfm(d_blog_uni, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###en_US.news.txt
d_news_uni <- tokens_ngrams(d_news_token, n=1)
df_news_uni <- dfm(d_news_uni, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###en_US.twitter.txt
d_tw_uni <- tokens_ngrams(d_tw_token, n=1)
df_tw_uni <- dfm(d_tw_uni, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###Building Bi-gram model
###en_US.blogs.txt
d_blog_bi <- tokens_ngrams(d_blogs_token, n=2)
df_blog_bi <- dfm(d_blog_bi, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###en_US.news.txt
d_news_bi <- tokens_ngrams(d_news_token, n=2)
df_news_bi <- dfm(d_news_bi, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###en_US.twitter.txt
d_tw_bi <- tokens_ngrams(d_tw_token, n=2)
df_tw_bi <- dfm(d_tw_bi, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###Building Tri-gram model
###en_US.blogs.txt
d_blog_tri <- tokens_ngrams(d_blogs_token, n=3)
df_blog_tri <- dfm(d_blog_tri, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###en_US.news.txt
d_news_tri <- tokens_ngrams(d_news_token, n=3)
df_news_tri <- dfm(d_news_tri, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###en_US.twitter.txt
d_tw_tri <- tokens_ngrams(d_tw_token, n=3)
df_tw_tri <- dfm(d_tw_tri, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)
###Top 15 Unigrams
topfeatures(df_blog_uni,15)
## one like just can time get people know now also new
## 10071 8007 7911 7764 7147 5626 4809 4803 4685 4382 4372
## even make day good
## 4185 4069 4026 4001
topfeatures(df_news_uni,15)
## said one new can also year first last state two years
## 1549 476 423 383 372 370 347 333 325 319 316
## people time just like
## 313 302 292 283
topfeatures(df_tw_uni,15)
## just like get love good can rt thanks day now one
## 12134 9759 8915 8570 8050 7165 7159 7126 7091 6625 6578
## know u time great
## 6340 6139 6006 5931
###Top 15 Biigrams
topfeatures(df_blog_bi,15)
## years_ago new_york right_now even_though feel_like last_year
## 446 398 391 367 360 335
## can_see first_time make_sure last_week can_get last_night
## 328 326 308 280 263 258
## every_day one_day high_school
## 251 251 244
topfeatures(df_news_bi,15)
## last_year new_york high_school st_louis new_jersey
## 94 83 59 53 48
## years_ago united_states san_francisco los_angeles last_week
## 47 39 35 32 32
## even_though two_years last_month first_time right_now
## 31 30 28 26 25
topfeatures(df_tw_bi,15)
## right_now last_night looking_forward good_morning happy_birthday
## 1351 909 776 667 647
## feel_like just_got good_luck follow_back thanks_follow
## 568 556 535 495 492
## looks_like let_know can_get next_week please_follow
## 488 457 444 393 337
###Top 15 Trigrams
topfeatures(df_blog_tri,15)
## new_york_city new_york_times amazon_services_llc
## 67 49 40
## services_llc_amazon llc_amazon_eu couple_weeks_ago
## 40 40 39
## two_years_ago world_war_ii many_years_ago
## 36 26 25
## new_york_n.y let_us_know happy_new_year
## 25 25 24
## preheat_oven_degrees several_years_ago two_weeks_ago
## 23 23 23
topfeatures(df_news_tri,15)
## new_york_city pates_fountain_parks classic_pates_fountain
## 12 11 8
## st_louis_county gov_chris_christie president_barack_obama
## 7 7 7
## 12u_14u_16u new_york_times assistant_u.s_attorney
## 7 6 6
## run_run_run past_two_years u.s_supreme_court
## 6 5 5
## 10u_12u_14u 14u_16u_18u per_serving_calories
## 5 5 5
topfeatures(df_tw_tri,15)
## let_us_know happy_new_year happy_mothers_day
## 161 151 125
## happy_mother's_day looking_forward_seeing cinco_de_mayo
## 113 83 75
## just_got_back good_morning_everyone cant_wait_see
## 60 45 44
## thanks_following_us just_got_home please_follow_back
## 44 43 42
## come_see_us follow_back_please keep_good_work
## 41 40 39
In the research of building this model, I find quite different packages for helping me out, how is the most efficient way in order to create the ngrams prediction models. In my findings, I encountered quanteda package, that really helped me with all the analysis of the textual data, you can find the general description and function in the link bellow https://tutorials.quanteda.io/
Further exploration of ngram models in order to get a optimized code for running it in the Shiny app taking in to account its limitations.