The objectives for this project is to:
1. Demonstrate that I’ve downloaded the data and have successfully loaded it in
2. Create a basic report of summary statistics about the data sets
3. Report any interesting findings that I found so far
twitter <- readLines(con <- file("Data/final/en_US/en_US.twitter.txt"),
encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con <- file("Data/final/en_US/en_US.news.txt"),
encoding="UTF-8", skipNul = TRUE)
blogs <- readLines(con <- file("Data/final/en_US/en_US.blogs.txt"),
encoding="UTF-8", skipNul = TRUE)
Here I’m taking loaded data and count number of lines, words and characters for each file.
library(stringi)
tw_sum_wrd <- summary(stri_count_words(twitter))#number of words
tw_sum_chr <- summary(nchar(twitter))#number of characters
tw_lines <- length(twitter)#number of lines
nw_sum_wrd <- summary(stri_count_words(news))#number of words
nw_sum_chr <- summary(nchar(news))#number of characters
nw_lines <- length(news)#number of lines
bg_sum_wrd <- summary(stri_count_words(blogs))#number of words
bg_sum_chr <- summary(nchar(blogs))#number of characters
bg_lines <- length(blogs)#number of lines
library(tidyverse)
tibble(
"Mean Words" = c(tw_sum_wrd[4],nw_sum_wrd[4],bg_sum_wrd[4]),
"Max Words" = c(tw_sum_wrd[6],nw_sum_wrd[6],bg_sum_wrd[6]),
"Mean Characters"=c(tw_sum_chr[4],nw_sum_chr[4],bg_sum_chr[4]),
"No Lines"=c(tw_lines,nw_lines,bg_lines),
"Source"= c('twitter','news','blog')
)
## # A tibble: 3 x 5
## `Mean Words` `Max Words` `Mean Characters` `No Lines` Source
## <dbl> <dbl> <dbl> <int> <chr>
## 1 12.8 47 68.7 2360148 twitter
## 2 34.4 1796 201. 1010242 news
## 3 41.8 6726 230. 899288 blog
Here I subsample 1% of the data and display most popular words for those subsamples in each file.
set.seed(123)
twitter_smpl<- sample(twitter, tw_lines*.001)
news_smpl <- sample(news, nw_lines*.001)
blog_smpl <- sample(blogs, bg_lines*.001)
The idea and workflow or preprocessing has being taking from this paper.
Key steps are:
library(quanteda)
# length(blog_smpl)#899 lines in sample
bg_smpl_proc <- tokens(blog_smpl)
bg_smpl_proc <- tokens_tolower(bg_smpl_proc)
bg_smpl_proc <- tokens_wordstem(bg_smpl_proc)
sw <- stopwords("english")
# head(sw)
bg_smpl_proc_clean <- tokens_remove(bg_smpl_proc, sw)
dtm <- dfm(bg_smpl_proc_clean,remove_punct=TRUE)
doc_freq <- docfreq(dtm) # document frequency per term (column)
# length(news_smpl) #1010 - lines in sample
nw_smpl_proc <- tokens(news_smpl)
nw_smpl_proc <- tokens_tolower(nw_smpl_proc)
nw_smpl_proc <- tokens_wordstem(nw_smpl_proc)
# sw <- stopwords("english")
# head(sw)
nw_smpl_proc_clean <- tokens_remove(nw_smpl_proc, sw)
dtm_news <- dfm(nw_smpl_proc_clean,remove_punct=TRUE)
doc_freq_news <- docfreq(dtm_news) # document frequency per term (column)
# length(twitter_smpl)#2360 lines in twitter sample
tw_smpl_proc <- tokens(twitter_smpl)
tw_smpl_proc <- tokens_tolower(tw_smpl_proc)
tw_smpl_proc <- tokens_wordstem(tw_smpl_proc)
# sw <- stopwords("english")
# head(sw)
tw_smpl_proc_clean <- tokens_remove(tw_smpl_proc, sw)
dtm_tw <- dfm(tw_smpl_proc_clean,remove_punct=TRUE)
doc_freq_tw <- docfreq(dtm_tw) # document frequency per term (column)
After pre-processing we can display the 30 most frequent words in each subsample.
library(plotly)
doc_freq_df <- data.frame("freq"=doc_freq)
doc_freq_df$word <- row.names(doc_freq_df)
doc_freq_df <- arrange(doc_freq_df,desc(freq))
doc_freq_df$word <- factor(doc_freq_df$word, levels = unique(doc_freq_df$word)[order(doc_freq_df$freq, decreasing = TRUE)])
plot_ly(head(doc_freq_df,30)) %>%
add_trace(x=~word,y=~freq, type="bar",
text = ~paste0(
"Word: ",word,
"<br>Count: ",freq
)) %>%
layout(title = 'Amount of words in sample (1%) from blog file',
xaxis = list(title = "Words"),
yaxis = list(title = "Number of words"))
Further steps would be to subsample data and based on it test and adjust algorithms for prediction, which later will be used in Shiny application.