Executive Summary

The objectives for this project is to:
1. Demonstrate that I’ve downloaded the data and have successfully loaded it in
2. Create a basic report of summary statistics about the data sets
3. Report any interesting findings that I found so far

Loading data

twitter <- readLines(con <- file("Data/final/en_US/en_US.twitter.txt"), 
                     encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con <- file("Data/final/en_US/en_US.news.txt"),
                  encoding="UTF-8", skipNul = TRUE)
blogs <- readLines(con <- file("Data/final/en_US/en_US.blogs.txt"),
                  encoding="UTF-8", skipNul = TRUE)

Basic statistics

Here I’m taking loaded data and count number of lines, words and characters for each file.

library(stringi)
tw_sum_wrd <- summary(stri_count_words(twitter))#number of words
tw_sum_chr <- summary(nchar(twitter))#number of characters
tw_lines <- length(twitter)#number of lines

nw_sum_wrd <- summary(stri_count_words(news))#number of words
nw_sum_chr <- summary(nchar(news))#number of characters
nw_lines <- length(news)#number of lines

bg_sum_wrd <- summary(stri_count_words(blogs))#number of words
bg_sum_chr <- summary(nchar(blogs))#number of characters
bg_lines <- length(blogs)#number of lines
library(tidyverse)
tibble(
  
  "Mean Words" = c(tw_sum_wrd[4],nw_sum_wrd[4],bg_sum_wrd[4]),
  "Max Words" = c(tw_sum_wrd[6],nw_sum_wrd[6],bg_sum_wrd[6]),
  "Mean Characters"=c(tw_sum_chr[4],nw_sum_chr[4],bg_sum_chr[4]),
  "No Lines"=c(tw_lines,nw_lines,bg_lines),
  "Source"= c('twitter','news','blog')
)
## # A tibble: 3 x 5
##   `Mean Words` `Max Words` `Mean Characters` `No Lines` Source 
##          <dbl>       <dbl>             <dbl>      <int> <chr>  
## 1         12.8          47              68.7    2360148 twitter
## 2         34.4        1796             201.     1010242 news   
## 3         41.8        6726             230.      899288 blog

Subsample data, pre-processing

Here I subsample 1% of the data and display most popular words for those subsamples in each file.

set.seed(123)

twitter_smpl<- sample(twitter, tw_lines*.001)
news_smpl <- sample(news, nw_lines*.001)
blog_smpl <- sample(blogs, bg_lines*.001)

The idea and workflow or preprocessing has being taking from this paper.
Key steps are:

  • tokenization
  • lowercasing
  • stemming
  • removing stop-words
library(quanteda) 
# length(blog_smpl)#899 lines in sample
bg_smpl_proc <- tokens(blog_smpl)
bg_smpl_proc <- tokens_tolower(bg_smpl_proc) 
bg_smpl_proc <- tokens_wordstem(bg_smpl_proc) 
sw <- stopwords("english")
# head(sw)
bg_smpl_proc_clean <- tokens_remove(bg_smpl_proc, sw)
dtm <- dfm(bg_smpl_proc_clean,remove_punct=TRUE)
doc_freq <- docfreq(dtm)         # document frequency per term (column) 

# length(news_smpl) #1010 - lines in sample
nw_smpl_proc <- tokens(news_smpl)
nw_smpl_proc <- tokens_tolower(nw_smpl_proc) 
nw_smpl_proc <- tokens_wordstem(nw_smpl_proc) 
# sw <- stopwords("english")
# head(sw)
nw_smpl_proc_clean <- tokens_remove(nw_smpl_proc, sw)
dtm_news <- dfm(nw_smpl_proc_clean,remove_punct=TRUE)
doc_freq_news <- docfreq(dtm_news)         # document frequency per term (column) 


# length(twitter_smpl)#2360 lines in twitter sample
tw_smpl_proc <- tokens(twitter_smpl)
tw_smpl_proc <- tokens_tolower(tw_smpl_proc) 
tw_smpl_proc <- tokens_wordstem(tw_smpl_proc) 
# sw <- stopwords("english")
# head(sw)
tw_smpl_proc_clean <- tokens_remove(tw_smpl_proc, sw)
dtm_tw <- dfm(tw_smpl_proc_clean,remove_punct=TRUE)
doc_freq_tw <- docfreq(dtm_tw)         # document frequency per term (column) 

After pre-processing we can display the 30 most frequent words in each subsample.

library(plotly)

doc_freq_df <- data.frame("freq"=doc_freq)
doc_freq_df$word <- row.names(doc_freq_df)
doc_freq_df <- arrange(doc_freq_df,desc(freq))

doc_freq_df$word <- factor(doc_freq_df$word, levels = unique(doc_freq_df$word)[order(doc_freq_df$freq, decreasing = TRUE)])

plot_ly(head(doc_freq_df,30)) %>%
  add_trace(x=~word,y=~freq, type="bar",
            text = ~paste0(
              "Word: ",word,
              "<br>Count: ",freq
            )) %>%
  layout(title = 'Amount of words in sample (1%) from blog file',
         xaxis = list(title = "Words"),
         yaxis = list(title = "Number of words"))

Further steps would be to subsample data and based on it test and adjust algorithms for prediction, which later will be used in Shiny application.