Overview

This report aims to document some initial exploratory data analysis of the data sets which will be used to create a prediction algorithm for the Data Science Capstone.

The motivation for this project is to:

Data retrieval

#read in data
blogs <- readLines("../final/en_US/en_US.blogs.txt") 
twitter <- suppressWarnings(readLines("../final/en_US/en_US.twitter.txt"))
news <- suppressWarnings(readLines("../final/en_US/en_US.news.txt"))

Data summary

summary <- data.frame(length(blogs), length(twitter), length(news))
summary <- rbind(summary, c(format(object.size(blogs), units = "Mb"),
                            format(object.size(twitter), units = "Mb"),
                            format(object.size(news), units = "Mb")))
colnames(summary) <- c("Blogs", "Twitter", "News")
rownames(summary) <- c("Lines", "Memory")
library(kableExtra)
summary %>% kable() %>% kable_styling(full_width = FALSE, position = "left")
Blogs Twitter News
Lines 899288 2360148 1010242
Memory 248.5 Mb 301.4 Mb 249.6 Mb

The table above shows a summary of the each data set.

Subset data for exploratory analysis

set.seed(123456)
random_twitter <- ceiling(runif(n=25000, min=0, max=length(twitter)))
random_news <- ceiling(runif(n=25000, min=0, max=length(news)))
random_blogs <- ceiling(runif(n=25000, min=0, max=length(blogs)))
#subset
twitter <- twitter[random_twitter]
news <- news[random_news]
blogs <- blogs[random_blogs]
#remove all non-English characters
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
#convert to tidy format
twitter_df <- data_frame(source = "twitter", text = twitter)
blogs_df <- data_frame(source = "blogs", text = blogs)
news_df <- data_frame(source = "news", text = news)
samples_df <- bind_rows(list(twitter_df, blogs_df, news_df)) %>% 
  group_by(source) %>% 
  mutate(line = row_number()) %>% 
  ungroup()         
tidy_samples <- samples_df %>%
  unnest_tokens(word, text)
tidy_samples 
## # A tibble: 2,211,714 x 3
##    source   line word  
##    <chr>   <int> <chr> 
##  1 twitter     1 why   
##  2 twitter     1 do    
##  3 twitter     1 native
##  4 twitter     1 apps  
##  5 twitter     1 get   
##  6 twitter     1 more  
##  7 twitter     1 usage 
##  8 twitter     1 than  
##  9 twitter     1 mobile
## 10 twitter     1 web   
## # ... with 2,211,704 more rows

25,000 lines from each data set was sampled for in order to perform an exploratory data analysis. All non-English characters were then removed. The sampled data sets were converted to tidy format before combining them into one “samples_df”. Each line of text was then seperated into tokens for further analysis.

Remove profanity and stop words

The next step was to remove any profanity, stop words (such as “the”), numbers and retweet symbols from the twitter data. Below is an example of the data frame after these filtering efforts.

# list of bad words downloaded and used to filter 
url <- "http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
download.file(url, destfile = "../Resources/bad-words.txt")
badWordList <- readLines("../Resources/bad-words.txt")
data("stop_words")
tidy_samples <- tidy_samples %>% 
  anti_join(stop_words) %>% #remove stop words 
  anti_join(data_frame(word = badWordList)) %>% #remove bad words 
  filter(!str_detect(word, "^\\d")) %>%  #remove numbers
  filter(!str_detect(word, "rt")) #remove retweet
tidy_samples
## # A tibble: 849,819 x 3
##    source   line word    
##    <chr>   <int> <chr>   
##  1 twitter     1 native  
##  2 twitter     1 apps    
##  3 twitter     1 usage   
##  4 twitter     1 mobile  
##  5 twitter     1 web     
##  6 twitter     2 pettitte
##  7 twitter     2 berkman 
##  8 twitter     2 jeter   
##  9 twitter     2 clutch  
## 10 twitter     2 hitters 
## # ... with 849,809 more rows

Wordcloud showing top 100 most frequent words in sampled data set

tidy_samples %>% 
  count(word) %>% 
  with(wordcloud(word, n, max.words = 100, colors=brewer.pal(6,"Dark2"))) 

What are the top 20 most frequent words from each data source?

tidy_samples_count <- tidy_samples %>% 
  group_by(source) %>% 
  count(word, sort = TRUE)
tidy_samples_count %>%
  group_by(source) %>% 
  top_n(20) %>% 
  ungroup %>%
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = source)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Word Count") +
  facet_wrap(~source, ncol = 3, scales = "free") +
  coord_flip() + 
  ggtitle("Twenty most frequent words in each source of text",
          subtitle = "Sorted in order of overall frequency in the sampled dataset") +
  theme_minimal()

The word “love” is frequent in blogs and twitter data and absent from top 20 list in news data. “lol” is unique to the top20 twitter data set.

Bigrams

count_bigrams <- function(dataset) { #function to count bigrams
  dataset %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !word1 %in% badWordList,
           !word2 %in% badWordList,
           !str_detect(word1, "^\\d"),
           !str_detect(word2, "^\\d")) %>%
    count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) { #function to visualise bigrams 
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}
#apply count_bigrams to our samples_df
sample_bigrams <- samples_df %>% 
  count_bigrams()
# filter out rare combinations and rt/amp and visualise
sample_bigrams %>%
  filter(n > 30,
         !str_detect(word1, "rt"),
         !str_detect(word2, "rt"),
         !str_detect(word1, "amp"),
         !str_detect(word2, "amp")) %>% 
  visualize_bigrams()

The above plot shows common bigrams encountered in the sampled data set. For example in the centre “parking” is often followed by “lot” and “head” is often followed by “coach”. “san” could be followed by “francisco” or “diego”. Similarly “president” can be preceded by “vice” or followed by “barack” or “obama”.

Trigams-consecutive sequences of 3 words

sample_trigrams <- samples_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word,
         !word1 %in% badWordList,
         !word2 %in% badWordList,
         !word3 %in% badWordList,
         !str_detect(word1, "^\\d"),
         !str_detect(word2, "^\\d"),
         !str_detect(word3, "^\\d"),
         !str_detect(word1, "rt"),
         !str_detect(word2, "rt"),
         !str_detect(word3, "rt"),
         !str_detect(word1, "amp"),
         !str_detect(word2, "amp"),
         !str_detect(word3, "amp")) %>%
  count(word1, word2, word3, sort = TRUE)

sample_trigrams %>% 
  top_n(20) %>% 
  mutate(x = paste(word1, word2, word3, sep = " ")) %>% 
  mutate(x = reorder(x, n)) %>% 
  ggplot(aes(x, n)) + xlab("Trigram") + ylab("Frequency") +
  geom_col(show.legend = FALSE, fill = "purple") + coord_flip() + theme_minimal() +
  ggtitle("Twenty most frequent trigrams in the sampled data set")

The plot above shows the most frequent observed trigrams in the sampled data set.

Predicting the next word based on observed bigrams

return_third_word <- function(woord1, woord2){
  woord <- sample_trigrams %>%
    filter_(~word1 == woord1, ~word2 == woord2) %>%
    sample_n(1, weight = n) %>% #select random row from filtered df, but weighted towards most freq trigram
    .[["word3"]]
woord
  }
return_third_word("president", "barack")
## [1] "obama"
return_third_word("jones", "industrial")
## [1] "average"
return_third_word("st", "louis") #market, district, pinnacle
## [1] "county"
return_third_word("st", "patrick's")
## [1] "day"
return_third_word("dow", "jones") #global, industrial, fell, most often industrial
## [1] "global"
return_second_word <- function(woord1){
  woord <- sample_bigrams %>%
    filter_(~word1 == woord1) %>%
    sample_n(1, weight = n) %>%
    .[["word2"]]
  woord
}
return_second_word("happy")
## [1] "hour"
sample_bigrams %>% 
  filter_(~word1 == "happy") #173 instances where happy is word1 in a bigram
## # A tibble: 174 x 3
##    word1 word2            n
##    <chr> <chr>        <int>
##  1 happy birthday       107
##  2 happy hour            36
##  3 happy mother's        25
##  4 happy mothers         24
##  5 happy friday          22
##  6 happy easter          15
##  7 happy monday          13
##  8 happy thanksgiving    12
##  9 happy bday            10
## 10 happy sunday           8
## # ... with 164 more rows
#repeated execution of return_second_word("happy") -> birthday and other terms freq in training set appear more often
return_second_word("police")
## [1] "plan"
return_second_word("olive")
## [1] "oil"
return_second_word("hip")
## [1] "hop"
return_second_word("bottom")
## [1] "line"
return_second_word("credit")
## [1] "crisis"
return_second_word("san")
## [1] "francisco"

Summary

The bigrams observed in the training data sets will be used to inform the output of the return_second_word and return_third_word functions described above. A shiny app will be developed which will predict the next word when a user inputs text.

##Session Info

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_IE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_IE.UTF-8        LC_COLLATE=en_IE.UTF-8    
##  [5] LC_MONETARY=en_IE.UTF-8    LC_MESSAGES=en_IE.UTF-8   
##  [7] LC_PAPER=en_IE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_IE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2     kableExtra_0.9.0   igraph_1.2.2      
##  [4] stringr_1.3.1      tidyr_0.8.1        ggraph_1.0.2      
##  [7] wordcloud_2.6      RColorBrewer_1.1-2 ggplot2_3.0.0     
## [10] tidytext_0.2.0     dplyr_0.7.6       
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.4  purrr_0.2.5       lattice_0.20-35  
##  [4] colorspace_1.3-2  htmltools_0.3.6   SnowballC_0.5.1  
##  [7] viridisLite_0.3.0 yaml_2.2.0        utf8_1.1.4       
## [10] rlang_0.2.2       pillar_1.3.0      glue_1.3.0       
## [13] withr_2.1.2       tweenr_1.0.0      bindr_0.1.1      
## [16] plyr_1.8.4        munsell_0.5.0     gtable_0.2.0     
## [19] rvest_0.3.2       evaluate_0.11     labeling_0.3     
## [22] knitr_1.20        fansi_0.4.0       highr_0.7        
## [25] broom_0.5.0       tokenizers_0.2.1  Rcpp_0.12.19     
## [28] readr_1.1.1       scales_1.0.0      backports_1.1.2  
## [31] farver_1.0        gridExtra_2.3     hms_0.4.2        
## [34] ggforce_0.1.3     digest_0.6.17     stringi_1.2.4    
## [37] ggrepel_0.8.0     grid_3.4.4        cli_1.0.1        
## [40] tools_3.4.4       magrittr_1.5      lazyeval_0.2.1   
## [43] tibble_1.4.2      janeaustenr_0.1.5 crayon_1.3.4     
## [46] pkgconfig_2.0.2   MASS_7.3-49       Matrix_1.2-12    
## [49] xml2_1.2.0        httr_1.3.1        rstudioapi_0.8   
## [52] assertthat_0.2.0  rmarkdown_1.10.11 htmldeps_0.1.1   
## [55] viridis_0.5.1     R6_2.3.0          units_0.6-1      
## [58] nlme_3.1-131      compiler_3.4.4