SUmmary

The goal of this project is just to display that I’ve gotten used to working with the data and that I al on track to create my prediction algorithm.
I plan to use tidytext library for processing text because it seems to be working much faster than tm library.

Getting data

library(dplyr)
library(ggplot2)
library(tidytext)
library(tidyr)
library(igraph)
library(ggraph)
library(widyr)
fNameBlogs <- "./final/en_US/en_US.blogs.txt"
fNameTwitter <- "./final/en_US/en_US.twitter.txt"
fNameNews <- "./final/en_US/en_US.news.txt"

fBlogs <- file(fNameBlogs,"r",blocking = F)
blogs <- readLines(fBlogs,encoding="UTF-8")
close(fBlogs)

fTwitter <- file(fNameTwitter,"r",blocking = F)
twitter <- readLines(fTwitter,encoding="UTF-8")
close(fTwitter)

fNews <- file(fNameNews,"r",blocking = F)
news <- readLines(fNews,encoding="UTF-8")
close(fNews)

The files are quite large: 1. blogs has size of 255.4 Mb and 899288 lines.
2. news has size of 19.8 Mb and 77258 lines.
3. twitter has size of 19.8 Mb and 2360148 lines.

So I will use random samples to speed up the exploration process. 5% of the population should be enough.

set.seed(1983)
# rblogs <- data.frame(text = sample(blogs,size = length(blogs) %/% 100 *5,replace = F),
#                      source = "blogs")
# rnews <- data.frame(text = sample(news,size = length(news) %/% 100 *5,replace = F),
#                     source = "news")
# rtwitter <- data.frame(text = sample(twitter,size = length(twitter) %/% 100 *5,replace = F),
#                        source = "twitter")
rblogs <- data.frame(text = sample(blogs,size = 2000,replace = F),
                     source = "blogs")
rnews <- data.frame(text = sample(news,size = 2000,replace = F),
                    source = "news")
rtwitter <- data.frame(text = sample(twitter,size = 2000,replace = F),
                       source = "twitter")

samples <- rblogs %>%
           bind_rows(rnews) %>%
           bind_rows(rtwitter)

The most common words

Let’s check which words are the most common among different sources

samples %>%
  unnest_tokens(word, text) %>%
  group_by(source) %>%
  count(source, word, sort = T) %>% 
  top_n(5) %>%
  arrange(desc(n)) %>%
  ggplot(aes(x=reorder(word,n),y=n,fill=source)) +
  geom_col(show.legend=F) +
  facet_wrap(~source,scales = "free") + 
  coord_flip() + ylab("Words") + xlab("Number of occurences") +
  ggtitle("Most frequent words by source")
## Warning: package 'bindrcpp' was built under R version 3.5.1

let’s remove stopwords

samplewords <- samples %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  group_by(source) %>%
  count(source, word, sort = T) %>% 
  top_n(5) %>%
  arrange(desc(n))

samplewords %>%
  ggplot(aes(x=reorder(word,n),y=n,fill=source)) +
  geom_col(show.legend=F) +
  facet_wrap(~source,scales = "free") + 
  coord_flip() + ylab("Words") + xlab("Number of occurences")+
  ggtitle("Most frequent words by source",subtitle = "with stop words removed")

2 word combinations (bigrams)

samplebigrams <- samples %>%
  unnest_tokens(bigram,text,token = "ngrams", n=2) %>%
  separate(bigram,c("word1","word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  group_by(source) %>%
  count(word1,word2,sort = T) %>%
  top_n(5) %>%
  mutate(bigram = paste(word1,word2,sep=" ")) %>%
  arrange(desc(n))

samplebigrams %>%
  ggplot(aes(x=reorder(bigram,n),y=n,fill=source)) +
  geom_col(show.legend=F) +
  facet_wrap(~source,scales = "free") + 
  coord_flip() + ylab("Bigrams") + xlab("Number of occurences")+
  ggtitle("Most frequent bigrams by source",subtitle = "with stop words removed")

Let’s check what are the most common relations between words

bigram_graph <-  samples %>%
  unnest_tokens(bigram,text,token = "ngrams", n=2) %>%
  separate(bigram,c("word1","word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  count(word1,word2,sort = T) %>%
  filter(n>3) %>%
  graph_from_data_frame()
## Warning: package 'bindrcpp' was built under R version 3.5.1
ggraph(bigram_graph,layout = "fr") +
  geom_edge_link() +
  geom_node_point(color="lightblue",size=5)+
  geom_node_text(aes(label=name),repel = T)+
  theme_void()

Next steps

  1. Pay more precise attention on cleaning words. We have time, figures and acronimes to clear
  2. Build a model which will predict a word depending on 2 and 3 gramms
  3. Create an application.