The goal of this project is just to display that I’ve gotten used to working with the data and that I al on track to create my prediction algorithm.
I plan to use tidytext library for processing text because it seems to be working much faster than tm library.
library(dplyr)
library(ggplot2)
library(tidytext)
library(tidyr)
library(igraph)
library(ggraph)
library(widyr)
fNameBlogs <- "./final/en_US/en_US.blogs.txt"
fNameTwitter <- "./final/en_US/en_US.twitter.txt"
fNameNews <- "./final/en_US/en_US.news.txt"
fBlogs <- file(fNameBlogs,"r",blocking = F)
blogs <- readLines(fBlogs,encoding="UTF-8")
close(fBlogs)
fTwitter <- file(fNameTwitter,"r",blocking = F)
twitter <- readLines(fTwitter,encoding="UTF-8")
close(fTwitter)
fNews <- file(fNameNews,"r",blocking = F)
news <- readLines(fNews,encoding="UTF-8")
close(fNews)
The files are quite large: 1. blogs has size of 255.4 Mb and 899288 lines.
2. news has size of 19.8 Mb and 77258 lines.
3. twitter has size of 19.8 Mb and 2360148 lines.
So I will use random samples to speed up the exploration process. 5% of the population should be enough.
set.seed(1983)
# rblogs <- data.frame(text = sample(blogs,size = length(blogs) %/% 100 *5,replace = F),
# source = "blogs")
# rnews <- data.frame(text = sample(news,size = length(news) %/% 100 *5,replace = F),
# source = "news")
# rtwitter <- data.frame(text = sample(twitter,size = length(twitter) %/% 100 *5,replace = F),
# source = "twitter")
rblogs <- data.frame(text = sample(blogs,size = 2000,replace = F),
source = "blogs")
rnews <- data.frame(text = sample(news,size = 2000,replace = F),
source = "news")
rtwitter <- data.frame(text = sample(twitter,size = 2000,replace = F),
source = "twitter")
samples <- rblogs %>%
bind_rows(rnews) %>%
bind_rows(rtwitter)
Let’s check which words are the most common among different sources
samples %>%
unnest_tokens(word, text) %>%
group_by(source) %>%
count(source, word, sort = T) %>%
top_n(5) %>%
arrange(desc(n)) %>%
ggplot(aes(x=reorder(word,n),y=n,fill=source)) +
geom_col(show.legend=F) +
facet_wrap(~source,scales = "free") +
coord_flip() + ylab("Words") + xlab("Number of occurences") +
ggtitle("Most frequent words by source")
## Warning: package 'bindrcpp' was built under R version 3.5.1
let’s remove stopwords
samplewords <- samples %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
group_by(source) %>%
count(source, word, sort = T) %>%
top_n(5) %>%
arrange(desc(n))
samplewords %>%
ggplot(aes(x=reorder(word,n),y=n,fill=source)) +
geom_col(show.legend=F) +
facet_wrap(~source,scales = "free") +
coord_flip() + ylab("Words") + xlab("Number of occurences")+
ggtitle("Most frequent words by source",subtitle = "with stop words removed")
samplebigrams <- samples %>%
unnest_tokens(bigram,text,token = "ngrams", n=2) %>%
separate(bigram,c("word1","word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
group_by(source) %>%
count(word1,word2,sort = T) %>%
top_n(5) %>%
mutate(bigram = paste(word1,word2,sep=" ")) %>%
arrange(desc(n))
samplebigrams %>%
ggplot(aes(x=reorder(bigram,n),y=n,fill=source)) +
geom_col(show.legend=F) +
facet_wrap(~source,scales = "free") +
coord_flip() + ylab("Bigrams") + xlab("Number of occurences")+
ggtitle("Most frequent bigrams by source",subtitle = "with stop words removed")
Let’s check what are the most common relations between words
bigram_graph <- samples %>%
unnest_tokens(bigram,text,token = "ngrams", n=2) %>%
separate(bigram,c("word1","word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(word1,word2,sort = T) %>%
filter(n>3) %>%
graph_from_data_frame()
## Warning: package 'bindrcpp' was built under R version 3.5.1
ggraph(bigram_graph,layout = "fr") +
geom_edge_link() +
geom_node_point(color="lightblue",size=5)+
geom_node_text(aes(label=name),repel = T)+
theme_void()