# Used packages
library(igraph)
library(ggraph)
library(tidytext)
library(tidyverse)
The report is created as part of the coursere data science specialization capstone project. The aim of the capstone project is to create a model that predicts the follow up word. The purpose of this report is to give an overview on basic information of the data and view on exploratory analysis of the swift key data set.
Downloading the data if necessary
if (!dir.exists("Data")) {
dir.create("Data")
}
if (!file.exists("Data/project_data.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
"Data/project_data.zip")
}
if(!dir.exists("Data/final/en_US")){
unzip("Data/project_data.zip", exdir = "Data")
}
For this example only the English-US texts are used
en_US_blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
en_US_news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
en_US_twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
Some info on those files:
info<-data.frame(Source = c("Blogs", "News", "Twitter"),
'No of documents' = c(length(en_US_blogs), length(en_US_news), length(en_US_twitter)),
'No of words' = c(data_frame(text=en_US_blogs) %>% unnest_tokens(text, text) %>% nrow(),
data_frame(text=en_US_news) %>% unnest_tokens(text, text) %>% nrow(),
data_frame(text=en_US_twitter) %>% unnest_tokens(text, text) %>% nrow()),
'File size' = c(format(object.size(en_US_blogs), units="auto"),
format(object.size(en_US_news), units="auto"),
format(object.size(en_US_twitter), units="auto"))
)
info
## Source No.of.documents No.of.words File.size
## 1 Blogs 899288 37546246 255.4 Mb
## 2 News 77259 2674536 19.8 Mb
## 3 Twitter 2360148 30093410 319 Mb
Getting a sample data set from those files. The sample data set is a random set of 10% of the original files
samplesize<-.1
samples <- rbind(
data_frame(Source = "Us_blog", doc=1:floor(length(en_US_blogs)*samplesize),
text=sample(en_US_blogs, length(en_US_blogs)*samplesize)),
data_frame(Source = "Us_news", doc=1:floor(length(en_US_news)*samplesize),
text=sample(en_US_news, length(en_US_news)*samplesize)),
data_frame(Source = "Us_twitter", doc=1:floor(length(en_US_twitter)*samplesize),
text=sample(en_US_twitter, length(en_US_twitter)*samplesize))
)
samples$text<-sapply(samples$text, iconv, "UTF-8", "ASCII", sub = "") #remove all non utf 8 characters
In a first step a word tokenizer is user to analysis frequencies in each of the sets and check for speech typical patterns in the data.
In this case the tidytext package is user to do the data set set up.
Data is tokenized and stop-words are removed.
Within each source the frequency of each word is counted and percentage within the source is calculated
plotdata<-
samples %>%
mutate(text = str_extract(tolower(text), "[a-z']+")) %>%
unnest_tokens(ngram, text, token = "ngrams", n = 1) %>%
anti_join(data_frame(ngram=tm::stopwords())) %>%
group_by(Source) %>%
count(ngram, sort = T) %>%
mutate(Share= n/sum(n)) %>%
arrange(desc(Share))
## Joining, by = "ngram"
From this tidy data set we can draw a graph
plotdata %>% ungroup() %>%
mutate(ngram=fct_reorder(ngram, Share, .desc = T)) %>%
group_by(Source) %>%
top_n(10, Share)%>%
ggplot()+
geom_bar(aes(x=ngram, y= Share, fill=Source), stat = "identity", position = "dodge")+
scale_y_continuous(labels = scales::percent)+
coord_flip() +
facet_grid(Source~., scales = "free")+
labs(title="Top words per text source", x="Words", y="Share of word of all words from source")
Looking at the density plot for all three sources.
Easily observable is that (as expected) only a very small number of words occurs more frequent, the majority of words has only few occurrences.
plotdata %>%
top_n(500, Share) %>%
ggplot()+
geom_density(aes(Share, fill=Source), alpha=.4)+
labs(title="Density of top 500 words")
Using Bi-grams (2 words) is for the following task a more relevant case. This descriptive analysis should show how different words are co-occurring through out the data set.
First step is again to take the sample data set and tokenzie and count absolute and relative frequencies
Again this is done with the tidytext package
plotdata<-
samples %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2) %>%
separate(ngram, c("word1", "word2"), sep = " ") %>%
mutate(word1 = str_extract(word1, "[a-z']+"),
word2 = str_extract(word2, "[a-z']+")) %>%
filter(!is.na(word1) & !is.na(word2)) %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(ngram, word1, word2, sep = " ") %>%
group_by(Source) %>%
count(ngram, sort = T) %>%
mutate(Share= n/sum(n)) %>%
arrange(desc(Share))
Again looking at the top 10 Bi-grams by source
plotdata %>% ungroup() %>%
mutate(ngram=fct_reorder(ngram, Share, .desc = T)) %>%
group_by(Source) %>%
top_n(10, Share)%>%
ggplot()+
geom_bar(aes(x=ngram, y= Share, fill=Source), stat = "identity", position = "dodge")+
scale_y_continuous(labels = scales::percent)+
coord_flip() +
facet_grid(Source~., scales = "free")+
labs(title="Top bi-grams per text source", x="Words", y="Share of word of all words from source")
Bi-grams have an advantage over single words. Their co-occurring can be analysed.
The following plot shows how the most frequent word pairs are related. Where multiple combinations are shown like ‘San’ occurs with ‘Francisco’ and ‘Diego’ as well as the direction of this relation.
plotdata %>%
ungroup() %>%
separate(ngram, c("word1", "word2"), " ") %>%
filter(n>50) %>%
select(word1, word2, n) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = grid::arrow(type = "closed", length = unit(.15, "inches")),
end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()+
labs(title= "BI-Gram word relation ship")+
theme(plot.title = element_text(hjust = 0.5))
Using the insights from this first analysis and the obtained data set to dive deeper in the modeling of follow up words and a shiny application that makes this model usable.