DATA SCIENCE CAPSTONE: MILESTONE REPORT

Below is some exploratory work on Swiftkey’s 3 data sets comprised of blogs, news, and tweets. The purpose of this project is to provide a sense of the structure of the corpus. With these insights, one would hopefully make more minded decisions around designing an NLP model on the basis of this data set.

summary(blogsraw)
##    Length     Class      Mode 
##    899288 character character
summary(newsraw)
##    Length     Class      Mode 
##     77259 character character
summary(twitterraw)
##    Length     Class      Mode 
##   2360148 character character

Data Processing

The general process below involves tokenizing the raw corpus in order to develop a more useful data set. Tokenizing is specifically breaking down the data by word. As an aside, I also make data frames excluding stop words, or common words that may not be useful for our purposes.

These tokenized sets are then organized in bigrams and trigrams for further visualization.

## Convert to tidy dataset
twitter_df<-data_frame(text=twitter)

## Tokenize, word frequencies
twitterwords<-twitter_df %>% unnest_tokens(word,text) %>% count(word,sort=TRUE)

        # eliminate stop words
        twitterwordsstop<-twitter_df %>% unnest_tokens(word,text) %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
## News same
news_df<-data_frame(text=news)
newswords<-news_df %>% unnest_tokens(word,text) %>% count(word,sort=TRUE)

        newswordsstop<-news_df %>% unnest_tokens(word,text) %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
## Blogs same
blogs_df<-data_frame(text=blogs)
blogswords<-blogs_df %>% unnest_tokens(word,text) %>% count(word,sort=TRUE)

        blogswordsstop<-blogs_df %>% unnest_tokens(word,text) %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
## n-gram frequencies
twitter2gram<-twitter_df %>% unnest_tokens(bigram,text,token="ngrams",n=2)
twitter3gram<-twitter_df %>% unnest_tokens(trigram,text,token="ngrams",n=3)

        twitter2gramfreq<-twitter2gram %>% count(bigram,sort=TRUE)
        twitter3gramfreq<-twitter3gram %>% count(trigram,sort=TRUE)

blogs2gram<-blogs_df %>% unnest_tokens(bigram,text,token="ngrams",n=2)
blogs3gram<-blogs_df %>% unnest_tokens(trigram,text,token="ngrams",n=3)

        blogs2gramfreq<-blogs2gram %>% count(bigram,sort=TRUE)
        blogs3gramfreq<-blogs3gram %>% count(trigram,sort=TRUE)

news2gram<-news_df %>% unnest_tokens(bigram,text,token="ngrams",n=2)
news3gram<-news_df %>% unnest_tokens(trigram,text,token="ngrams",n=3)

        news2gramfreq<-news2gram %>% count(bigram,sort=TRUE)
        news3gramfreq<-news3gram %>% count(trigram,sort=TRUE)


## Combine ngrams
df2<-dplyr::bind_rows(list(twitter=twitter2gramfreq,blogs=blogs2gramfreq,news=news2gramfreq),.id='source') %>% bind_tf_idf(bigram,source,n) %>% arrange(desc(n))
df3<-dplyr::bind_rows(list(twitter=twitter3gramfreq,blogs=blogs3gramfreq,news=news3gramfreq),.id='source') %>% bind_tf_idf(trigram,source,n) %>% arrange(desc(n))
        
        
                
        ## de register cluster
        stopCluster(cluster)

Then I put together some graphics to show most common occurrences of certain word by data frame, bigrams by data frame, and trigrams by data frame.

##Graphics
library(ggplot2); library(ggraph); library(igraph)
## Warning: package 'ggraph' was built under R version 3.5.2
## Warning: package 'igraph' was built under R version 3.5.2
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
twitterwordsstop %>% top_n(10) %>% 
    ggplot(aes(x = reorder(word,n), y = n)) +
    geom_bar(stat = "identity", width = 0.5) + 
    xlab(NULL) +
    coord_flip() +
    ylab("Word Frequency") +
    ggtitle("Most Common Twitter Words") +
    theme(legend.position = "none")
## Selecting by n

blogswordsstop %>% top_n(10) %>% 
    ggplot(aes(x = reorder(word,n), y = n)) +
    geom_bar(stat = "identity", width = 0.5) + 
    xlab(NULL) +
    coord_flip() +
    ylab("Word Frequency") +
    ggtitle("Most Common Blog Words") +
    theme(legend.position = "none")
## Selecting by n

newswordsstop %>% top_n(10) %>% 
    ggplot(aes(x = reorder(word,n), y = n)) +
    geom_bar(stat = "identity", width = 0.5) + 
    xlab(NULL) +
    coord_flip() +
    ylab("Word Frequency") +
    ggtitle("Most Common News Words") +
    theme(legend.position = "none")
## Selecting by n

df2 %>% top_n(10,n) %>% 
    ggplot(aes(x = reorder(bigram,n), y = n)) +
    geom_bar(stat = "identity", width = 0.5) + 
    xlab(NULL) +
    coord_flip() +
    ylab("Word Frequency") +
    ggtitle("Most Common Bigrams") +
    theme(legend.position = "none")

df3 %>% top_n(10,n) %>% 
    ggplot(aes(x = reorder(trigram,n), y = n)) +
    geom_bar(stat = "identity", width = 0.5) + 
    xlab(NULL) +
    coord_flip() +
    ylab("Word Frequency") +
    ggtitle("Most Common Trigrams") +
    theme(legend.position = "none")

set.seed(2018)
bigram_graph <- df2 %>% separate(bigram,c("word1","word2"),sep=" ") %>% select(-c(source)) %>% filter(n>1000) %>% graph_from_data_frame()

ggraph(bigram_graph, layout = "fr") +
    geom_edge_link() +
    geom_node_point() +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1)

trigram_graph <- df3 %>% separate(trigram,c("word1","word2","word3"),sep=" ") %>% select(-c(source)) %>% filter(n>100) %>% graph_from_data_frame()

ggraph(trigram_graph, layout = "fr") +
    geom_edge_link() +
    geom_node_point() +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1)