Prepare Libraries

Instal the basic libraries for analysis.

library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
library(gbm)
## Warning: package 'gbm' was built under R version 3.6.3
## Loaded gbm 2.1.8
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 3.6.3
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 3.6.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.6.3
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(stringi)
## Warning: package 'stringi' was built under R version 3.6.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 3.6.3
print("Success")
## [1] "Success"

Get and explore data

Read each line of the data into R and display the line count:

df_blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
df_news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
df_twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

paste("Number of lines in blogs: ", length(df_blogs))
## [1] "Number of lines in blogs:  899288"
paste("Number of lines in news: ", length(df_news))
## [1] "Number of lines in news:  77259"
paste("Number of lines in twitter: ", length(df_twitter))
## [1] "Number of lines in twitter:  2360148"

Combine the datasets into one set to ease cleaning and analysis. Because each file is so large a sample with 5% of the lines is created for further analysis:

# Sample and Combine data into one file
set.seed(02122021)

twit_sample <- sample(df_twitter, length(df_twitter)*.05)
news_sample <- sample(df_news, length(df_news)*.05)
blog_sample <- sample(df_blogs, length(df_blogs)*.05)

combined_sample <- c(twit_sample, blog_sample, news_sample)
paste("Number of lines in combined sample: ", length(combined_sample))
## [1] "Number of lines in combined sample:  166833"

The data in this analysis is not clean. There are a lot of irrelevant punctuation or words being used.

Use Text Mining package to clean data for analysis: -Remove whitespace, convert to lowercase, remove punctuation, numbers, stop words, non-ASCII characters

textcleaning <- function(x){
  textclean <- x %>% 
    removeNumbers() %>%
    removeWords(stopwords("english")) %>%
    removePunctuation(preserve_intra_word_contractions = TRUE, preserve_intra_word_dashes = TRUE) %>%
    stripWhitespace()
    return(textclean)
}

twitter_clean <- textcleaning(twit_sample)
news_clean <- textcleaning(news_sample)
blog_clean <- textcleaning(blog_sample)

Create tibbles with the cleaned data for analysis.

twitter_df <- tibble(line = 1:length(twitter_clean), words = twitter_clean)
blog_df <- tibble(line = 1:length(blog_clean), words = blog_clean)
news_df <- tibble(line = 1:length(news_clean), words = news_clean)

# Combine into one tibble
combined_df <- bind_rows("twitter" = twitter_df, "blogs" = blog_df, "news" = news_df, .id = "group")

We need to tokenize the dataframe to get specific words for analysis. For the sake of this project, 1-3 words are most relevant for prediction.

data(stop_words)
tbn1 <- combined_df %>% unnest_tokens(word, words, token = "ngrams", n = 1)
tbn1 <- tbn1 %>% anti_join(stop_words) #remove most common english words in 1 n-gram
## Joining, by = "word"
tbn2 <- combined_df %>% unnest_tokens(word, words, token = "ngrams", n = 2)
tbn3 <- combined_df %>% unnest_tokens(word, words, token = "ngrams", n = 3)
sample_n(tbn1, 5)
## # A tibble: 5 x 3
##   group     line word 
##   <chr>    <int> <chr>
## 1 blogs     7177 eye  
## 2 twitter  21212 day  
## 3 blogs    32284 rings
## 4 blogs    18027 bring
## 5 twitter 103307 demos

Calculate the frequency and total count of each ngram.

totaltf <- function (x){
    ntf <- x %>%
    count(group, word, sort=T)
    total <- ntf %>%
    group_by(group) %>%
    summarize(total = sum(n))
    ntf <- left_join(ntf, total)
    ntf <- mutate(ntf, tf = n/total)
    return(ntf)
}

TBN1 <- totaltf(tbn1)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
TBN2 <- totaltf(tbn2)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
TBN3 <- totaltf(tbn3)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
#join all three tibbles in one object
TBN <- bind_rows("ngram1" = TBN1, "ngram2" = TBN2, "ngram3" = TBN3, .id = "ngram")
TBN <- TBN[TBN$word != 'NA',] #remove NA values
sample_n(TBN, 10)
## # A tibble: 10 x 6
##    ngram  group   word                       n   total          tf
##    <chr>  <chr>   <chr>                  <int>   <int>       <dbl>
##  1 ngram2 blogs   days sleeping              1 1032409 0.000000969
##  2 ngram2 news    dinners also               1   77771 0.0000129  
##  3 ngram3 twitter love you monique           1  723833 0.00000138 
##  4 ngram3 blogs   create image tan           1  991111 0.00000101 
##  5 ngram3 twitter i broke yoyo               1  723833 0.00000138 
##  6 ngram2 twitter listen avd                 1  831714 0.00000120 
##  7 ngram3 blogs   grooved elements often     1  991111 0.00000101 
##  8 ngram2 twitter facebook my                2  831714 0.00000240 
##  9 ngram3 blogs   artists parents liked      1  991111 0.00000101 
## 10 ngram2 blogs   go bump                    2 1032409 0.00000194

Now we select the highest 5 values from each ngram and group and plot them.

TBN_High <- TBN %>%
  group_by(group, ngram) %>%
  top_n(5)
## Selecting by tf
ngram_plot <- ggplot(TBN_High, aes(x=word, y=tf))+
  geom_bar(stat="identity", width = 0.6)+
  labs(title = "1, 2, and 3 ngrams by source", x = "Words", y="Term Frequency")+
  facet_grid(~ngram+group, scales = "free", space = "free_y")+
  theme(axis.text.x=element_text(angle = -90, hjust = 0))+
  theme(axis.text.y=element_text(angle = -90, hjust = 0))
ngram_plot

We can see from the plots what the distribution of the 5-most frequent ngrams are for each group. Now it's time to plan out how to create a prediction model.