Instal the basic libraries for analysis.
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
library(gbm)
## Warning: package 'gbm' was built under R version 3.6.3
## Loaded gbm 2.1.8
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 3.6.3
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 3.6.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.6.3
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringi)
## Warning: package 'stringi' was built under R version 3.6.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 3.6.3
print("Success")
## [1] "Success"
Read each line of the data into R and display the line count:
df_blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
df_news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
df_twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
paste("Number of lines in blogs: ", length(df_blogs))
## [1] "Number of lines in blogs: 899288"
paste("Number of lines in news: ", length(df_news))
## [1] "Number of lines in news: 77259"
paste("Number of lines in twitter: ", length(df_twitter))
## [1] "Number of lines in twitter: 2360148"
Combine the datasets into one set to ease cleaning and analysis. Because each file is so large a sample with 5% of the lines is created for further analysis:
# Sample and Combine data into one file
set.seed(02122021)
twit_sample <- sample(df_twitter, length(df_twitter)*.05)
news_sample <- sample(df_news, length(df_news)*.05)
blog_sample <- sample(df_blogs, length(df_blogs)*.05)
combined_sample <- c(twit_sample, blog_sample, news_sample)
paste("Number of lines in combined sample: ", length(combined_sample))
## [1] "Number of lines in combined sample: 166833"
The data in this analysis is not clean. There are a lot of irrelevant punctuation or words being used.
Use Text Mining package to clean data for analysis: -Remove whitespace, convert to lowercase, remove punctuation, numbers, stop words, non-ASCII characters
textcleaning <- function(x){
textclean <- x %>%
removeNumbers() %>%
removeWords(stopwords("english")) %>%
removePunctuation(preserve_intra_word_contractions = TRUE, preserve_intra_word_dashes = TRUE) %>%
stripWhitespace()
return(textclean)
}
twitter_clean <- textcleaning(twit_sample)
news_clean <- textcleaning(news_sample)
blog_clean <- textcleaning(blog_sample)
Create tibbles with the cleaned data for analysis.
twitter_df <- tibble(line = 1:length(twitter_clean), words = twitter_clean)
blog_df <- tibble(line = 1:length(blog_clean), words = blog_clean)
news_df <- tibble(line = 1:length(news_clean), words = news_clean)
# Combine into one tibble
combined_df <- bind_rows("twitter" = twitter_df, "blogs" = blog_df, "news" = news_df, .id = "group")
We need to tokenize the dataframe to get specific words for analysis. For the sake of this project, 1-3 words are most relevant for prediction.
data(stop_words)
tbn1 <- combined_df %>% unnest_tokens(word, words, token = "ngrams", n = 1)
tbn1 <- tbn1 %>% anti_join(stop_words) #remove most common english words in 1 n-gram
## Joining, by = "word"
tbn2 <- combined_df %>% unnest_tokens(word, words, token = "ngrams", n = 2)
tbn3 <- combined_df %>% unnest_tokens(word, words, token = "ngrams", n = 3)
sample_n(tbn1, 5)
## # A tibble: 5 x 3
## group line word
## <chr> <int> <chr>
## 1 blogs 7177 eye
## 2 twitter 21212 day
## 3 blogs 32284 rings
## 4 blogs 18027 bring
## 5 twitter 103307 demos
Calculate the frequency and total count of each ngram.
totaltf <- function (x){
ntf <- x %>%
count(group, word, sort=T)
total <- ntf %>%
group_by(group) %>%
summarize(total = sum(n))
ntf <- left_join(ntf, total)
ntf <- mutate(ntf, tf = n/total)
return(ntf)
}
TBN1 <- totaltf(tbn1)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
TBN2 <- totaltf(tbn2)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
TBN3 <- totaltf(tbn3)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "group"
#join all three tibbles in one object
TBN <- bind_rows("ngram1" = TBN1, "ngram2" = TBN2, "ngram3" = TBN3, .id = "ngram")
TBN <- TBN[TBN$word != 'NA',] #remove NA values
sample_n(TBN, 10)
## # A tibble: 10 x 6
## ngram group word n total tf
## <chr> <chr> <chr> <int> <int> <dbl>
## 1 ngram2 blogs days sleeping 1 1032409 0.000000969
## 2 ngram2 news dinners also 1 77771 0.0000129
## 3 ngram3 twitter love you monique 1 723833 0.00000138
## 4 ngram3 blogs create image tan 1 991111 0.00000101
## 5 ngram3 twitter i broke yoyo 1 723833 0.00000138
## 6 ngram2 twitter listen avd 1 831714 0.00000120
## 7 ngram3 blogs grooved elements often 1 991111 0.00000101
## 8 ngram2 twitter facebook my 2 831714 0.00000240
## 9 ngram3 blogs artists parents liked 1 991111 0.00000101
## 10 ngram2 blogs go bump 2 1032409 0.00000194
Now we select the highest 5 values from each ngram and group and plot them.
TBN_High <- TBN %>%
group_by(group, ngram) %>%
top_n(5)
## Selecting by tf
ngram_plot <- ggplot(TBN_High, aes(x=word, y=tf))+
geom_bar(stat="identity", width = 0.6)+
labs(title = "1, 2, and 3 ngrams by source", x = "Words", y="Term Frequency")+
facet_grid(~ngram+group, scales = "free", space = "free_y")+
theme(axis.text.x=element_text(angle = -90, hjust = 0))+
theme(axis.text.y=element_text(angle = -90, hjust = 0))
ngram_plot
We can see from the plots what the distribution of the 5-most frequent ngrams are for each group. Now it's time to plan out how to create a prediction model.