This project is some exploratory analysis for NLP (Natural Language Processing)

Load the required packages

Sys.setenv(JAVA_HOME = ‘C:\Program Files\Java\jre1.8.0_261\’) install.packages(“readtext”) install.packages(“tm”) install.packages(“wordcloud”) # word-cloud generator install.packages(“RColorBrewer”) install.packages(“RWeka”) install.packages(“dplyr”) install.packages(“SnowballC”) install.packages(“stringi”) install.packages(“kableExtra”)

library("SnowballC")
library(RWeka)
library("readtext")
library(tm)
## Loading required package: NLP
library(RColorBrewer)
library(wordcloud)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringi)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Read in the data

The data sets of twitters, blogs and news are so large, we will use a sample 0.2% of each data set.

set.seed(4321)
blogs <- readLines("en_US.blogs.txt")
blogs_sample <- sample(blogs, length(blogs) * 0.002)
twit <- readLines("en_US.twitter.txt")
## Warning in readLines("en_US.twitter.txt"): line 167155 appears to contain an
## embedded nul
## Warning in readLines("en_US.twitter.txt"): line 268547 appears to contain an
## embedded nul
## Warning in readLines("en_US.twitter.txt"): line 1274086 appears to contain an
## embedded nul
## Warning in readLines("en_US.twitter.txt"): line 1759032 appears to contain an
## embedded nul
twit_sample <- sample(twit, length(twit)*.002)
news <- readLines("en_US.news.txt")
## Warning in readLines("en_US.news.txt"): incomplete final line found on
## 'en_US.news.txt'
news_sample <- sample(news, length(news) * 0.002)
all <- c(blogs_sample, twit_sample, news_sample)
all_sample <- sample(all, length(all)*0.002)

Explore the Datasets and determine each of their sizes, number of lines, and number of words

b <- file.info("en_US.blogs.txt")$size
t <- file.info("en_US.twitter.txt")$size
n <- file.info("en_US.news.txt")$size
b <- round(b/1024/1024)
t <- round(t/1024/1024)
n <- round(n/1024/1024)
a = b + t + n

lb <- length(blogs)
lt <- length(twit)
ln <- length(news)
la <- lb + lt + ln
wb <- sum(stri_count_words(blogs))
wt <- sum(stri_count_words(twit))
wn <- sum(stri_count_words(news))
wa <- wb + wt + wn
df <- data.frame("Size_in_Mb" = c(b,t,n,a), "Number_of_Lines" = c(lb,lt,ln,la), "Number_of_Words" = c(wb, wt, wn, wa))
row.names(df) <- c("Blogs", "Twitter", "News", "All")
kable(df, format = "html",  
      caption = "Table Size, # of Lines, # of Words for the 4 Datasets",
      digits = c(0, 0, 2, 3), align = "ccrr") %>%
  kable_styling(full_width = FALSE, position = "left")
Table Size, # of Lines, # of Words for the 4 Datasets
Size_in_Mb Number_of_Lines Number_of_Words
Blogs 200 899288 38154238
Twitter 159 2360148 30218125
News 196 77259 2693898
All 555 3336695 71066261

Question: How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

all_sample_corpus <- VCorpus(VectorSource(all))
all_sample_corpus <- tm_map(all_sample_corpus, stripWhitespace) # Eliminate extra white spaces
all_sample_corpus <- tm_map(all_sample_corpus, removePunctuation) # Remove punctuations
all_sample_corpus <- tm_map(all_sample_corpus, content_transformer(tolower)) # Convert the text to lower case
all_sample_corpus <- tm_map(all_sample_corpus, removeNumbers) # Remove numbers


corpus_sample <- TermDocumentMatrix(all_sample_corpus)
corpus_sample_matrix <- as.matrix(corpus_sample)
corpus_sample_matrix <- sort(rowSums(corpus_sample_matrix),decreasing=TRUE)
corpus_df <- data.frame(word = names(corpus_sample_matrix),freq=corpus_sample_matrix)
head(corpus_df, 10)
##      word freq
## the   the 5821
## and   and 3145
## you   you 1726
## for   for 1562
## that that 1429
## with with  951
## this this  842
## was   was  825
## have have  799
## are   are  728
p<-cumsum(corpus_df$freq)/sum(corpus_df$freq)
which(p>=0.5)[1]
## [1] 263
which(p>=0.90)[1]
## [1] 7050
which(p>=0.99)[1]
## [1] 16579

We see 263 words are needed for 50%, 7050 words are needed for 90% and 16,579 words are needed for 99% coverge,

Function for creating Corpus object and cleaning the data

process_corpus<-function(corpus) { 
  corpus <- VCorpus(VectorSource(corpus))
  corpus <- corpus %>%
  tm_map(removeNumbers) %>% # Eliminate numbers
  tm_map(stripWhitespace) %>% # Eliminate extra white spaces
  tm_map(removePunctuation,preserve_intra_word_dashes = TRUE) %>% # Remove punctuations
  tm_map(content_transformer(tolower)) ## %>% # Convert the text to lower case
  ## tm_map(removeWords, stopwords("en")) #Remove stopwords
}

Function for calculating n-grams

ngrams<-function(corpus, n, N=NA){
  corpus<-process_corpus(corpus)
  token<-function(corpus){
    NGramTokenizer(corpus, Weka_control(min = n, max = n))
  }
  tdm <- TermDocumentMatrix(corpus, control = list(tokenize = token))
  rsum <- rowSums(as.matrix(tdm))
  ngram<-data.frame(ngram=names(rsum),freq=rsum)
  ngram<-ngram[order(-ngram$freq),]
  if(!is.na(N)){
    ngram<-head(ngram,N)
  }
  return(ngram)
}
## Plot Unigrams for all datasets
blogs_df <- ngrams(blogs_sample, 1, 50)
twit_df <- ngrams(twit_sample, 1, 50)
news_df <- ngrams(news_sample, 1, 50)
all_df <- ngrams(all_sample, 1, 50)

par(mfrow=c(2, 2))
barplot(blogs_df[1:10,]$freq, las = 3, names.arg = blogs_df[1:10,]$ngram,
        col ="red2", main ="Frequency of words in blogs",
        ylab = "Number of Occurrences")
barplot(twit_df[1:10,]$freq, las = 3, names.arg = twit_df[1:10,]$ngram,
        col ="chartreuse1", main ="Frequency of words in twitter",
        ylab = "Number of Occurrences")

barplot(news_df[1:10,]$freq, las = 3, names.arg = news_df[1:10,]$ngram,
        col ="blue3", main ="Frequency of words in the news",
        ylab = "Number of Occurrences")
barplot(all_df[1:10,]$freq, las = 3, names.arg = all_df[1:10,]$ngram,
        col ="chocolate2", main ="Frequency of words in all the Data",
        ylab = "Number of Occurrences")

par(mfrow=c(1,1))
wordcloud(words = blogs_df$ngram, freq = blogs_df$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Set1"))

wordcloud(words = twit_df$ngram, freq = twit_df$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Spectral"))

wordcloud(words = news_df$ngram, freq = news_df$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "RdYlGn"))

wordcloud(words = all_df$ngram, freq = all_df$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Paired"))

Plot BiGrams for all datasets

blogs_df <- ngrams(blogs_sample, 2, 50)
twit_df <- ngrams(twit_sample, 2, 50)
news_df <- ngrams(news_sample, 2, 50)
all_df <- ngrams(all_sample, 2, 50)

par(mfrow=c(2, 2))
barplot(blogs_df[1:10,]$freq, las = 3, names.arg = blogs_df[1:10,]$ngram,
        col ="red2", main ="Frequency of words in blogs",
        ylab = "Number of Occurrences")
barplot(twit_df[1:10,]$freq, las = 3, names.arg = twit_df[1:10,]$ngram,
        col ="chartreuse1", main ="Frequency of words in twitter",
        ylab = "Number of Occurrences")

barplot(news_df[1:10,]$freq, las = 3, names.arg = news_df[1:10,]$ngram,
        col ="blue3", main ="Frequency of words in the news",
        ylab = "Number of Occurrences")
barplot(all_df[1:10,]$freq, las = 3, names.arg = all_df[1:10,]$ngram,
        col ="chocolate2", main ="Frequency of words in all the Data",
        ylab = "Number of Occurrences")

par(mfrow=c(1,1))
wordcloud(words = blogs_df$ngram, freq = blogs_df$freq, min.freq = 1,
          max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Set1"))

wordcloud(words = twit_df$ngram, freq = twit_df$freq, min.freq = 1,
          max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Spectral"))

wordcloud(words = news_df$ngram, freq = news_df$freq, min.freq = 1,
          max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "RdYlGn"))

wordcloud(words = all_df$ngram, scale=c(1,1), freq = all_df$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Paired"))

Plot TriGrams for all datasets

blogs_df <- ngrams(blogs_sample, 3, 50)
twit_df <- ngrams(twit_sample, 3, 50)
news_df <- ngrams(news_sample, 3, 50)
all_df <- ngrams(all_sample, 3, 100)

par(mfrow=c(2, 2))
barplot(blogs_df[1:10,]$freq, las = 3, names.arg = blogs_df[1:10,]$ngram,
        col ="red2", main ="Frequency of words in blogs",
        ylab = "Number of Occurrences")
barplot(twit_df[1:10,]$freq, las = 3, names.arg = twit_df[1:10,]$ngram,
        col ="chartreuse1", main ="Frequency of words in twitter",
        ylab = "Number of Occurrences")

barplot(news_df[1:10,]$freq, las = 3, names.arg = news_df[1:10,]$ngram,
        col ="blue3", main ="Frequency of words in the news",
        ylab = "Number of Occurrences")
barplot(all_df[1:10,]$freq, las = 3, names.arg = all_df[1:10,]$ngram,
        col ="chocolate2", main ="Frequency of words in all the Data",
        ylab = "Number of Occurrences")

par(mfrow=c(1,1))
wordcloud(words = blogs_df$ngram, freq = blogs_df$freq, min.freq = 1,
          max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Set1"))

wordcloud(words = twit_df$ngram, freq = twit_df$freq, min.freq = 1,
          max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Spectral"))

wordcloud(words = news_df$ngram, freq = news_df$freq, min.freq = 1,
          max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "RdYlGn"))

wordcloud(words = all_df$ngram, freq = all_df$freq, min.freq = 3,
          max.words=100, scale=c(.5,.5), random.order=FALSE, rot.per=.35, 
          colors=brewer.pal(8, "Paired"))

Future Considerations 1. The datasets are still very large (especially the “all” set). Maybe we will need a smaller sample in the future.
2. Should profanities be removed? I decided not to remove these because
- none of them seemed to be part of the unigrams, bigrams or trigrams
- they might be useful in prediction.
3. Foreign words also were not present. In the future we might need an English dictionary to eliminate foreign words.
4. Build a Training set, Test set to use for Cross-Validation.
5. Build a prediction model to predict the next word, based on n-grams.