The objective of the project is to to perform exploratorty analysis from text data.
The data used in the project has 3 data files in US english natural language:- * blogs * news * twitter
library(tm)
library(corpus)
library(quanteda)
library(stringi)
library(ngram)
library(SnowballC)
library(ggplot2)
library(lexicon)
We need to subset the data to smaller data for our analysis as the file size is too large and a small sample would be enough to create inference for our analysis.
loading the data
twitter1<-readLines("./final/en_US/en_US.twitter.txt", skipNul = TRUE)
blogs1<- readLines("./final/en_US/en_US.blogs.txt",skipNul = TRUE)
news1<- readLines("./final/en_US/en_US.news.txt",skipNul = TRUE)
Summary statistics of data
Blogs.word<-stri_count_words(blogs1)
News.word<-stri_count_words(news1)
Twitter.word<-stri_count_words(twitter1)
data.frame(Data_Source = c("Blogs", "News", "Twitter"),
num_of_lines = c(length(blogs1), length(news1), length(twitter1)),
num_of_words = c(sum(Blogs.word), sum(News.word), sum(Twitter.word)))
## Data_Source num_of_lines num_of_words
## 1 Blogs 899288 38154238
## 2 News 77259 2693898
## 3 Twitter 2360148 30218166
Subsetting the data
set.seed(111)
twitter<- twitter1[rbinom(length(twitter1)*.01,length(twitter1),0.5)]
blogs<- blogs1[rbinom(length(blogs1)*.01,length(blogs1),0.5)]
news<- news1[rbinom(length(news1)*.01,length(news1),0.5)]
combing 3 documents
data<- c(twitter,blogs,news)
data<- Corpus(VectorSource(list(data)))
STep:5- cleaning data and removing profanity words
data<- tm_map(data,tolower)
data<- tm_map(data,removePunctuation)
data<- tm_map(data,stripWhitespace)
data<- tm_map(data,removeWords,profanity_banned)## removing profanity words
data<- tm_map(data,removeWords, stopwords("en"))
data<- tm_map(data,stemDocument)
Writing a corpus into new file
writeCorpus(data, filenames="Corpusdata.txt")
data1 <- readLines("Corpusdata.txt")
Creating n-grams and dataframe
#unigram
unigram<-ngram(data1, n=1)
unigram.df<-data.frame(get.phrasetable(unigram)[,1:2])
#bigram
bigram<-ngram(data1, n=2)
bigram.df<-data.frame(get.phrasetable(bigram)[,1:2])
#trigram
trigram<-ngram(data1, n=3)
trigram.df<-data.frame(get.phrasetable(trigram)[,1:2])
Unigram
ggplot(unigram.df[1:20,], aes(ngrams, freq, colour=freq, fill=freq))+geom_col()+theme_light()+labs(title="Top 20 Uni-gram phrases", y="Frequency",x="Uni-grams")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
Bigram
ggplot(bigram.df[1:20,], aes(ngrams, freq, colour=freq, fill=freq))+geom_col()+theme_light()+labs(title="Top 20 Bi-gram phrases", y="Frequency",x="Bi-grams")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
Trigram
ggplot(trigram.df[1:20,], aes(ngrams, freq, colour=freq, fill=freq))+geom_col()+theme_light()+labs(title="Top 10 Tri-gram phrases", y="Frequency",x="Tri-grams")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
Based on the analysis performed above, the prediction of the next word for a sentence can be well done using the tri-gram NLP prediction algorithm. The algorithm would generate the most probable word based on its occurance probability after the previous two words.