Data science capstone project

The objective of the project is to to perform exploratorty analysis from text data.

The data used in the project has 3 data files in US english natural language:- * blogs * news * twitter

Loading required packages for analysis

library(tm)
library(corpus)
library(quanteda)
library(stringi)
library(ngram)
library(SnowballC)
library(ggplot2)
library(lexicon)

Loading data in R

We need to subset the data to smaller data for our analysis as the file size is too large and a small sample would be enough to create inference for our analysis.

loading the data

twitter1<-readLines("./final/en_US/en_US.twitter.txt", skipNul = TRUE)
blogs1<- readLines("./final/en_US/en_US.blogs.txt",skipNul = TRUE)
news1<- readLines("./final/en_US/en_US.news.txt",skipNul = TRUE)

Summary statistics of data

Blogs.word<-stri_count_words(blogs1)
News.word<-stri_count_words(news1)
Twitter.word<-stri_count_words(twitter1)
data.frame(Data_Source = c("Blogs", "News", "Twitter"),
           num_of_lines = c(length(blogs1), length(news1), length(twitter1)),
           num_of_words = c(sum(Blogs.word), sum(News.word), sum(Twitter.word)))
##   Data_Source num_of_lines num_of_words
## 1       Blogs       899288     38154238
## 2        News        77259      2693898
## 3     Twitter      2360148     30218166

Subsetting the data

set.seed(111)
twitter<- twitter1[rbinom(length(twitter1)*.01,length(twitter1),0.5)]
blogs<- blogs1[rbinom(length(blogs1)*.01,length(blogs1),0.5)]
news<- news1[rbinom(length(news1)*.01,length(news1),0.5)]

combing 3 documents

data<- c(twitter,blogs,news)
data<- Corpus(VectorSource(list(data)))

STep:5- cleaning data and removing profanity words

data<- tm_map(data,tolower)
data<- tm_map(data,removePunctuation)
data<- tm_map(data,stripWhitespace)
data<- tm_map(data,removeWords,profanity_banned)## removing profanity words
data<- tm_map(data,removeWords, stopwords("en"))
data<- tm_map(data,stemDocument)

Writing a corpus into new file

writeCorpus(data, filenames="Corpusdata.txt")
data1 <- readLines("Corpusdata.txt")

Creating n-grams and dataframe

#unigram
unigram<-ngram(data1, n=1)
unigram.df<-data.frame(get.phrasetable(unigram)[,1:2])
#bigram
bigram<-ngram(data1, n=2)
bigram.df<-data.frame(get.phrasetable(bigram)[,1:2])
#trigram
trigram<-ngram(data1, n=3)
trigram.df<-data.frame(get.phrasetable(trigram)[,1:2])

Creating plots of top 20 n grams of the data

Unigram

ggplot(unigram.df[1:20,], aes(ngrams, freq, colour=freq, fill=freq))+geom_col()+theme_light()+labs(title="Top 20 Uni-gram phrases", y="Frequency",x="Uni-grams")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Bigram

ggplot(bigram.df[1:20,], aes(ngrams, freq, colour=freq, fill=freq))+geom_col()+theme_light()+labs(title="Top 20 Bi-gram phrases", y="Frequency",x="Bi-grams")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Trigram

ggplot(trigram.df[1:20,], aes(ngrams, freq, colour=freq, fill=freq))+geom_col()+theme_light()+labs(title="Top 10 Tri-gram phrases", y="Frequency",x="Tri-grams")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Based on the analysis performed above, the prediction of the next word for a sentence can be well done using the tri-gram NLP prediction algorithm. The algorithm would generate the most probable word based on its occurance probability after the previous two words.