The presented document shows some data exploratroy analysis on the text data that can be downloaded from here. The main objective of this assignment is to make the sample corpus, build the 2-gram and 3-gram TDM and carry out exploratory analysis on the corpus.
The following packages are required:
library(NLP)
library(ggplot2)
library(tm)
library(textmineR)
library(wordcloud)
library(stringr)
library(RWeka)
library(SnowballC)
library(data.table)
Reading text files:
dataBlogEn <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
dataNewsEn <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
dataTwitterEn <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
Nummber of English blog lines: 899288.
Nummber of English news lines: 77259.
Nummber of English Twitter lines: 2360148.
Nummber of words in English blog: 37873924.
Nummber of words in English News: 2662071.
Nummber of words in English Twitter: 30555635.
Now, because the files are too big to be handled by my computer, 2000 lines of each file are sampled and binded and used in the report.
sampleBlog <- readLines("en_US.blogs.txt", 2000)
sampleNews <- readLines("en_US.news.txt", 2000)
sampleTwitter <- readLines("en_US.Twitter.txt", 2000)
bindData <- c(sampleBlog, sampleNews, sampleTwitter)
All numbers and non-word letters, punctuations, extra spaces and stop words are removed from the text and uppercase letters are lowercased.
#Removing all non word and number characters
bindData <- gsub('\\W', ' ', bindData)
#Removing all numbers
bindData <- gsub('\\d', ' ', bindData)
#Conveting all letters into lowercase
bindData <- tolower(bindData)
#Removing stop words
bindData <- removeWords(bindData, stopwords(kind = "en"))
#removing all one-letter words
bindData <- gsub("\\b\\w\\b", " ", bindData)
#removing all empty spaces
bindData <- gsub("\\s{2,}", " ", bindData)
The cleaned text is turned into a corpus:
dataCorpus <- Corpus(VectorSource(bindData))
Unigram, bigram and trigram are built using RWeka package:
oneGramToken <- NGramTokenizer(dataCorpus, Weka_control(min = 1, max = 1))
oneGram <- data.frame(table(oneGramToken))
oneGram <- oneGram[order(oneGram$Freq, decreasing = TRUE),]
colnames(oneGram) <- c("Word", "Freq")
oneGram <- head(oneGram, 12)
oneGramFreqPlot <- ggplot(oneGram, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill= "white", color='black') +
ggtitle("Unigrams Frequency") +
xlab("Words") + ylab("Frequency") +
theme_bw()+
theme(axis.text.x=element_text(angle=45, hjust=1))
twoGramToken <- NGramTokenizer(dataCorpus, Weka_control(min = 2, max = 2))
twoGram <- data.frame(table(twoGramToken))
twoGram <- twoGram[order(twoGram$Freq, decreasing = TRUE),]
colnames(twoGram) <- c("Word", "Freq")
twoGram <- head(twoGram, 12)
twoGramFreqPlot <- ggplot(twoGram, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill= "white", color='black') +
ggtitle("Bigrams Frequency") +
xlab("Words") + ylab("Frequency") +
theme_bw()+
theme(axis.text.x=element_text(angle=45, hjust=1))
threeGramToken <- NGramTokenizer(dataCorpus,Weka_control(min = 3, max = 3))
threeGram <- data.frame(table(threeGramToken))
threeGram <- threeGram[order(threeGram$Freq, decreasing = TRUE),]
colnames(threeGram) <- c("Word", "Freq")
threeGram <- head(threeGram, 12)
threeGramFreqPlot <- ggplot(threeGram, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill= "white", color='black') +
ggtitle("Trigrams Frequency") +
xlab("Words") + ylab("Frequency") +
theme_bw()+
theme(axis.text.x=element_text(angle=45, hjust=1))
oneGramFreqPlot
twoGramFreqPlot
threeGramFreqPlot
After performing basic exploratory data analysis, the next steps would be to build a predictive model and to develop a shiny app.