=============== SONJA OFFWOOD
To start off the analysis, we have to download the data and do some exploratory analysis on the data.
setwd("C:/Users/SJD/OneDrive - FRG/Own/Data Science/John Hopkins Specialisation/Capstone project/final/en_US")
con= file("en_US.twitter.txt", "r")
doc_twitter = suppressWarnings(readLines(con, encoding = "UTF-8", skipNul = TRUE))
close(con)
con= file("en_US.blogs.txt", "r")
doc_blogs = suppressWarnings(readLines(con, encoding = "UTF-8", skipNul = TRUE))
close(con)
con= file("en_US.news.txt", "r")
doc_news = suppressWarnings(readLines(con, encoding = "UTF-8", skipNul = TRUE))
close(con)
rm(con)
We want to see how many lines each text source has, following which we want to understand the number of words and characters each of the text components. It will also be useful to understand the max amount of words each of the text components have.
len_twitter=length(doc_twitter)
len_blogs=length(doc_blogs)
len_news=length(doc_news)
words_twitter=sapply(strsplit(doc_twitter, " "), length)
words_blogs=sapply(strsplit(doc_blogs, " "), length)
words_news=sapply(strsplit(doc_news, " "), length)
wordcount_twitter=sum(sapply(strsplit(doc_twitter, " "), length))
wordcount_blogs=sum(sapply(strsplit(doc_blogs, " "), length))
wordcount_news=sum(sapply(strsplit(doc_news, " "), length))
char_twitter=sum(sapply(strsplit(doc_twitter, ""), length))
char_blogs=sum(sapply(strsplit(doc_blogs, ""), length))
char_news=sum(sapply(strsplit(doc_news, ""), length))
max_twitter=max(words_twitter)
max_blogs=max(words_blogs)
max_news=max(words_news)
text_summary <- data.frame(
fileName = c("Twitter","Blogs","News"),
line_count = c(len_twitter, len_blogs, len_news),
word_count = c(wordcount_twitter, wordcount_blogs, wordcount_news),
char_count = c(char_twitter, char_blogs, char_news),
max_length = c(max_twitter, max_blogs, max_news)
)
text_summary
## fileName line_count word_count char_count max_length
## 1 Twitter 2360148 30373583 162096241 47
## 2 Blogs 899288 37334131 206824505 6630
## 3 News 77259 2643969 15639408 1031
rm(len_twitter)
rm(len_blogs)
rm(len_news)
rm(words_twitter)
rm(words_blogs)
rm(words_news)
rm(char_twitter)
rm(char_blogs)
rm(char_news)
rm(max_twitter)
rm(max_blogs)
rm(max_news)
Because of the large amount of data available, we need to sample a smaller selection of the existing text to run further analysis. To do this, we take 15% of each of the three data sets and combine them into the final set to be used.
set.seed(1234)
p=0.001
data_sample1 = sample(doc_twitter,p*length(doc_twitter))
data_sample2 = sample(doc_blogs,p*length(doc_blogs))
data_sample3 = sample(doc_news,p*length(doc_news))
full_data= c(data_sample1,data_sample2,data_sample3)
rm(doc_twitter)
rm(doc_blogs)
rm(doc_news)
rm(data_sample1)
rm(data_sample2)
rm(data_sample3)
To be able to perform some data transformations, we need to load the tm library.
library(NLP)
library(tm)
After the data is saved into a Corpus, the following preprocessing steps are performed on the data: 1. The punctuation is removed from the text 2. All numbers are removed 3. Convert all text to lower case letters 4. Remove stopwords from the text 5. Strip white spaces from the text
my_corpus = VCorpus(VectorSource(full_data))
my_corpus = tm_map(my_corpus, removePunctuation)
my_corpus = tm_map(my_corpus, removeNumbers)
my_corpus = tm_map(my_corpus,content_transformer(tolower))
my_corpus = tm_map(my_corpus, removeWords, stopwords("english"))
my_corpus = tm_map(my_corpus, stripWhitespace)
We first load the libraries required for this section of the analysis.
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
In order to understand more about the word frequencies, we create a document term matrix which is then used for further analysis. We start off the analysis with plotting a wordcloud of the most frequent words
dtm = DocumentTermMatrix(my_corpus)
words_frequency = colSums(as.matrix(dtm))
suppressWarnings(wordcloud(words = names(words_frequency), freq = words_frequency, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.4, colors=brewer.pal(3, "Dark2")))
We next want to find the 20 most frequent words used in the datset post the transformations performed.
ord=order(words_frequency, decreasing=TRUE)
top20= head(words_frequency[ord], 20)
top20_data=data.frame(words=names(top20), freq= top20)
top20_data$words=factor(top20_data$words, levels=top20_data$words)
g=ggplot(data=top20_data, aes(x=words, y=freq))
g=g+geom_bar(stat="identity", color="blue", fill="blue")
g=g+xlab("word")+ylab("frequency")
g
To make this relevant to the Capstone project, we will want to have an understanding of which words generally follow each other. For this reason we want to look at the ngrams of the data and not only at the individual words. To do a similar analysis as what we did above, we will need to create tokenizers to pass into the document term matrix function.
library(RWeka)
BigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min=2, max=2))}
TrigramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min=3, max=3))}
FourgramTokenizer <- function(x) {NGramTokenizer(x, Weka_control(min=4, max=4))}
bigram = DocumentTermMatrix(my_corpus, control = list(tokenize = BigramTokenizer))
trigram = DocumentTermMatrix(my_corpus, control = list(tokenize = TrigramTokenizer))
fourgram = DocumentTermMatrix(my_corpus, control = list(tokenize = FourgramTokenizer))
Below we have a look at the top 20 bigrams on our dataset.
bigram_frequency = colSums(as.matrix(bigram))
bigram_ord=order(bigram_frequency, decreasing=TRUE)
bigram_top20= head(bigram_frequency[bigram_ord], 20)
bigram_top20_data=data.frame(words=names(bigram_top20), freq= bigram_top20)
bigram_top20_data$words=factor(bigram_top20_data$words, levels=bigram_top20_data$words)
g=ggplot(data=bigram_top20_data, aes(x=words, y=freq))
g=g+geom_bar(stat="identity", color="red", fill="red")
g=g+xlab("word")+ylab("frequency")
g=g+theme(axis.text.x = element_text(angle = 90, hjust = 1))
g
The current plan for the model is to return the word which has the highest frequency in the test set following the n words typed by the user. For this it is important not to remove the stop words when transforming the data as these will be critical in the prediction. The steps envisaged are set out below assuming the user types three words: 1. Find the word following the three types words in the 4-gram dictionary 2. If no match is found, find the word with the highest frequency following the last two words typed in the 3-gram dictionary 3. If still no match is found, find the word with the highest frequency following the last word typed 4. To the extent there is still no match, return a dummy word