The first step in this project is to understand the distribution and relationship between the words, tokens, and phrases in the text. This report will create a word frequency table, find the 1-gram, 2-gram and 3-gram term document matrix and conduct the exploratory analysis of the words.
The training data is downloaded from the link below:
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
The zip file contains the followings:
1.en_US.blogs.txt
2.en_US.news.txt
3.en_US.twitter.txt
First, loading the packages that will be used in the report:
library(tm)
library(NLP)
library(ggplot2)
library(RWeka)
library(ngram)
library(slam)
Read in original data files and check the file size:
## Read in Original Datasets
setwd("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/")
twitter <- readLines(con <- file("en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
news <- readLines(con <- file("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
blogs <- readLines(con <- file("C:/Users/i58197/Desktop/Coursera_SwiftKey/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)
## Check number of lines
twitter_len <- length(twitter)
news_len <- length(news)
blogs_len <- length(blogs)
## Check file size
twitter_size <- file.size("en_US.twitter.txt")/1024^2
news_size <- file.size("en_US.news.txt")/1024^2
blogs_size <- file.size("en_US.blogs.txt")/1024^2
summary <- data.frame(c(twitter_len,news_len,blogs_len),c(twitter_size,news_size,blogs_size))
colnames(summary)[1] <- c('Number of Length')
colnames(summary)[2] <- c('File Size(MB)')
summary
## Number of Length File Size(MB)
## 1 2360148 159.3641
## 2 77259 196.2775
## 3 899288 200.4242
The amount of data in original dataset is huge, thus a 10% sub sample is used in the following analysis. Samples are saved in separate txt file for future repeated use.
# Choosen sample size
sample_size <- 0.1
# Creating subsets
twitter_index <- sample(seq_len(length(twitter)),length(twitter)*sample_size)
news_index <- sample(seq_len(length(news)),length(news)*sample_size)
blogs_index <- sample(seq_len(length(blogs)),length(blogs)*sample_size)
twitter_sub <- twitter[twitter_index[]]
writeLines(twitter_sub, con="twitter_sub.txt", "\n")
news_sub <- news[news_index[]]
writeLines(news_sub, con="news_sub.txt", "\n")
blogs_sub <- blogs[blogs_index[]]
writeLines(blogs_sub, con="blogs_sub.txt", "\n")
Create corpus and clean the corpus by removing profanity words and special symbols like punctuations and numbers. Profanity words are downloaded from:https://www.cs.cmu.edu/~biglou/resources/
##Read in Sub-sample txt files for twitter, news and blogs
corpus.folder <- "C:/Users/i58197/Desktop/Sub/"
corpus <- VCorpus(DirSource(corpus.folder,encoding="UTF-8"))
#Remove punctuation
corpus <- tm_map(corpus,removePunctuation)
#Remove numbers
corpus <- tm_map(corpus,removeNumbers)
#Remove whitespaces
corpus <- tm_map(corpus,stripWhitespace)
#Remove profanity words
profanity <- readLines("C:/Users/i58197/Desktop/bad_words.txt")
corpus <- tm_map(corpus,removeWords, profanity)
#Remove non-ASCII
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
corpus<-tm_map(corpus,content_transformer(removeNonASCII))
#Convert to lowercase
corpus <- tm_map(corpus,content_transformer(tolower))
Now we can analyze the word frequency in 1-gram, 2-gram and 3-gram categories:
##1-gram
corpus_tdm_1 <- TermDocumentMatrix(corpus)
wordMatrix_1 <- as.data.frame((as.matrix(corpus_tdm_1)) )
wordSort_1 <- sort(rowSums(wordMatrix_1),decreasing=TRUE)
result_1 <- data.frame(word = names(wordSort_1),freq=wordSort_1)
top_30_result_1 <- result_1[1:30,]
top_30_result_1
## word freq
## the the 294960
## and and 159416
## you you 85658
## for for 77885
## that that 72505
## with with 47765
## this this 43159
## was was 41486
## have have 39716
## are are 36174
## but but 33937
## not not 30597
## your your 27441
## all all 26739
## just just 25329
## from from 24361
## its its 24160
## out out 22898
## like like 22739
## what what 22676
## they they 21894
## will will 21552
## one one 21263
## about about 21241
## when when 19342
## can can 19306
## get get 18763
## time time 16712
## more more 16256
## there there 15861
ggplot(data=top_30_result_1, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") + guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))
##2-gram
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus_tdm_2 <- TermDocumentMatrix(corpus,control = list(tokenize = bigram))
wordMatrix_2 <- as.data.frame((as.matrix(corpus_tdm_2)) )
wordSort_2 <- sort(rowSums(wordMatrix_2),decreasing=TRUE)
result_2 <- data.frame(word = names(wordSort_2),freq=wordSort_2)
top_30_result_2 <- result_2[1:30,]
top_30_result_2
## word freq
## of the of the 25910
## in the in the 24764
## for the for the 13833
## to the to the 13831
## on the on the 12967
## to be to be 11841
## at the at the 8874
## i have i have 7938
## and the and the 7828
## i was i was 7669
## is a is a 7566
## in a in a 7269
## i am i am 7193
## and i and i 7088
## it was it was 6911
## it is it is 6766
## for a for a 6594
## with the with the 6575
## if you if you 6519
## going to going to 6060
## have a have a 5969
## is the is the 5617
## will be will be 5523
## to get to get 5502
## from the from the 5287
## i dont i dont 5132
## that i that i 5097
## want to want to 5082
## one of one of 4992
## with a with a 4930
ggplot(data=top_30_result_2, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") + guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))
##3-gram
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus_tdm_3 <- TermDocumentMatrix(corpus,control = list(tokenize = trigram))
wordMatrix_3 <- as.data.frame((as.matrix(corpus_tdm_3)) )
wordSort_3 <- sort(rowSums(wordMatrix_3),decreasing=TRUE)
result_3 <- data.frame(word = names(wordSort_3),freq=wordSort_3)
top_30_result_3 <- result_3[1:30,]
top_30_result_3
## word freq
## thanks for the thanks for the 2330
## one of the one of the 2129
## a lot of a lot of 1933
## i want to i want to 1323
## to be a to be a 1291
## going to be going to be 1245
## it was a it was a 1041
## cant wait to cant wait to 1040
## i have a i have a 1036
## i have to i have to 1021
## looking forward to looking forward to 1018
## thank you for thank you for 995
## i dont know i dont know 969
## be able to be able to 961
## out of the out of the 952
## the end of the end of 942
## i love you i love you 941
## i need to i need to 913
## im going to im going to 912
## the rest of the rest of 881
## some of the some of the 869
## as well as as well as 812
## for the follow for the follow 806
## one of my one of my 805
## you want to you want to 786
## this is a this is a 764
## is going to is going to 736
## a couple of a couple of 729
## to go to to go to 720
## the fact that the fact that 707
ggplot(data=top_30_result_3, aes(x=word, y=freq, fill=freq)) + geom_bar(stat="identity") + guides(fill=FALSE) + theme(axis.text.x=element_text(angle=90))
For the final analysis, text modelling, and text prediction, we need to do the following studies:
1.N-Gram modelling of the full text data sets
2.Optimize model for low memory utilization and faster processing.
3.Implement model as a Shiny App