This milestone report is the first in a set of tasks in building a predictive model based on understanding the distribution and relationships among words, tokens and phrases in texts.
The text datasets are in four different languages, English, German, Finish and Russia. For this report the English text dataset will be explored and are taken from en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt made available by Coursera-SwiftKey.
The plan taken in this task to access, deploy and clean the text datasets and will undergo the following procesess:
Since the report requires writing code chunks in the R markdown documents, set echo = TRUE and results = hold as global options so that the codes can be review for analysis.
library(knitr)
opts_chunk$set(echo = TRUE, results = 'hold')
# load required packages
suppressMessages(require("tm"))
suppressMessages(require("R.utils"))
suppressMessages(require("ggplot2"))
setwd("C:/Users/Andria/Data-Science-Capstone")
1. Collecting file size information on each text dataset
file.info("final/en_US/en_US.blogs.txt")$size/1024^2
file.info("final/en_US/en_US.twitter.txt")$size/1024^2
file.info("final/en_US/en_US.news.txt")$size/1024^2
## [1] 200.4242
## [1] 159.3641
## [1] 196.2775
2. Collecting data on the number of lines in the blogs dataset
conblogs <- file("final/en_US/en_US.blogs.txt", open = "rb")
blogs <- readLines(conblogs, encoding = "UTF=8")
close(conblogs)
rm(conblogs)
# number of lines in en_US.blogs.txt dataset
summary(blogs)
## Length Class Mode
## 899288 character character
3. Collecting data on the number of lines in the twitter dataset
contwit <- file("final/en_US/en_US.twitter.txt", open = "rb")
twitter <- readLines(contwit, encoding = "UTF=8")
## Warning in readLines(contwit, encoding = "UTF=8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(contwit, encoding = "UTF=8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(contwit, encoding = "UTF=8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(contwit, encoding = "UTF=8"): line 1759032 appears to
## contain an embedded nul
close(contwit)
rm(contwit)
# number of lines in en_US.twitter.txt dataset
summary(twitter)
## Length Class Mode
## 2360148 character character
4. Collecting data on the number of lines in the news dataset
# Reading the news dataset
conews <- file("final/en_US/en_US.news.txt", open = "rb")
news <- readLines(conews, encoding = "UTF=8")
close(conews)
rm(conews)
# number of lines in en_US.news.txt dataset
summary(news)
## Length Class Mode
## 1010242 character character
Since the dataset is very large only a sampled dataset will we used in our analysis
blogs <- readLines("final/en_US/en_US.news.txt", 20000)
news <- readLines("final/en_US/en_US.news.txt", 20000)
twitter <- readLines("final/en_US/en_US.twitter.txt", 20000)
Available from: https://gist.github.com/jamiew/1112488
profanity <- readLines("final/banned_words.txt", skipNul = T)
## Warning in readLines("final/banned_words.txt", skipNul = T): incomplete
## final line found on 'final/banned_words.txt'
library(RWeka)
sampledData <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)
corpus <- Corpus(VectorSource(sampledData))
rm(sampledData)
# Cleaning the sampledData
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, function(x) gsub('[^[:alnum:] ]', "", x))
corpus <- tm_map(corpus, function(x) gsub('[])(;:#%$^*\\~{}[&+=@/"`|<>_]+', "", x))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, profanity)
# remove profanity
rm(profanity)
cleanset into Unigrams, Bigrams and TrigramsuniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))
freqTerms <- findFreqTerms(uniGramMatrix, lowfreq = 1000)
termFrequency <- rowSums(as.matrix(uniGramMatrix[freqTerms,]))
termFrequency <- data.frame(unigram=names(termFrequency), frequency=termFrequency)
termFrequency
## unigram frequency
## also also 1315
## can can 1490
## first first 1113
## get get 1314
## good good 1028
## just just 1684
## last last 1216
## like like 1573
## new new 1574
## now now 1124
## one one 2037
## people people 1203
## said said 5021
## state state 1058
## time time 1311
## two two 1214
## will will 2410
## year year 1305
## years years 1050
g <- ggplot(termFrequency, aes(x=reorder(unigram, frequency), y=frequency)) +
geom_bar(stat = "identity", fill = "blue") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Unigram") + ylab("Frequency") +
labs(title = "Top Unigrams by Frequency")
print(g)
freqTerms <- findFreqTerms(biGramMatrix, lowfreq = 50)
termFrequency <- rowSums(as.matrix(biGramMatrix[freqTerms,]))
termFrequency <- data.frame(bigram=names(termFrequency), frequency=termFrequency)
termFrequency
## bigram frequency
## can get can get 62
## cant wait cant wait 68
## dont know dont know 103
## dont think dont think 57
## dont want dont want 65
## even though even though 64
## every day every day 50
## feel like feel like 98
## first time first time 82
## four years four years 57
## general manager general manager 66
## health care health care 74
## high school high school 169
## im going im going 68
## last month last month 97
## last night last night 70
## last season last season 60
## last week last week 126
## last year last year 275
## little bit little bit 55
## looking forward looking forward 57
## looks like looks like 57
## los angeles los angeles 90
## make sure make sure 96
## many people many people 56
## new jersey new jersey 118
## new york new york 214
## next week next week 58
## next year next year 54
## officials said officials said 80
## percent percent percent percent 50
## police said police said 61
## right now right now 146
## said im said im 51
## said will said will 58
## san diego san diego 69
## san francisco san francisco 79
## st louis st louis 178
## supreme court supreme court 57
## three years three years 64
## two weeks two weeks 55
## two years two years 93
## united states united states 85
## white house white house 59
## years ago years ago 138
g <- ggplot(termFrequency, aes(x=reorder(bigram, frequency), y=frequency)) +
geom_bar(stat = "identity", fill = "purple") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Bigram") + ylab("Frequency") +
labs(title = "Top Bigrams by Frequency")
print(g)
freqTerms <- findFreqTerms(triGramMatrix, lowfreq = 15)
termFrequency <- rowSums(as.matrix(triGramMatrix[freqTerms,]))
termFrequency <- data.frame(trigram=names(termFrequency), frequency=termFrequency)
termFrequency
## trigram frequency
## âhe hasnât really âhe hasnât really 16
## cant wait see cant wait see 15
## east st louis east st louis 16
## four years ago four years ago 16
## long way go long way go 16
## new york city new york city 23
## new york times new york times 16
## past two years past two years 18
## president barack obama president barack obama 26
## st louis county st louis county 24
## three years ago three years ago 16
## two years ago two years ago 26
## us district court us district court 16
## us supreme court us supreme court 16
g <- ggplot(termFrequency, aes(x=reorder(trigram, frequency), y=frequency)) +
geom_bar(stat = "identity", fill = "red") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Trigram") + ylab("Frequency") +
labs(title = "Top Trigrams by Frequency")
print(g)