The goal of this project is just to display that we have gotten used to working with the data and that we are on track to create a prediction algorithm. This repor will be submitted on R Pubs (http://rpubs.com/) and will explain the exploratory analysis and the goals for the eventual app and algorithm.
# Preload necessary R librabires
library(dplyr)
library(doParallel)
library(stringi)
library(SnowballC)
library(tm)
# To solve rJava package issues while loading it or Rweka, set the directory of your Java location by setting it before loading the library:
if(Sys.getenv("JAVA_HOME")!="")
Sys.setenv(JAVA_HOME="")
#options(java.home="C:\\Program Files\\Java\\jre1.8.0_171\\")
#library(rJava)
library(RWeka)
library(ggplot2)
The data is from HC Corpora with access to 4 languages, but only English will be used. The dataset has three files includes en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt. ##Donloading the data
if(!file.exists("Coursera-SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
# Read the blogs and twitter files using readLines
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
# Read the news file using binary/binomial mode as there are special characters in the text
con <- file("final/en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)
Calculate some summary stats for each file: Size in Megabytes, number of entries (rows), total characters and length of longest entry.
# Get file sizes
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
pop_summary <- data.frame('File' = c("Blogs","News","Twitter"),
"FileSizeinMB" = c(blogs_size, news_size, twitter_size),
'NumberofLines' = sapply(list(blogs, news, twitter), function(x){length(x)}),
'TotalCharacters' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
TotalWords = sapply(list(blogs,news,twitter),stri_stats_latex)[4,],
'MaxCharacters' = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
pop_summary
## File FileSizeinMB NumberofLines TotalCharacters TotalWords MaxCharacters
## 1 Blogs 200.4242 899288 206824505 37570839 40833
## 2 News 196.2775 1010242 203223159 34494539 11384
## 3 Twitter 159.3641 2360148 162096031 30451128 140
The data is too big for the initial tests and analysis so we will sample to make it easier
Creating the models does not require the loading of the whole data sets so we will use a 5% subset to apply all the calcualtions
set.seed(10)
# Remove all non english characters as they cause issues
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
# Binomial sampling of the data and create the relevant files
sample <- function(population, percentage) {
return(population[as.logical(rbinom(length(population),1,percentage))])
}
# Set sample percentage
percent <- 0.05 #If memory issues comes, it needs to be further reduced
samp_blogs <- sample(blogs, percent)
samp_news <- sample(news, percent)
samp_twitter <- sample(twitter, percent)
dir.create("sample", showWarnings = FALSE)
#write(samp_blogs, "sample/sample.blogs.txt")
#write(samp_news, "sample/sample.news.txt")
#write(samp_twitter, "sample/sample.twitter.txt")
samp_data <- c(samp_blogs,samp_news,samp_twitter)
write(samp_data, "sample/sampleData.txt")
Calculate some summary stats for each file on sample data.
samp_summary <- data.frame(
File = c("blogs","news","twitter"),
t(rbind(sapply(list(samp_blogs,samp_news,samp_twitter),stri_stats_general),
TotalWords = sapply(list(samp_blogs,samp_news,samp_twitter),stri_stats_latex)[4,]))
)
samp_summary
## File Lines LinesNEmpty Chars CharsNWhite TotalWords
## 1 blogs 44820 44813 10305669 8481988 1862344
## 2 news 50480 50480 10178559 8504440 1726424
## 3 twitter 117802 117802 8081879 6683738 1517599
# remove temporary variables
rm(blogs, news, twitter, samp_blogs, samp_news, samp_twitter, samp_data, pop_summary, samp_summary)
The data can be cleaned using techniues such as removing whitespaces, numbers, URLs, punctuations and profanity etc.
directory <- file.path(".", "sample")
#sample_data <- Corpus(DirSource(directory))
sample_data <- VCorpus(DirSource(directory)) # load the data as a corpus
sample_data <- tm_map(sample_data, content_transformer(tolower))
# Removing Profanity Words using one of the available dictionaries of 1384 words,
# but removed from it some words which which dont consider profanity.
profanity_words = readLines("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
profanity_words = profanity_words[-(which(profanity_words%in%c("refugee","reject","remains","screw","welfare","sweetness","shoot","sick","shooting","servant","sex","radical","racial","racist","republican","public","molestation","mexican","looser","lesbian","liberal","kill","killing","killer","heroin","fraud","fire","fight","fairy","^die","death","desire","deposit","crash","^crim","crack","^color","cigarette","church","^christ","canadian","cancer","^catholic","cemetery","buried","burn","breast","^bomb","^beast","attack","australian","balls","baptist","^addict","abuse","abortion","amateur","asian","aroused","angry","arab","bible")==TRUE))]
sample_data <- tm_map(sample_data,removeWords, profanity_words)
## removing URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
sample_data <- tm_map(sample_data, content_transformer(removeURL))
#sample_data[[1]]$content
# Replacing special chars with space
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
sample_data <- tm_map(sample_data, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
sample_data <- tm_map(sample_data, toSpace, "@[^\\s]+")
sample_data <- tm_map(sample_data, tolower) # convert to lowercase
#sample_data <- tm_map(sample_data, removeWords, stopwords("en"))#remove english stop words
sample_data <- tm_map(sample_data, removePunctuation) # remove punctuation
sample_data <- tm_map(sample_data, removeNumbers) # remove numbers
sample_data <- tm_map(sample_data, stripWhitespace) # remove extra whitespaces
#sample_data <- tm_map(sample_data, stemDocument) # initiate stemming
sample_data <- tm_map(sample_data, PlainTextDocument)
sample_corpus <- data.frame(text=unlist(sapply(sample_data,'[',"content")),stringsAsFactors = FALSE)
head(sample_corpus)
## text
## character(0).content1 its a sickening feeling
## character(0).content2 even if you dont like the so called screwball comedy that some critic also called sex comedy without sex whose trouble in paradise gives a perfect example you could enjoy two things from this movie the typical art deco interior design in mme colet house and the beautiful gowns designed by travis banton one of the most famous costume designer that show at its best this style
## character(0).content3 cat is looking for more pictures of cute animals with their tongues sticking out email cuteanimaltongues at gmail dot com with yours
## character(0).content4 sunday the festivities continued i went back to shadowbox to see back to the garden again told ya i love that place then after that was over i had to work at the store it was survivor finale night and we do a special survivor crop each season so i had to go host that
## character(0).content5 they are both chunky knits and were a complete bargainthe green was and the multi colour knit was the charity shops have now started putting out their winter stocks so using these knits as inspiration why dont you go and hunt down a stylish cosy bargain for much less than the high street or designer versions
## character(0).content6 its official i made the spellbinders team its been an amazing year and i am so glad that it doesnt have to end i love this company their products their values and the people who make spellbinders what it is
After the above transformations the first review looks like:
inspect(sample_data[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 18682810
review_dtm <- DocumentTermMatrix(sample_data)
review_dtm
## <<DocumentTermMatrix (documents: 1, terms: 104463)>>
## Non-/sparse entries: 104463/0
## Sparsity : 0%
## Maximal term length: 110
## Weighting : term frequency (tf)
Unigram Analysis shows that which words are the most frequent and what their frequency is. Unigram is based on individual words.
unigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 1, max = 1))
}
#unigrams <- TermDocumentMatrix(sample_data, control = list(tokenize = unigramTokenizer))
unigrams <- DocumentTermMatrix(sample_data, control = list(tokenize = unigramTokenizer))
Bigram Analysis shows that which words are the most frequent and what their frequency is. Bigram is based on two word combinations.
BigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
bigrams <- DocumentTermMatrix(sample_data, control = list(tokenize = BigramTokenizer))
Trigram Analysis shows that which words are the most frequent and what their frequency is. Trigram is based on three word combinations.
trigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
#trigrams <- TermDocumentMatrix(sample_data, control = list(tokenize = trigramTokenizer))
trigrams <- DocumentTermMatrix(sample_data, control = list(tokenize = trigramTokenizer))
Quadgram Analysis shows that which words are the most frequent and what their frequency is. Quadgram is based on four word combinations.
quadgramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 4, max = 4))
}
#quadgrams <- TermDocumentMatrix(sample_data, control = list(tokenize = trigramTokenizer))
quadgrams <- DocumentTermMatrix(sample_data, control = list(tokenize = quadgramTokenizer))
Now we can perform exploratory analysis on the tidy data. For each Term Document Matrix, we list the most common unigrams, bigrams, trigrams and fourgrams. It would be interesting and helpful to find the most frequently occurring words in the data.
unigrams_frequency <- sort(colSums(as.matrix(unigrams)),decreasing = TRUE)
unigrams_freq_df <- data.frame(word = names(unigrams_frequency), frequency = unigrams_frequency)
head(unigrams_freq_df, 10)
## word frequency
## the the 183506
## and and 95477
## that that 39230
## for for 34120
## with with 26331
## was was 24765
## you you 19432
## this this 18600
## have have 17514
## but but 17120
unigrams_freq_df %>%
filter(frequency > 3000) %>%
ggplot(aes(reorder(word,-frequency), frequency)) +
geom_bar(stat = "identity", colour= "lightblue", fill= "darkblue") +
ggtitle("Unigrams with frequencies > 3000") +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
bigrams_frequency <- sort(colSums(as.matrix(bigrams)),decreasing = TRUE)
bigrams_freq_df <- data.frame(word = names(bigrams_frequency), frequency = bigrams_frequency)
head(bigrams_freq_df, 10)
## word frequency
## of the of the 18089
## in the in the 15841
## to the to the 8080
## on the on the 7148
## for the for the 6068
## to be to be 5567
## and the and the 5467
## at the at the 5038
## in a in a 4636
## with the with the 4303
Here, create generic function to plot the top 50 frequences for Bigrams and Trigrams.
hist_plot <- function(data, label) {
ggplot(data[1:50,], aes(reorder(word, -frequency), frequency)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity",colour= "lightblue", fill= "darkblue")
}
hist_plot(bigrams_freq_df, "50 Most Common Bigrams")
trigrams_frequency <- sort(colSums(as.matrix(trigrams)),decreasing = TRUE)
trigrams_freq_df <- data.frame(word = names(trigrams_frequency), frequency = trigrams_frequency)
head(trigrams_freq_df, 10)
## word frequency
## one of the one of the 1420
## a lot of a lot of 1175
## as well as as well as 678
## to be a to be a 596
## out of the out of the 580
## some of the some of the 576
## the end of the end of 572
## it was a it was a 546
## part of the part of the 545
## be able to be able to 500
hist_plot(trigrams_freq_df, "50 Most Common Trigrams")
quadgrams_frequency <- sort(colSums(as.matrix(quadgrams)),decreasing = TRUE)
quadgrams_freq_df <- data.frame(word = names(quadgrams_frequency), frequency = quadgrams_frequency)
head(quadgrams_freq_df, 10)
## word frequency
## the end of the the end of the 311
## at the end of at the end of 257
## the rest of the the rest of the 252
## for the first time for the first time 214
## at the same time at the same time 191
## one of the most one of the most 172
## is one of the is one of the 170
## when it comes to when it comes to 163
## to be able to to be able to 146
## in the middle of in the middle of 141
hist_plot(quadgrams_freq_df, "50 Most Common Quadgrams")
Building N-grams takes some time, even when downsampling to 2%. Caching helps to speed the process up when run the next time (cache = TRUE).
The longer the N-grams, the lower their abundance (e.g. the most abundant Bigrams frequency is 14485, the most abundant Trigrams frequency is 1135 and that of the most abundant Quadgrams frequency is 241).
For the Shiny applicaiton, the plan is to create an application with a simple interface where the user can enter a string of text. Our prediction model will then give a list of suggested words to update the next word.