library(tm)
library(RWeka)
library(dplyr)
library(RColorBrewer)
library(wordcloud)
library(caret)
library(ggplot2)
library("stringi")
library(gridExtra)Milestone Submission
#Project overview
Data Loading and Cleaning
Data Loading
- This data is provided as a part of the Data Science course on Coursera, the link can be retrieved here: “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”
- This zip folder uses the files named LOCALE.blogs.txt where LOCALE is the each of the four locales en_US, de_DE, ru_RU and fi_FI. The data is from a corpus called HC Corpora. See the About the Corpora reading for more details. The files have been language filtered but may still contain some foreign text.
- The first step of this project is to tokenize the text and make profanity filtering (i.e., removing bad words) as the data may contain words of offensive and profane meaning.
twitter_url="./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
blogs_url= "./Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
news_url="./Coursera-SwiftKey/final/en_US/en_US.news.txt"
twitter_file = readLines(twitter_url, encoding="UTF-8")
blogs_file = readLines(blogs_url, encoding="UTF-8")
news_file= readLines(news_url, encoding="UTF-8")###Have a look at the data
data.frame(file = c("Twitter", "Blogs", "News"),
size_MB = c(file.info(twitter_url)$size/1024^2,
file.info(blogs_url)$size/1024^2,
file.info(news_url)$size/1024^2),
lines = c(length(twitter_file),
length(blogs_file),
length(news_file)),
longest_line = c(summary(nchar(twitter_file))[6],
summary(nchar(blogs_file))[6],
summary(nchar(news_file))[6]),
word_count=c(sum(stri_count_words(twitter_file)),
sum(stri_count_words(blogs_file)),
sum(stri_count_words(news_file)))
) file size_MB lines longest_line word_count
1 Twitter 159.3641 2360148 140 30093372
2 Blogs 200.4242 899288 40833 37546250
3 News 196.2775 77259 5760 2674536
Data cleaning
- To make it handy for the following steps of building the prediction model, the data is subset as follow:
twitter_subset=readLines(twitter_url,500)
blogs_subset=readLines(blogs_url,500)
news_subset=readLines(news_url,500)
all_subset=paste(twitter_subset,blogs_subset,news_subset)- The following code removes all of the non-alphabet emelements and lowercase the text
# Remove special characters from the text
all_subset=iconv(all_subset,"UTF-8", "ASCII", sub = "")
#Remove number
all_subset=removeNumbers(all_subset)
#Remove white space
all_subset=stripWhitespace(all_subset)
#Remove Punctuation
all_subset=removePunctuation(all_subset)
#To lowercase
all_subset = tolower(all_subset)- The following code removes profane words from the data. The “bad-words” corpus provided by the Carnegie Mellon’s School of Computer Science is used to filtering profanity in working dataset.
profanity_url="http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
download.file(profanity_url,destfile = "profanity.txt")
profanity=readLines("profanity.txt",encoding="UTF-8")
all_subset=removeWords(all_subset,c(profanity))Exploratory data analysis
In this section, the analysis aim to: 1. Understanding the distribution of words and relationship between the words in the corpora. 2. Understand frequencies of words and word pairs To achieve this, this analysis adopts the N-gram function to examine the frequencies of single words and groups of words that appear in the dataset. The function NGramTokenizer() from package tm is used in the following code. ## Tokenizing the sample
tokens <- function(x, y){
x <- NGramTokenizer(all_subset, Weka_control(min = y, max = y))
x <- data.frame(table(x))
x <- x[order(x$Freq, decreasing = TRUE), ]
}
unigrams_token <- tokens(all_subset, 1)
bigrams_token <- tokens(all_subset, 2)
trigrams_token <- tokens(all_subset, 3)
all_data_unigrams <- c(twitter_file, news_file, blogs_file) %>% tokens(1)##Plot the frequencies of top 15 words in each N-grams
p1= unigrams_token[1:15,] %>%
ggplot(aes(x = reorder(x, Freq), y = Freq)) +
geom_bar(stat ='identity',
colour = "lightgrey",
fill = "lightcoral") +
geom_text(label = unigrams_token$Freq[1:15],
size = 2.5,
hjust = 3) +
xlab("1-gram") +
ylab("Frequency") +
ggtitle("Top 15 Unigrams words") +
theme(plot.title = element_text(hjust = 0.5))+
coord_flip()
p2= bigrams_token[1:15,] %>%
ggplot(aes(x = reorder(x, Freq), y = Freq)) +
geom_bar(stat ='identity',
colour = "lightgrey",
fill = "lightblue") +
geom_text(label = bigrams_token$Freq[1:15],
size = 2.5,
hjust = 3) +
xlab("2-gram") +
ylab("Frequency") +
ggtitle("Top 15 Bigrams words") +
theme(plot.title = element_text(hjust = 0.5))+
coord_flip()
p3= trigrams_token[1:15,] %>%
ggplot(aes(x = reorder(x, Freq), y = Freq)) +
geom_bar(stat ='identity',
colour = "lightgrey",
fill = "springgreen4") +
geom_text(label = trigrams_token$Freq[1:15],
size = 2.5,
hjust = 3) +
xlab("3-gram") +
ylab("Frequency") +
ggtitle("Top 15 Trigrams words") +
theme(plot.title = element_text(hjust = 0.5))+
coord_flip()
grid.arrange(p1, p2, p3, nrow = 2)##The 100 most frequent words
#Tonkenizing the words in the dataset
words<-WordTokenizer(all_subset)
wordcloud(words, scale=c(5,0.1), max.words=100, random.order=FALSE,
rot.per=0.5, use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
drops documents
Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
tm::stopwords())): transformation drops documents
Further step
Now since we have the data to the n-grams model we and the observations required we can proceed to make our shiny application that predicts the the words using N - gram model.