blog_data<-readLines("en_US.blogs.txt", skipNul = TRUE, warn= FALSE)
news_data<-readLines("en_US.news.txt", skipNul = TRUE, warn=FALSE)
twitter_data<-readLines("en_US.twitter.txt", skipNul = TRUE, warn=FALSE)
Motivation of this Report Motivation 1. Demonstrate that the data had been downloaded successfully. 2. Create a basic report of summary statistics about the datasets. 3. Report any interesting findings that was amassed so far. 4. Get feedback on the plans for creating a prediction algorithm and Shiny app.
About the SwiftKey Dataset For this report, we will be only do exploratory analysis at the English dataset which consist of: 1. blog posts (en_US.blogs.txt) 2. news (en_US.news.txt) 3. twitter tweets (en_US.twitter.txt)
Load the English dataset Note: the datasets were already downloaded beforehand and the root folder of the datasets (Cousera-SwiftKey) resides in the same directory as this markdown file.
blog_data_size<-file.info("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")$size/2^20
news_data_size<-file.info("./Coursera-SwiftKey/final/en_US/en_US.news.txt")$size/2^20
twitter_data_size<-file.info("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt")$size/2^20
# fit the size data into a dataframe to see the sizes clearer
dataset_EN<-c(blog_data_size,news_data_size,twitter_data_size)
df_size<-data.frame(dataset_EN)
names(df_size)[1] <-"MB"
row.names(df_size) <- c("Blog Posts", "News", "Twitter Tweets")
round(df_size,2)
## MB
## Blog Posts 200.42
## News 196.28
## Twitter Tweets 159.36
summary(blog_data)
## Length Class Mode
## 899288 character character
head(blog_data,3)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan â\200œgodsâ\200\235."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
summary(news_data)
## Length Class Mode
## 77259 character character
head(news_data,3)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
summary(twitter_data)
## Length Class Mode
## 2360148 character character
head(twitter_data,3)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
blog_data_lc<-length(blog_data)
news_data_lc<-length(news_data)
twitter_data_lc<-length(twitter_data)
dataset_lc <-c(blog_data_lc,news_data_lc, twitter_data_lc)
dataset_lc_df <-data.frame(dataset_lc)
names(dataset_lc_df)[1] <-"Line Count"
# fit the size data into a dataframe to see the line count clearer
row.names(dataset_lc_df) <- c("Blog Posts", "News", "Twitter Tweets")
dataset_lc_df
## Line Count
## Blog Posts 899288
## News 77259
## Twitter Tweets 2360148
# import ngram library to count number of words
library(ngram)
blog_data_wc <-wordcount(blog_data)
news_data_wc <-wordcount(news_data)
twitter_data_wc <-wordcount(twitter_data)
dataset_wc <-c(blog_data_wc,news_data_wc, twitter_data_wc)
dataset_wc_df <-data.frame(dataset_wc)
names(dataset_wc_df)[1] <-"Word Count"
row.names(dataset_wc_df) <- c("Blog Posts", "News", "Twitter Tweets")
dataset_wc_df
## Word Count
## Blog Posts 37334131
## News 2643969
## Twitter Tweets 30373583
SAMPLE_SIZE <- 0.01
blog_index <- sample(seq_len(blog_data_lc),blog_data_lc*SAMPLE_SIZE)
news_index <- sample(seq_len(news_data_lc),news_data_lc*SAMPLE_SIZE)
twitter_index <- sample(seq_len(twitter_data_lc),twitter_data_lc*SAMPLE_SIZE)
blog_data_subset <- blog_data[blog_index[]]
news_data_subset <- news_data[news_index[]]
twitter_data_subset <- twitter_data[twitter_index[]]
rm(list= ls()[!(ls() %in% c('blog_data_subset','news_data_subset','twitter_data_subset'))])
library(tm)
## Loading required package: NLP
library(NLP)
main_df <- VCorpus(VectorSource(c(blog_data_subset, news_data_subset, twitter_data_subset)),readerControl=list(reader=readPlain,language="en"))
main_df <- Corpus(VectorSource(sapply(main_df, function(row) iconv(row, "latin1", "ASCII", sub=""))))
main_df <- tm_map(main_df, content_transformer(tolower))
main_df <- tm_map(main_df, stripWhitespace)
main_df <- tm_map(main_df, removePunctuation)
main_df <- tm_map(main_df, removeNumbers)
main_df <- tm_map(main_df, PlainTextDocument)
## Warning in tm_map.SimpleCorpus(main_df, PlainTextDocument): transformation drops
## documents
main_df <- Corpus(VectorSource(main_df))
#head(main_df,5)
library(RWeka)
uni <- function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
uni_table <- TermDocumentMatrix(main_df, control=list(tokenize=uni))
uni_corpus <- findFreqTerms(uni_table,lowfreq = 2000)
uni_corpus_count <- rowSums(as.matrix(uni_table[uni_corpus,]))
uni_corpus_df <- data.frame(Word=names(uni_corpus_count),frequency=uni_corpus_count)
# get counts of words in descending order
uni_corpus_sorted <- uni_corpus_df[order(-uni_corpus_df$frequency),]
# get top 10 words with highest frequency
head(uni_corpus_sorted,10)
## Word frequency
## the the 28289
## and and 15232
## you you 7764
## for for 7685
## that that 7034
## with with 4736
## was was 4177
## have have 3927
## this this 3781
## are are 3363
rm(list= ls()[!(ls() %in% c('main_df'))])
main_df <- tm_map(main_df, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(main_df, removeWords, stopwords("english")):
## transformation drops documents
library(RWeka)
uni <- function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
uni_table <- TermDocumentMatrix(main_df, control=list(tokenize=uni))
uni_corpus <- findFreqTerms(uni_table,lowfreq = 500)
uni_corpus_count <- rowSums(as.matrix(uni_table[uni_corpus,]))
uni_corpus_df <- data.frame(Word=names(uni_corpus_count),frequency=uni_corpus_count)
# get counts of words in descending order
uni_corpus_sorted <- uni_corpus_df[order(-uni_corpus_df$frequency),]
# get top 10 words with highest frequency
head(uni_corpus_sorted,15)
## Word frequency
## just just 2229
## like like 2223
## will will 2160
## one one 1953
## get get 1936
## can can 1823
## dont dont 1438
## time time 1428
## know know 1366
## love love 1366
## new new 1292
## day day 1213
## good good 1205
## see see 1155
## now now 1094
library(slam)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
main_corpus <-TermDocumentMatrix(main_df,control=list(minWordLength=1))
wordFrequency <-rowapply_simple_triplet_matrix(main_corpus,sum)
wordFrequency <-wordFrequency[order(wordFrequency,decreasing=T)]
word_top15 <-as.data.frame(wordFrequency[1:15])
#data$carb <- factor(data$carb, levels = data$carb[order(-data$mpg)])
word_top15 <-data.frame(Words = row.names(word_top15),word_top15)
names(word_top15)[2] = "Frequency"
word_top15_plot = ggplot(data=word_top15, aes(x=reorder(Words,-Frequency), y=Frequency, fill=Frequency)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45)) + geom_text(aes(label = Frequency ), vjust = -0.30, size = 3)
word_top15_plot + ggtitle("Top 15 Words that are most frequently used") + theme(plot.title = element_text(hjust = 0.5)) + labs(y="Frequency", x = "Words")
Interesting Findings With the barplot shown above, the word 'just' appeared the most number of times in the corpus of text. We could see that after removing all the stopwords, the top 15 common words in the corpus are as shown. These words are not surprisinglycommon as well because we used these words quite frequently in our daily life. We should further do a bigram and trigram analysis so as to give a better sense of the most frequent used 2-word or 3-word phrase representation. This type of findings can then further be used to predict trends in the data and to create a predictive model of English text.
Project Plan We can use the model that we have analysed and train to generate the next possible word given an input sentence by the user. With the term frequency-inverse document frequency (TF-IDF) analysis and analysing of the uni,bi and tri-gram, predicting the next word given an input sentence might be possible and this sort of mimic the real Recurrent Neural Network (RNN) in the deep learning literature. This is also quite a scaled down version of what SwiftKey is doing -> text prediction. We will then incorporate this idea and deploy it into Shiny.io which is a webpage that will serve the user. It allows the user to input an incomplete sentence and then expect a 1-word output based on their input.