The goal of this project is to show the knowdlege that I have learned on working with the data and how to create my prediction algorithm.
The motivation for this project is to: 1. Demonstrate that I’ve downloaded the data and have successfully loaded it in. 2. Created a basic report of summary statistics about the data sets. 3. Report any interesting findings that I amassed so far. 4. Provide feedback on my plans for creating a prediction algorithm and Shiny app.
setwd("~/R/Capstone/final/en_US")
con <- file("en_US.twitter.txt", "r")
twitter<-readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)
con <- file("en_US.blogs.txt","r")
blogs<-readLines(con)
close(con)
con <- file("en_US.news.txt","r")
news<-readLines(con)
## Warning in readLines(con): incomplete final line found on 'en_US.news.txt'
close(con)
library(stringi)
## Warning: package 'stringi' was built under R version 3.4.1
twitter_words<-stri_count_words(twitter)
blogs_words<-stri_count_words(blogs)
news_words<-stri_count_words(news)
#line counts for each file
nline_twitter<-length(twitter_words)
nline_blogs<-length(blogs_words)
nline_news<-length(news_words)
line<-c(nline_twitter,nline_blogs,nline_news)
#Word counts for each file
twitter_words<-sum(twitter_words)
blogs_words<-sum(blogs_words)
news_words<-sum(news_words)
words<-c(twitter_words,blogs_words,news_words)
#Average word counts per line for each file
ave_twitter_words<-twitter_words/nline_twitter
ave_blogs_words<-blogs_words/nline_blogs
ave_news_words<-news_words/nline_news
ave_words<-c(ave_twitter_words,ave_blogs_words,ave_news_words)
#Summary Table
table<-cbind(words,line,ave_words)
rownames(table) <- c("twitter", "blogs", "news")
table
## words line ave_words
## twitter 30279349 2360148 12.82943
## blogs 38872243 899288 43.22558
## news 2710587 77259 35.08442
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
#We are only going to use 10% of the data
subTwitter <- sample(twitter, length(twitter)*0.01)
subBlogs <- sample(blogs, length(blogs)*0.01)
subNews <- sample(news, length(news)*0.01)
#All subseted data were combined to build the corpus
combined_data <- c(subBlogs, subNews, subTwitter)
combined_data <- iconv(combined_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(combined_data, stringsAsFactors = FALSE)))
toSpace <- content_transformer(function(x, pattern)
{
return (gsub(pattern, " ", x))
})
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, "'")
corpus <- tm_map(corpus, toSpace, " -")
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
#Unigram frequency distribution
unigram.df <- data.frame(table(unigram))
unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]
plot1<-ggplot(unigram.df[1:15,], aes(x=reorder(unigram, -Freq), y=Freq)) +
geom_bar(stat = "identity")+
xlab("Unigram") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))+
labs(title = "Most common words")
plot1
bigram.df <- data.frame(table(bigram))
bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]
plot2<-ggplot(bigram.df[1:15,], aes(x=reorder(bigram, -Freq), y=Freq)) +
geom_bar(stat = "identity")+
xlab("Bigram") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))+
labs(title = "Most common word pairs")
plot2
trigram.df <- data.frame(table(trigram))
trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]
plot3<-ggplot(trigram.df[1:15,], aes(x=reorder(trigram, -Freq), y=Freq)) +
geom_bar(stat = "identity")+
xlab("Trigram") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))+
labs(title = "Most common word triplets")
plot3