The goals of this project are: 1. Demonstrate that we’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
library("tm")
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library(NLP)
setwd("~/Documents/Capstone/en_US")
en_US_blogs <- readLines("en_US.blogs.txt", encoding = 'UTF-8',warn = FALSE)
en_US_news_txt <- readLines("en_US.news.txt", encoding = 'UTF-8',warn = FALSE)
en_US_twitter_txt <- readLines("en_US.twitter.txt", encoding = 'UTF-8',warn = FALSE)
en_US_blogs_1 <- Corpus(VectorSource(en_US_blogs))
en_US_twitter_1<- Corpus(VectorSource(en_US_twitter_txt))
en_US_news_txt_1<- Corpus(VectorSource(en_US_news_txt))
First, we need to explore the files by knowing lines count and words count:
dtm_blog <- TermDocumentMatrix(en_US_blogs_1)
dtm_blog
## <<TermDocumentMatrix (terms: 962432, documents: 899288)>>
## Non-/sparse entries: 24699977/865478848439
## Sparsity : 100%
## Maximal term length: 487
## Weighting : term frequency (tf)
dtm_twitter <- TermDocumentMatrix(en_US_twitter_1)
dtm_twitter
## <<TermDocumentMatrix (terms: 1077761, documents: 2360148)>>
## Non-/sparse entries: 22711283/2543652757345
## Sparsity : 100%
## Maximal term length: 140
## Weighting : term frequency (tf)
dtm_news <- TermDocumentMatrix(en_US_news_txt_1)
dtm_news
## <<TermDocumentMatrix (terms: 788817, documents: 1010242)>>
## Non-/sparse entries: 25240301/796870823413
## Sparsity : 100%
## Maximal term length: 123
## Weighting : term frequency (tf)
The data is huge so we need to clean and remove some words:
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
en_US_blogs_1 <- tm_map(en_US_blogs_1, toSpace, "/")
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, toSpace, "/"): transformation
## drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, toSpace, "@")
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, toSpace, "@"): transformation
## drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, toSpace, "\\|"): transformation
## drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, removeWords,
## stopwords("english")): transformation drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, removePunctuation)
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, removePunctuation): transformation
## drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, stripWhitespace)
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, stripWhitespace): transformation
## drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, removeNumbers)
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, removeNumbers): transformation
## drops documents
en_US_blogs_1 <- tm_map(en_US_blogs_1, removeWords, c("”", "“" ,"’s" ))
## Warning in tm_map.SimpleCorpus(en_US_blogs_1, removeWords, c("”", "“", :
## transformation drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,toSpace, "/")
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, toSpace, "/"): transformation
## drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,toSpace, "@")
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, toSpace, "@"): transformation
## drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, toSpace, "\\|"): transformation
## drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, removeWords,
## stopwords("english")): transformation drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,removePunctuation)
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, removePunctuation):
## transformation drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,stripWhitespace)
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, stripWhitespace): transformation
## drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,removeNumbers)
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, removeNumbers): transformation
## drops documents
en_US_twitter_1<- tm_map(en_US_twitter_1,removeWords, c("”", "“" ,"’s" ))
## Warning in tm_map.SimpleCorpus(en_US_twitter_1, removeWords, c("”", "“", :
## transformation drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,toSpace, "/")
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, toSpace, "/"): transformation
## drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,toSpace, "@")
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, toSpace, "@"): transformation
## drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, toSpace, "\\|"): transformation
## drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, removeWords,
## stopwords("english")): transformation drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,removePunctuation)
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, removePunctuation):
## transformation drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,stripWhitespace)
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, stripWhitespace):
## transformation drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,removeNumbers)
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, removeNumbers): transformation
## drops documents
en_US_news_txt_1<- tm_map(en_US_news_txt_1,removeWords, c("”", "“" ,"’s" ))
## Warning in tm_map.SimpleCorpus(en_US_news_txt_1, removeWords, c("”", "“", :
## transformation drops documents
Here we calculate most frequent words in each file:
set.seed(1000)
news_sample <- sample(en_US_news_txt_1,size=10000)
Sys.setenv('R_MAX_VSIZE'=32000000000)
dtm_3 <- TermDocumentMatrix(news_sample)
m_3 <- as.matrix(dtm_3)
v_3 <- sort(rowSums(m_3),decreasing=TRUE)
d_3 <- data.frame(word = names(v_3),freq=v_3)
head(d_3, 10)
set.seed(1000)
twitter_sample <- sample(en_US_twitter_1,size=20000)
Sys.setenv('R_MAX_VSIZE'=32000000000)
dtm_2 <- TermDocumentMatrix(twitter_sample)
m_2 <- as.matrix(dtm_2)
v_2 <- sort(rowSums(m_2),decreasing=TRUE)
d_2 <- data.frame(word = names(v_2),freq=v_2)
head(d_2, 10)
Sys.setenv('R_MAX_VSIZE'=32000000000)
blogs_sample <- sample(en_US_blogs_1,size=20000)
dtm <- TermDocumentMatrix(blogs_sample)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
Here we create word cloud:
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
barplot(d_2[1:10,]$freq, las = 2, names.arg = d_2[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
Finally, the data is huge but we can utilize this data by knowing the most frequent words used in English and understand people behavior. And by the next steps will explore it more.