rm(list=ls())
library(httr)
library(tm)
library(SnowballC)
library(ggplot2)
library(wordcloud)
library(stringi)
require(stringr)
There are 3 datasets: “en_US.twitter.txt”, “en_US.news.txt” and “en_US.blogs.txt”. First, we will sample each dataset and then execute some cleaning. Basic preprocessing consist in reducing the statement datasets to term datasets. Some tables and plots will be done so we can have an idea on what the data looks like. Based on the matrices of terms extracted from the raw datasets a prospective data product on topic identification can be developed.
lista=c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
create_table = function(lista){
tab = cbind("", "", "", "")
for(i in lista){
data = readLines(i)
chr = format(sum(nchar(data)), digits=9, decimal.mark=",", big.mark=" ")
rec = format(length(data), digits=9, decimal.mark=",", big.mark=" ")
wds = format(sum(str_count(data, "\\S+")), digits=9, decimal.mark=",", big.mark=" ")
tab = rbind(tab, cbind(i, chr, rec, wds) )
}
return(tab)
}
tab=create_table(lista)
colnames(tab) = c("Dataset", "Chars", "Entries", "Words")
(tab = as.data.frame( tab))
## Dataset Chars Entries Words
## 1
## 2 en_US.blogs.txt 208 361 438 899 288 37 334 441
## 3 en_US.news.txt 15 683 765 77 259 2 643 972
## 4 en_US.twitter.txt 162 384 825 2 360 148 30 373 792
Since the number of entries is huge we will explore our datasets on only a sample of 5,000 records.
Functions for sampling datasets, for cleaning datasets and for converting a dataset to a term matrix are created.
create_docs_sample = function(file, num){
docs_raw = readLines(file)
sample_num = sample(length(docs_raw), num)
sample_num = sort(sample_num)
# create connexion to loop through
connexion = file(file, "r")
# needed variables
docs_sample = character(0)
i=1; j=1
while(i<=length(docs_raw)){
line_read = readLines(connexion, 1)
if(is.element(i, sample_num)){
# selected line is assigned
docs_sample[j] = line_read[1]
j=j+1
}
i=i+1
}
close(connexion)
return(docs_sample)
}
clean_text_data = function(data_dirty){
txtclean = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", data_dirty) # remove RTs
txtclean = gsub("(RT|via)", "", txtclean) # remove RT
txtclean = gsub("@\\w+", "", txtclean) # remove @people
txtclean = gsub("[[:punct:]]", "", txtclean) # remove punktuation
txtclean = gsub("[[:digit:]]", "", txtclean) # remove numbers
txtclean = gsub("http\\w+", "", txtclean) # remove links
txtclean = gsub("\\", "", txtclean, fixed=T) # remove NA
txtclean = stri_trans_tolower(txtclean) # tolower(txtclean)
txtclean = trimws(txtclean)
return(txtclean)
}
clean_corpus = function(corpus){
docs_corpus = tm_map(corpus, removeWords, c(stopwords("english"))) #remove stopwords
docs_corpus <- tm_map(docs_corpus, function(x) chartr('áéíóú','aeiou', x)) # removeTILDE
docs_corpus <- tm_map(docs_corpus, function(x) chartr('ñ','n', x)) # removeÑ
docs_corpus <- tm_map(docs_corpus, stripWhitespace)
return(docs_corpus)
}
create_terms_df = function(docs_corpus, freq = 10){
terms_list <- TermDocumentMatrix(docs_corpus)
terms_matrix <- as.matrix(terms_list)
wf <- sort(rowSums(terms_matrix), decreasing=TRUE)
terms_df_all <- data.frame(word = names(wf), freq=wf)
terms_df <- subset(terms_df_all, terms_df_all$freq >= freq)
terms_df = terms_df[order(terms_df$freq,decreasing = T ),]
return(terms_df)
}
set.seed(777)
blogs_sample = create_docs_sample("en_US.blogs.txt", 5000)
head(blogs_sample, 1)
## [1] "Earlier this week, the German small business foundation filed a criminal lawsuit against the Bundesbank, accusing the board of disguising the true scale of risk born by German citizens. If they knew all that Geli wants to sign them up for, the entire nation would have a fit of the vapours."
blogs_sample = clean_text_data(blogs_sample)
head(blogs_sample, 1)
## [1] "earlier this week the german small business foundation filed a criminal lawsuit against the bundesbank accusing the board of disguising the true scale of risk born by german citizens if they knew all that geli wants to sign them up for the entire nation would have a fit of the vapours"
blogs_corpus = Corpus(VectorSource(blogs_sample))
blogs_corpus[1]
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
blogs_corpus = clean_corpus(blogs_corpus)
blogs_corpus[1]
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
terms_blogs_list <- TermDocumentMatrix(blogs_corpus)
terms_non_sparse <- removeSparseTerms(terms_blogs_list, 0.97)
inspect(terms_non_sparse)
## <<TermDocumentMatrix (terms: 49, documents: 5000)>>
## Non-/sparse entries: 12150/232850
## Sparsity : 95%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1662 1930 2074 2272 2598 3005 3241 334 3945 610
## also 1 1 0 0 0 1 2 1 1 0
## can 3 2 0 0 2 11 4 1 1 0
## get 1 1 0 1 0 2 0 1 1 2
## just 3 0 2 0 1 1 3 2 0 2
## like 1 0 1 1 1 0 3 0 1 1
## now 1 0 0 1 1 0 0 0 2 0
## one 3 1 0 0 0 1 2 2 0 0
## people 0 1 1 0 0 2 1 0 0 0
## time 1 0 0 2 1 0 0 0 0 3
## will 0 0 0 3 0 1 0 2 2 0
terms_blogs_df = create_terms_df(blogs_corpus)
ggplot(head(terms_blogs_df,20), aes( x= word, y=freq )) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Frequency") + coord_flip() +
theme(axis.text=element_text(size=7))
wordcloud(terms_blogs_df$word, terms_blogs_df$freq, random.order=FALSE, colors=brewer.pal(12, "Paired"))
set.seed(777)
news_sample = create_docs_sample("en_US.news.txt", 5000)
## Warning in readLines(file): incomplete final line found on 'en_US.news.txt'
## Warning in readLines(connexion, 1): incomplete final line found on
## 'en_US.news.txt'
head(news_sample, 1)
## [1] "\"Itâ\200\231s just another in a long line of failed attempts to subsidize Atlantic City,\" said Americans for Prosperity New Jersey Director Steve Lonegan, a conservative who lost to Christie in the 2009 GOP primary. \"The Revel Casino hit the jackpot here at government expense.\""
news_sample = clean_text_data(news_sample)
head(news_sample, 1)
## [1] "itâ\200\231s just another in a long line of failed attempts to subsidize atlantic city said americans for prosperity new jersey director steve lonegan a conservative who lost to christie in the gop primary the revel casino hit the jackpot here at government expense"
news_corpus = Corpus(VectorSource(news_sample))
news_corpus[1]
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
news_corpus = clean_corpus(news_corpus)
news_corpus[1]
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
terms_news_list <- TermDocumentMatrix(news_corpus)
terms_non_sparse <- removeSparseTerms(terms_news_list, 0.97)
inspect(terms_non_sparse)
## <<TermDocumentMatrix (terms: 23, documents: 5000)>>
## Non-/sparse entries: 6320/108680
## Sparsity : 95%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 2471 2633 318 4353 4618 472 519 725 812 873
## also 0 0 1 0 0 0 1 0 0 0
## can 0 1 1 0 1 0 0 0 0 0
## first 1 0 0 0 1 0 1 0 0 1
## new 0 0 0 1 0 3 0 0 1 0
## one 1 0 0 0 0 0 0 1 1 0
## said 0 2 0 0 0 1 2 0 1 1
## time 2 0 1 0 1 0 0 1 2 1
## two 0 1 0 0 0 0 0 0 0 0
## will 3 1 0 3 5 1 0 4 0 0
## year 1 1 1 1 0 0 1 0 0 1
terms_news_df = create_terms_df(news_corpus)
ggplot(head(terms_news_df,20), aes( x= word, y=freq )) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Frequency") + coord_flip() +
theme(axis.text=element_text(size=7))
wordcloud(terms_news_df$word, terms_news_df$freq, random.order=FALSE, colors=brewer.pal(12, "Paired"))
set.seed(777)
tweets_sample = create_docs_sample("en_US.twitter.txt", 5000)
head(tweets_sample, 1)
## [1] "is our new Twitter pg. Follow 4 exciting deals, ATX challenges, & insights on #fashion, epicurean, entertainment & the arts"
tweets_sample = clean_text_data(tweets_sample)
head(tweets_sample, 1)
## [1] "is our new twitter pg follow exciting deals atx challenges insights on fashion epicurean entertainment the arts"
tweets_corpus = Corpus(VectorSource(tweets_sample))
tweets_corpus[1]
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
tweets_corpus = clean_corpus(tweets_corpus)
tweets_corpus[1]
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
terms_tweets_list <- TermDocumentMatrix(tweets_corpus)
terms_non_sparse <- removeSparseTerms(terms_tweets_list, 0.97)
inspect(terms_non_sparse)
## <<TermDocumentMatrix (terms: 13, documents: 5000)>>
## Non-/sparse entries: 2556/62444
## Sparsity : 96%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1124 114 1263 1304 1545 160 250 4572 489 860
## can 0 0 0 0 2 0 0 1 0 1
## day 1 0 0 0 0 0 0 0 0 0
## get 0 1 1 0 1 0 0 0 0 0
## good 1 0 0 1 0 0 0 1 0 0
## just 1 0 1 1 1 0 1 1 0 1
## like 0 0 0 2 0 1 0 1 2 1
## love 0 0 0 0 0 1 3 0 0 0
## now 0 0 0 0 0 3 0 1 0 0
## thanks 0 0 1 0 0 0 0 0 0 1
## will 0 1 0 0 0 1 0 0 0 0
terms_tweets_df = create_terms_df(tweets_corpus)
ggplot(head(terms_tweets_df,20), aes( x= word, y=freq )) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Frequency") + coord_flip() +
theme(axis.text=element_text(size=7))
wordcloud(terms_tweets_df$word, terms_tweets_df$freq, random.order=FALSE, colors=brewer.pal(12, "Paired"))