Load libraries

rm(list=ls())
library(httr)
library(tm)
library(SnowballC)
library(ggplot2)
library(wordcloud)
library(stringi)
require(stringr)

Introduction

There are 3 datasets: “en_US.twitter.txt”, “en_US.news.txt” and “en_US.blogs.txt”. First, we will sample each dataset and then execute some cleaning. Basic preprocessing consist in reducing the statement datasets to term datasets. Some tables and plots will be done so we can have an idea on what the data looks like. Based on the matrices of terms extracted from the raw datasets a prospective data product on topic identification can be developed.

Datasets dimensions

lista=c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")

create_table = function(lista){
  tab = cbind("", "", "", "")
  for(i in lista){
    data = readLines(i)
    chr = format(sum(nchar(data)), digits=9, decimal.mark=",", big.mark=" ")
    rec = format(length(data), digits=9, decimal.mark=",", big.mark=" ")
    wds = format(sum(str_count(data, "\\S+")), digits=9, decimal.mark=",", big.mark=" ")

    tab = rbind(tab, cbind(i, chr, rec, wds) )  
  }
  return(tab)
}
tab=create_table(lista)
colnames(tab) = c("Dataset", "Chars", "Entries", "Words")
(tab = as.data.frame( tab))
##             Dataset       Chars   Entries      Words
## 1                                                   
## 2   en_US.blogs.txt 208 361 438   899 288 37 334 441
## 3    en_US.news.txt  15 683 765    77 259  2 643 972
## 4 en_US.twitter.txt 162 384 825 2 360 148 30 373 792

Since the number of entries is huge we will explore our datasets on only a sample of 5,000 records.

Basic Functions

Functions for sampling datasets, for cleaning datasets and for converting a dataset to a term matrix are created.

Sampling data function

create_docs_sample = function(file, num){
  docs_raw = readLines(file)
  sample_num = sample(length(docs_raw), num)
  sample_num = sort(sample_num)
  
  # create connexion to loop through
  connexion = file(file, "r")
  
  # needed variables
  docs_sample = character(0)
  i=1;   j=1
  
  while(i<=length(docs_raw)){
    line_read = readLines(connexion, 1)
    if(is.element(i, sample_num)){
      # selected line is assigned 
      docs_sample[j] = line_read[1]
      j=j+1
    }
    i=i+1
  }
  
  close(connexion)
  return(docs_sample)
}

Cleaning data function

clean_text_data = function(data_dirty){
  txtclean = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", data_dirty)  # remove RTs
  txtclean = gsub("(RT|via)", "", txtclean)  # remove RT
  txtclean = gsub("@\\w+", "", txtclean)  # remove @people
  txtclean = gsub("[[:punct:]]", "", txtclean)  # remove punktuation
  txtclean = gsub("[[:digit:]]", "", txtclean)  # remove numbers
  txtclean = gsub("http\\w+", "", txtclean)  # remove links
  txtclean = gsub("\\", "", txtclean, fixed=T)  # remove NA

  txtclean = stri_trans_tolower(txtclean) # tolower(txtclean)
  txtclean = trimws(txtclean)
  
  return(txtclean)
}

Cleaning data function (on corpus object)

clean_corpus = function(corpus){
  docs_corpus = tm_map(corpus, removeWords, c(stopwords("english"))) #remove stopwords
  docs_corpus <- tm_map(docs_corpus, function(x) chartr('áéíóú','aeiou', x))  # removeTILDE
  docs_corpus <- tm_map(docs_corpus, function(x) chartr('ñ','n', x))  # removeÑ 

  docs_corpus <- tm_map(docs_corpus, stripWhitespace)
  return(docs_corpus)
}  

Convertion to term dataframe function

create_terms_df = function(docs_corpus, freq = 10){
  terms_list <- TermDocumentMatrix(docs_corpus)
  terms_matrix <- as.matrix(terms_list)
  
  wf <- sort(rowSums(terms_matrix), decreasing=TRUE)
  
  terms_df_all <- data.frame(word = names(wf), freq=wf)
  terms_df <- subset(terms_df_all, terms_df_all$freq >= freq)
  terms_df = terms_df[order(terms_df$freq,decreasing = T ),]

  return(terms_df)
}

US Blogs dataset

set.seed(777)
blogs_sample = create_docs_sample("en_US.blogs.txt", 5000)
head(blogs_sample, 1)
## [1] "Earlier this week, the German small business foundation filed a criminal lawsuit against the Bundesbank, accusing the board of disguising the true scale of risk born by German citizens. If they knew all that Geli wants to sign them up for, the entire nation would have a fit of the vapours."

Cleaning and preprocessing

blogs_sample = clean_text_data(blogs_sample)
head(blogs_sample, 1)
## [1] "earlier this week the german small business foundation filed a criminal lawsuit against the bundesbank accusing the board of disguising the true scale of risk born by german citizens if they knew all that geli wants to sign them up for the entire nation would have a fit of the vapours"
blogs_corpus = Corpus(VectorSource(blogs_sample))
blogs_corpus[1]
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
blogs_corpus = clean_corpus(blogs_corpus)
blogs_corpus[1]
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

Creating terms dataframe

terms_blogs_list <- TermDocumentMatrix(blogs_corpus)
terms_non_sparse <- removeSparseTerms(terms_blogs_list, 0.97)
inspect(terms_non_sparse)
## <<TermDocumentMatrix (terms: 49, documents: 5000)>>
## Non-/sparse entries: 12150/232850
## Sparsity           : 95%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##         Docs
## Terms    1662 1930 2074 2272 2598 3005 3241 334 3945 610
##   also      1    1    0    0    0    1    2   1    1   0
##   can       3    2    0    0    2   11    4   1    1   0
##   get       1    1    0    1    0    2    0   1    1   2
##   just      3    0    2    0    1    1    3   2    0   2
##   like      1    0    1    1    1    0    3   0    1   1
##   now       1    0    0    1    1    0    0   0    2   0
##   one       3    1    0    0    0    1    2   2    0   0
##   people    0    1    1    0    0    2    1   0    0   0
##   time      1    0    0    2    1    0    0   0    0   3
##   will      0    0    0    3    0    1    0   2    2   0

Most frequent terms

terms_blogs_df = create_terms_df(blogs_corpus)

ggplot(head(terms_blogs_df,20), aes( x= word, y=freq )) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Frequency") + coord_flip() +
  theme(axis.text=element_text(size=7))

wordcloud(terms_blogs_df$word, terms_blogs_df$freq, random.order=FALSE, colors=brewer.pal(12, "Paired"))

US News

set.seed(777)
news_sample = create_docs_sample("en_US.news.txt", 5000)
## Warning in readLines(file): incomplete final line found on 'en_US.news.txt'
## Warning in readLines(connexion, 1): incomplete final line found on
## 'en_US.news.txt'
head(news_sample, 1)
## [1] "\"Itâ\200\231s just another in a long line of failed attempts to subsidize Atlantic City,\" said Americans for Prosperity New Jersey Director Steve Lonegan, a conservative who lost to Christie in the 2009 GOP primary. \"The Revel Casino hit the jackpot here at government expense.\""

Cleaning and preprocessing

news_sample = clean_text_data(news_sample)
head(news_sample, 1)
## [1] "itâ\200\231s just another in a long line of failed attempts to subsidize atlantic city said americans for prosperity new jersey director steve lonegan a conservative who lost to christie in the  gop primary the revel casino hit the jackpot here at government expense"
news_corpus = Corpus(VectorSource(news_sample))
news_corpus[1]
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
news_corpus = clean_corpus(news_corpus)
news_corpus[1]
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

Creating terms dataframe

terms_news_list <- TermDocumentMatrix(news_corpus)
terms_non_sparse <- removeSparseTerms(terms_news_list, 0.97)
inspect(terms_non_sparse)
## <<TermDocumentMatrix (terms: 23, documents: 5000)>>
## Non-/sparse entries: 6320/108680
## Sparsity           : 95%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##        Docs
## Terms   2471 2633 318 4353 4618 472 519 725 812 873
##   also     0    0   1    0    0   0   1   0   0   0
##   can      0    1   1    0    1   0   0   0   0   0
##   first    1    0   0    0    1   0   1   0   0   1
##   new      0    0   0    1    0   3   0   0   1   0
##   one      1    0   0    0    0   0   0   1   1   0
##   said     0    2   0    0    0   1   2   0   1   1
##   time     2    0   1    0    1   0   0   1   2   1
##   two      0    1   0    0    0   0   0   0   0   0
##   will     3    1   0    3    5   1   0   4   0   0
##   year     1    1   1    1    0   0   1   0   0   1

Most frequent terms

terms_news_df = create_terms_df(news_corpus)

ggplot(head(terms_news_df,20), aes( x= word, y=freq )) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Frequency") + coord_flip() +
  theme(axis.text=element_text(size=7))

wordcloud(terms_news_df$word, terms_news_df$freq, random.order=FALSE, colors=brewer.pal(12, "Paired"))

US twitter

set.seed(777)
tweets_sample = create_docs_sample("en_US.twitter.txt", 5000)
head(tweets_sample, 1)
## [1] "is our new Twitter pg. Follow 4 exciting deals, ATX challenges, & insights on #fashion, epicurean, entertainment & the arts"

Cleaning and preprocessing

tweets_sample = clean_text_data(tweets_sample)
head(tweets_sample, 1)
## [1] "is our new twitter pg follow  exciting deals atx challenges  insights on fashion epicurean entertainment  the arts"
tweets_corpus = Corpus(VectorSource(tweets_sample))
tweets_corpus[1]
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
tweets_corpus = clean_corpus(tweets_corpus)
tweets_corpus[1]
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

Creating terms dataframe

terms_tweets_list <- TermDocumentMatrix(tweets_corpus)
terms_non_sparse <- removeSparseTerms(terms_tweets_list, 0.97)
inspect(terms_non_sparse)
## <<TermDocumentMatrix (terms: 13, documents: 5000)>>
## Non-/sparse entries: 2556/62444
## Sparsity           : 96%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##         Docs
## Terms    1124 114 1263 1304 1545 160 250 4572 489 860
##   can       0   0    0    0    2   0   0    1   0   1
##   day       1   0    0    0    0   0   0    0   0   0
##   get       0   1    1    0    1   0   0    0   0   0
##   good      1   0    0    1    0   0   0    1   0   0
##   just      1   0    1    1    1   0   1    1   0   1
##   like      0   0    0    2    0   1   0    1   2   1
##   love      0   0    0    0    0   1   3    0   0   0
##   now       0   0    0    0    0   3   0    1   0   0
##   thanks    0   0    1    0    0   0   0    0   0   1
##   will      0   1    0    0    0   1   0    0   0   0

Most frequent terms

terms_tweets_df = create_terms_df(tweets_corpus)

ggplot(head(terms_tweets_df,20), aes( x= word, y=freq )) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Frequency") + coord_flip() +
  theme(axis.text=element_text(size=7))

wordcloud(terms_tweets_df$word, terms_tweets_df$freq, random.order=FALSE, colors=brewer.pal(12, "Paired"))